1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************* 7 * 8 * File CUCDTST.C 9 * 10 * Modification History: 11 * Name Description 12 * Madhu Katragadda Ported for C API, added tests for string functions 13 ******************************************************************************** 14 */ 15 16 #include <string.h> 17 #include <math.h> 18 #include <stdlib.h> 19 20 #include "unicode/utypes.h" 21 #include "unicode/uchar.h" 22 #include "unicode/putil.h" 23 #include "unicode/ustring.h" 24 #include "unicode/uloc.h" 25 #include "unicode/unorm2.h" 26 27 #include "cintltst.h" 28 #include "putilimp.h" 29 #include "uparse.h" 30 #include "ucase.h" 31 #include "ubidi_props.h" 32 #include "uprops.h" 33 #include "uset_imp.h" 34 #include "usc_impl.h" 35 #include "udatamem.h" /* for testing ucase_openBinary() */ 36 #include "cucdapi.h" 37 38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 39 40 /* prototypes --------------------------------------------------------------- */ 41 42 static void TestUpperLower(void); 43 static void TestLetterNumber(void); 44 static void TestMisc(void); 45 static void TestPOSIX(void); 46 static void TestControlPrint(void); 47 static void TestIdentifier(void); 48 static void TestUnicodeData(void); 49 static void TestCodeUnit(void); 50 static void TestCodePoint(void); 51 static void TestCharLength(void); 52 static void TestCharNames(void); 53 static void TestMirroring(void); 54 static void TestUScriptRunAPI(void); 55 static void TestAdditionalProperties(void); 56 static void TestNumericProperties(void); 57 static void TestPropertyNames(void); 58 static void TestPropertyValues(void); 59 static void TestConsistency(void); 60 static void TestUCase(void); 61 static void TestUBiDiProps(void); 62 static void TestCaseFolding(void); 63 64 /* internal methods used */ 65 static int32_t MakeProp(char* str); 66 static int32_t MakeDir(char* str); 67 68 /* helpers ------------------------------------------------------------------ */ 69 70 static void 71 parseUCDFile(const char *filename, 72 char *fields[][2], int32_t fieldCount, 73 UParseLineFn *lineFn, void *context, 74 UErrorCode *pErrorCode) { 75 char path[256]; 76 char backupPath[256]; 77 78 if(U_FAILURE(*pErrorCode)) { 79 return; 80 } 81 82 /* Look inside ICU_DATA first */ 83 strcpy(path, u_getDataDirectory()); 84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING); 85 strcat(path, filename); 86 87 /* As a fallback, try to guess where the source data was located 88 * at the time ICU was built, and look there. 89 */ 90 strcpy(backupPath, ctest_dataSrcDir()); 91 strcat(backupPath, U_FILE_SEP_STRING); 92 strcat(backupPath, "unidata" U_FILE_SEP_STRING); 93 strcat(backupPath, filename); 94 95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode); 96 if(*pErrorCode==U_FILE_ACCESS_ERROR) { 97 *pErrorCode=U_ZERO_ERROR; 98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode); 99 } 100 if(U_FAILURE(*pErrorCode)) { 101 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode)); 102 } 103 } 104 105 /* test data ---------------------------------------------------------------- */ 106 107 static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD; 108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf"; 109 static const int32_t tagValues[] = 110 { 111 /* Mn */ U_NON_SPACING_MARK, 112 /* Mc */ U_COMBINING_SPACING_MARK, 113 /* Me */ U_ENCLOSING_MARK, 114 /* Nd */ U_DECIMAL_DIGIT_NUMBER, 115 /* Nl */ U_LETTER_NUMBER, 116 /* No */ U_OTHER_NUMBER, 117 /* Zs */ U_SPACE_SEPARATOR, 118 /* Zl */ U_LINE_SEPARATOR, 119 /* Zp */ U_PARAGRAPH_SEPARATOR, 120 /* Cc */ U_CONTROL_CHAR, 121 /* Cf */ U_FORMAT_CHAR, 122 /* Cs */ U_SURROGATE, 123 /* Co */ U_PRIVATE_USE_CHAR, 124 /* Cn */ U_UNASSIGNED, 125 /* Lu */ U_UPPERCASE_LETTER, 126 /* Ll */ U_LOWERCASE_LETTER, 127 /* Lt */ U_TITLECASE_LETTER, 128 /* Lm */ U_MODIFIER_LETTER, 129 /* Lo */ U_OTHER_LETTER, 130 /* Pc */ U_CONNECTOR_PUNCTUATION, 131 /* Pd */ U_DASH_PUNCTUATION, 132 /* Ps */ U_START_PUNCTUATION, 133 /* Pe */ U_END_PUNCTUATION, 134 /* Po */ U_OTHER_PUNCTUATION, 135 /* Sm */ U_MATH_SYMBOL, 136 /* Sc */ U_CURRENCY_SYMBOL, 137 /* Sk */ U_MODIFIER_SYMBOL, 138 /* So */ U_OTHER_SYMBOL, 139 /* Pi */ U_INITIAL_PUNCTUATION, 140 /* Pf */ U_FINAL_PUNCTUATION 141 }; 142 143 static const char dirStrings[][5] = { 144 "L", 145 "R", 146 "EN", 147 "ES", 148 "ET", 149 "AN", 150 "CS", 151 "B", 152 "S", 153 "WS", 154 "ON", 155 "LRE", 156 "LRO", 157 "AL", 158 "RLE", 159 "RLO", 160 "PDF", 161 "NSM", 162 "BN" 163 }; 164 165 void addUnicodeTest(TestNode** root); 166 167 void addUnicodeTest(TestNode** root) 168 { 169 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit"); 170 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint"); 171 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength"); 172 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues"); 173 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData"); 174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties"); 175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties"); 176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower"); 177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber"); 178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc"); 179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX"); 180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint"); 181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier"); 182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames"); 183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring"); 184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI"); 185 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript"); 186 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions"); 187 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI"); 188 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames"); 189 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); 190 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency"); 191 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase"); 192 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps"); 193 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding"); 194 } 195 196 /*==================================================== */ 197 /* test u_toupper() and u_tolower() */ 198 /*==================================================== */ 199 static void TestUpperLower() 200 { 201 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000}; 202 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000}; 203 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21); 204 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 205 int32_t i; 206 207 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21); 208 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 209 210 /* 211 Checks LetterLike Symbols which were previously a source of confusion 212 [Bertrand A. D. 02/04/98] 213 */ 214 for (i=0x2100;i<0x2138;i++) 215 { 216 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */ 217 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132) 218 { 219 if (i != (int)u_tolower(i)) /* itself */ 220 log_err("Failed case conversion with itself: U+%04x\n", i); 221 if (i != (int)u_toupper(i)) 222 log_err("Failed case conversion with itself: U+%04x\n", i); 223 } 224 } 225 226 for(i=0; i < u_strlen(upper); i++){ 227 if(u_tolower(upper[i]) != lower[i]){ 228 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i])); 229 } 230 } 231 232 log_verbose("testing upper lower\n"); 233 for (i = 0; i < 21; i++) { 234 235 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i])) 236 { 237 log_err("Failed isLowerCase test at %c\n", upperTest[i]); 238 } 239 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i])) 240 { 241 log_err("Failed isUpperCase test at %c\n", lowerTest[i]); 242 } 243 else if (upperTest[i] != u_tolower(lowerTest[i])) 244 { 245 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]); 246 } 247 else if (lowerTest[i] != u_toupper(upperTest[i])) 248 { 249 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]); 250 } 251 else if (upperTest[i] != u_tolower(upperTest[i])) 252 { 253 log_err("Failed case conversion with itself: %c\n", upperTest[i]); 254 } 255 else if (lowerTest[i] != u_toupper(lowerTest[i])) 256 { 257 log_err("Failed case conversion with itself: %c\n", lowerTest[i]); 258 } 259 } 260 log_verbose("done testing upper lower\n"); 261 262 log_verbose("testing u_istitle\n"); 263 { 264 static const UChar expected[] = { 265 0x1F88, 266 0x1F89, 267 0x1F8A, 268 0x1F8B, 269 0x1F8C, 270 0x1F8D, 271 0x1F8E, 272 0x1F8F, 273 0x1F88, 274 0x1F89, 275 0x1F8A, 276 0x1F8B, 277 0x1F8C, 278 0x1F8D, 279 0x1F8E, 280 0x1F8F, 281 0x1F98, 282 0x1F99, 283 0x1F9A, 284 0x1F9B, 285 0x1F9C, 286 0x1F9D, 287 0x1F9E, 288 0x1F9F, 289 0x1F98, 290 0x1F99, 291 0x1F9A, 292 0x1F9B, 293 0x1F9C, 294 0x1F9D, 295 0x1F9E, 296 0x1F9F, 297 0x1FA8, 298 0x1FA9, 299 0x1FAA, 300 0x1FAB, 301 0x1FAC, 302 0x1FAD, 303 0x1FAE, 304 0x1FAF, 305 0x1FA8, 306 0x1FA9, 307 0x1FAA, 308 0x1FAB, 309 0x1FAC, 310 0x1FAD, 311 0x1FAE, 312 0x1FAF, 313 0x1FBC, 314 0x1FBC, 315 0x1FCC, 316 0x1FCC, 317 0x1FFC, 318 0x1FFC, 319 }; 320 int32_t num = sizeof(expected)/sizeof(expected[0]); 321 for(i=0; i<num; i++){ 322 if(!u_istitle(expected[i])){ 323 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]); 324 } 325 } 326 327 } 328 } 329 330 /* compare two sets and verify that their difference or intersection is empty */ 331 static UBool 332 showADiffB(const USet *a, const USet *b, 333 const char *a_name, const char *b_name, 334 UBool expect, UBool diffIsError) { 335 USet *aa; 336 int32_t i, start, end, length; 337 UErrorCode errorCode; 338 339 /* 340 * expect: 341 * TRUE -> a-b should be empty, that is, b should contain all of a 342 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa) 343 */ 344 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) { 345 return TRUE; 346 } 347 348 /* clone a to aa because a is const */ 349 aa=uset_open(1, 0); 350 if(aa==NULL) { 351 /* unusual problem - out of memory? */ 352 return FALSE; 353 } 354 uset_addAll(aa, a); 355 356 /* compute the set in question */ 357 if(expect) { 358 /* a-b */ 359 uset_removeAll(aa, b); 360 } else { 361 /* a&b */ 362 uset_retainAll(aa, b); 363 } 364 365 /* aa is not empty because of the initial tests above; show its contents */ 366 errorCode=U_ZERO_ERROR; 367 i=0; 368 for(;;) { 369 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode); 370 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 371 break; /* done */ 372 } 373 if(U_FAILURE(errorCode)) { 374 log_err("error comparing %s with %s at difference item %d: %s\n", 375 a_name, b_name, i, u_errorName(errorCode)); 376 break; 377 } 378 if(length!=0) { 379 break; /* done with code points, got a string or -1 */ 380 } 381 382 if(diffIsError) { 383 if(expect) { 384 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 385 } else { 386 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 387 } 388 } else { 389 if(expect) { 390 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 391 } else { 392 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 393 } 394 } 395 396 ++i; 397 } 398 399 uset_close(aa); 400 return FALSE; 401 } 402 403 static UBool 404 showAMinusB(const USet *a, const USet *b, 405 const char *a_name, const char *b_name, 406 UBool diffIsError) { 407 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError); 408 } 409 410 static UBool 411 showAIntersectB(const USet *a, const USet *b, 412 const char *a_name, const char *b_name, 413 UBool diffIsError) { 414 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError); 415 } 416 417 static UBool 418 compareUSets(const USet *a, const USet *b, 419 const char *a_name, const char *b_name, 420 UBool diffIsError) { 421 /* 422 * Use an arithmetic & not a logical && so that both branches 423 * are always taken and all differences are shown. 424 */ 425 return 426 showAMinusB(a, b, a_name, b_name, diffIsError) & 427 showAMinusB(b, a, b_name, a_name, diffIsError); 428 } 429 430 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */ 431 static void TestLetterNumber() 432 { 433 UChar i = 0x0000; 434 435 log_verbose("Testing for isalpha\n"); 436 for (i = 0x0041; i < 0x005B; i++) { 437 if (!u_isalpha(i)) 438 { 439 log_err("Failed isLetter test at %.4X\n", i); 440 } 441 } 442 for (i = 0x0660; i < 0x066A; i++) { 443 if (u_isalpha(i)) 444 { 445 log_err("Failed isLetter test with numbers at %.4X\n", i); 446 } 447 } 448 449 log_verbose("Testing for isdigit\n"); 450 for (i = 0x0660; i < 0x066A; i++) { 451 if (!u_isdigit(i)) 452 { 453 log_verbose("Failed isNumber test at %.4X\n", i); 454 } 455 } 456 457 log_verbose("Testing for isalnum\n"); 458 for (i = 0x0041; i < 0x005B; i++) { 459 if (!u_isalnum(i)) 460 { 461 log_err("Failed isAlNum test at %.4X\n", i); 462 } 463 } 464 for (i = 0x0660; i < 0x066A; i++) { 465 if (!u_isalnum(i)) 466 { 467 log_err("Failed isAlNum test at %.4X\n", i); 468 } 469 } 470 471 { 472 /* 473 * The following checks work only starting from Unicode 4.0. 474 * Check the version number here. 475 */ 476 static UVersionInfo u401={ 4, 0, 1, 0 }; 477 UVersionInfo version; 478 u_getUnicodeVersion(version); 479 if(version[0]<4 || 0==memcmp(version, u401, 4)) { 480 return; 481 } 482 } 483 484 { 485 /* 486 * Sanity check: 487 * Verify that exactly the digit characters have decimal digit values. 488 * This assumption is used in the implementation of u_digit() 489 * (which checks nt=de) 490 * compared with the parallel java.lang.Character.digit() 491 * (which checks Nd). 492 * 493 * This was not true in Unicode 3.2 and earlier. 494 * Unicode 4.0 fixed discrepancies. 495 * Unicode 4.0.1 re-introduced problems in this area due to an 496 * unintentionally incomplete last-minute change. 497 */ 498 U_STRING_DECL(digitsPattern, "[:Nd:]", 6); 499 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 500 501 USet *digits, *decimalValues; 502 UErrorCode errorCode; 503 504 U_STRING_INIT(digitsPattern, "[:Nd:]", 6); 505 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 506 errorCode=U_ZERO_ERROR; 507 digits=uset_openPattern(digitsPattern, 6, &errorCode); 508 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode); 509 510 if(U_SUCCESS(errorCode)) { 511 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE); 512 } 513 514 uset_close(digits); 515 uset_close(decimalValues); 516 } 517 } 518 519 static void testSampleCharProps(UBool propFn(UChar32), const char *propName, 520 const UChar32 *sampleChars, int32_t sampleCharsLength, 521 UBool expected) { 522 int32_t i; 523 for (i = 0; i < sampleCharsLength; ++i) { 524 UBool result = propFn(sampleChars[i]); 525 if (result != expected) { 526 log_err("error: character property function %s(U+%04x)=%d is wrong\n", 527 propName, sampleChars[i], result); 528 } 529 } 530 } 531 532 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */ 533 static void TestMisc() 534 { 535 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005}; 536 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74}; 537 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e}; 538 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd}; 539 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2}; 540 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B}; 541 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/ 542 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5}; 543 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE}; 544 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c}; 545 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef}; 546 547 static const int32_t sampleDigitValues[] = {0, 2, 3, 5}; 548 549 uint32_t mask; 550 551 int32_t i; 552 char icuVersion[U_MAX_VERSION_STRING_LENGTH]; 553 UVersionInfo realVersion; 554 555 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH); 556 557 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 558 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 559 560 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 561 sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 562 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 563 sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 564 565 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 566 sampleWhiteSpaces, LENGTHOF(sampleWhiteSpaces), TRUE); 567 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 568 sampleNonWhiteSpaces, LENGTHOF(sampleNonWhiteSpaces), FALSE); 569 570 testSampleCharProps(u_isdefined, "u_isdefined", 571 sampleDefined, LENGTHOF(sampleDefined), TRUE); 572 testSampleCharProps(u_isdefined, "u_isdefined", 573 sampleUndefined, LENGTHOF(sampleUndefined), FALSE); 574 575 testSampleCharProps(u_isbase, "u_isbase", sampleBase, LENGTHOF(sampleBase), TRUE); 576 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, LENGTHOF(sampleNonBase), FALSE); 577 578 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, LENGTHOF(sampleDigits), TRUE); 579 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, LENGTHOF(sampleNonDigits), FALSE); 580 581 for (i = 0; i < LENGTHOF(sampleDigits); i++) { 582 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) { 583 log_err("error: u_charDigitValue(U+04x)=%d != %d\n", 584 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]); 585 } 586 } 587 588 /* Tests the ICU version #*/ 589 u_getVersion(realVersion); 590 u_versionToString(realVersion, icuVersion); 591 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0) 592 { 593 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion); 594 } 595 #if defined(ICU_VERSION) 596 /* test only happens where we have configure.in with VERSION - sanity check. */ 597 if(strcmp(U_ICU_VERSION, ICU_VERSION)) 598 { 599 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION); 600 } 601 #endif 602 603 /* test U_GC_... */ 604 if( 605 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK || 606 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK || 607 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK || 608 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK || 609 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK || 610 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK 611 ) { 612 log_err("error: U_GET_GC_MASK does not work properly\n"); 613 } 614 615 mask=0; 616 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK; 617 618 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK; 619 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK; 620 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK; 621 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK; 622 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK; 623 624 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK; 625 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK; 626 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK; 627 628 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK; 629 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK; 630 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK; 631 632 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK; 633 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK; 634 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK; 635 636 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK; 637 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK; 638 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK; 639 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK; 640 641 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK; 642 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK; 643 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK; 644 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK; 645 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK; 646 647 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK; 648 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK; 649 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK; 650 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK; 651 652 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK; 653 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK; 654 655 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 656 log_err("error: problems with U_GC_XX_MASK constants\n"); 657 } 658 659 mask=0; 660 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK; 661 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK; 662 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK; 663 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK; 664 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK; 665 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK; 666 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK; 667 668 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 669 log_err("error: problems with U_GC_Y_MASK constants\n"); 670 } 671 { 672 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 }; 673 for(i=0; i<10; i++){ 674 if(digit[i]!=u_forDigit(i,10)){ 675 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10)); 676 } 677 } 678 } 679 680 /* test u_digit() */ 681 { 682 static const struct { 683 UChar32 c; 684 int8_t radix, value; 685 } data[]={ 686 /* base 16 */ 687 { 0x0031, 16, 1 }, 688 { 0x0038, 16, 8 }, 689 { 0x0043, 16, 12 }, 690 { 0x0066, 16, 15 }, 691 { 0x00e4, 16, -1 }, 692 { 0x0662, 16, 2 }, 693 { 0x06f5, 16, 5 }, 694 { 0xff13, 16, 3 }, 695 { 0xff41, 16, 10 }, 696 697 /* base 8 */ 698 { 0x0031, 8, 1 }, 699 { 0x0038, 8, -1 }, 700 { 0x0043, 8, -1 }, 701 { 0x0066, 8, -1 }, 702 { 0x00e4, 8, -1 }, 703 { 0x0662, 8, 2 }, 704 { 0x06f5, 8, 5 }, 705 { 0xff13, 8, 3 }, 706 { 0xff41, 8, -1 }, 707 708 /* base 36 */ 709 { 0x5a, 36, 35 }, 710 { 0x7a, 36, 35 }, 711 { 0xff3a, 36, 35 }, 712 { 0xff5a, 36, 35 }, 713 714 /* wrong radix values */ 715 { 0x0031, 1, -1 }, 716 { 0xff3a, 37, -1 } 717 }; 718 719 for(i=0; i<LENGTHOF(data); ++i) { 720 if(u_digit(data[i].c, data[i].radix)!=data[i].value) { 721 log_err("u_digit(U+%04x, %d)=%d expected %d\n", 722 data[i].c, 723 data[i].radix, 724 u_digit(data[i].c, data[i].radix), 725 data[i].value); 726 } 727 } 728 } 729 } 730 731 /* test C/POSIX-style functions --------------------------------------------- */ 732 733 /* bit flags */ 734 #define ISAL 1 735 #define ISLO 2 736 #define ISUP 4 737 738 #define ISDI 8 739 #define ISXD 0x10 740 741 #define ISAN 0x20 742 743 #define ISPU 0x40 744 #define ISGR 0x80 745 #define ISPR 0x100 746 747 #define ISSP 0x200 748 #define ISBL 0x400 749 #define ISCN 0x800 750 751 /* C/POSIX-style functions, in the same order as the bit flags */ 752 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c); 753 754 static const struct { 755 IsPOSIXClass *fn; 756 const char *name; 757 } posixClasses[]={ 758 { u_isalpha, "isalpha" }, 759 { u_islower, "islower" }, 760 { u_isupper, "isupper" }, 761 { u_isdigit, "isdigit" }, 762 { u_isxdigit, "isxdigit" }, 763 { u_isalnum, "isalnum" }, 764 { u_ispunct, "ispunct" }, 765 { u_isgraph, "isgraph" }, 766 { u_isprint, "isprint" }, 767 { u_isspace, "isspace" }, 768 { u_isblank, "isblank" }, 769 { u_iscntrl, "iscntrl" } 770 }; 771 772 static const struct { 773 UChar32 c; 774 uint32_t posixResults; 775 } posixData[]={ 776 { 0x0008, ISCN }, /* backspace */ 777 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */ 778 { 0x000a, ISSP| ISCN }, /* LF */ 779 { 0x000c, ISSP| ISCN }, /* FF */ 780 { 0x000d, ISSP| ISCN }, /* CR */ 781 { 0x0020, ISPR|ISSP|ISBL }, /* space */ 782 { 0x0021, ISPU|ISGR|ISPR }, /* ! */ 783 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */ 784 { 0x0040, ISPU|ISGR|ISPR }, /* @ */ 785 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */ 786 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */ 787 { 0x007b, ISPU|ISGR|ISPR }, /* { */ 788 { 0x0085, ISSP| ISCN }, /* NEL */ 789 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */ 790 { 0x00a4, ISGR|ISPR }, /* currency sign */ 791 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */ 792 { 0x0300, ISGR|ISPR }, /* combining grave */ 793 { 0x0600, ISCN }, /* arabic number sign */ 794 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */ 795 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */ 796 { 0x2002, ISPR|ISSP|ISBL }, /* en space */ 797 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */ 798 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */ 799 { 0x200b, ISCN }, /* ZWSP */ 800 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/ 801 { 0x200e, ISCN }, /* LRM */ 802 { 0x2028, ISPR|ISSP| ISCN }, /* LS */ 803 { 0x2029, ISPR|ISSP| ISCN }, /* PS */ 804 { 0x20ac, ISGR|ISPR }, /* Euro */ 805 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */ 806 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */ 807 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */ 808 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */ 809 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */ 810 }; 811 812 static void 813 TestPOSIX() { 814 uint32_t mask; 815 int32_t cl, i; 816 UBool expect; 817 818 mask=1; 819 for(cl=0; cl<12; ++cl) { 820 for(i=0; i<LENGTHOF(posixData); ++i) { 821 expect=(UBool)((posixData[i].posixResults&mask)!=0); 822 if(posixClasses[cl].fn(posixData[i].c)!=expect) { 823 log_err("u_%s(U+%04x)=%s is wrong\n", 824 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE"); 825 } 826 } 827 mask<<=1; 828 } 829 } 830 831 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */ 832 static void TestControlPrint() 833 { 834 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b}; 835 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2}; 836 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014}; 837 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b}; 838 UChar32 c; 839 840 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, LENGTHOF(sampleControl), TRUE); 841 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, LENGTHOF(sampleNonControl), FALSE); 842 843 testSampleCharProps(u_isprint, "u_isprint", 844 samplePrintable, LENGTHOF(samplePrintable), TRUE); 845 testSampleCharProps(u_isprint, "u_isprint", 846 sampleNonPrintable, LENGTHOF(sampleNonPrintable), FALSE); 847 848 /* test all ISO 8 controls */ 849 for(c=0; c<=0x9f; ++c) { 850 if(c==0x20) { 851 /* skip ASCII graphic characters and continue with DEL */ 852 c=0x7f; 853 } 854 if(!u_iscntrl(c)) { 855 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c); 856 } 857 if(!u_isISOControl(c)) { 858 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c); 859 } 860 if(u_isprint(c)) { 861 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c); 862 } 863 } 864 865 /* test all Latin-1 graphic characters */ 866 for(c=0x20; c<=0xff; ++c) { 867 if(c==0x7f) { 868 c=0xa0; 869 } else if(c==0xad) { 870 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */ 871 ++c; 872 } 873 if(!u_isprint(c)) { 874 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c); 875 } 876 } 877 } 878 879 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/ 880 static void TestIdentifier() 881 { 882 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f}; 883 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082}; 884 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045}; 885 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020}; 886 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061}; 887 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019}; 888 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045}; 889 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020}; 890 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85}; 891 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061}; 892 893 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 894 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 895 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 896 sampleNonJavaIDStart, LENGTHOF(sampleNonJavaIDStart), FALSE); 897 898 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 899 sampleJavaIDPart, LENGTHOF(sampleJavaIDPart), TRUE); 900 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 901 sampleNonJavaIDPart, LENGTHOF(sampleNonJavaIDPart), FALSE); 902 903 /* IDPart should imply IDStart */ 904 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 905 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 906 907 testSampleCharProps(u_isIDStart, "u_isIDStart", 908 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 909 testSampleCharProps(u_isIDStart, "u_isIDStart", 910 sampleNonUnicodeIDStart, LENGTHOF(sampleNonUnicodeIDStart), FALSE); 911 912 testSampleCharProps(u_isIDPart, "u_isIDPart", 913 sampleUnicodeIDPart, LENGTHOF(sampleUnicodeIDPart), TRUE); 914 testSampleCharProps(u_isIDPart, "u_isIDPart", 915 sampleNonUnicodeIDPart, LENGTHOF(sampleNonUnicodeIDPart), FALSE); 916 917 /* IDPart should imply IDStart */ 918 testSampleCharProps(u_isIDPart, "u_isIDPart", 919 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 920 921 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 922 sampleIDIgnore, LENGTHOF(sampleIDIgnore), TRUE); 923 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 924 sampleNonIDIgnore, LENGTHOF(sampleNonIDIgnore), FALSE); 925 } 926 927 /* for each line of UnicodeData.txt, check some of the properties */ 928 /* 929 * ### TODO 930 * This test fails incorrectly if the First or Last code point of a repetitive area 931 * is overridden, which is allowed and is encouraged for the PUAs. 932 * Currently, this means that both area First/Last and override lines are 933 * tested against the properties from the API, 934 * and the area boundary will not match and cause an error. 935 * 936 * This function should detect area boundaries and skip them for the test of individual 937 * code points' properties. 938 * Then it should check that the areas contain all the same properties except where overridden. 939 * For this, it would have had to set a flag for which code points were listed explicitly. 940 */ 941 static void U_CALLCONV 942 unicodeDataLineFn(void *context, 943 char *fields[][2], int32_t fieldCount, 944 UErrorCode *pErrorCode) 945 { 946 char buffer[100]; 947 char *end; 948 uint32_t value; 949 UChar32 c; 950 int32_t i; 951 int8_t type; 952 953 /* get the character code, field 0 */ 954 c=strtoul(fields[0][0], &end, 16); 955 if(end<=fields[0][0] || end!=fields[0][1]) { 956 log_err("error: syntax error in field 0 at %s\n", fields[0][0]); 957 return; 958 } 959 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) { 960 log_err("error in UnicodeData.txt: code point %lu out of range\n", c); 961 return; 962 } 963 964 /* get general category, field 2 */ 965 *fields[2][1]=0; 966 type = (int8_t)tagValues[MakeProp(fields[2][0])]; 967 if(u_charType(c)!=type) { 968 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type); 969 } 970 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 971 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 972 } 973 974 /* get canonical combining class, field 3 */ 975 value=strtoul(fields[3][0], &end, 10); 976 if(end<=fields[3][0] || end!=fields[3][1]) { 977 log_err("error: syntax error in field 3 at code 0x%lx\n", c); 978 return; 979 } 980 if(value>255) { 981 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value); 982 return; 983 } 984 #if !UCONFIG_NO_NORMALIZATION 985 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) { 986 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value); 987 } 988 #endif 989 990 /* get BiDi category, field 4 */ 991 *fields[4][1]=0; 992 i=MakeDir(fields[4][0]); 993 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) { 994 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]); 995 } 996 997 /* get ISO Comment, field 11 */ 998 *fields[11][1]=0; 999 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode); 1000 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) { 1001 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n", 1002 c, u_errorName(*pErrorCode), 1003 U_FAILURE(*pErrorCode) ? buffer : "[error]", 1004 fields[11][0]); 1005 } 1006 1007 /* get uppercase mapping, field 12 */ 1008 if(fields[12][0]!=fields[12][1]) { 1009 value=strtoul(fields[12][0], &end, 16); 1010 if(end!=fields[12][1]) { 1011 log_err("error: syntax error in field 12 at code 0x%lx\n", c); 1012 return; 1013 } 1014 if((UChar32)value!=u_toupper(c)) { 1015 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value); 1016 } 1017 } else { 1018 /* no case mapping: the API must map the code point to itself */ 1019 if(c!=u_toupper(c)) { 1020 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c)); 1021 } 1022 } 1023 1024 /* get lowercase mapping, field 13 */ 1025 if(fields[13][0]!=fields[13][1]) { 1026 value=strtoul(fields[13][0], &end, 16); 1027 if(end!=fields[13][1]) { 1028 log_err("error: syntax error in field 13 at code 0x%lx\n", c); 1029 return; 1030 } 1031 if((UChar32)value!=u_tolower(c)) { 1032 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value); 1033 } 1034 } else { 1035 /* no case mapping: the API must map the code point to itself */ 1036 if(c!=u_tolower(c)) { 1037 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c)); 1038 } 1039 } 1040 1041 /* get titlecase mapping, field 14 */ 1042 if(fields[14][0]!=fields[14][1]) { 1043 value=strtoul(fields[14][0], &end, 16); 1044 if(end!=fields[14][1]) { 1045 log_err("error: syntax error in field 14 at code 0x%lx\n", c); 1046 return; 1047 } 1048 if((UChar32)value!=u_totitle(c)) { 1049 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value); 1050 } 1051 } else { 1052 /* no case mapping: the API must map the code point to itself */ 1053 if(c!=u_totitle(c)) { 1054 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c)); 1055 } 1056 } 1057 } 1058 1059 static UBool U_CALLCONV 1060 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1061 static const UChar32 test[][2]={ 1062 {0x41, U_UPPERCASE_LETTER}, 1063 {0x308, U_NON_SPACING_MARK}, 1064 {0xfffe, U_GENERAL_OTHER_TYPES}, 1065 {0xe0041, U_FORMAT_CHAR}, 1066 {0xeffff, U_UNASSIGNED} 1067 }; 1068 1069 int32_t i, count; 1070 1071 if(0!=strcmp((const char *)context, "a1")) { 1072 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n"); 1073 return FALSE; 1074 } 1075 1076 count=LENGTHOF(test); 1077 for(i=0; i<count; ++i) { 1078 if(start<=test[i][0] && test[i][0]<limit) { 1079 if(type!=(UCharCategory)test[i][1]) { 1080 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n", 1081 start, limit, (long)type, test[i][0], test[i][1]); 1082 } 1083 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */ 1084 return i==(count-1) ? FALSE : TRUE; 1085 } 1086 } 1087 1088 if(start>test[count-1][0]) { 1089 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n", 1090 start, limit, (long)type); 1091 return FALSE; 1092 } 1093 1094 return TRUE; 1095 } 1096 1097 static UBool U_CALLCONV 1098 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1099 /* default Bidi classes for unassigned code points */ 1100 static const int32_t defaultBidi[][2]={ /* { limit, class } */ 1101 { 0x0590, U_LEFT_TO_RIGHT }, 1102 { 0x0600, U_RIGHT_TO_LEFT }, 1103 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, 1104 { 0x0900, U_RIGHT_TO_LEFT }, 1105 { 0xFB1D, U_LEFT_TO_RIGHT }, 1106 { 0xFB50, U_RIGHT_TO_LEFT }, 1107 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, 1108 { 0xFE70, U_LEFT_TO_RIGHT }, 1109 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, 1110 { 0x10800, U_LEFT_TO_RIGHT }, 1111 { 0x11000, U_RIGHT_TO_LEFT }, 1112 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */ 1113 { 0x1F000, U_RIGHT_TO_LEFT }, 1114 { 0x110000, U_LEFT_TO_RIGHT } 1115 }; 1116 1117 UChar32 c; 1118 int32_t i; 1119 UCharDirection shouldBeDir; 1120 1121 /* 1122 * LineBreak.txt specifies: 1123 * # - Assigned characters that are not listed explicitly are given the value 1124 * # "AL". 1125 * # - Unassigned characters are given the value "XX". 1126 * 1127 * PUA characters are listed explicitly with "XX". 1128 * Verify that no assigned character has "XX". 1129 */ 1130 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) { 1131 c=start; 1132 while(c<limit) { 1133 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) { 1134 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c); 1135 } 1136 ++c; 1137 } 1138 } 1139 1140 /* 1141 * Verify default Bidi classes. 1142 * For recent Unicode versions, see UCD.html. 1143 * 1144 * For older Unicode versions: 1145 * See table 3-7 "Bidirectional Character Types" in UAX #9. 1146 * http://www.unicode.org/reports/tr9/ 1147 * 1148 * See also DerivedBidiClass.txt for Cn code points! 1149 * 1150 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html) 1151 * changed some default values. 1152 * In particular, non-characters and unassigned Default Ignorable Code Points 1153 * change from L to BN. 1154 * 1155 * UCD.html version 4.0.1 does not yet reflect these changes. 1156 */ 1157 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) { 1158 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */ 1159 c=start; 1160 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) { 1161 if((int32_t)c<defaultBidi[i][0]) { 1162 while(c<limit && (int32_t)c<defaultBidi[i][0]) { 1163 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { 1164 shouldBeDir=U_BOUNDARY_NEUTRAL; 1165 } else { 1166 shouldBeDir=(UCharDirection)defaultBidi[i][1]; 1167 } 1168 1169 if( u_charDirection(c)!=shouldBeDir || 1170 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir 1171 ) { 1172 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n", 1173 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]); 1174 } 1175 ++c; 1176 } 1177 } 1178 } 1179 } 1180 1181 return TRUE; 1182 } 1183 1184 /* tests for several properties */ 1185 static void TestUnicodeData() 1186 { 1187 UVersionInfo expectVersionArray; 1188 UVersionInfo versionArray; 1189 char *fields[15][2]; 1190 UErrorCode errorCode; 1191 UChar32 c; 1192 int8_t type; 1193 1194 u_versionFromString(expectVersionArray, U_UNICODE_VERSION); 1195 u_getUnicodeVersion(versionArray); 1196 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0) 1197 { 1198 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n", 1199 versionArray[0], versionArray[1], versionArray[2], versionArray[3]); 1200 } 1201 1202 #if defined(ICU_UNICODE_VERSION) 1203 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */ 1204 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION)) 1205 { 1206 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n"); 1207 } 1208 #endif 1209 1210 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) { 1211 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041)); 1212 } 1213 1214 errorCode=U_ZERO_ERROR; 1215 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode); 1216 if(U_FAILURE(errorCode)) { 1217 return; /* if we couldn't parse UnicodeData.txt, we should return */ 1218 } 1219 1220 /* sanity check on repeated properties */ 1221 for(c=0xfffe; c<=0x10ffff;) { 1222 type=u_charType(c); 1223 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1224 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1225 } 1226 if(type!=U_UNASSIGNED) { 1227 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c)); 1228 } 1229 if((c&0xffff)==0xfffe) { 1230 ++c; 1231 } else { 1232 c+=0xffff; 1233 } 1234 } 1235 1236 /* test that PUA is not "unassigned" */ 1237 for(c=0xe000; c<=0x10fffd;) { 1238 type=u_charType(c); 1239 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1240 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1241 } 1242 if(type==U_UNASSIGNED) { 1243 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c); 1244 } else if(type!=U_PRIVATE_USE_CHAR) { 1245 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type); 1246 } 1247 if(c==0xf8ff) { 1248 c=0xf0000; 1249 } else if(c==0xffffd) { 1250 c=0x100000; 1251 } else { 1252 ++c; 1253 } 1254 } 1255 1256 /* test u_enumCharTypes() */ 1257 u_enumCharTypes(enumTypeRange, "a1"); 1258 1259 /* check default properties */ 1260 u_enumCharTypes(enumDefaultsRange, NULL); 1261 } 1262 1263 static void TestCodeUnit(){ 1264 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0}; 1265 1266 int32_t i; 1267 1268 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){ 1269 UChar c=codeunit[i]; 1270 if(i<4){ 1271 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){ 1272 log_err("ERROR: U+%04x is a single", c); 1273 } 1274 1275 } 1276 if(i >= 4 && i< 8){ 1277 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){ 1278 log_err("ERROR: U+%04x is a first surrogate", c); 1279 } 1280 } 1281 if(i >= 8 && i< 12){ 1282 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){ 1283 log_err("ERROR: U+%04x is a second surrogate", c); 1284 } 1285 } 1286 } 1287 1288 } 1289 1290 static void TestCodePoint(){ 1291 const UChar32 codePoint[]={ 1292 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */ 1293 0xd800, 1294 0xdbff, 1295 0xdc00, 1296 0xdfff, 1297 0xdc04, 1298 0xd821, 1299 /*not a surrogate, valid, isUnicodeChar , not Error*/ 1300 0x20ac, 1301 0xd7ff, 1302 0xe000, 1303 0xe123, 1304 0x0061, 1305 0xe065, 1306 0x20402, 1307 0x24506, 1308 0x23456, 1309 0x20402, 1310 0x10402, 1311 0x23456, 1312 /*not a surrogate, not valid, isUnicodeChar, isError */ 1313 0x0015, 1314 0x009f, 1315 /*not a surrogate, not valid, not isUnicodeChar, isError */ 1316 0xffff, 1317 0xfffe, 1318 }; 1319 int32_t i; 1320 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){ 1321 UChar32 c=codePoint[i]; 1322 if(i<6){ 1323 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){ 1324 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1325 } 1326 if(UTF_IS_VALID(c)){ 1327 log_err("ERROR: isValid() failed for U+%04x\n", c); 1328 } 1329 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1330 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1331 } 1332 if(UTF_IS_ERROR(c)){ 1333 log_err("ERROR: isError() failed for U+%04x\n", c); 1334 } 1335 }else if(i >=6 && i<18){ 1336 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1337 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1338 } 1339 if(!UTF_IS_VALID(c)){ 1340 log_err("ERROR: isValid() failed for U+%04x\n", c); 1341 } 1342 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1343 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1344 } 1345 if(UTF_IS_ERROR(c)){ 1346 log_err("ERROR: isError() failed for U+%04x\n", c); 1347 } 1348 }else if(i >=18 && i<20){ 1349 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1350 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1351 } 1352 if(UTF_IS_VALID(c)){ 1353 log_err("ERROR: isValid() failed for U+%04x\n", c); 1354 } 1355 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1356 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1357 } 1358 if(!UTF_IS_ERROR(c)){ 1359 log_err("ERROR: isError() failed for U+%04x\n", c); 1360 } 1361 } 1362 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){ 1363 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1364 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1365 } 1366 if(UTF_IS_VALID(c)){ 1367 log_err("ERROR: isValid() failed for U+%04x\n", c); 1368 } 1369 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1370 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1371 } 1372 if(!UTF_IS_ERROR(c)){ 1373 log_err("ERROR: isError() failed for U+%04x\n", c); 1374 } 1375 } 1376 } 1377 1378 if( 1379 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) || 1380 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) || 1381 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) || 1382 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff) 1383 ) { 1384 log_err("error with U_IS_BMP()\n"); 1385 } 1386 1387 if( 1388 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) || 1389 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) || 1390 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) || 1391 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff) 1392 ) { 1393 log_err("error with U_IS_SUPPLEMENTARY()\n"); 1394 } 1395 } 1396 1397 static void TestCharLength() 1398 { 1399 const int32_t codepoint[]={ 1400 1, 0x0061, 1401 1, 0xe065, 1402 1, 0x20ac, 1403 2, 0x20402, 1404 2, 0x23456, 1405 2, 0x24506, 1406 2, 0x20402, 1407 2, 0x10402, 1408 1, 0xd7ff, 1409 1, 0xe000 1410 }; 1411 1412 int32_t i; 1413 UBool multiple; 1414 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){ 1415 UChar32 c=codepoint[i+1]; 1416 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){ 1417 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c)); 1418 } 1419 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); 1420 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){ 1421 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c); 1422 } 1423 } 1424 } 1425 1426 /*internal functions ----*/ 1427 static int32_t MakeProp(char* str) 1428 { 1429 int32_t result = 0; 1430 char* matchPosition =0; 1431 1432 matchPosition = strstr(tagStrings, str); 1433 if (matchPosition == 0) 1434 { 1435 log_err("unrecognized type letter "); 1436 log_err(str); 1437 } 1438 else 1439 result = (int32_t)((matchPosition - tagStrings) / 2); 1440 return result; 1441 } 1442 1443 static int32_t MakeDir(char* str) 1444 { 1445 int32_t pos = 0; 1446 for (pos = 0; pos < 19; pos++) { 1447 if (strcmp(str, dirStrings[pos]) == 0) { 1448 return pos; 1449 } 1450 } 1451 return -1; 1452 } 1453 1454 /* test u_charName() -------------------------------------------------------- */ 1455 1456 static const struct { 1457 uint32_t code; 1458 const char *name, *oldName, *extName, *alias; 1459 } names[]={ 1460 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, 1461 {0x01a2, "LATIN CAPITAL LETTER OI", 1462 "LATIN CAPITAL LETTER O I", 1463 "LATIN CAPITAL LETTER OI", 1464 "LATIN CAPITAL LETTER GHA"}, 1465 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", 1466 "LATIN SMALL LETTER DOTLESS J BAR HOOK", 1467 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" }, 1468 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "", 1469 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", 1470 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"}, 1471 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" }, 1472 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" }, 1473 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" }, 1474 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" }, 1475 {0xd800, "", "", "<lead surrogate-D800>" }, 1476 {0xdc00, "", "", "<trail surrogate-DC00>" }, 1477 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" }, 1478 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" }, 1479 {0xffff, "", "", "<noncharacter-FFFF>" }, 1480 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "", 1481 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 1482 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"}, 1483 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" } 1484 }; 1485 1486 static UBool 1487 enumCharNamesFn(void *context, 1488 UChar32 code, UCharNameChoice nameChoice, 1489 const char *name, int32_t length) { 1490 int32_t *pCount=(int32_t *)context; 1491 const char *expected; 1492 int i; 1493 1494 if(length<=0 || length!=(int32_t)strlen(name)) { 1495 /* should not be called with an empty string or invalid length */ 1496 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length); 1497 return TRUE; 1498 } 1499 1500 ++*pCount; 1501 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) { 1502 if(code==(UChar32)names[i].code) { 1503 switch (nameChoice) { 1504 case U_EXTENDED_CHAR_NAME: 1505 if(0!=strcmp(name, names[i].extName)) { 1506 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName); 1507 } 1508 break; 1509 case U_UNICODE_CHAR_NAME: 1510 if(0!=strcmp(name, names[i].name)) { 1511 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name); 1512 } 1513 break; 1514 case U_UNICODE_10_CHAR_NAME: 1515 expected=names[i].oldName; 1516 if(expected[0]==0 || 0!=strcmp(name, expected)) { 1517 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected); 1518 } 1519 break; 1520 case U_CHAR_NAME_ALIAS: 1521 expected=names[i].alias; 1522 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) { 1523 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected); 1524 } 1525 break; 1526 case U_CHAR_NAME_CHOICE_COUNT: 1527 break; 1528 } 1529 break; 1530 } 1531 } 1532 return TRUE; 1533 } 1534 1535 struct enumExtCharNamesContext { 1536 uint32_t length; 1537 int32_t last; 1538 }; 1539 1540 static UBool 1541 enumExtCharNamesFn(void *context, 1542 UChar32 code, UCharNameChoice nameChoice, 1543 const char *name, int32_t length) { 1544 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context; 1545 1546 if (ecncp->last != (int32_t) code - 1) { 1547 if (ecncp->last < 0) { 1548 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1); 1549 } else { 1550 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code); 1551 } 1552 } 1553 ecncp->last = (int32_t) code; 1554 1555 if (!*name) { 1556 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code); 1557 } 1558 1559 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length); 1560 } 1561 1562 /** 1563 * This can be made more efficient by moving it into putil.c and having 1564 * it directly access the ebcdic translation tables. 1565 * TODO: If we get this method in putil.c, then delete it from here. 1566 */ 1567 static UChar 1568 u_charToUChar(char c) { 1569 UChar uc; 1570 u_charsToUChars(&c, &uc, 1); 1571 return uc; 1572 } 1573 1574 static void 1575 TestCharNames() { 1576 static char name[80]; 1577 UErrorCode errorCode=U_ZERO_ERROR; 1578 struct enumExtCharNamesContext extContext; 1579 const char *expected; 1580 int32_t length; 1581 UChar32 c; 1582 int32_t i; 1583 1584 log_verbose("Testing uprv_getMaxCharNameLength()\n"); 1585 length=uprv_getMaxCharNameLength(); 1586 if(length==0) { 1587 /* no names data available */ 1588 return; 1589 } 1590 if(length<83) { /* Unicode 3.2 max char name length */ 1591 log_err("uprv_getMaxCharNameLength()=%d is too short"); 1592 } 1593 /* ### TODO same tests for max ISO comment length as for max name length */ 1594 1595 log_verbose("Testing u_charName()\n"); 1596 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) { 1597 /* modern Unicode character name */ 1598 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode); 1599 if(U_FAILURE(errorCode)) { 1600 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode)); 1601 return; 1602 } 1603 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) { 1604 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name); 1605 } 1606 1607 /* find the modern name */ 1608 if (*names[i].name) { 1609 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode); 1610 if(U_FAILURE(errorCode)) { 1611 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode)); 1612 return; 1613 } 1614 if(c!=(UChar32)names[i].code) { 1615 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code); 1616 } 1617 } 1618 1619 /* Unicode 1.0 character name */ 1620 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode); 1621 if(U_FAILURE(errorCode)) { 1622 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode)); 1623 return; 1624 } 1625 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) { 1626 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName); 1627 } 1628 1629 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */ 1630 if(names[i].oldName[0]!=0 /* && length>0 */) { 1631 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode); 1632 if(U_FAILURE(errorCode)) { 1633 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode)); 1634 return; 1635 } 1636 if(c!=(UChar32)names[i].code) { 1637 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code); 1638 } 1639 } 1640 1641 /* Unicode character name alias */ 1642 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode); 1643 if(U_FAILURE(errorCode)) { 1644 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode)); 1645 return; 1646 } 1647 expected=names[i].alias; 1648 if(expected==NULL) { 1649 expected=""; 1650 } 1651 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) { 1652 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n", 1653 names[i].code, name, length, expected); 1654 } 1655 1656 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */ 1657 if(expected[0]!=0 /* && length>0 */) { 1658 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode); 1659 if(U_FAILURE(errorCode)) { 1660 log_err("u_charFromName(%s - alias) error %s\n", 1661 expected, u_errorName(errorCode)); 1662 return; 1663 } 1664 if(c!=(UChar32)names[i].code) { 1665 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n", 1666 expected, c, names[i].code); 1667 } 1668 } 1669 } 1670 1671 /* test u_enumCharNames() */ 1672 length=0; 1673 errorCode=U_ZERO_ERROR; 1674 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode); 1675 if(U_FAILURE(errorCode) || length<94140) { 1676 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length); 1677 } 1678 1679 extContext.length = 0; 1680 extContext.last = -1; 1681 errorCode=U_ZERO_ERROR; 1682 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode); 1683 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) { 1684 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length); 1685 } 1686 1687 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */ 1688 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) { 1689 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode)); 1690 } 1691 1692 /* Test getCharNameCharacters */ 1693 if(!getTestOption(QUICK_OPTION)) { 1694 enum { BUFSIZE = 256 }; 1695 UErrorCode ec = U_ZERO_ERROR; 1696 char buf[BUFSIZE]; 1697 int32_t maxLength; 1698 UChar32 cp; 1699 UChar pat[BUFSIZE], dumbPat[BUFSIZE]; 1700 int32_t l1, l2; 1701 UBool map[256]; 1702 UBool ok; 1703 1704 USet* set = uset_open(1, 0); /* empty set */ 1705 USet* dumb = uset_open(1, 0); /* empty set */ 1706 1707 /* 1708 * uprv_getCharNameCharacters() will likely return more lowercase 1709 * letters than actual character names contain because 1710 * it includes all the characters in lowercased names of 1711 * general categories, for the full possible set of extended names. 1712 */ 1713 { 1714 USetAdder sa={ 1715 NULL, 1716 uset_add, 1717 uset_addRange, 1718 uset_addString, 1719 NULL /* don't need remove() */ 1720 }; 1721 sa.set=set; 1722 uprv_getCharNameCharacters(&sa); 1723 } 1724 1725 /* build set the dumb (but sure-fire) way */ 1726 for (i=0; i<256; ++i) { 1727 map[i] = FALSE; 1728 } 1729 1730 maxLength=0; 1731 for (cp=0; cp<0x110000; ++cp) { 1732 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME, 1733 buf, BUFSIZE, &ec); 1734 if (U_FAILURE(ec)) { 1735 log_err("FAIL: u_charName failed when it shouldn't\n"); 1736 uset_close(set); 1737 uset_close(dumb); 1738 return; 1739 } 1740 if(len>maxLength) { 1741 maxLength=len; 1742 } 1743 1744 for (i=0; i<len; ++i) { 1745 if (!map[(uint8_t) buf[i]]) { 1746 uset_add(dumb, (UChar32)u_charToUChar(buf[i])); 1747 map[(uint8_t) buf[i]] = TRUE; 1748 } 1749 } 1750 1751 /* test for leading/trailing whitespace */ 1752 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') { 1753 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp); 1754 } 1755 } 1756 1757 if(map[(uint8_t)'\t']) { 1758 log_err("u_charName() returned a name with a TAB for some code point\n", cp); 1759 } 1760 1761 length=uprv_getMaxCharNameLength(); 1762 if(length!=maxLength) { 1763 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n", 1764 length, maxLength); 1765 } 1766 1767 /* compare the sets. Where is my uset_equals?!! */ 1768 ok=TRUE; 1769 for(i=0; i<256; ++i) { 1770 if(uset_contains(set, i)!=uset_contains(dumb, i)) { 1771 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) { 1772 /* ignore lowercase a-z that are in set but not in dumb */ 1773 ok=TRUE; 1774 } else { 1775 ok=FALSE; 1776 break; 1777 } 1778 } 1779 } 1780 1781 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec); 1782 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec); 1783 if (U_FAILURE(ec)) { 1784 log_err("FAIL: uset_toPattern failed when it shouldn't\n"); 1785 uset_close(set); 1786 uset_close(dumb); 1787 return; 1788 } 1789 1790 if (l1 >= BUFSIZE) { 1791 l1 = BUFSIZE-1; 1792 pat[l1] = 0; 1793 } 1794 if (l2 >= BUFSIZE) { 1795 l2 = BUFSIZE-1; 1796 dumbPat[l2] = 0; 1797 } 1798 1799 if (!ok) { 1800 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n", 1801 aescstrdup(pat, l1), aescstrdup(dumbPat, l2)); 1802 } else if(getTestOption(VERBOSITY_OPTION)) { 1803 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1)); 1804 } 1805 1806 uset_close(set); 1807 uset_close(dumb); 1808 } 1809 1810 /* ### TODO: test error cases and other interesting things */ 1811 } 1812 1813 /* test u_isMirrored() and u_charMirror() ----------------------------------- */ 1814 1815 static void 1816 TestMirroring() { 1817 USet *set; 1818 UErrorCode errorCode; 1819 1820 UChar32 start, end, c2, c3; 1821 int32_t i; 1822 1823 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1824 1825 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1826 1827 log_verbose("Testing u_isMirrored()\n"); 1828 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) && 1829 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400) 1830 ) 1831 ) { 1832 log_err("u_isMirrored() does not work correctly\n"); 1833 } 1834 1835 log_verbose("Testing u_charMirror()\n"); 1836 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 && 1837 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */ 1838 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab && 1839 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 1840 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d 1841 ) 1842 ) { 1843 log_err("u_charMirror() does not work correctly\n"); 1844 } 1845 1846 /* verify that Bidi_Mirroring_Glyph roundtrips */ 1847 errorCode=U_ZERO_ERROR; 1848 set=uset_openPattern(mirroredPattern, 17, &errorCode); 1849 1850 if (U_FAILURE(errorCode)) { 1851 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n"); 1852 } else { 1853 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) { 1854 do { 1855 c2=u_charMirror(start); 1856 c3=u_charMirror(c2); 1857 if(c3!=start) { 1858 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3); 1859 } 1860 } while(++start<=end); 1861 } 1862 } 1863 1864 uset_close(set); 1865 } 1866 1867 1868 struct RunTestData 1869 { 1870 const char *runText; 1871 UScriptCode runCode; 1872 }; 1873 1874 typedef struct RunTestData RunTestData; 1875 1876 static void 1877 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns, 1878 const char *prefix) 1879 { 1880 int32_t run, runStart, runLimit; 1881 UScriptCode runCode; 1882 1883 /* iterate over all the runs */ 1884 run = 0; 1885 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) { 1886 if (runStart != runStarts[run]) { 1887 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n", 1888 prefix, run, runStarts[run], runStart); 1889 } 1890 1891 if (runLimit != runStarts[run + 1]) { 1892 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n", 1893 prefix, run, runStarts[run + 1], runLimit); 1894 } 1895 1896 if (runCode != testData[run].runCode) { 1897 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n", 1898 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode)); 1899 } 1900 1901 run += 1; 1902 1903 /* stop when we've seen all the runs we expect to see */ 1904 if (run >= nRuns) { 1905 break; 1906 } 1907 } 1908 1909 /* Complain if we didn't see then number of runs we expected */ 1910 if (run != nRuns) { 1911 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns); 1912 } 1913 } 1914 1915 static void 1916 TestUScriptRunAPI() 1917 { 1918 static const RunTestData testData1[] = { 1919 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI}, 1920 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC}, 1921 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC}, 1922 {"English (", USCRIPT_LATIN}, 1923 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI}, 1924 {") ", USCRIPT_LATIN}, 1925 {"\\u6F22\\u5B75", USCRIPT_HAN}, 1926 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA}, 1927 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA}, 1928 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET} 1929 }; 1930 1931 static const RunTestData testData2[] = { 1932 {"((((((((((abc))))))))))", USCRIPT_LATIN} 1933 }; 1934 1935 static const struct { 1936 const RunTestData *testData; 1937 int32_t nRuns; 1938 } testDataEntries[] = { 1939 {testData1, LENGTHOF(testData1)}, 1940 {testData2, LENGTHOF(testData2)} 1941 }; 1942 1943 static const int32_t nTestEntries = LENGTHOF(testDataEntries); 1944 int32_t testEntry; 1945 1946 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) { 1947 UChar testString[1024]; 1948 int32_t runStarts[256]; 1949 int32_t nTestRuns = testDataEntries[testEntry].nRuns; 1950 const RunTestData *testData = testDataEntries[testEntry].testData; 1951 1952 int32_t run, stringLimit; 1953 UScriptRun *scriptRun = NULL; 1954 UErrorCode err; 1955 1956 /* 1957 * Fill in the test string and the runStarts array. 1958 */ 1959 stringLimit = 0; 1960 for (run = 0; run < nTestRuns; run += 1) { 1961 runStarts[run] = stringLimit; 1962 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit); 1963 /*stringLimit -= 1;*/ 1964 } 1965 1966 /* The limit of the last run */ 1967 runStarts[nTestRuns] = stringLimit; 1968 1969 /* 1970 * Make sure that calling uscript_OpenRun with a NULL text pointer 1971 * and a non-zero text length returns the correct error. 1972 */ 1973 err = U_ZERO_ERROR; 1974 scriptRun = uscript_openRun(NULL, stringLimit, &err); 1975 1976 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 1977 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 1978 } 1979 1980 if (scriptRun != NULL) { 1981 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n"); 1982 uscript_closeRun(scriptRun); 1983 } 1984 1985 /* 1986 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 1987 * and a zero text length returns the correct error. 1988 */ 1989 err = U_ZERO_ERROR; 1990 scriptRun = uscript_openRun(testString, 0, &err); 1991 1992 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 1993 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 1994 } 1995 1996 if (scriptRun != NULL) { 1997 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n"); 1998 uscript_closeRun(scriptRun); 1999 } 2000 2001 /* 2002 * Make sure that calling uscript_openRun with a NULL text pointer 2003 * and a zero text length doesn't return an error. 2004 */ 2005 err = U_ZERO_ERROR; 2006 scriptRun = uscript_openRun(NULL, 0, &err); 2007 2008 if (U_FAILURE(err)) { 2009 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err)); 2010 } 2011 2012 /* Make sure that the empty iterator doesn't find any runs */ 2013 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) { 2014 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n"); 2015 } 2016 2017 /* 2018 * Make sure that calling uscript_setRunText with a NULL text pointer 2019 * and a non-zero text length returns the correct error. 2020 */ 2021 err = U_ZERO_ERROR; 2022 uscript_setRunText(scriptRun, NULL, stringLimit, &err); 2023 2024 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2025 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2026 } 2027 2028 /* 2029 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2030 * and a zero text length returns the correct error. 2031 */ 2032 err = U_ZERO_ERROR; 2033 uscript_setRunText(scriptRun, testString, 0, &err); 2034 2035 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2036 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2037 } 2038 2039 /* 2040 * Now call uscript_setRunText on the empty iterator 2041 * and make sure that it works. 2042 */ 2043 err = U_ZERO_ERROR; 2044 uscript_setRunText(scriptRun, testString, stringLimit, &err); 2045 2046 if (U_FAILURE(err)) { 2047 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err)); 2048 } else { 2049 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText"); 2050 } 2051 2052 uscript_closeRun(scriptRun); 2053 2054 /* 2055 * Now open an interator over the testString 2056 * using uscript_openRun and make sure that it works 2057 */ 2058 scriptRun = uscript_openRun(testString, stringLimit, &err); 2059 2060 if (U_FAILURE(err)) { 2061 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err)); 2062 } else { 2063 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun"); 2064 } 2065 2066 /* Now reset the iterator, and make sure 2067 * that it still works. 2068 */ 2069 uscript_resetRun(scriptRun); 2070 2071 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun"); 2072 2073 /* Close the iterator */ 2074 uscript_closeRun(scriptRun); 2075 } 2076 } 2077 2078 /* test additional, non-core properties */ 2079 static void 2080 TestAdditionalProperties() { 2081 /* test data for u_charAge() */ 2082 static const struct { 2083 UChar32 c; 2084 UVersionInfo version; 2085 } charAges[]={ 2086 {0x41, { 1, 1, 0, 0 }}, 2087 {0xffff, { 1, 1, 0, 0 }}, 2088 {0x20ab, { 2, 0, 0, 0 }}, 2089 {0x2fffe, { 2, 0, 0, 0 }}, 2090 {0x20ac, { 2, 1, 0, 0 }}, 2091 {0xfb1d, { 3, 0, 0, 0 }}, 2092 {0x3f4, { 3, 1, 0, 0 }}, 2093 {0x10300, { 3, 1, 0, 0 }}, 2094 {0x220, { 3, 2, 0, 0 }}, 2095 {0xff60, { 3, 2, 0, 0 }} 2096 }; 2097 2098 /* test data for u_hasBinaryProperty() */ 2099 static const int32_t 2100 props[][3]={ /* code point, property, value */ 2101 { 0x0627, UCHAR_ALPHABETIC, TRUE }, 2102 { 0x1034a, UCHAR_ALPHABETIC, TRUE }, 2103 { 0x2028, UCHAR_ALPHABETIC, FALSE }, 2104 2105 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE }, 2106 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE }, 2107 2108 { 0x202c, UCHAR_BIDI_CONTROL, TRUE }, 2109 { 0x202f, UCHAR_BIDI_CONTROL, FALSE }, 2110 2111 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE }, 2112 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE }, 2113 2114 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 2115 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE }, 2116 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE }, 2117 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE }, 2118 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE }, 2119 2120 { 0x058a, UCHAR_DASH, TRUE }, 2121 { 0x007e, UCHAR_DASH, FALSE }, 2122 2123 { 0x0c4d, UCHAR_DIACRITIC, TRUE }, 2124 { 0x3000, UCHAR_DIACRITIC, FALSE }, 2125 2126 { 0x0e46, UCHAR_EXTENDER, TRUE }, 2127 { 0x0020, UCHAR_EXTENDER, FALSE }, 2128 2129 #if !UCONFIG_NO_NORMALIZATION 2130 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2131 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2132 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE }, 2133 2134 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */ 2135 { 0x0308, UCHAR_NFD_INERT, FALSE }, 2136 2137 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */ 2138 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */ 2139 2140 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */ 2141 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */ 2142 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */ 2143 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */ 2144 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */ 2145 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */ 2146 2147 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */ 2148 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */ 2149 2150 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE }, 2151 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE }, 2152 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */ 2153 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */ 2154 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */ 2155 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */ 2156 #endif 2157 2158 { 0x0044, UCHAR_HEX_DIGIT, TRUE }, 2159 { 0xff46, UCHAR_HEX_DIGIT, TRUE }, 2160 { 0x0047, UCHAR_HEX_DIGIT, FALSE }, 2161 2162 { 0x30fb, UCHAR_HYPHEN, TRUE }, 2163 { 0xfe58, UCHAR_HYPHEN, FALSE }, 2164 2165 { 0x2172, UCHAR_ID_CONTINUE, TRUE }, 2166 { 0x0307, UCHAR_ID_CONTINUE, TRUE }, 2167 { 0x005c, UCHAR_ID_CONTINUE, FALSE }, 2168 2169 { 0x2172, UCHAR_ID_START, TRUE }, 2170 { 0x007a, UCHAR_ID_START, TRUE }, 2171 { 0x0039, UCHAR_ID_START, FALSE }, 2172 2173 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE }, 2174 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE }, 2175 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE }, 2176 2177 { 0x200c, UCHAR_JOIN_CONTROL, TRUE }, 2178 { 0x2029, UCHAR_JOIN_CONTROL, FALSE }, 2179 2180 { 0x1d7bc, UCHAR_LOWERCASE, TRUE }, 2181 { 0x0345, UCHAR_LOWERCASE, TRUE }, 2182 { 0x0030, UCHAR_LOWERCASE, FALSE }, 2183 2184 { 0x1d7a9, UCHAR_MATH, TRUE }, 2185 { 0x2135, UCHAR_MATH, TRUE }, 2186 { 0x0062, UCHAR_MATH, FALSE }, 2187 2188 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2189 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2190 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE }, 2191 2192 { 0x0022, UCHAR_QUOTATION_MARK, TRUE }, 2193 { 0xff62, UCHAR_QUOTATION_MARK, TRUE }, 2194 { 0xd840, UCHAR_QUOTATION_MARK, FALSE }, 2195 2196 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE }, 2197 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE }, 2198 2199 { 0x1d44a, UCHAR_UPPERCASE, TRUE }, 2200 { 0x2162, UCHAR_UPPERCASE, TRUE }, 2201 { 0x0345, UCHAR_UPPERCASE, FALSE }, 2202 2203 { 0x0020, UCHAR_WHITE_SPACE, TRUE }, 2204 { 0x202f, UCHAR_WHITE_SPACE, TRUE }, 2205 { 0x3001, UCHAR_WHITE_SPACE, FALSE }, 2206 2207 { 0x0711, UCHAR_XID_CONTINUE, TRUE }, 2208 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE }, 2209 { 0x007c, UCHAR_XID_CONTINUE, FALSE }, 2210 2211 { 0x16ee, UCHAR_XID_START, TRUE }, 2212 { 0x23456, UCHAR_XID_START, TRUE }, 2213 { 0x1d1aa, UCHAR_XID_START, FALSE }, 2214 2215 /* 2216 * Version break: 2217 * The following properties are only supported starting with the 2218 * Unicode version indicated in the second field. 2219 */ 2220 { -1, 0x320, 0 }, 2221 2222 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2223 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2224 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE }, 2225 2226 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */ 2227 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */ 2228 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */ 2229 { 0xe0100, UCHAR_DEPRECATED, FALSE }, 2230 2231 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE }, 2232 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE }, 2233 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE }, 2234 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2235 2236 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE }, 2237 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE }, 2238 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2239 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE }, 2240 2241 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE }, 2242 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE }, 2243 2244 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE }, 2245 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE }, 2246 2247 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE }, 2248 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE }, 2249 2250 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE }, 2251 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE }, 2252 2253 { 0x2e9b, UCHAR_RADICAL, TRUE }, 2254 { 0x4e00, UCHAR_RADICAL, FALSE }, 2255 2256 { 0x012f, UCHAR_SOFT_DOTTED, TRUE }, 2257 { 0x0049, UCHAR_SOFT_DOTTED, FALSE }, 2258 2259 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE }, 2260 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE }, 2261 2262 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */ 2263 2264 { 0x002e, UCHAR_S_TERM, TRUE }, 2265 { 0x0061, UCHAR_S_TERM, FALSE }, 2266 2267 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE }, 2268 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE }, 2269 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE }, 2270 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE }, 2271 2272 /* enum/integer type properties */ 2273 2274 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */ 2275 /* test default Bidi classes for unassigned code points */ 2276 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2277 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2278 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2279 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */ 2280 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */ 2281 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2282 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2283 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2284 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2285 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2286 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2287 2288 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2289 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2290 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2291 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2292 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2293 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2294 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2295 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2296 2297 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS }, 2298 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU }, 2299 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS }, 2300 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG }, 2301 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU }, 2302 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2303 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA }, 2304 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS }, 2305 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2306 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2307 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B }, 2308 2309 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */ 2310 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 }, 2311 2312 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK }, 2313 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT }, 2314 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2315 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2316 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2317 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2318 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL }, 2319 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT }, 2320 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2321 2322 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2323 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW }, 2324 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2325 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH }, 2326 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2327 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH }, 2328 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2329 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2330 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2331 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2332 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2333 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2334 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2335 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */ 2336 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2337 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2338 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2339 2340 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */ 2341 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 }, 2342 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */ 2343 2344 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP }, 2345 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN }, 2346 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH }, 2347 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH }, 2348 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL }, 2349 2350 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING }, 2351 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2352 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING }, 2353 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2354 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING }, 2355 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2356 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2357 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2358 2359 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */ 2360 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2361 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2362 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION }, 2363 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION }, 2364 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2365 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2366 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2367 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2368 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2369 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2370 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2371 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION }, 2372 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS }, 2373 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2374 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2375 2376 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */ 2377 2378 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */ 2379 2380 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2381 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2382 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2383 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2384 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2385 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2386 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2387 2388 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2389 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2390 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2391 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2392 2393 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2394 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2395 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2396 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2397 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2398 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2399 2400 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2401 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2402 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2403 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2404 2405 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2406 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2407 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2408 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2409 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2410 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2411 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2412 2413 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2414 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2415 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2416 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2417 2418 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2419 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2420 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2421 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2422 2423 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2424 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2425 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2426 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2427 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2428 2429 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2430 2431 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */ 2432 2433 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE }, 2434 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE }, 2435 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE }, 2436 2437 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2438 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2439 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2440 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2441 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2442 2443 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION }, 2444 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC }, 2445 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS }, 2446 2447 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE }, 2448 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC }, 2449 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI }, 2450 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN }, 2451 2452 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 }, 2453 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 }, 2454 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 }, 2455 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL }, 2456 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT }, 2457 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV }, 2458 2459 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT }, 2460 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND }, 2461 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL }, 2462 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V }, 2463 2464 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER }, 2465 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER }, 2466 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC }, 2467 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM }, 2468 2469 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER }, 2470 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER }, 2471 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE }, 2472 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP }, 2473 2474 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */ 2475 2476 /* test some script codes >127 */ 2477 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM }, 2478 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU }, 2479 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN }, 2480 2481 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */ 2482 2483 /* value changed in Unicode 6.0 */ 2484 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL }, 2485 2486 /* undefined UProperty values */ 2487 { 0x61, 0x4a7, 0 }, 2488 { 0x234bc, 0x15ed, 0 } 2489 }; 2490 2491 UVersionInfo version; 2492 UChar32 c; 2493 int32_t i, result, uVersion; 2494 UProperty which; 2495 2496 /* what is our Unicode version? */ 2497 u_getUnicodeVersion(version); 2498 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */ 2499 2500 u_charAge(0x20, version); 2501 if(version[0]==0) { 2502 /* no additional properties available */ 2503 log_err("TestAdditionalProperties: no additional properties available, not tested\n"); 2504 return; 2505 } 2506 2507 /* test u_charAge() */ 2508 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) { 2509 u_charAge(charAges[i].c, version); 2510 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) { 2511 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n", 2512 charAges[i].c, 2513 version[0], version[1], version[2], version[3], 2514 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]); 2515 } 2516 } 2517 2518 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 || 2519 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 || 2520 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */ 2521 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/ 2522 u_getIntPropertyMinValue(0x2345)!=0 2523 ) { 2524 log_err("error: u_getIntPropertyMinValue() wrong\n"); 2525 } 2526 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) { 2527 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n"); 2528 } 2529 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) { 2530 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n"); 2531 } 2532 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) { 2533 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n"); 2534 } 2535 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) { 2536 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n"); 2537 } 2538 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) { 2539 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n"); 2540 } 2541 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) { 2542 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n"); 2543 } 2544 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) { 2545 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n"); 2546 } 2547 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) { 2548 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n"); 2549 } 2550 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) { 2551 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n"); 2552 } 2553 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) { 2554 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n"); 2555 } 2556 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) { 2557 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n"); 2558 } 2559 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) { 2560 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n"); 2561 } 2562 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) { 2563 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n"); 2564 } 2565 /*JB#2410*/ 2566 if( u_getIntPropertyMaxValue(0x2345)!=-1) { 2567 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n"); 2568 } 2569 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) { 2570 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n"); 2571 } 2572 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) { 2573 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n"); 2574 } 2575 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) { 2576 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n"); 2577 } 2578 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) { 2579 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n"); 2580 } 2581 2582 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */ 2583 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) { 2584 const char *whichName; 2585 2586 if(props[i][0]<0) { 2587 /* Unicode version break */ 2588 if(uVersion<props[i][1]) { 2589 break; /* do not test properties that are not yet supported */ 2590 } else { 2591 continue; /* skip this row */ 2592 } 2593 } 2594 2595 c=(UChar32)props[i][0]; 2596 which=(UProperty)props[i][1]; 2597 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME); 2598 2599 if(which<UCHAR_INT_START) { 2600 result=u_hasBinaryProperty(c, which); 2601 if(result!=props[i][2]) { 2602 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n", 2603 c, whichName, result, i); 2604 } 2605 } 2606 2607 result=u_getIntPropertyValue(c, which); 2608 if(result!=props[i][2]) { 2609 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n", 2610 c, whichName, result, props[i][2], i); 2611 } 2612 2613 /* test separate functions, too */ 2614 switch((UProperty)props[i][1]) { 2615 case UCHAR_ALPHABETIC: 2616 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) { 2617 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n", 2618 props[i][0], result, i); 2619 } 2620 break; 2621 case UCHAR_LOWERCASE: 2622 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2623 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n", 2624 props[i][0], result, i); 2625 } 2626 break; 2627 case UCHAR_UPPERCASE: 2628 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2629 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n", 2630 props[i][0], result, i); 2631 } 2632 break; 2633 case UCHAR_WHITE_SPACE: 2634 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) { 2635 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n", 2636 props[i][0], result, i); 2637 } 2638 break; 2639 default: 2640 break; 2641 } 2642 } 2643 } 2644 2645 static void 2646 TestNumericProperties(void) { 2647 /* see UnicodeData.txt, DerivedNumericValues.txt */ 2648 static const struct { 2649 UChar32 c; 2650 int32_t type; 2651 double numValue; 2652 } values[]={ 2653 { 0x0F33, U_NT_NUMERIC, -1./2. }, 2654 { 0x0C66, U_NT_DECIMAL, 0 }, 2655 { 0x96f6, U_NT_NUMERIC, 0 }, 2656 { 0xa833, U_NT_NUMERIC, 1./16. }, 2657 { 0x2152, U_NT_NUMERIC, 1./10. }, 2658 { 0x2151, U_NT_NUMERIC, 1./9. }, 2659 { 0x1245f, U_NT_NUMERIC, 1./8. }, 2660 { 0x2150, U_NT_NUMERIC, 1./7. }, 2661 { 0x2159, U_NT_NUMERIC, 1./6. }, 2662 { 0x09f6, U_NT_NUMERIC, 3./16. }, 2663 { 0x2155, U_NT_NUMERIC, 1./5. }, 2664 { 0x00BD, U_NT_NUMERIC, 1./2. }, 2665 { 0x0031, U_NT_DECIMAL, 1. }, 2666 { 0x4e00, U_NT_NUMERIC, 1. }, 2667 { 0x58f1, U_NT_NUMERIC, 1. }, 2668 { 0x10320, U_NT_NUMERIC, 1. }, 2669 { 0x0F2B, U_NT_NUMERIC, 3./2. }, 2670 { 0x00B2, U_NT_DIGIT, 2. }, 2671 { 0x5f10, U_NT_NUMERIC, 2. }, 2672 { 0x1813, U_NT_DECIMAL, 3. }, 2673 { 0x5f0e, U_NT_NUMERIC, 3. }, 2674 { 0x2173, U_NT_NUMERIC, 4. }, 2675 { 0x8086, U_NT_NUMERIC, 4. }, 2676 { 0x278E, U_NT_DIGIT, 5. }, 2677 { 0x1D7F2, U_NT_DECIMAL, 6. }, 2678 { 0x247A, U_NT_DIGIT, 7. }, 2679 { 0x7396, U_NT_NUMERIC, 9. }, 2680 { 0x1372, U_NT_NUMERIC, 10. }, 2681 { 0x216B, U_NT_NUMERIC, 12. }, 2682 { 0x16EE, U_NT_NUMERIC, 17. }, 2683 { 0x249A, U_NT_NUMERIC, 19. }, 2684 { 0x303A, U_NT_NUMERIC, 30. }, 2685 { 0x5345, U_NT_NUMERIC, 30. }, 2686 { 0x32B2, U_NT_NUMERIC, 37. }, 2687 { 0x1375, U_NT_NUMERIC, 40. }, 2688 { 0x10323, U_NT_NUMERIC, 50. }, 2689 { 0x0BF1, U_NT_NUMERIC, 100. }, 2690 { 0x964c, U_NT_NUMERIC, 100. }, 2691 { 0x217E, U_NT_NUMERIC, 500. }, 2692 { 0x2180, U_NT_NUMERIC, 1000. }, 2693 { 0x4edf, U_NT_NUMERIC, 1000. }, 2694 { 0x2181, U_NT_NUMERIC, 5000. }, 2695 { 0x137C, U_NT_NUMERIC, 10000. }, 2696 { 0x4e07, U_NT_NUMERIC, 10000. }, 2697 { 0x4ebf, U_NT_NUMERIC, 100000000. }, 2698 { 0x5146, U_NT_NUMERIC, 1000000000000. }, 2699 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2700 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2701 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2702 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2703 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2704 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2705 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2706 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE } 2707 }; 2708 2709 double nv; 2710 UChar32 c; 2711 int32_t i, type; 2712 2713 for(i=0; i<LENGTHOF(values); ++i) { 2714 c=values[i].c; 2715 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE); 2716 nv=u_getNumericValue(c); 2717 2718 if(type!=values[i].type) { 2719 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type); 2720 } 2721 if(0.000001 <= fabs(nv - values[i].numValue)) { 2722 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue); 2723 } 2724 } 2725 } 2726 2727 /** 2728 * Test the property names and property value names API. 2729 */ 2730 static void 2731 TestPropertyNames(void) { 2732 int32_t p, v, choice=0, rev; 2733 UBool atLeastSomething = FALSE; 2734 2735 for (p=0; ; ++p) { 2736 UProperty propEnum = (UProperty)p; 2737 UBool sawProp = FALSE; 2738 if(p > 10 && !atLeastSomething) { 2739 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice); 2740 return; 2741 } 2742 2743 for (choice=0; ; ++choice) { 2744 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice); 2745 if (name) { 2746 if (!sawProp) 2747 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff); 2748 log_verbose("%d=\"%s\"", choice, name); 2749 sawProp = TRUE; 2750 atLeastSomething = TRUE; 2751 2752 /* test reverse mapping */ 2753 rev = u_getPropertyEnum(name); 2754 if (rev != p) { 2755 log_err("Property round-trip failure: %d -> %s -> %d\n", 2756 p, name, rev); 2757 } 2758 } 2759 if (!name && choice>0) break; 2760 } 2761 if (sawProp) { 2762 /* looks like a valid property; check the values */ 2763 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2764 int32_t max = 0; 2765 if (p == UCHAR_CANONICAL_COMBINING_CLASS) { 2766 max = 255; 2767 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) { 2768 /* it's far too slow to iterate all the way up to 2769 the real max, U_GC_P_MASK */ 2770 max = U_GC_NL_MASK; 2771 } else if (p == UCHAR_BLOCK) { 2772 /* UBlockCodes, unlike other values, start at 1 */ 2773 max = 1; 2774 } 2775 log_verbose("\n"); 2776 for (v=-1; ; ++v) { 2777 UBool sawValue = FALSE; 2778 for (choice=0; ; ++choice) { 2779 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice); 2780 if (vname) { 2781 if (!sawValue) log_verbose(" %s, value %d:", pname, v); 2782 log_verbose("%d=\"%s\"", choice, vname); 2783 sawValue = TRUE; 2784 2785 /* test reverse mapping */ 2786 rev = u_getPropertyValueEnum(propEnum, vname); 2787 if (rev != v) { 2788 log_err("Value round-trip failure (%s): %d -> %s -> %d\n", 2789 pname, v, vname, rev); 2790 } 2791 } 2792 if (!vname && choice>0) break; 2793 } 2794 if (sawValue) { 2795 log_verbose("\n"); 2796 } 2797 if (!sawValue && v>=max) break; 2798 } 2799 } 2800 if (!sawProp) { 2801 if (p>=UCHAR_STRING_LIMIT) { 2802 break; 2803 } else if (p>=UCHAR_DOUBLE_LIMIT) { 2804 p = UCHAR_STRING_START - 1; 2805 } else if (p>=UCHAR_MASK_LIMIT) { 2806 p = UCHAR_DOUBLE_START - 1; 2807 } else if (p>=UCHAR_INT_LIMIT) { 2808 p = UCHAR_MASK_START - 1; 2809 } else if (p>=UCHAR_BINARY_LIMIT) { 2810 p = UCHAR_INT_START - 1; 2811 } 2812 } 2813 } 2814 } 2815 2816 /** 2817 * Test the property values API. See JB#2410. 2818 */ 2819 static void 2820 TestPropertyValues(void) { 2821 int32_t i, p, min, max; 2822 UErrorCode ec; 2823 2824 /* Min should be 0 for everything. */ 2825 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */ 2826 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) { 2827 UProperty propEnum = (UProperty)p; 2828 min = u_getIntPropertyMinValue(propEnum); 2829 if (min != 0) { 2830 if (p == UCHAR_BLOCK) { 2831 /* This is okay...for now. See JB#2487. 2832 TODO Update this for JB#2487. */ 2833 } else { 2834 const char* name; 2835 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2836 if (name == NULL) 2837 name = "<ERROR>"; 2838 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n", 2839 name, min); 2840 } 2841 } 2842 } 2843 2844 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 || 2845 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) { 2846 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n"); 2847 } 2848 2849 /* Max should be -1 for invalid properties. */ 2850 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE); 2851 if (max != -1) { 2852 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n", 2853 max); 2854 } 2855 2856 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */ 2857 for (i=0; i<2; ++i) { 2858 int32_t script; 2859 const char* desc; 2860 ec = U_ZERO_ERROR; 2861 switch (i) { 2862 case 0: 2863 script = uscript_getScript(-1, &ec); 2864 desc = "uscript_getScript(-1)"; 2865 break; 2866 case 1: 2867 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT); 2868 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)"; 2869 break; 2870 default: 2871 log_err("Internal test error. Too many scripts\n"); 2872 return; 2873 } 2874 /* We don't explicitly test ec. It should be U_FAILURE but it 2875 isn't documented as such. */ 2876 if (script != (int32_t)USCRIPT_INVALID_CODE) { 2877 log_err("FAIL: %s = %d, exp. 0\n", 2878 desc, script); 2879 } 2880 } 2881 } 2882 2883 /* various tests for consistency of UCD data and API behavior */ 2884 static void 2885 TestConsistency() { 2886 char buffer[300]; 2887 USet *set1, *set2, *set3, *set4; 2888 UErrorCode errorCode; 2889 2890 UChar32 start, end; 2891 int32_t i, length; 2892 2893 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10); 2894 U_STRING_DECL(dashPattern, "[:Dash:]", 8); 2895 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13); 2896 U_STRING_DECL(formatPattern, "[:Cf:]", 6); 2897 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14); 2898 2899 U_STRING_DECL(mathBlocksPattern, 2900 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 2901 1+32+46+46+45+43+1+1); /* +1 for NUL */ 2902 U_STRING_DECL(mathPattern, "[:Math:]", 8); 2903 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6); 2904 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14); 2905 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 2906 2907 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10); 2908 U_STRING_INIT(dashPattern, "[:Dash:]", 8); 2909 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13); 2910 U_STRING_INIT(formatPattern, "[:Cf:]", 6); 2911 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14); 2912 2913 U_STRING_INIT(mathBlocksPattern, 2914 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 2915 1+32+46+46+45+43+1+1); /* +1 for NUL */ 2916 U_STRING_INIT(mathPattern, "[:Math:]", 8); 2917 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6); 2918 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14); 2919 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 2920 2921 /* 2922 * It used to be that UCD.html and its precursors said 2923 * "Those dashes used to mark connections between pieces of words, 2924 * plus the Katakana middle dot." 2925 * 2926 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash 2927 * but not from Hyphen. 2928 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html. 2929 * Therefore, do not show errors when testing the Hyphen property. 2930 */ 2931 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" 2932 "known to the UTC and not considered errors.\n"); 2933 2934 errorCode=U_ZERO_ERROR; 2935 set1=uset_openPattern(hyphenPattern, 10, &errorCode); 2936 set2=uset_openPattern(dashPattern, 8, &errorCode); 2937 if(U_SUCCESS(errorCode)) { 2938 /* remove the Katakana middle dot(s) from set1 */ 2939 uset_remove(set1, 0x30fb); 2940 uset_remove(set1, 0xff65); /* halfwidth variant */ 2941 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE); 2942 } else { 2943 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 2944 } 2945 2946 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */ 2947 set3=uset_openPattern(formatPattern, 6, &errorCode); 2948 set4=uset_openPattern(alphaPattern, 14, &errorCode); 2949 if(U_SUCCESS(errorCode)) { 2950 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE); 2951 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE); 2952 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE); 2953 } else { 2954 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 2955 } 2956 2957 uset_close(set1); 2958 uset_close(set2); 2959 uset_close(set3); 2960 uset_close(set4); 2961 2962 /* 2963 * Check that each lowercase character has "small" in its name 2964 * and not "capital". 2965 * There are some such characters, some of which seem odd. 2966 * Use the verbose flag to see these notices. 2967 */ 2968 errorCode=U_ZERO_ERROR; 2969 set1=uset_openPattern(lowerPattern, 13, &errorCode); 2970 if(U_SUCCESS(errorCode)) { 2971 for(i=0;; ++i) { 2972 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode); 2973 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 2974 break; /* done */ 2975 } 2976 if(U_FAILURE(errorCode)) { 2977 log_err("error iterating over [:Lowercase:] at item %d: %s\n", 2978 i, u_errorName(errorCode)); 2979 break; 2980 } 2981 if(length!=0) { 2982 break; /* done with code points, got a string or -1 */ 2983 } 2984 2985 while(start<=end) { 2986 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); 2987 if(U_FAILURE(errorCode)) { 2988 log_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode)); 2989 errorCode=U_ZERO_ERROR; 2990 continue; 2991 } 2992 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) && 2993 strstr(buffer, "SMALL CAPITAL")==NULL 2994 ) { 2995 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer); 2996 } 2997 ++start; 2998 } 2999 } 3000 } else { 3001 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3002 } 3003 uset_close(set1); 3004 3005 /* verify that all assigned characters in Math blocks are exactly Math characters */ 3006 errorCode=U_ZERO_ERROR; 3007 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode); 3008 set2=uset_openPattern(mathPattern, 8, &errorCode); 3009 set3=uset_openPattern(unassignedPattern, 6, &errorCode); 3010 if(U_SUCCESS(errorCode)) { 3011 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */ 3012 uset_complement(set3); /* assigned characters */ 3013 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */ 3014 compareUSets(set1, set2, 3015 "[assigned Math block chars]", "[math blocks]&[:Math:]", 3016 TRUE); 3017 } else { 3018 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3019 } 3020 uset_close(set1); 3021 uset_close(set2); 3022 uset_close(set3); 3023 3024 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */ 3025 errorCode=U_ZERO_ERROR; 3026 set1=uset_openPattern(unknownPattern, 14, &errorCode); 3027 set2=uset_openPattern(reservedPattern, 20, &errorCode); 3028 if(U_SUCCESS(errorCode)) { 3029 compareUSets(set1, set2, 3030 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]", 3031 TRUE); 3032 } else { 3033 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3034 } 3035 uset_close(set1); 3036 uset_close(set2); 3037 } 3038 3039 /* 3040 * Starting with ICU4C 3.4, the core Unicode properties files 3041 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu) 3042 * are hardcoded in the common DLL and therefore not included 3043 * in the data package any more. 3044 * Test requiring these files are disabled so that 3045 * we need not jump through hoops (like adding snapshots of these files 3046 * to testdata). 3047 * See Jitterbug 4497. 3048 */ 3049 #define HARDCODED_DATA_4497 1 3050 3051 /* API coverage for ucase.c */ 3052 static void TestUCase() { 3053 #if !HARDCODED_DATA_4497 3054 UDataMemory *pData; 3055 UCaseProps *csp; 3056 const UCaseProps *ccsp; 3057 UErrorCode errorCode; 3058 3059 /* coverage for ucase_openBinary() */ 3060 errorCode=U_ZERO_ERROR; 3061 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode); 3062 if(U_FAILURE(errorCode)) { 3063 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3064 u_errorName(errorCode)); 3065 return; 3066 } 3067 3068 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3069 if(U_FAILURE(errorCode)) { 3070 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3071 u_errorName(errorCode)); 3072 udata_close(pData); 3073 return; 3074 } 3075 3076 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */ 3077 log_err("ucase_openBinary() does not seem to return working UCaseProps\n"); 3078 } 3079 3080 ucase_close(csp); 3081 udata_close(pData); 3082 3083 /* coverage for ucase_getDummy() */ 3084 errorCode=U_ZERO_ERROR; 3085 ccsp=ucase_getDummy(&errorCode); 3086 if(ucase_tolower(ccsp, 0x41)!=0x41) { 3087 log_err("ucase_tolower(dummy, A)!=A\n"); 3088 } 3089 #endif 3090 } 3091 3092 /* API coverage for ubidi_props.c */ 3093 static void TestUBiDiProps() { 3094 #if !HARDCODED_DATA_4497 3095 UDataMemory *pData; 3096 UBiDiProps *bdp; 3097 const UBiDiProps *cbdp; 3098 UErrorCode errorCode; 3099 3100 /* coverage for ubidi_openBinary() */ 3101 errorCode=U_ZERO_ERROR; 3102 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode); 3103 if(U_FAILURE(errorCode)) { 3104 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3105 u_errorName(errorCode)); 3106 return; 3107 } 3108 3109 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3110 if(U_FAILURE(errorCode)) { 3111 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3112 u_errorName(errorCode)); 3113 udata_close(pData); 3114 return; 3115 } 3116 3117 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */ 3118 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n"); 3119 } 3120 3121 ubidi_closeProps(bdp); 3122 udata_close(pData); 3123 3124 /* coverage for ubidi_getDummy() */ 3125 errorCode=U_ZERO_ERROR; 3126 cbdp=ubidi_getDummy(&errorCode); 3127 if(ubidi_getClass(cbdp, 0x20)!=0) { 3128 log_err("ubidi_getClass(dummy, space)!=0\n"); 3129 } 3130 #endif 3131 } 3132 3133 /* test case folding, compare return values with CaseFolding.txt ------------ */ 3134 3135 /* bit set for which case foldings for a character have been tested already */ 3136 enum { 3137 CF_SIMPLE=1, 3138 CF_FULL=2, 3139 CF_TURKIC=4, 3140 CF_ALL=7 3141 }; 3142 3143 static void 3144 testFold(UChar32 c, int which, 3145 UChar32 simple, UChar32 turkic, 3146 const UChar *full, int32_t fullLength, 3147 const UChar *turkicFull, int32_t turkicFullLength) { 3148 UChar s[2], t[32]; 3149 UChar32 c2; 3150 int32_t length, length2; 3151 3152 UErrorCode errorCode=U_ZERO_ERROR; 3153 3154 length=0; 3155 U16_APPEND_UNSAFE(s, length, c); 3156 3157 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) { 3158 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3159 } 3160 if((which&CF_FULL)!=0) { 3161 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode); 3162 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) { 3163 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c); 3164 } 3165 } 3166 if((which&CF_TURKIC)!=0) { 3167 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) { 3168 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3169 } 3170 3171 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); 3172 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) { 3173 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c); 3174 } 3175 } 3176 } 3177 3178 /* test that c case-folds to itself */ 3179 static void 3180 testFoldToSelf(UChar32 c, int which) { 3181 UChar s[2]; 3182 int32_t length; 3183 3184 length=0; 3185 U16_APPEND_UNSAFE(s, length, c); 3186 testFold(c, which, c, c, s, length, s, length); 3187 } 3188 3189 struct CaseFoldingData { 3190 USet *notSeen; 3191 UChar32 prev, prevSimple; 3192 UChar prevFull[32]; 3193 int32_t prevFullLength; 3194 int which; 3195 }; 3196 typedef struct CaseFoldingData CaseFoldingData; 3197 3198 static void U_CALLCONV 3199 caseFoldingLineFn(void *context, 3200 char *fields[][2], int32_t fieldCount, 3201 UErrorCode *pErrorCode) { 3202 CaseFoldingData *pData=(CaseFoldingData *)context; 3203 char *end; 3204 UChar full[32]; 3205 UChar32 c, prev, simple; 3206 int32_t count; 3207 int which; 3208 char status; 3209 3210 /* get code point */ 3211 c=(UChar32)strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 3212 end=(char *)u_skipWhitespace(end); 3213 if(end<=fields[0][0] || end!=fields[0][1]) { 3214 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 3215 *pErrorCode=U_PARSE_ERROR; 3216 return; 3217 } 3218 3219 /* get the status of this mapping */ 3220 status=*u_skipWhitespace(fields[1][0]); 3221 if(status!='C' && status!='S' && status!='F' && status!='T') { 3222 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 3223 *pErrorCode=U_PARSE_ERROR; 3224 return; 3225 } 3226 3227 /* get the mapping */ 3228 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode); 3229 if(U_FAILURE(*pErrorCode)) { 3230 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 3231 return; 3232 } 3233 3234 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 3235 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) { 3236 simple=c; 3237 } 3238 3239 if(c!=(prev=pData->prev)) { 3240 /* 3241 * Test remaining mappings for the previous code point. 3242 * If a turkic folding was not mentioned, then it should fold the same 3243 * as the regular simple case folding. 3244 */ 3245 UChar s[2]; 3246 int32_t length; 3247 3248 length=0; 3249 U16_APPEND_UNSAFE(s, length, prev); 3250 testFold(prev, (~pData->which)&CF_ALL, 3251 prev, pData->prevSimple, 3252 s, length, 3253 pData->prevFull, pData->prevFullLength); 3254 pData->prev=pData->prevSimple=c; 3255 length=0; 3256 U16_APPEND_UNSAFE(pData->prevFull, length, c); 3257 pData->prevFullLength=length; 3258 pData->which=0; 3259 } 3260 3261 /* 3262 * Turn the status into a bit set of case foldings to test. 3263 * Remember non-Turkic case foldings as defaults for Turkic mode. 3264 */ 3265 switch(status) { 3266 case 'C': 3267 which=CF_SIMPLE|CF_FULL; 3268 pData->prevSimple=simple; 3269 u_memcpy(pData->prevFull, full, count); 3270 pData->prevFullLength=count; 3271 break; 3272 case 'S': 3273 which=CF_SIMPLE; 3274 pData->prevSimple=simple; 3275 break; 3276 case 'F': 3277 which=CF_FULL; 3278 u_memcpy(pData->prevFull, full, count); 3279 pData->prevFullLength=count; 3280 break; 3281 case 'T': 3282 which=CF_TURKIC; 3283 break; 3284 default: 3285 which=0; 3286 break; /* won't happen because of test above */ 3287 } 3288 3289 testFold(c, which, simple, simple, full, count, full, count); 3290 3291 /* remember which case foldings of c have been tested */ 3292 pData->which|=which; 3293 3294 /* remove c from the set of ones not mentioned in CaseFolding.txt */ 3295 uset_remove(pData->notSeen, c); 3296 } 3297 3298 static void 3299 TestCaseFolding() { 3300 CaseFoldingData data={ NULL }; 3301 char *fields[3][2]; 3302 UErrorCode errorCode; 3303 3304 static char *lastLine= (char *)"10FFFF; C; 10FFFF;"; 3305 3306 errorCode=U_ZERO_ERROR; 3307 /* test BMP & plane 1 - nothing interesting above */ 3308 data.notSeen=uset_open(0, 0x1ffff); 3309 data.prevFullLength=1; /* length of full case folding of U+0000 */ 3310 3311 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode); 3312 if(U_SUCCESS(errorCode)) { 3313 int32_t i, start, end; 3314 3315 /* add a pseudo-last line to finish testing of the actual last one */ 3316 fields[0][0]=lastLine; 3317 fields[0][1]=lastLine+6; 3318 fields[1][0]=lastLine+7; 3319 fields[1][1]=lastLine+9; 3320 fields[2][0]=lastLine+10; 3321 fields[2][1]=lastLine+17; 3322 caseFoldingLineFn(&data, fields, 3, &errorCode); 3323 3324 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */ 3325 for(i=0; 3326 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) && 3327 U_SUCCESS(errorCode); 3328 ++i 3329 ) { 3330 do { 3331 testFoldToSelf(start, CF_ALL); 3332 } while(++start<=end); 3333 } 3334 } 3335 3336 uset_close(data.notSeen); 3337 } 3338