1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: props2.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb24 14 * created by: Markus W. Scherer 15 * 16 * Parse more Unicode Character Database files and store 17 * additional Unicode character properties in bit set vectors. 18 */ 19 20 #include <stdio.h> 21 #include "unicode/utypes.h" 22 #include "unicode/uchar.h" 23 #include "unicode/uscript.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "utrie.h" 27 #include "uprops.h" 28 #include "propsvec.h" 29 #include "uparse.h" 30 #include "writesrc.h" 31 #include "genprops.h" 32 33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 34 35 /* data --------------------------------------------------------------------- */ 36 37 static UNewTrie *newTrie; 38 UPropsVectors *pv; 39 40 /* miscellaneous ------------------------------------------------------------ */ 41 42 static char * 43 trimTerminateField(char *s, char *limit) { 44 /* trim leading whitespace */ 45 s=(char *)u_skipWhitespace(s); 46 47 /* trim trailing whitespace */ 48 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { 49 --limit; 50 } 51 *limit=0; 52 53 return s; 54 } 55 56 static void 57 parseTwoFieldFile(char *filename, char *basename, 58 const char *ucdFile, const char *suffix, 59 UParseLineFn *lineFn, 60 UErrorCode *pErrorCode) { 61 char *fields[2][2]; 62 63 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 64 return; 65 } 66 67 writeUCDFilename(basename, ucdFile, suffix); 68 69 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode); 70 if(U_FAILURE(*pErrorCode)) { 71 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode)); 72 } 73 } 74 75 static void U_CALLCONV 76 ageLineFn(void *context, 77 char *fields[][2], int32_t fieldCount, 78 UErrorCode *pErrorCode); 79 80 static void 81 parseMultiFieldFile(char *filename, char *basename, 82 const char *ucdFile, const char *suffix, 83 int32_t fieldCount, 84 UParseLineFn *lineFn, 85 UErrorCode *pErrorCode) { 86 char *fields[20][2]; 87 88 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 89 return; 90 } 91 92 writeUCDFilename(basename, ucdFile, suffix); 93 94 u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode); 95 if(U_FAILURE(*pErrorCode)) { 96 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode)); 97 } 98 } 99 100 static void U_CALLCONV 101 numericLineFn(void *context, 102 char *fields[][2], int32_t fieldCount, 103 UErrorCode *pErrorCode); 104 105 /* parse files with single enumerated properties ---------------------------- */ 106 107 struct SingleEnum { 108 const char *ucdFile, *propName; 109 UProperty prop; 110 int32_t vecWord, vecShift; 111 uint32_t vecMask; 112 }; 113 typedef struct SingleEnum SingleEnum; 114 115 static void 116 parseSingleEnumFile(char *filename, char *basename, const char *suffix, 117 const SingleEnum *sen, 118 UErrorCode *pErrorCode); 119 120 static const SingleEnum scriptSingleEnum={ 121 "Scripts", "script", 122 UCHAR_SCRIPT, 123 0, 0, UPROPS_SCRIPT_MASK 124 }; 125 126 static const SingleEnum blockSingleEnum={ 127 "Blocks", "block", 128 UCHAR_BLOCK, 129 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK 130 }; 131 132 static const SingleEnum graphemeClusterBreakSingleEnum={ 133 "GraphemeBreakProperty", "Grapheme_Cluster_Break", 134 UCHAR_GRAPHEME_CLUSTER_BREAK, 135 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK 136 }; 137 138 static const SingleEnum wordBreakSingleEnum={ 139 "WordBreakProperty", "Word_Break", 140 UCHAR_WORD_BREAK, 141 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK 142 }; 143 144 static const SingleEnum sentenceBreakSingleEnum={ 145 "SentenceBreakProperty", "Sentence_Break", 146 UCHAR_SENTENCE_BREAK, 147 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK 148 }; 149 150 static const SingleEnum lineBreakSingleEnum={ 151 "LineBreak", "line break", 152 UCHAR_LINE_BREAK, 153 UPROPS_LB_VWORD, UPROPS_LB_SHIFT, UPROPS_LB_MASK 154 }; 155 156 static const SingleEnum eawSingleEnum={ 157 "EastAsianWidth", "east asian width", 158 UCHAR_EAST_ASIAN_WIDTH, 159 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK 160 }; 161 162 static void U_CALLCONV 163 singleEnumLineFn(void *context, 164 char *fields[][2], int32_t fieldCount, 165 UErrorCode *pErrorCode) { 166 const SingleEnum *sen; 167 char *s; 168 uint32_t start, end, uv; 169 int32_t value; 170 171 sen=(const SingleEnum *)context; 172 173 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 174 if(U_FAILURE(*pErrorCode)) { 175 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]); 176 exit(*pErrorCode); 177 } 178 179 /* parse property alias */ 180 s=trimTerminateField(fields[1][0], fields[1][1]); 181 value=u_getPropertyValueEnum(sen->prop, s); 182 if(value<0) { 183 if(sen->prop==UCHAR_BLOCK) { 184 if(isToken("Greek", s)) { 185 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */ 186 } else if(isToken("Combining Marks for Symbols", s)) { 187 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */ 188 } else if(isToken("Private Use", s)) { 189 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */ 190 } 191 } 192 } 193 if(value<0) { 194 fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n", 195 sen->propName, sen->ucdFile, s); 196 exit(U_PARSE_ERROR); 197 } 198 199 uv=(uint32_t)(value<<sen->vecShift); 200 if((uv&sen->vecMask)!=uv) { 201 fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n", 202 sen->propName, (int)uv, s); 203 exit(U_INTERNAL_PROGRAM_ERROR); 204 } 205 206 if(start==0 && end==0x10ffff) { 207 /* Also set bits for initialValue and errorValue. */ 208 end=UPVEC_MAX_CP; 209 } 210 upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode); 211 if(U_FAILURE(*pErrorCode)) { 212 fprintf(stderr, "genprops error: unable to set %s code: %s\n", 213 sen->propName, u_errorName(*pErrorCode)); 214 exit(*pErrorCode); 215 } 216 } 217 218 static void 219 parseSingleEnumFile(char *filename, char *basename, const char *suffix, 220 const SingleEnum *sen, 221 UErrorCode *pErrorCode) { 222 char *fields[2][2]; 223 224 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 225 return; 226 } 227 228 writeUCDFilename(basename, sen->ucdFile, suffix); 229 230 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode); 231 if(U_FAILURE(*pErrorCode)) { 232 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode)); 233 } 234 } 235 236 /* parse files with multiple binary properties ------------------------------ */ 237 238 struct Binary { 239 const char *propName; 240 int32_t vecWord, vecShift; 241 }; 242 typedef struct Binary Binary; 243 244 struct Binaries { 245 const char *ucdFile; 246 const Binary *binaries; 247 int32_t binariesCount; 248 }; 249 typedef struct Binaries Binaries; 250 251 static const Binary 252 propListNames[]={ 253 { "White_Space", 1, UPROPS_WHITE_SPACE }, 254 { "Dash", 1, UPROPS_DASH }, 255 { "Hyphen", 1, UPROPS_HYPHEN }, 256 { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK }, 257 { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION }, 258 { "Hex_Digit", 1, UPROPS_HEX_DIGIT }, 259 { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT }, 260 { "Ideographic", 1, UPROPS_IDEOGRAPHIC }, 261 { "Diacritic", 1, UPROPS_DIACRITIC }, 262 { "Extender", 1, UPROPS_EXTENDER }, 263 { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT }, 264 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }, 265 { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR }, 266 { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR }, 267 { "Radical", 1, UPROPS_RADICAL }, 268 { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH }, 269 { "Deprecated", 1, UPROPS_DEPRECATED }, 270 { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION }, 271 272 /* new properties in Unicode 4.0.1 */ 273 { "STerm", 1, UPROPS_S_TERM }, 274 { "Variation_Selector", 1, UPROPS_VARIATION_SELECTOR }, 275 276 /* new properties in Unicode 4.1 */ 277 { "Pattern_Syntax", 1, UPROPS_PATTERN_SYNTAX }, 278 { "Pattern_White_Space", 1, UPROPS_PATTERN_WHITE_SPACE } 279 }; 280 281 static const Binaries 282 propListBinaries={ 283 "PropList", propListNames, LENGTHOF(propListNames) 284 }; 285 286 static const Binary 287 derCorePropsNames[]={ 288 { "XID_Start", 1, UPROPS_XID_START }, 289 { "XID_Continue", 1, UPROPS_XID_CONTINUE }, 290 291 /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */ 292 { "Math", 1, UPROPS_MATH }, 293 { "Alphabetic", 1, UPROPS_ALPHABETIC }, 294 { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND }, 295 { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT }, 296 297 /* new properties bits in ICU 2.6/format version 3.2 */ 298 { "ID_Start", 1, UPROPS_ID_START }, 299 { "ID_Continue", 1, UPROPS_ID_CONTINUE }, 300 { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE }, 301 302 /* 303 * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt 304 * to DerivedCoreProperties.txt and deprecates it. 305 */ 306 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK } 307 }; 308 309 static const Binaries 310 derCorePropsBinaries={ 311 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) 312 }; 313 314 static char ignoredProps[100][64]; 315 static int32_t ignoredPropsCount; 316 317 static void 318 addIgnoredProp(char *s, char *limit) { 319 int32_t i; 320 321 s=trimTerminateField(s, limit); 322 for(i=0; i<ignoredPropsCount; ++i) { 323 if(0==uprv_strcmp(ignoredProps[i], s)) { 324 return; 325 } 326 } 327 uprv_strcpy(ignoredProps[ignoredPropsCount++], s); 328 } 329 330 static void U_CALLCONV 331 binariesLineFn(void *context, 332 char *fields[][2], int32_t fieldCount, 333 UErrorCode *pErrorCode) { 334 const Binaries *bin; 335 char *s; 336 uint32_t start, end, uv; 337 int32_t i; 338 339 bin=(const Binaries *)context; 340 341 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 342 if(U_FAILURE(*pErrorCode)) { 343 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); 344 exit(*pErrorCode); 345 } 346 347 /* parse binary property name */ 348 s=(char *)u_skipWhitespace(fields[1][0]); 349 for(i=0;; ++i) { 350 if(i==bin->binariesCount) { 351 /* ignore unrecognized properties */ 352 if(beVerbose) { 353 addIgnoredProp(s, fields[1][1]); 354 } 355 return; 356 } 357 if(isToken(bin->binaries[i].propName, s)) { 358 break; 359 } 360 } 361 362 if(bin->binaries[i].vecShift>=32) { 363 fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n", 364 (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName); 365 exit(U_INTERNAL_PROGRAM_ERROR); 366 } 367 uv=U_MASK(bin->binaries[i].vecShift); 368 369 if(start==0 && end==0x10ffff) { 370 /* Also set bits for initialValue and errorValue. */ 371 end=UPVEC_MAX_CP; 372 } 373 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode); 374 if(U_FAILURE(*pErrorCode)) { 375 fprintf(stderr, "genprops error: unable to set %s code: %s\n", 376 bin->binaries[i].propName, u_errorName(*pErrorCode)); 377 exit(*pErrorCode); 378 } 379 } 380 381 static void 382 parseBinariesFile(char *filename, char *basename, const char *suffix, 383 const Binaries *bin, 384 UErrorCode *pErrorCode) { 385 char *fields[2][2]; 386 int32_t i; 387 388 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 389 return; 390 } 391 392 writeUCDFilename(basename, bin->ucdFile, suffix); 393 394 ignoredPropsCount=0; 395 396 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); 397 if(U_FAILURE(*pErrorCode)) { 398 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); 399 } 400 401 if(beVerbose) { 402 for(i=0; i<ignoredPropsCount; ++i) { 403 printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile); 404 } 405 } 406 } 407 408 /* -------------------------------------------------------------------------- */ 409 410 U_CFUNC void 411 initAdditionalProperties() { 412 UErrorCode errorCode=U_ZERO_ERROR; 413 pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode); 414 if(U_FAILURE(errorCode)) { 415 fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode)); 416 exit(errorCode); 417 } 418 } 419 420 U_CFUNC void 421 exitAdditionalProperties() { 422 utrie_close(newTrie); 423 upvec_close(pv); 424 } 425 426 U_CFUNC void 427 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) { 428 char *basename; 429 430 basename=filename+uprv_strlen(filename); 431 432 /* process various UCD .txt files */ 433 434 /* add Han numeric types & values */ 435 parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode); 436 437 parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode); 438 439 /* 440 * UTR 24 says: 441 * Section 2: 442 * "Common - For characters that may be used 443 * within multiple scripts, 444 * or any unassigned code points." 445 * 446 * Section 4: 447 * "The value COMMON is the default value, 448 * given to all code points that are not 449 * explicitly mentioned in the data file." 450 * 451 * COMMON==USCRIPT_COMMON==0 - nothing to do 452 */ 453 parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode); 454 455 parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode); 456 457 parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode); 458 459 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode); 460 461 parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode); 462 463 parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode); 464 465 parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode); 466 467 /* 468 * LineBreak-4.0.0.txt: 469 * - All code points, assigned and unassigned, that are not listed 470 * explicitly are given the value "XX". 471 * 472 * XX==U_LB_UNKNOWN==0 - nothing to do 473 */ 474 parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode); 475 476 /* 477 * Preset East Asian Width defaults: 478 * 479 * http://www.unicode.org/reports/tr11/#Unassigned 480 * 7.1 Unassigned and Private Use characters 481 * 482 * All unassigned characters are by default classified as non-East Asian neutral, 483 * except for the range U+20000 to U+2FFFD, 484 * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W). 485 * All Private use characters are by default classified as ambiguous, 486 * since their definition depends on context. 487 * 488 * N for all ==0 - nothing to do 489 * A for Private Use 490 * W for plane 2 491 */ 492 *pErrorCode=U_ZERO_ERROR; 493 upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); 494 upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); 495 upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); 496 upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode); 497 if(U_FAILURE(*pErrorCode)) { 498 fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode)); 499 exit(*pErrorCode); 500 } 501 502 /* parse EastAsianWidth.txt */ 503 parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode); 504 505 { 506 UPVecToUTrieContext toUTrie={ NULL, 50000 /* capacity */, 0, TRUE /* latin1Linear */ }; 507 upvec_compact(pv, upvec_compactToUTrieHandler, &toUTrie, pErrorCode); 508 if(U_FAILURE(*pErrorCode)) { 509 fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", 510 u_errorName(*pErrorCode)); 511 exit(*pErrorCode); 512 } 513 newTrie=toUTrie.newTrie; 514 } 515 } 516 517 /* DerivedAge.txt ----------------------------------------------------------- */ 518 519 static void U_CALLCONV 520 ageLineFn(void *context, 521 char *fields[][2], int32_t fieldCount, 522 UErrorCode *pErrorCode) { 523 char *s, *numberLimit; 524 uint32_t value, start, end, version; 525 526 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 527 if(U_FAILURE(*pErrorCode)) { 528 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]); 529 exit(*pErrorCode); 530 } 531 532 /* ignore "unassigned" (the default is already set to 0.0) */ 533 s=(char *)u_skipWhitespace(fields[1][0]); 534 if(0==uprv_strncmp(s, "unassigned", 10)) { 535 return; 536 } 537 538 /* parse version number */ 539 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); 540 if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { 541 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); 542 *pErrorCode=U_PARSE_ERROR; 543 exit(U_PARSE_ERROR); 544 } 545 version=value<<4; 546 547 /* parse minor version number */ 548 if(*numberLimit=='.') { 549 s=(char *)u_skipWhitespace(numberLimit+1); 550 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); 551 if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { 552 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); 553 *pErrorCode=U_PARSE_ERROR; 554 exit(U_PARSE_ERROR); 555 } 556 version|=value; 557 } 558 559 if(start==0 && end==0x10ffff) { 560 /* Also set bits for initialValue and errorValue. */ 561 end=UPVEC_MAX_CP; 562 } 563 upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode); 564 if(U_FAILURE(*pErrorCode)) { 565 fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode)); 566 exit(*pErrorCode); 567 } 568 } 569 570 /* DerivedNumericValues.txt ------------------------------------------------- */ 571 572 static void U_CALLCONV 573 numericLineFn(void *context, 574 char *fields[][2], int32_t fieldCount, 575 UErrorCode *pErrorCode) { 576 Props newProps={ 0 }; 577 char *s, *numberLimit; 578 uint32_t start, end, value, oldProps32; 579 char c; 580 UBool isFraction; 581 582 /* get the code point range */ 583 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 584 if(U_FAILURE(*pErrorCode)) { 585 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]); 586 exit(*pErrorCode); 587 } 588 589 /* 590 * Ignore the 591 * # @missing: 0000..10FFFF; NaN 592 * line from Unicode 5.1's DerivedNumericValues.txt: 593 * The following code cannot parse "NaN", and we don't want to overwrite 594 * the numeric values for all characters after reading most 595 * from UnicodeData.txt already. 596 */ 597 if(start==0 && end==0x10ffff) { 598 return; 599 } 600 601 /* check if the numeric value is a fraction (this code does not handle any) */ 602 isFraction=FALSE; 603 s=uprv_strchr(fields[1][0], '.'); 604 if(s!=NULL) { 605 numberLimit=s+1; 606 while('0'<=(c=*numberLimit++) && c<='9') { 607 if(c!='0') { 608 isFraction=TRUE; 609 break; 610 } 611 } 612 } 613 614 if(isFraction) { 615 value=0; 616 } else { 617 /* parse numeric value */ 618 s=(char *)u_skipWhitespace(fields[1][0]); 619 620 /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */ 621 if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') { 622 /* large integers are encoded in a special way, see store.c */ 623 uint8_t exp=0; 624 625 value=s[0]-'0'; 626 numberLimit=s; 627 while(*(++numberLimit)=='0') { 628 ++exp; 629 } 630 newProps.exponent=exp; 631 } else { 632 /* normal number parsing */ 633 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); 634 } 635 if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) { 636 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]); 637 exit(U_PARSE_ERROR); 638 } 639 } 640 641 /* 642 * Unicode 4.0.1 removes the third column that used to list the numeric type. 643 * Assume that either the data is the same as in UnicodeData.txt, 644 * or else that the numeric type is "numeric". 645 * This should work because we only expect to add numeric values for 646 * Han characters; for those, UnicodeData.txt lists only ranges without 647 * specific properties for single characters. 648 */ 649 650 /* set the new numeric value */ 651 newProps.code=start; 652 newProps.numericValue=(int32_t)value; /* newly parsed numeric value */ 653 /* the exponent may have been set above */ 654 655 for(; start<=end; ++start) { 656 uint32_t newProps32; 657 int32_t oldNtv; 658 oldProps32=getProps(start); 659 oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32); 660 661 if(isFraction) { 662 if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) { 663 /* this code point was already listed with its numeric value in UnicodeData.txt */ 664 continue; 665 } else { 666 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]); 667 exit(U_PARSE_ERROR); 668 } 669 } 670 671 /* 672 * For simplicity, and because we only expect to set numeric values for Han characters, 673 * for now we only allow to set these values for Lo characters. 674 */ 675 if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) { 676 fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]); 677 exit(U_PARSE_ERROR); 678 } 679 680 /* verify that we do not change an existing value (fractions were excluded above) */ 681 if(oldNtv!=UPROPS_NTV_NONE) { 682 /* the code point already has a value stored */ 683 newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv); 684 newProps32=makeProps(&newProps); 685 if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) { 686 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start); 687 exit(U_PARSE_ERROR); 688 } 689 /* same value, continue */ 690 } else { 691 /* the code point is getting a new numeric value */ 692 newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */ 693 newProps32=makeProps(&newProps); 694 if(beVerbose) { 695 printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n", 696 (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]); 697 } 698 699 addProps(start, newProps32|GET_CATEGORY(oldProps32)); 700 } 701 } 702 } 703 704 /* data serialization ------------------------------------------------------- */ 705 706 U_CFUNC int32_t 707 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) { 708 const uint32_t *pvArray; 709 int32_t pvRows, pvCount; 710 int32_t length; 711 UErrorCode errorCode; 712 713 pvArray=upvec_getArray(pv, &pvRows, NULL); 714 pvCount=pvRows*UPROPS_VECTOR_WORDS; 715 716 errorCode=U_ZERO_ERROR; 717 length=utrie_serialize(newTrie, p, capacity, NULL, TRUE, &errorCode); 718 if(U_FAILURE(errorCode)) { 719 fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode)); 720 exit(errorCode); 721 } 722 if(p!=NULL) { 723 if(beVerbose) { 724 printf("size in bytes of additional props trie:%5u\n", (int)length); 725 } 726 if(f!=NULL) { 727 UTrie trie={ NULL }; 728 UTrie2 *trie2; 729 730 utrie_unserialize(&trie, p, length, &errorCode); 731 if(U_FAILURE(errorCode)) { 732 fprintf( 733 stderr, 734 "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n", 735 u_errorName(errorCode)); 736 exit(errorCode); 737 } 738 739 /* use UTrie2 */ 740 trie2=utrie2_fromUTrie(&trie, trie.initialValue, &errorCode); 741 if(U_FAILURE(errorCode)) { 742 fprintf( 743 stderr, 744 "genprops error: utrie2_fromUTrie() failed - %s\n", 745 u_errorName(errorCode)); 746 exit(errorCode); 747 } 748 { 749 /* delete lead surrogate code unit values */ 750 UChar lead; 751 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 752 for(lead=0xd800; lead<0xdc00; ++lead) { 753 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 754 } 755 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 756 if(U_FAILURE(errorCode)) { 757 fprintf( 758 stderr, 759 "genbidi error: deleting lead surrogate code unit values failed - %s\n", 760 u_errorName(errorCode)); 761 exit(errorCode); 762 } 763 } 764 765 usrc_writeUTrie2Arrays(f, 766 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL, 767 trie2, 768 "\n};\n\n"); 769 usrc_writeUTrie2Struct(f, 770 "static const UTrie2 propsVectorsTrie={\n", 771 trie2, "propsVectorsTrie_index", NULL, 772 "};\n\n"); 773 774 utrie2_close(trie2); 775 } 776 777 p+=length; 778 capacity-=length; 779 780 /* set indexes */ 781 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]= 782 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4; 783 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS; 784 indexes[UPROPS_RESERVED_INDEX]= 785 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount; 786 787 indexes[UPROPS_MAX_VALUES_INDEX]= 788 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)| 789 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)| 790 (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK); 791 indexes[UPROPS_MAX_VALUES_2_INDEX]= 792 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)| 793 (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)| 794 (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)| 795 (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)| 796 ((int32_t)U_DT_COUNT-1); 797 } 798 799 if(p!=NULL && (pvCount*4)<=capacity) { 800 if(f!=NULL) { 801 usrc_writeArray(f, 802 "static const uint32_t propsVectors[%ld]={\n", 803 pvArray, 32, pvCount, 804 "};\n\n"); 805 fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount); 806 fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]); 807 } else { 808 uprv_memcpy(p, pvArray, pvCount*4); 809 } 810 if(beVerbose) { 811 printf("number of additional props vectors: %5u\n", (int)pvRows); 812 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS); 813 } 814 } 815 length+=pvCount*4; 816 817 return length; 818 } 819