1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: gencase.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug28 14 * created by: Markus W. Scherer 15 * 16 * This program reads several of the Unicode character database text files, 17 * parses them, and the case mapping properties for each character. 18 * It then writes a binary file containing the properties 19 * that is designed to be used directly for random-access to 20 * the properties of each Unicode character. 21 */ 22 23 #include <stdio.h> 24 #include "unicode/utypes.h" 25 #include "unicode/uchar.h" 26 #include "unicode/uset.h" 27 #include "unicode/putil.h" 28 #include "unicode/uclean.h" 29 #include "cmemory.h" 30 #include "cstring.h" 31 #include "uarrsort.h" 32 #include "unewdata.h" 33 #include "uoptions.h" 34 #include "uparse.h" 35 #include "uprops.h" 36 #include "propsvec.h" 37 #include "gencase.h" 38 39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 40 41 /* data --------------------------------------------------------------------- */ 42 43 UPropsVectors *pv; 44 45 UBool beVerbose=FALSE, haveCopyright=TRUE; 46 47 /* 48 * Unicode set collecting the case-sensitive characters; 49 * see uchar.h UCHAR_CASE_SENSITIVE. 50 * Add code points from case mappings/foldings in 51 * the root locale and with default options. 52 */ 53 static USet *caseSensitive; 54 55 /* prototypes --------------------------------------------------------------- */ 56 57 static void 58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); 59 60 static void 61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode); 62 63 static void 64 parseDB(const char *filename, UErrorCode *pErrorCode); 65 66 /* parse files with multiple binary properties ------------------------------ */ 67 68 /* TODO: more common code, move functions to uparse.h|c */ 69 70 /* TODO: similar to genprops/props2.c but not the same */ 71 72 struct Binary { 73 const char *propName; 74 int32_t vecWord; 75 uint32_t vecValue, vecMask; 76 }; 77 typedef struct Binary Binary; 78 79 struct Binaries { 80 const char *ucdFile; 81 const Binary *binaries; 82 int32_t binariesCount; 83 }; 84 typedef struct Binaries Binaries; 85 86 static const Binary 87 propListNames[]={ 88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } 89 }; 90 91 static const Binaries 92 propListBinaries={ 93 "PropList", propListNames, LENGTHOF(propListNames) 94 }; 95 96 static const Binary 97 derCorePropsNames[]={ 98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, 99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } 100 }; 101 102 static const Binaries 103 derCorePropsBinaries={ 104 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) 105 }; 106 107 /* 108 * Treat Word_Break=MidLetter and MidNumLet as a single binary property. 109 * We need not distinguish between them because both add to case-ignorable. 110 * We ignore all other Word_Break values. 111 */ 112 static const Binary 113 wordBreakNames[]={ 114 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }, 115 { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } 116 }; 117 118 static const Binaries 119 wordBreakBinaries={ 120 "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) 121 }; 122 123 static void U_CALLCONV 124 binariesLineFn(void *context, 125 char *fields[][2], int32_t fieldCount, 126 UErrorCode *pErrorCode) { 127 const Binaries *bin; 128 char *s; 129 uint32_t start, end; 130 int32_t i; 131 132 bin=(const Binaries *)context; 133 134 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 135 if(U_FAILURE(*pErrorCode)) { 136 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); 137 exit(*pErrorCode); 138 } 139 140 /* parse binary property name */ 141 s=(char *)u_skipWhitespace(fields[1][0]); 142 for(i=0;; ++i) { 143 if(i==bin->binariesCount) { 144 /* ignore unrecognized properties */ 145 return; 146 } 147 if(isToken(bin->binaries[i].propName, s)) { 148 break; 149 } 150 } 151 152 if(bin->binaries[i].vecMask==0) { 153 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", 154 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); 155 exit(U_INTERNAL_PROGRAM_ERROR); 156 } 157 158 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode); 159 if(U_FAILURE(*pErrorCode)) { 160 fprintf(stderr, "gencase error: unable to set %s, code: %s\n", 161 bin->binaries[i].propName, u_errorName(*pErrorCode)); 162 exit(*pErrorCode); 163 } 164 } 165 166 static void 167 parseBinariesFile(char *filename, char *basename, const char *suffix, 168 const Binaries *bin, 169 UErrorCode *pErrorCode) { 170 char *fields[2][2]; 171 172 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 173 return; 174 } 175 176 writeUCDFilename(basename, bin->ucdFile, suffix); 177 178 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); 179 if(U_FAILURE(*pErrorCode)) { 180 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); 181 } 182 } 183 184 /* -------------------------------------------------------------------------- */ 185 186 enum 187 { 188 HELP_H, 189 HELP_QUESTION_MARK, 190 VERBOSE, 191 COPYRIGHT, 192 DESTDIR, 193 SOURCEDIR, 194 UNICODE_VERSION, 195 ICUDATADIR, 196 CSOURCE 197 }; 198 199 /* Keep these values in sync with the above enums */ 200 static UOption options[]={ 201 UOPTION_HELP_H, 202 UOPTION_HELP_QUESTION_MARK, 203 UOPTION_VERBOSE, 204 UOPTION_COPYRIGHT, 205 UOPTION_DESTDIR, 206 UOPTION_SOURCEDIR, 207 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 208 UOPTION_ICUDATADIR, 209 UOPTION_DEF("csource", 'C', UOPT_NO_ARG) 210 }; 211 212 extern int 213 main(int argc, char* argv[]) { 214 char filename[300]; 215 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; 216 char *basename=NULL; 217 UErrorCode errorCode=U_ZERO_ERROR; 218 219 U_MAIN_INIT_ARGS(argc, argv); 220 221 /* preset then read command line options */ 222 options[DESTDIR].value=u_getDataDirectory(); 223 options[SOURCEDIR].value=""; 224 options[UNICODE_VERSION].value=""; 225 options[ICUDATADIR].value=u_getDataDirectory(); 226 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 227 228 /* error handling, printing usage message */ 229 if(argc<0) { 230 fprintf(stderr, 231 "error in command line argument \"%s\"\n", 232 argv[-argc]); 233 } 234 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 235 /* 236 * Broken into chucks because the C89 standard says the minimum 237 * required supported string length is 509 bytes. 238 */ 239 fprintf(stderr, 240 "Usage: %s [-options] [suffix]\n" 241 "\n" 242 "read the UnicodeData.txt file and other Unicode properties files and\n" 243 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" 244 "\n", 245 argv[0]); 246 fprintf(stderr, 247 "Options:\n" 248 "\t-h or -? or --help this usage text\n" 249 "\t-v or --verbose verbose output\n" 250 "\t-c or --copyright include a copyright notice\n" 251 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" 252 "\t-C or --csource generate a .c source file rather than the .icu binary\n"); 253 fprintf(stderr, 254 "\t-d or --destdir destination directory, followed by the path\n" 255 "\t-s or --sourcedir source directory, followed by the path\n" 256 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 257 "\t followed by path, defaults to %s\n" 258 "\tsuffix suffix that is to be appended with a '-'\n" 259 "\t to the source file basenames before opening;\n" 260 "\t 'gencase new' will read UnicodeData-new.txt etc.\n", 261 u_getDataDirectory()); 262 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 263 } 264 265 /* get the options values */ 266 beVerbose=options[VERBOSE].doesOccur; 267 haveCopyright=options[COPYRIGHT].doesOccur; 268 srcDir=options[SOURCEDIR].value; 269 destDir=options[DESTDIR].value; 270 271 if(argc>=2) { 272 suffix=argv[1]; 273 } else { 274 suffix=NULL; 275 } 276 277 if(options[UNICODE_VERSION].doesOccur) { 278 setUnicodeVersion(options[UNICODE_VERSION].value); 279 } 280 /* else use the default dataVersion in store.c */ 281 282 if (options[ICUDATADIR].doesOccur) { 283 u_setDataDirectory(options[ICUDATADIR].value); 284 } 285 286 /* prepare the filename beginning with the source dir */ 287 uprv_strcpy(filename, srcDir); 288 basename=filename+uprv_strlen(filename); 289 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 290 *basename++=U_FILE_SEP_CHAR; 291 } 292 293 /* initialize */ 294 pv=upvec_open(2, &errorCode); 295 caseSensitive=uset_open(1, 0); /* empty set (start>end) */ 296 297 /* process SpecialCasing.txt */ 298 writeUCDFilename(basename, "SpecialCasing", suffix); 299 parseSpecialCasing(filename, &errorCode); 300 301 /* process CaseFolding.txt */ 302 writeUCDFilename(basename, "CaseFolding", suffix); 303 parseCaseFolding(filename, &errorCode); 304 305 /* process additional properties files */ 306 *basename=0; 307 308 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); 309 310 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); 311 312 if(ucdVersion>=UNI_4_1) { 313 parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); 314 } 315 316 /* process UnicodeData.txt */ 317 writeUCDFilename(basename, "UnicodeData", suffix); 318 parseDB(filename, &errorCode); 319 320 /* process parsed data */ 321 makeCaseClosure(); 322 323 makeExceptions(); 324 325 if(U_SUCCESS(errorCode)) { 326 /* write the properties data file */ 327 generateData(destDir, options[CSOURCE].doesOccur); 328 } 329 330 u_cleanup(); 331 return errorCode; 332 } 333 334 U_CFUNC void 335 writeUCDFilename(char *basename, const char *filename, const char *suffix) { 336 int32_t length=(int32_t)uprv_strlen(filename); 337 uprv_strcpy(basename, filename); 338 if(suffix!=NULL) { 339 basename[length++]='-'; 340 uprv_strcpy(basename+length, suffix); 341 length+=(int32_t)uprv_strlen(suffix); 342 } 343 uprv_strcpy(basename+length, ".txt"); 344 } 345 346 /* TODO: move to toolutil */ 347 U_CFUNC UBool 348 isToken(const char *token, const char *s) { 349 const char *z; 350 int32_t j; 351 352 s=u_skipWhitespace(s); 353 for(j=0;; ++j) { 354 if(token[j]!=0) { 355 if(s[j]!=token[j]) { 356 break; 357 } 358 } else { 359 z=u_skipWhitespace(s+j); 360 if(*z==';' || *z==0) { 361 return TRUE; 362 } else { 363 break; 364 } 365 } 366 } 367 368 return FALSE; 369 } 370 371 static int32_t 372 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { 373 const char *t, *z; 374 int32_t i, j; 375 376 s=u_skipWhitespace(s); 377 for(i=0; i<countTokens; ++i) { 378 t=tokens[i]; 379 if(t!=NULL) { 380 for(j=0;; ++j) { 381 if(t[j]!=0) { 382 if(s[j]!=t[j]) { 383 break; 384 } 385 } else { 386 z=u_skipWhitespace(s+j); 387 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { 388 return i; 389 } else { 390 break; 391 } 392 } 393 } 394 } 395 } 396 return -1; 397 } 398 399 static void 400 _set_addAll(USet *set, const UChar *s, int32_t length) { 401 UChar32 c; 402 int32_t i; 403 404 /* needs length>=0 */ 405 for(i=0; i<length; /* U16_NEXT advances i */) { 406 U16_NEXT(s, i, length, c); 407 uset_add(set, c); 408 } 409 } 410 411 /* parser for SpecialCasing.txt --------------------------------------------- */ 412 413 #define MAX_SPECIAL_CASING_COUNT 500 414 415 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; 416 static int32_t specialCasingCount=0; 417 418 static void U_CALLCONV 419 specialCasingLineFn(void *context, 420 char *fields[][2], int32_t fieldCount, 421 UErrorCode *pErrorCode) { 422 char *end; 423 424 /* get code point */ 425 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 426 end=(char *)u_skipWhitespace(end); 427 if(end<=fields[0][0] || end!=fields[0][1]) { 428 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); 429 *pErrorCode=U_PARSE_ERROR; 430 exit(U_PARSE_ERROR); 431 } 432 433 /* is this a complex mapping? */ 434 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { 435 /* there is some condition text in the fifth field */ 436 specialCasings[specialCasingCount].isComplex=TRUE; 437 438 /* do not store any actual mappings for this */ 439 specialCasings[specialCasingCount].lowerCase[0]=0; 440 specialCasings[specialCasingCount].upperCase[0]=0; 441 specialCasings[specialCasingCount].titleCase[0]=0; 442 } else { 443 /* just set the "complex" flag and get the case mappings */ 444 specialCasings[specialCasingCount].isComplex=FALSE; 445 specialCasings[specialCasingCount].lowerCase[0]= 446 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); 447 specialCasings[specialCasingCount].upperCase[0]= 448 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); 449 specialCasings[specialCasingCount].titleCase[0]= 450 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); 451 if(U_FAILURE(*pErrorCode)) { 452 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); 453 exit(*pErrorCode); 454 } 455 456 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); 457 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); 458 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); 459 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); 460 } 461 462 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { 463 fprintf(stderr, "gencase: too many special casing mappings\n"); 464 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 465 exit(U_INDEX_OUTOFBOUNDS_ERROR); 466 } 467 } 468 469 static int32_t U_CALLCONV 470 compareSpecialCasings(const void *context, const void *left, const void *right) { 471 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; 472 } 473 474 static void 475 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { 476 char *fields[5][2]; 477 int32_t i, j; 478 479 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 480 return; 481 } 482 483 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); 484 485 /* sort the special casing entries by code point */ 486 if(specialCasingCount>0) { 487 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), 488 compareSpecialCasings, NULL, FALSE, pErrorCode); 489 } 490 if(U_FAILURE(*pErrorCode)) { 491 return; 492 } 493 494 /* replace multiple entries for any code point by one "complex" one */ 495 j=0; 496 for(i=1; i<specialCasingCount; ++i) { 497 if(specialCasings[i-1].code==specialCasings[i].code) { 498 /* there is a duplicate code point */ 499 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ 500 specialCasings[i].isComplex=TRUE; /* make the following one complex */ 501 specialCasings[i].lowerCase[0]=0; 502 specialCasings[i].upperCase[0]=0; 503 specialCasings[i].titleCase[0]=0; 504 ++j; 505 } 506 } 507 508 /* if some entries just were removed, then re-sort */ 509 if(j>0) { 510 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), 511 compareSpecialCasings, NULL, FALSE, pErrorCode); 512 specialCasingCount-=j; 513 } 514 if(U_FAILURE(*pErrorCode)) { 515 return; 516 } 517 518 /* 519 * Add one complex mapping to caseSensitive that was filtered out above: 520 * Greek final Sigma has a conditional mapping but not locale-sensitive, 521 * and it is taken when lowercasing just U+03A3 alone. 522 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 523 */ 524 uset_add(caseSensitive, 0x3c2); 525 } 526 527 /* parser for CaseFolding.txt ----------------------------------------------- */ 528 529 #define MAX_CASE_FOLDING_COUNT 2000 530 531 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; 532 static int32_t caseFoldingCount=0; 533 534 static void U_CALLCONV 535 caseFoldingLineFn(void *context, 536 char *fields[][2], int32_t fieldCount, 537 UErrorCode *pErrorCode) { 538 char *end; 539 static UChar32 prevCode=0; 540 int32_t count; 541 char status; 542 543 /* get code point */ 544 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 545 end=(char *)u_skipWhitespace(end); 546 if(end<=fields[0][0] || end!=fields[0][1]) { 547 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 548 *pErrorCode=U_PARSE_ERROR; 549 exit(U_PARSE_ERROR); 550 } 551 552 /* get the status of this mapping */ 553 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); 554 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { 555 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 556 *pErrorCode=U_PARSE_ERROR; 557 exit(U_PARSE_ERROR); 558 } 559 560 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ 561 if(status=='L') { 562 return; 563 } 564 565 /* get the mapping */ 566 count=caseFoldings[caseFoldingCount].full[0]= 567 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); 568 if(U_FAILURE(*pErrorCode)) { 569 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 570 exit(*pErrorCode); 571 } 572 573 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 574 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { 575 caseFoldings[caseFoldingCount].simple=0; 576 } 577 578 /* update the case-sensitive set */ 579 if(status!='T') { 580 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); 581 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); 582 } 583 584 /* check the status */ 585 if(status=='S') { 586 /* check if there was a full mapping for this code point before */ 587 if( caseFoldingCount>0 && 588 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && 589 caseFoldings[caseFoldingCount-1].status=='F' 590 ) { 591 /* merge the two entries */ 592 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; 593 return; 594 } 595 } else if(status=='F') { 596 /* check if there was a simple mapping for this code point before */ 597 if( caseFoldingCount>0 && 598 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && 599 caseFoldings[caseFoldingCount-1].status=='S' 600 ) { 601 /* merge the two entries */ 602 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); 603 return; 604 } 605 } else if(status=='I' || status=='T') { 606 /* check if there was a default mapping for this code point before (remove it) */ 607 while(caseFoldingCount>0 && 608 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code 609 ) { 610 prevCode=0; 611 --caseFoldingCount; 612 } 613 /* store only a marker for special handling for cases like dotless i */ 614 caseFoldings[caseFoldingCount].simple=0; 615 caseFoldings[caseFoldingCount].full[0]=0; 616 } 617 618 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ 619 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { 620 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", 621 (unsigned long)caseFoldings[caseFoldingCount].code, 622 (unsigned long)prevCode); 623 *pErrorCode=U_PARSE_ERROR; 624 exit(U_PARSE_ERROR); 625 } 626 prevCode=caseFoldings[caseFoldingCount].code; 627 628 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { 629 fprintf(stderr, "gencase: too many case folding mappings\n"); 630 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 631 exit(U_INDEX_OUTOFBOUNDS_ERROR); 632 } 633 } 634 635 static void 636 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { 637 char *fields[3][2]; 638 639 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 640 return; 641 } 642 643 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); 644 } 645 646 /* parser for UnicodeData.txt ----------------------------------------------- */ 647 648 /* general categories */ 649 const char *const 650 genCategoryNames[U_CHAR_CATEGORY_COUNT]={ 651 "Cn", 652 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", 653 "Mc", "Nd", "Nl", "No", 654 "Zs", "Zl", "Zp", 655 "Cc", "Cf", "Co", "Cs", 656 "Pd", "Ps", "Pe", "Pc", "Po", 657 "Sm", "Sc", "Sk", "So", 658 "Pi", "Pf" 659 }; 660 661 static int32_t specialCasingIndex=0, caseFoldingIndex=0; 662 663 static void U_CALLCONV 664 unicodeDataLineFn(void *context, 665 char *fields[][2], int32_t fieldCount, 666 UErrorCode *pErrorCode) { 667 Props p; 668 char *end; 669 static UChar32 prevCode=0; 670 UChar32 value; 671 int32_t i; 672 673 /* reset the properties */ 674 uprv_memset(&p, 0, sizeof(Props)); 675 676 /* get the character code, field 0 */ 677 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); 678 if(end<=fields[0][0] || end!=fields[0][1]) { 679 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); 680 *pErrorCode=U_PARSE_ERROR; 681 exit(U_PARSE_ERROR); 682 } 683 684 /* get general category, field 2 */ 685 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); 686 if(i>=0) { 687 p.gc=(uint8_t)i; 688 } else { 689 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", 690 fields[2][0], (unsigned long)p.code); 691 *pErrorCode=U_PARSE_ERROR; 692 exit(U_PARSE_ERROR); 693 } 694 695 /* get canonical combining class, field 3 */ 696 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); 697 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { 698 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); 699 *pErrorCode=U_PARSE_ERROR; 700 exit(U_PARSE_ERROR); 701 } 702 p.cc=(uint8_t)value; 703 704 /* get uppercase mapping, field 12 */ 705 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); 706 if(end!=fields[12][1]) { 707 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", 708 (unsigned long)p.code); 709 *pErrorCode=U_PARSE_ERROR; 710 exit(U_PARSE_ERROR); 711 } 712 if(value!=0 && value!=p.code) { 713 p.upperCase=value; 714 uset_add(caseSensitive, p.code); 715 uset_add(caseSensitive, value); 716 } 717 718 /* get lowercase value, field 13 */ 719 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); 720 if(end!=fields[13][1]) { 721 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", 722 (unsigned long)p.code); 723 *pErrorCode=U_PARSE_ERROR; 724 exit(U_PARSE_ERROR); 725 } 726 if(value!=0 && value!=p.code) { 727 p.lowerCase=value; 728 uset_add(caseSensitive, p.code); 729 uset_add(caseSensitive, value); 730 } 731 732 /* get titlecase value, field 14 */ 733 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); 734 if(end!=fields[14][1]) { 735 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", 736 (unsigned long)p.code); 737 *pErrorCode=U_PARSE_ERROR; 738 exit(U_PARSE_ERROR); 739 } 740 if(value!=0 && value!=p.code) { 741 p.titleCase=value; 742 uset_add(caseSensitive, p.code); 743 uset_add(caseSensitive, value); 744 } 745 746 /* set additional properties from previously parsed files */ 747 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { 748 p.specialCasing=specialCasings+specialCasingIndex++; 749 } else { 750 p.specialCasing=NULL; 751 } 752 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { 753 p.caseFolding=caseFoldings+caseFoldingIndex++; 754 755 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ 756 if( p.caseFolding->status=='C' && 757 p.caseFolding->simple==p.lowerCase 758 ) { 759 p.caseFolding=NULL; 760 } 761 } else { 762 p.caseFolding=NULL; 763 } 764 765 /* check for non-character code points */ 766 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { 767 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", 768 (unsigned long)p.code); 769 *pErrorCode=U_PARSE_ERROR; 770 exit(U_PARSE_ERROR); 771 } 772 773 /* check that the code points (p.code) are in ascending order */ 774 if(p.code<=prevCode && p.code>0) { 775 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", 776 (unsigned long)p.code, (unsigned long)prevCode); 777 *pErrorCode=U_PARSE_ERROR; 778 exit(U_PARSE_ERROR); 779 } 780 781 /* properties for a single code point */ 782 setProps(&p); 783 784 prevCode=p.code; 785 } 786 787 static void 788 parseDB(const char *filename, UErrorCode *pErrorCode) { 789 char *fields[15][2]; 790 UChar32 start, end; 791 int32_t i; 792 793 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 794 return; 795 } 796 797 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); 798 799 /* are all sub-properties consumed? */ 800 if(specialCasingIndex<specialCasingCount) { 801 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n"); 802 *pErrorCode=U_PARSE_ERROR; 803 exit(U_PARSE_ERROR); 804 } 805 if(caseFoldingIndex<caseFoldingCount) { 806 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n"); 807 *pErrorCode=U_PARSE_ERROR; 808 exit(U_PARSE_ERROR); 809 } 810 811 if(U_FAILURE(*pErrorCode)) { 812 return; 813 } 814 815 for(i=0; 816 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode); 817 ++i 818 ) { 819 addCaseSensitive(start, end); 820 } 821 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 822 *pErrorCode=U_ZERO_ERROR; 823 } 824 } 825 826 /* 827 * Hey, Emacs, please set the following: 828 * 829 * Local Variables: 830 * indent-tabs-mode: nil 831 * End: 832 * 833 */ 834