1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: genprops.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999dec08 14 * created by: Markus W. Scherer 15 * 16 * This program reads several of the Unicode character database text files, 17 * parses them, and extracts most of the properties for each character. 18 * It then writes a binary file containing the properties 19 * that is designed to be used directly for random-access to 20 * the properties of each Unicode character. 21 */ 22 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include "unicode/utypes.h" 26 #include "unicode/uchar.h" 27 #include "unicode/putil.h" 28 #include "unicode/uclean.h" 29 #include "cmemory.h" 30 #include "cstring.h" 31 #include "unewdata.h" 32 #include "uoptions.h" 33 #include "uparse.h" 34 #include "uprops.h" 35 #include "propsvec.h" 36 37 U_CDECL_BEGIN 38 #include "genprops.h" 39 U_CDECL_END 40 41 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 42 43 UBool beVerbose=FALSE, haveCopyright=TRUE; 44 45 /* prototypes --------------------------------------------------------------- */ 46 47 static void 48 parseDB(const char *filename, UErrorCode *pErrorCode); 49 50 /* -------------------------------------------------------------------------- */ 51 52 enum 53 { 54 HELP_H, 55 HELP_QUESTION_MARK, 56 VERBOSE, 57 COPYRIGHT, 58 DESTDIR, 59 SOURCEDIR, 60 UNICODE_VERSION, 61 ICUDATADIR, 62 CSOURCE 63 }; 64 65 /* Keep these values in sync with the above enums */ 66 static UOption options[]={ 67 UOPTION_HELP_H, 68 UOPTION_HELP_QUESTION_MARK, 69 UOPTION_VERBOSE, 70 UOPTION_COPYRIGHT, 71 UOPTION_DESTDIR, 72 UOPTION_SOURCEDIR, 73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 74 UOPTION_ICUDATADIR, 75 UOPTION_DEF("csource", 'C', UOPT_NO_ARG) 76 }; 77 78 extern int 79 main(int argc, char* argv[]) { 80 char filename[300]; 81 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; 82 char *basename=NULL; 83 UErrorCode errorCode=U_ZERO_ERROR; 84 85 U_MAIN_INIT_ARGS(argc, argv); 86 87 /* preset then read command line options */ 88 options[DESTDIR].value=u_getDataDirectory(); 89 options[SOURCEDIR].value=""; 90 options[UNICODE_VERSION].value=""; 91 options[ICUDATADIR].value=u_getDataDirectory(); 92 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 93 94 /* error handling, printing usage message */ 95 if(argc<0) { 96 fprintf(stderr, 97 "error in command line argument \"%s\"\n", 98 argv[-argc]); 99 } 100 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 101 /* 102 * Broken into chucks because the C89 standard says the minimum 103 * required supported string length is 509 bytes. 104 */ 105 fprintf(stderr, 106 "Usage: %s [-options] [suffix]\n" 107 "\n" 108 "read the UnicodeData.txt file and other Unicode properties files and\n" 109 "create a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" 110 "\n", 111 argv[0]); 112 fprintf(stderr, 113 "Options:\n" 114 "\t-h or -? or --help this usage text\n" 115 "\t-v or --verbose verbose output\n" 116 "\t-c or --copyright include a copyright notice\n" 117 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" 118 "\t-C or --csource generate a .c source file rather than the .icu binary\n"); 119 fprintf(stderr, 120 "\t-d or --destdir destination directory, followed by the path\n" 121 "\t-s or --sourcedir source directory, followed by the path\n" 122 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 123 "\t followed by path, defaults to %s\n" 124 "\tsuffix suffix that is to be appended with a '-'\n" 125 "\t to the source file basenames before opening;\n" 126 "\t 'genprops new' will read UnicodeData-new.txt etc.\n", 127 u_getDataDirectory()); 128 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 129 } 130 131 /* get the options values */ 132 beVerbose=options[VERBOSE].doesOccur; 133 haveCopyright=options[COPYRIGHT].doesOccur; 134 srcDir=options[SOURCEDIR].value; 135 destDir=options[DESTDIR].value; 136 137 if(argc>=2) { 138 suffix=argv[1]; 139 } else { 140 suffix=NULL; 141 } 142 143 if(options[UNICODE_VERSION].doesOccur) { 144 setUnicodeVersion(options[UNICODE_VERSION].value); 145 } 146 /* else use the default dataVersion in store.c */ 147 148 if (options[ICUDATADIR].doesOccur) { 149 u_setDataDirectory(options[ICUDATADIR].value); 150 } 151 152 /* prepare the filename beginning with the source dir */ 153 uprv_strcpy(filename, srcDir); 154 basename=filename+uprv_strlen(filename); 155 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 156 *basename++=U_FILE_SEP_CHAR; 157 } 158 159 /* initialize */ 160 initStore(); 161 162 /* process UnicodeData.txt */ 163 writeUCDFilename(basename, "UnicodeData", suffix); 164 parseDB(filename, &errorCode); 165 166 /* process additional properties files */ 167 *basename=0; 168 generateAdditionalProperties(filename, suffix, &errorCode); 169 170 /* process parsed data */ 171 if(U_SUCCESS(errorCode)) { 172 /* write the properties data file */ 173 generateData(destDir, options[CSOURCE].doesOccur); 174 } 175 176 exitStore(); 177 u_cleanup(); 178 return errorCode; 179 } 180 181 U_CFUNC void 182 writeUCDFilename(char *basename, const char *filename, const char *suffix) { 183 int32_t length=(int32_t)uprv_strlen(filename); 184 uprv_strcpy(basename, filename); 185 if(suffix!=NULL) { 186 basename[length++]='-'; 187 uprv_strcpy(basename+length, suffix); 188 length+=(int32_t)uprv_strlen(suffix); 189 } 190 uprv_strcpy(basename+length, ".txt"); 191 } 192 193 U_CFUNC UBool 194 isToken(const char *token, const char *s) { 195 const char *z; 196 int32_t j; 197 198 s=u_skipWhitespace(s); 199 for(j=0;; ++j) { 200 if(token[j]!=0) { 201 if(s[j]!=token[j]) { 202 break; 203 } 204 } else { 205 z=u_skipWhitespace(s+j); 206 if(*z==';' || *z==0) { 207 return TRUE; 208 } else { 209 break; 210 } 211 } 212 } 213 214 return FALSE; 215 } 216 217 U_CFUNC int32_t 218 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { 219 const char *t, *z; 220 int32_t i, j; 221 222 s=u_skipWhitespace(s); 223 for(i=0; i<countTokens; ++i) { 224 t=tokens[i]; 225 if(t!=NULL) { 226 for(j=0;; ++j) { 227 if(t[j]!=0) { 228 if(s[j]!=t[j]) { 229 break; 230 } 231 } else { 232 z=u_skipWhitespace(s+j); 233 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { 234 return i; 235 } else { 236 break; 237 } 238 } 239 } 240 } 241 } 242 return -1; 243 } 244 245 /* parser for UnicodeData.txt ----------------------------------------------- */ 246 247 /* general categories */ 248 const char *const 249 genCategoryNames[U_CHAR_CATEGORY_COUNT]={ 250 "Cn", 251 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", 252 "Mc", "Nd", "Nl", "No", 253 "Zs", "Zl", "Zp", 254 "Cc", "Cf", "Co", "Cs", 255 "Pd", "Ps", "Pe", "Pc", "Po", 256 "Sm", "Sc", "Sk", "So", 257 "Pi", "Pf" 258 }; 259 260 const char *const 261 decompositionTypeNames[U_DT_COUNT]={ 262 NULL, 263 NULL, 264 "compat", 265 "circle", 266 "final", 267 "font", 268 "fraction", 269 "initial", 270 "isolated", 271 "medial", 272 "narrow", 273 "noBreak", 274 "small", 275 "square", 276 "sub", 277 "super", 278 "vertical", 279 "wide" 280 }; 281 282 static struct { 283 uint32_t first, last, props; 284 char name[80]; 285 } unicodeAreas[32]; 286 287 static int32_t unicodeAreaIndex=0; 288 289 static void U_CALLCONV 290 unicodeDataLineFn(void *context, 291 char *fields[][2], int32_t fieldCount, 292 UErrorCode *pErrorCode) { 293 Props p; 294 char *end; 295 static uint32_t prevCode=0; 296 uint32_t value; 297 int32_t i; 298 299 /* reset the properties */ 300 uprv_memset(&p, 0, sizeof(Props)); 301 302 /* get the character code, field 0 */ 303 p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); 304 if(end<=fields[0][0] || end!=fields[0][1]) { 305 fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]); 306 *pErrorCode=U_PARSE_ERROR; 307 exit(U_PARSE_ERROR); 308 } 309 310 /* get general category, field 2 */ 311 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); 312 if(i>=0) { 313 p.generalCategory=(uint8_t)i; 314 } else { 315 fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", 316 fields[2][0], (unsigned long)p.code); 317 *pErrorCode=U_PARSE_ERROR; 318 exit(U_PARSE_ERROR); 319 } 320 321 /* get decomposition type, field 5 */ 322 if(fields[5][0]<fields[5][1]) { 323 /* there is some decomposition */ 324 if(*fields[5][0]!='<') { 325 /* canonical */ 326 i=U_DT_CANONICAL; 327 } else { 328 /* get compatibility type */ 329 end=fields[5][0]+1; 330 while(end<fields[5][1] && *end!='>') { 331 ++end; 332 } 333 *end='#'; 334 i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1); 335 if(i<0) { 336 fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n", 337 fields[5][0], (unsigned long)p.code); 338 *pErrorCode=U_PARSE_ERROR; 339 exit(U_PARSE_ERROR); 340 } 341 } 342 upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode); 343 if(U_FAILURE(*pErrorCode)) { 344 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode)); 345 exit(*pErrorCode); 346 } 347 } 348 349 /* decimal digit value, field 6 */ 350 if(fields[6][0]<fields[6][1]) { 351 value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10); 352 if(end!=fields[6][1] || value>0x7fff) { 353 fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", 354 (unsigned long)p.code); 355 *pErrorCode=U_PARSE_ERROR; 356 exit(U_PARSE_ERROR); 357 } 358 p.numericValue=(int32_t)value; 359 p.numericType=1; 360 } 361 362 /* digit value, field 7 */ 363 if(fields[7][0]<fields[7][1]) { 364 value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10); 365 if(end!=fields[7][1] || value>0x7fff) { 366 fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n", 367 (unsigned long)p.code); 368 *pErrorCode=U_PARSE_ERROR; 369 exit(U_PARSE_ERROR); 370 } 371 if(p.numericType==0) { 372 p.numericValue=(int32_t)value; 373 p.numericType=2; 374 } else if((int32_t)value!=p.numericValue) { 375 fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n", 376 (unsigned long)p.code); 377 *pErrorCode=U_PARSE_ERROR; 378 exit(U_PARSE_ERROR); 379 } 380 } 381 382 /* numeric value, field 8 */ 383 if(fields[8][0]<fields[8][1]) { 384 char *s=fields[8][0]; 385 UBool isNegative; 386 387 /* get a possible minus sign */ 388 if(*s=='-') { 389 isNegative=TRUE; 390 ++s; 391 } else { 392 isNegative=FALSE; 393 } 394 395 value=(uint32_t)uprv_strtoul(s, &end, 10); 396 if(value>0 && *end=='/') { 397 /* field 8 may contain a fractional value, get the denominator */ 398 if(p.numericType>0) { 399 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n", 400 (unsigned long)p.code); 401 *pErrorCode=U_PARSE_ERROR; 402 exit(U_PARSE_ERROR); 403 } 404 405 p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10); 406 if(p.denominator==0) { 407 fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n", 408 (unsigned long)p.code); 409 *pErrorCode=U_PARSE_ERROR; 410 exit(U_PARSE_ERROR); 411 } 412 } 413 if(end!=fields[8][1] || value>0x7fffffff) { 414 fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", 415 (unsigned long)p.code); 416 *pErrorCode=U_PARSE_ERROR; 417 exit(U_PARSE_ERROR); 418 } 419 420 if(p.numericType==0) { 421 if(isNegative) { 422 p.numericValue=-(int32_t)value; 423 } else { 424 p.numericValue=(int32_t)value; 425 } 426 p.numericType=3; 427 } else if((int32_t)value!=p.numericValue) { 428 fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n", 429 (unsigned long)p.code); 430 *pErrorCode=U_PARSE_ERROR; 431 exit(U_PARSE_ERROR); 432 } 433 } 434 435 value=makeProps(&p); 436 437 if(*fields[1][0]=='<') { 438 /* first or last entry of a Unicode area */ 439 size_t length=fields[1][1]-fields[1][0]; 440 441 if(length<9) { 442 /* name too short for an area name */ 443 } else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) { 444 /* set the current area */ 445 if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) { 446 length-=9; 447 unicodeAreas[unicodeAreaIndex].first=p.code; 448 unicodeAreas[unicodeAreaIndex].props=value; 449 uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length); 450 unicodeAreas[unicodeAreaIndex].name[length]=0; 451 } else { 452 /* error: a previous area is incomplete */ 453 fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name); 454 *pErrorCode=U_PARSE_ERROR; 455 exit(U_PARSE_ERROR); 456 } 457 return; 458 } else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) { 459 /* check that the current area matches, and complete it with the last code point */ 460 length-=8; 461 if( unicodeAreas[unicodeAreaIndex].props==value && 462 0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) && 463 unicodeAreas[unicodeAreaIndex].name[length]==0 && 464 unicodeAreas[unicodeAreaIndex].first<p.code 465 ) { 466 unicodeAreas[unicodeAreaIndex].last=p.code; 467 if(beVerbose) { 468 printf("Unicode area U+%04lx..U+%04lx \"%s\"\n", 469 (unsigned long)unicodeAreas[unicodeAreaIndex].first, 470 (unsigned long)unicodeAreas[unicodeAreaIndex].last, 471 unicodeAreas[unicodeAreaIndex].name); 472 } 473 unicodeAreas[++unicodeAreaIndex].first=0xffffffff; 474 } else { 475 /* error: different properties between first & last, different area name, first>=last */ 476 fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name); 477 *pErrorCode=U_PARSE_ERROR; 478 exit(U_PARSE_ERROR); 479 } 480 return; 481 } else { 482 /* not an area name */ 483 } 484 } 485 486 /* check for non-character code points */ 487 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { 488 fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n", 489 (unsigned long)p.code); 490 *pErrorCode=U_PARSE_ERROR; 491 exit(U_PARSE_ERROR); 492 } 493 494 /* check that the code points (p.code) are in ascending order */ 495 if(p.code<=prevCode && p.code>0) { 496 fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", 497 (unsigned long)p.code, (unsigned long)prevCode); 498 *pErrorCode=U_PARSE_ERROR; 499 exit(U_PARSE_ERROR); 500 } 501 prevCode=p.code; 502 503 /* properties for a single code point */ 504 addProps(p.code, value); 505 } 506 507 /* set repeated properties for the areas */ 508 static void 509 repeatAreaProps() { 510 uint32_t puaProps; 511 int32_t i; 512 UBool hasPlane15PUA, hasPlane16PUA; 513 UErrorCode errorCode; 514 515 /* 516 * UnicodeData.txt before 3.0.1 did not contain the PUAs on 517 * planes 15 and 16. 518 * If that is the case, then we add them here, using the properties 519 * from the BMP PUA. 520 */ 521 puaProps=0; 522 hasPlane15PUA=hasPlane16PUA=FALSE; 523 524 for(i=0; i<unicodeAreaIndex; ++i) { 525 repeatProps(unicodeAreas[i].first, 526 unicodeAreas[i].last, 527 unicodeAreas[i].props); 528 if(unicodeAreas[i].first==0xe000) { 529 puaProps=unicodeAreas[i].props; 530 } else if(unicodeAreas[i].first==0xf0000) { 531 hasPlane15PUA=TRUE; 532 } else if(unicodeAreas[i].first==0x100000) { 533 hasPlane16PUA=TRUE; 534 } 535 } 536 537 if(puaProps!=0) { 538 if(!hasPlane15PUA) { 539 repeatProps(0xf0000, 0xffffd, puaProps); 540 } 541 if(!hasPlane16PUA) { 542 repeatProps(0x100000, 0x10fffd, puaProps); 543 } 544 } 545 546 /* Hangul have canonical decompositions */ 547 errorCode=U_ZERO_ERROR; 548 upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode); 549 if(U_FAILURE(errorCode)) { 550 fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode)); 551 exit(errorCode); 552 } 553 } 554 555 static void 556 parseDB(const char *filename, UErrorCode *pErrorCode) { 557 char *fields[15][2]; 558 559 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 560 return; 561 } 562 563 /* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */ 564 unicodeAreas[0].first=0xffffffff; 565 566 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); 567 568 if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) { 569 fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n", 570 unicodeAreas[unicodeAreaIndex].name, 571 (unsigned long)unicodeAreas[unicodeAreaIndex].first); 572 *pErrorCode=U_PARSE_ERROR; 573 exit(U_PARSE_ERROR); 574 } 575 576 repeatAreaProps(); 577 578 if(U_FAILURE(*pErrorCode)) { 579 return; 580 } 581 } 582 583 /* 584 * Hey, Emacs, please set the following: 585 * 586 * Local Variables: 587 * indent-tabs-mode: nil 588 * End: 589 * 590 */ 591