1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: gennames.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999sep30 14 * created by: Markus W. Scherer 15 * 16 * This program reads the Unicode character database text file, 17 * parses it, and extracts the character code, 18 * the "modern" character name, and optionally the 19 * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment. 20 * It then tokenizes and compresses the names and builds 21 * compact binary tables for random-access lookup 22 * in a u_charName() API function. 23 * 24 * unames.icu file format (after UDataInfo header etc. - see udata.c) 25 * (all data is static const) 26 * 27 * UDataInfo fields: 28 * dataFormat "unam" 29 * formatVersion 1.0 30 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 31 * 32 * -- data-based names 33 * uint32_t tokenStringOffset, 34 * groupsOffset, 35 * groupStringOffset, 36 * algNamesOffset; 37 * 38 * uint16_t tokenCount; 39 * uint16_t tokenTable[tokenCount]; 40 * 41 * char tokenStrings[]; -- padded to even count 42 * 43 * -- strings (groupStrings) are tokenized as follows: 44 * for each character c 45 * if(c>=tokenCount) write that character c directly 46 * else 47 * token=tokenTable[c]; 48 * if(token==0xfffe) -- lead byte of double-byte token 49 * token=tokenTable[c<<8|next character]; 50 * if(token==-1) 51 * write c directly 52 * else 53 * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) 54 * append zero-terminated tokenString; 55 * 56 * Different strings for a code point - normal name, 1.0 name, and ISO comment - 57 * are separated by ';'. 58 * 59 * uint16_t groupCount; 60 * struct { 61 * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 62 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset 63 * uint16_t offsetLow; 64 * } groupTable[groupCount]; 65 * 66 * char groupStrings[]; -- padded to 4-count 67 * 68 * -- The actual, tokenized group strings are not zero-terminated because 69 * that would take up too much space. 70 * Instead, they are preceeded by their length, written in a variable-length sequence: 71 * For each of the 32 group strings, one or two nibbles are stored for its length. 72 * Nibbles (4-bit values, half-bytes) are read MSB first. 73 * A nibble with a value of 0..11 directly indicates the length of the name string. 74 * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m 75 * by (((n-12)<<4)|m)+12, reaching values of 12..75. 76 * These lengths are sequentially for each tokenized string, not for the de-tokenized result. 77 * For the de-tokenizing, see token description above; the strings immediately follow the 78 * 32 lengths. 79 * 80 * -- algorithmic names 81 * 82 * typedef struct AlgorithmicRange { 83 * uint32_t rangeStart, rangeEnd; 84 * uint8_t algorithmType, algorithmVariant; 85 * uint16_t rangeSize; 86 * } AlgorithmicRange; 87 * 88 * uint32_t algRangesCount; -- number of data blocks for ranges of 89 * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) 90 * 91 * struct { 92 * AlgorithmicRange algRange; 93 * uint8_t algRangeData[]; -- padded to 4-count except in last range 94 * } algRanges[algNamesCount]; 95 * -- not a real array because each part has a different size 96 * of algRange.rangeSize (including AlgorithmicRange) 97 * 98 * -- algorithmic range types: 99 * 100 * 0 Names are formed from a string prefix that is stored in 101 * the algRangeData (zero-terminated), followed by the Unicode code point 102 * of the character in hexadecimal digits; 103 * algRange.algorithmVariant digits are written 104 * 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows: 106 * algRange.algorithmVariant is the count of modulo factors 107 * algRangeData contains 108 * uint16_t factors[algRange.algorithmVariant]; 109 * char strings[]; 110 * the first zero-terminated string is written as the prefix; then: 111 * 112 * The rangeStart is subtracted; with the difference, here "code": 113 * for(i=algRange.algorithmVariant-1 to 0 step -1) 114 * index[i]=code%factor[i]; 115 * code/=factor[i]; 116 * 117 * The strings after the prefix are short pieces that are then appended to the result 118 * according to index[0..algRange.algorithmVariant-1]. 119 */ 120 121 #include <stdio.h> 122 #include "unicode/utypes.h" 123 #include "unicode/putil.h" 124 #include "unicode/uclean.h" 125 #include "unicode/udata.h" 126 #include "cmemory.h" 127 #include "cstring.h" 128 #include "uarrsort.h" 129 #include "unewdata.h" 130 #include "uoptions.h" 131 #include "uparse.h" 132 133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 134 135 #define STRING_STORE_SIZE 1000000 136 #define GROUP_STORE_SIZE 5000 137 138 #define GROUP_SHIFT 5 139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT) 140 #define GROUP_MASK (LINES_PER_GROUP-1) 141 142 #define MAX_LINE_COUNT 50000 143 #define MAX_WORD_COUNT 20000 144 #define MAX_GROUP_COUNT 5000 145 146 #define DATA_NAME "unames" 147 #define DATA_TYPE "icu" 148 #define VERSION_STRING "unam" 149 #define NAME_SEPARATOR_CHAR ';' 150 151 #define ISO_DATA_NAME "ucomment" 152 153 /* Unicode versions --------------------------------------------------------- */ 154 155 enum { 156 UNI_1_0, 157 UNI_1_1, 158 UNI_2_0, 159 UNI_3_0, 160 UNI_3_1, 161 UNI_3_2, 162 UNI_4_0, 163 UNI_4_0_1, 164 UNI_4_1, 165 UNI_5_0, 166 UNI_5_1, 167 UNI_5_2, 168 UNI_VER_COUNT 169 }; 170 171 static const UVersionInfo 172 unicodeVersions[]={ 173 { 1, 0, 0, 0 }, 174 { 1, 1, 0, 0 }, 175 { 2, 0, 0, 0 }, 176 { 3, 0, 0, 0 }, 177 { 3, 1, 0, 0 }, 178 { 3, 2, 0, 0 }, 179 { 4, 0, 0, 0 }, 180 { 4, 0, 1, 0 }, 181 { 4, 1, 0, 0 }, 182 { 5, 0, 0, 0 }, 183 { 5, 1, 0, 0 }, 184 { 5, 2, 0, 0 } 185 }; 186 187 static int32_t ucdVersion=UNI_5_2; 188 189 static int32_t 190 findUnicodeVersion(const UVersionInfo version) { 191 int32_t i; 192 193 for(i=0; /* while(version>unicodeVersions[i]) {} */ 194 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; 195 ++i) {} 196 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { 197 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ 198 } 199 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ 200 } 201 202 /* generator data ----------------------------------------------------------- */ 203 204 /* UDataInfo cf. udata.h */ 205 static UDataInfo dataInfo={ 206 sizeof(UDataInfo), 207 0, 208 209 U_IS_BIG_ENDIAN, 210 U_CHARSET_FAMILY, 211 sizeof(UChar), 212 0, 213 214 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */ 215 {1, 0, 0, 0}, /* formatVersion */ 216 {3, 0, 0, 0} /* dataVersion */ 217 }; 218 219 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE; 220 221 typedef struct Options { 222 UBool storeNames; 223 UBool store10Names; 224 UBool storeISOComments; 225 } Options; 226 227 /* 228 * Pair of code point and name alias. 229 * Try to keep sizeof(CpNameAlias) a multiple of 4 to avoid padding. 230 */ 231 typedef struct CpNameAlias { 232 uint32_t code; 233 char nameAlias[124]; 234 } CpNameAlias; 235 236 static CpNameAlias cpNameAliases[50]; 237 238 static uint32_t cpNameAliasesIndex=0, cpNameAliasesTop=0; 239 240 static uint8_t stringStore[STRING_STORE_SIZE], 241 groupStore[GROUP_STORE_SIZE], 242 lineLengths[LINES_PER_GROUP]; 243 244 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop; 245 246 typedef struct { 247 uint32_t code; 248 int16_t length; 249 uint8_t *s; 250 } Line; 251 252 typedef struct { 253 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */ 254 int16_t count; 255 int16_t length; 256 uint8_t *s; 257 } Word; 258 259 static Line lines[MAX_LINE_COUNT]; 260 static Word words[MAX_WORD_COUNT]; 261 262 static uint32_t lineCount=0, wordCount=0; 263 264 static int16_t leadByteCount; 265 266 #define LEADBYTE_LIMIT 16 267 268 static int16_t tokens[LEADBYTE_LIMIT*256]; 269 static uint32_t tokenCount; 270 271 /* prototypes --------------------------------------------------------------- */ 272 273 static void 274 init(void); 275 276 static void 277 parseNameAliases(const char *filename, Options *options); 278 279 static void 280 parseDB(const char *filename, Options *options); 281 282 static void 283 parseName(char *name, int16_t length); 284 285 static int16_t 286 skipNoise(char *line, int16_t start, int16_t limit); 287 288 static int16_t 289 getWord(char *line, int16_t start, int16_t limit); 290 291 static void 292 compress(void); 293 294 static void 295 compressLines(void); 296 297 static int16_t 298 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop); 299 300 static int32_t 301 compareWords(const void *context, const void *word1, const void *word2); 302 303 static void 304 generateData(const char *dataDir, Options *options); 305 306 static uint32_t 307 generateAlgorithmicData(UNewDataMemory *pData, Options *options); 308 309 static int16_t 310 findToken(uint8_t *s, int16_t length); 311 312 static Word * 313 findWord(char *s, int16_t length); 314 315 static Word * 316 addWord(char *s, int16_t length); 317 318 static void 319 countWord(Word *word); 320 321 static void 322 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count); 323 324 static void 325 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length); 326 327 static uint32_t 328 addToken(uint8_t *s, int16_t length); 329 330 static void 331 appendLineLength(int16_t length); 332 333 static void 334 appendLineLengthNibble(uint8_t nibble); 335 336 static uint8_t * 337 allocLine(int32_t length); 338 339 static uint8_t * 340 allocWord(uint32_t length); 341 342 /* -------------------------------------------------------------------------- */ 343 344 enum { 345 HELP_H, 346 HELP_QUESTION_MARK, 347 VERBOSE, 348 QUIET, 349 COPYRIGHT, 350 DESTDIR, 351 UNICODE, 352 UNICODE1_NAMES, 353 NO_ISO_COMMENTS, 354 ONLY_ISO_COMMENTS 355 }; 356 357 static UOption options[]={ 358 UOPTION_HELP_H, 359 UOPTION_HELP_QUESTION_MARK, 360 UOPTION_VERBOSE, 361 UOPTION_QUIET, 362 UOPTION_COPYRIGHT, 363 UOPTION_DESTDIR, 364 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, 365 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }, 366 { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, 367 { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } 368 }; 369 370 extern int 371 main(int argc, char* argv[]) { 372 UVersionInfo version; 373 Options moreOptions={ TRUE, FALSE, TRUE }; 374 UErrorCode errorCode = U_ZERO_ERROR; 375 376 U_MAIN_INIT_ARGS(argc, argv); 377 378 /* Initialize ICU */ 379 u_init(&errorCode); 380 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) { 381 /* Note: u_init() will try to open ICU property data. 382 * failures here are expected when building ICU from scratch. 383 * ignore them. 384 */ 385 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n", 386 argv[0], u_errorName(errorCode)); 387 exit(1); 388 } 389 390 /* preset then read command line options */ 391 options[DESTDIR].value=u_getDataDirectory(); 392 options[UNICODE].value="4.1"; 393 argc=u_parseArgs(argc, argv, LENGTHOF(options), options); 394 395 /* error handling, printing usage message */ 396 if(argc<0) { 397 fprintf(stderr, 398 "error in command line argument \"%s\"\n", 399 argv[-argc]); 400 } else if(argc<2) { 401 argc=-1; 402 } 403 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 404 /* 405 * Broken into chucks because the C89 standard says the minimum 406 * required supported string length is 509 bytes. 407 */ 408 fprintf(stderr, 409 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] [filename_ud [filename_na]]\n" 410 "\n" 411 "Read the UnicodeData.txt file and \n" 412 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n" 413 "\n" 414 "\tfilename_ud absolute path/filename for the UnicodeData.txt file\n" 415 "\t (default: standard input)\n" 416 "\tfilename_na absolute path/filename for the NameAliases.txt file\n" 417 "\t (default: no name aliases)\n" 418 "\n", 419 argv[0]); 420 fprintf(stderr, 421 "Options:\n" 422 "\t-h or -? or --help this usage text\n" 423 "\t-v or --verbose verbose output\n" 424 "\t-q or --quiet no output\n" 425 "\t-c or --copyright include a copyright notice\n" 426 "\t-d or --destdir destination directory, followed by the path\n" 427 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); 428 fprintf(stderr, 429 "\t-1 or --unicode1-names store Unicode 1.0 character names\n" 430 "\t --no-iso-comments do not store ISO comments\n" 431 "\t --only-iso-comments write ucomment.icu with only ISO comments\n"); 432 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 433 } 434 435 /* get the options values */ 436 beVerbose=options[VERBOSE].doesOccur; 437 beQuiet=options[QUIET].doesOccur; 438 haveCopyright=options[COPYRIGHT].doesOccur; 439 moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur; 440 moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur; 441 if(options[ONLY_ISO_COMMENTS].doesOccur) { 442 moreOptions.storeNames=moreOptions.store10Names=FALSE; 443 moreOptions.storeISOComments=TRUE; 444 } 445 446 /* set the Unicode version */ 447 u_versionFromString(version, options[UNICODE].value); 448 uprv_memcpy(dataInfo.dataVersion, version, 4); 449 ucdVersion=findUnicodeVersion(version); 450 451 init(); 452 if(argc>=3) { 453 parseNameAliases(argv[2], &moreOptions); 454 } 455 parseDB(argc>=2 ? argv[1] : "-", &moreOptions); 456 compress(); 457 generateData(options[DESTDIR].value, &moreOptions); 458 459 u_cleanup(); 460 return 0; 461 } 462 463 static void 464 init() { 465 int i; 466 467 for(i=0; i<256; ++i) { 468 tokens[i]=0; 469 } 470 } 471 472 /* parsing ------------------------------------------------------------------ */ 473 474 /* get a name, strip leading and trailing whitespace */ 475 static int16_t 476 getName(char **pStart, char *limit) { 477 /* strip leading whitespace */ 478 char *start=(char *)u_skipWhitespace(*pStart); 479 480 /* strip trailing whitespace */ 481 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { 482 --limit; 483 } 484 485 /* return results */ 486 *pStart=start; 487 return (int16_t)(limit-start); 488 } 489 490 static void U_CALLCONV 491 nameAliasesLineFn(void *context, 492 char *fields[][2], int32_t fieldCount, 493 UErrorCode *pErrorCode) { 494 char *name; 495 int16_t length=0; 496 static uint32_t prevCode=0; 497 uint32_t code=0; 498 499 if(U_FAILURE(*pErrorCode)) { 500 return; 501 } 502 /* get the character code */ 503 code=uprv_strtoul(fields[0][0], NULL, 16); 504 505 /* get the character name */ 506 name=fields[1][0]; 507 length=getName(&name, fields[1][1]); 508 if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) { 509 fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n", 510 name, (unsigned long)code); 511 *pErrorCode=U_PARSE_ERROR; 512 exit(U_PARSE_ERROR); 513 } 514 515 /* check for non-character code points */ 516 if(!U_IS_UNICODE_CHAR(code)) { 517 fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n", 518 (unsigned long)code); 519 *pErrorCode=U_PARSE_ERROR; 520 exit(U_PARSE_ERROR); 521 } 522 523 /* check that the code points (code) are in ascending order */ 524 if(code<=prevCode && code>0) { 525 fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n", 526 (unsigned long)code, (unsigned long)prevCode); 527 *pErrorCode=U_PARSE_ERROR; 528 exit(U_PARSE_ERROR); 529 } 530 prevCode=code; 531 532 if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) { 533 fprintf(stderr, "gennames: error - too many name aliases\n"); 534 *pErrorCode=U_PARSE_ERROR; 535 exit(U_PARSE_ERROR); 536 } 537 cpNameAliases[cpNameAliasesTop].code=code; 538 uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length); 539 cpNameAliases[cpNameAliasesTop].nameAlias[length]=0; 540 ++cpNameAliasesTop; 541 542 parseName(name, length); 543 } 544 545 static void U_CALLCONV 546 lineFn(void *context, 547 char *fields[][2], int32_t fieldCount, 548 UErrorCode *pErrorCode) { 549 Options *storeOptions=(Options *)context; 550 char *names[4]; 551 int16_t lengths[4]={ 0, 0, 0, 0 }; 552 static uint32_t prevCode=0; 553 uint32_t code=0; 554 555 if(U_FAILURE(*pErrorCode)) { 556 return; 557 } 558 /* get the character code */ 559 code=uprv_strtoul(fields[0][0], NULL, 16); 560 561 /* get the character name */ 562 if(storeOptions->storeNames) { 563 names[0]=fields[1][0]; 564 lengths[0]=getName(names+0, fields[1][1]); 565 if(names[0][0]=='<') { 566 /* do not store pseudo-names in <> brackets */ 567 lengths[0]=0; 568 } 569 } 570 571 /* store 1.0 names */ 572 /* get the second character name, the one from Unicode 1.0 */ 573 if(storeOptions->store10Names) { 574 names[1]=fields[10][0]; 575 lengths[1]=getName(names+1, fields[10][1]); 576 if(names[1][0]=='<') { 577 /* do not store pseudo-names in <> brackets */ 578 lengths[1]=0; 579 } 580 } 581 582 /* get the ISO 10646 comment */ 583 if(storeOptions->storeISOComments) { 584 names[2]=fields[11][0]; 585 lengths[2]=getName(names+2, fields[11][1]); 586 } 587 588 if(lengths[0]+lengths[1]+lengths[2]==0) { 589 return; 590 } 591 592 /* check for non-character code points */ 593 if(!U_IS_UNICODE_CHAR(code)) { 594 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", 595 (unsigned long)code); 596 *pErrorCode=U_PARSE_ERROR; 597 exit(U_PARSE_ERROR); 598 } 599 600 /* check that the code points (code) are in ascending order */ 601 if(code<=prevCode && code>0) { 602 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", 603 (unsigned long)code, (unsigned long)prevCode); 604 *pErrorCode=U_PARSE_ERROR; 605 exit(U_PARSE_ERROR); 606 } 607 prevCode=code; 608 609 parseName(names[0], lengths[0]); 610 parseName(names[1], lengths[1]); 611 parseName(names[2], lengths[2]); 612 613 if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) { 614 if(code==cpNameAliases[cpNameAliasesIndex].code) { 615 names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias; 616 lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias); 617 ++cpNameAliasesIndex; 618 } else { 619 fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", 620 (unsigned long)code); 621 *pErrorCode=U_PARSE_ERROR; 622 exit(U_PARSE_ERROR); 623 } 624 } 625 626 /* 627 * set the count argument to 628 * 1: only store regular names, or only store ISO 10646 comments 629 * 2: store regular and 1.0 names 630 * 3: store names and ISO 10646 comment 631 * 4: also store name alias 632 * 633 * addLine() will ignore empty trailing names 634 */ 635 if(storeOptions->storeNames) { 636 /* store names and comments as parsed according to storeOptions */ 637 addLine(code, names, lengths, LENGTHOF(names)); 638 } else { 639 /* store only ISO 10646 comments */ 640 addLine(code, names+2, lengths+2, 1); 641 } 642 } 643 644 static void 645 parseNameAliases(const char *filename, Options *storeOptions) { 646 char *fields[2][2]; 647 UErrorCode errorCode=U_ZERO_ERROR; 648 649 if(!storeOptions->storeNames) { 650 return; 651 } 652 u_parseDelimitedFile(filename, ';', fields, 2, nameAliasesLineFn, NULL, &errorCode); 653 if(U_FAILURE(errorCode)) { 654 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); 655 exit(errorCode); 656 } 657 658 if(!beQuiet) { 659 printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop); 660 } 661 } 662 663 static void 664 parseDB(const char *filename, Options *storeOptions) { 665 char *fields[15][2]; 666 UErrorCode errorCode=U_ZERO_ERROR; 667 668 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode); 669 if(U_FAILURE(errorCode)) { 670 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); 671 exit(errorCode); 672 } 673 if(cpNameAliasesIndex<cpNameAliasesTop) { 674 fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", 675 (unsigned long)cpNameAliases[cpNameAliasesIndex].code); 676 exit(U_PARSE_ERROR); 677 } 678 679 if(!beQuiet) { 680 printf("size of all names in the database: %lu\n", 681 (unsigned long)lineTop); 682 printf("number of named Unicode characters: %lu\n", 683 (unsigned long)lineCount); 684 printf("number of words in the dictionary from these names: %lu\n", 685 (unsigned long)wordCount); 686 } 687 } 688 689 static void 690 parseName(char *name, int16_t length) { 691 int16_t start=0, limit, wordLength/*, prevStart=-1*/; 692 Word *word; 693 694 while(start<length) { 695 /* skip any "noise" characters */ 696 limit=skipNoise(name, start, length); 697 if(start<limit) { 698 /*prevStart=-1;*/ 699 start=limit; 700 } 701 if(start==length) { 702 break; 703 } 704 705 /* get a word and add it if it is longer than 1 */ 706 limit=getWord(name, start, length); 707 wordLength=(int16_t)(limit-start); 708 if(wordLength>1) { 709 word=findWord(name+start, wordLength); 710 if(word==NULL) { 711 word=addWord(name+start, wordLength); 712 } 713 countWord(word); 714 } 715 716 #if 0 717 /* 718 * if there was a word before this 719 * (with no noise in between), then add the pair of words, too 720 */ 721 if(prevStart!=-1) { 722 wordLength=limit-prevStart; 723 word=findWord(name+prevStart, wordLength); 724 if(word==NULL) { 725 word=addWord(name+prevStart, wordLength); 726 } 727 countWord(word); 728 } 729 #endif 730 731 /*prevStart=start;*/ 732 start=limit; 733 } 734 } 735 736 static UBool U_INLINE 737 isWordChar(char c) { 738 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */ 739 ('J'<=c && c<='R') || 740 ('S'<=c && c<='Z') || 741 742 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */ 743 ('j'<=c && c<='r') || 744 ('s'<=c && c<='z') || 745 746 ('0'<=c && c<='9'); 747 } 748 749 static int16_t 750 skipNoise(char *line, int16_t start, int16_t limit) { 751 /* skip anything that is not part of a word in this sense */ 752 while(start<limit && !isWordChar(line[start])) { 753 ++start; 754 } 755 756 return start; 757 } 758 759 static int16_t 760 getWord(char *line, int16_t start, int16_t limit) { 761 char c=0; /* initialize to avoid a compiler warning although the code was safe */ 762 763 /* a unicode character name word consists of A-Z0-9 */ 764 while(start<limit && isWordChar(line[start])) { 765 ++start; 766 } 767 768 /* include a following space or dash */ 769 if(start<limit && ((c=line[start])==' ' || c=='-')) { 770 ++start; 771 } 772 773 return start; 774 } 775 776 /* compressing -------------------------------------------------------------- */ 777 778 static void 779 compress() { 780 uint32_t i, letterCount; 781 int16_t wordNumber; 782 UErrorCode errorCode; 783 784 /* sort the words in reverse order by weight */ 785 errorCode=U_ZERO_ERROR; 786 uprv_sortArray(words, wordCount, sizeof(Word), 787 compareWords, NULL, FALSE, &errorCode); 788 789 /* remove the words that do not save anything */ 790 while(wordCount>0 && words[wordCount-1].weight<1) { 791 --wordCount; 792 } 793 794 /* count the letters in the token range */ 795 letterCount=0; 796 for(i=LEADBYTE_LIMIT; i<256; ++i) { 797 if(tokens[i]==-1) { 798 ++letterCount; 799 } 800 } 801 if(!beQuiet) { 802 printf("number of letters used in the names: %d\n", (int)letterCount); 803 } 804 805 /* do we need double-byte tokens? */ 806 if(wordCount+letterCount<=256) { 807 /* no, single-byte tokens are enough */ 808 leadByteCount=0; 809 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { 810 if(tokens[i]!=-1) { 811 tokens[i]=wordNumber; 812 if(beVerbose) { 813 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 814 (int)i, (long)words[wordNumber].weight, 815 words[wordNumber].length, words[wordNumber].s); 816 } 817 ++wordNumber; 818 } 819 } 820 tokenCount=i; 821 } else { 822 /* 823 * The tokens that need two token bytes 824 * get their weight reduced by their count 825 * because they save less. 826 */ 827 tokenCount=256-letterCount; 828 for(i=tokenCount; i<wordCount; ++i) { 829 words[i].weight-=words[i].count; 830 } 831 832 /* sort these words in reverse order by weight */ 833 errorCode=U_ZERO_ERROR; 834 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word), 835 compareWords, NULL, FALSE, &errorCode); 836 837 /* remove the words that do not save anything */ 838 while(wordCount>0 && words[wordCount-1].weight<1) { 839 --wordCount; 840 } 841 842 /* how many tokens and lead bytes do we have now? */ 843 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); 844 /* 845 * adjust upwards to take into account that 846 * double-byte tokens must not 847 * use NAME_SEPARATOR_CHAR as a second byte 848 */ 849 tokenCount+=(tokenCount-256+254)/255; 850 851 leadByteCount=(int16_t)(tokenCount>>8); 852 if(leadByteCount<LEADBYTE_LIMIT) { 853 /* adjust for the real number of lead bytes */ 854 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount; 855 } else { 856 /* limit the number of lead bytes */ 857 leadByteCount=LEADBYTE_LIMIT-1; 858 tokenCount=LEADBYTE_LIMIT*256; 859 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1); 860 /* adjust again to skip double-byte tokens with ';' */ 861 wordCount-=(tokenCount-256+254)/255; 862 } 863 864 /* set token 0 to word 0 */ 865 tokens[0]=0; 866 if(beVerbose) { 867 printf("tokens[0x000]: word%8ld \"%.*s\"\n", 868 (long)words[0].weight, 869 words[0].length, words[0].s); 870 } 871 wordNumber=1; 872 873 /* set the lead byte tokens */ 874 for(i=1; (int16_t)i<=leadByteCount; ++i) { 875 tokens[i]=-2; 876 } 877 878 /* set the tokens */ 879 for(; i<256; ++i) { 880 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ 881 if(tokens[i]!=-1) { 882 tokens[i]=wordNumber; 883 if(beVerbose) { 884 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 885 (int)i, (long)words[wordNumber].weight, 886 words[wordNumber].length, words[wordNumber].s); 887 } 888 ++wordNumber; 889 } 890 } 891 892 /* continue above 255 where there are no letters */ 893 for(; (uint32_t)wordNumber<wordCount; ++i) { 894 if((i&0xff)==NAME_SEPARATOR_CHAR) { 895 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 896 } else { 897 tokens[i]=wordNumber; 898 if(beVerbose) { 899 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 900 (int)i, (long)words[wordNumber].weight, 901 words[wordNumber].length, words[wordNumber].s); 902 } 903 ++wordNumber; 904 } 905 } 906 tokenCount=i; /* should be already tokenCount={i or i+1} */ 907 } 908 909 if(!beQuiet) { 910 printf("number of lead bytes: %d\n", leadByteCount); 911 printf("number of single-byte tokens: %lu\n", 912 (unsigned long)256-letterCount-leadByteCount); 913 printf("number of tokens: %lu\n", (unsigned long)tokenCount); 914 } 915 916 compressLines(); 917 } 918 919 static void 920 compressLines() { 921 Line *line=NULL; 922 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */, 923 groupMSB=0xffff, lineCount2; 924 int16_t groupTop=0; 925 926 /* store the groups like lines, with compressed data after raw strings */ 927 groupBottom=lineTop; 928 lineCount2=lineCount; 929 lineCount=0; 930 931 /* loop over all lines */ 932 while(i<lineCount2) { 933 line=lines+i++; 934 inLine=line->code; 935 936 /* segment the lines to groups of 32 */ 937 if(inLine>>GROUP_SHIFT!=groupMSB) { 938 /* finish the current group with empty lines */ 939 while((++outLine&GROUP_MASK)!=0) { 940 appendLineLength(0); 941 } 942 943 /* store the group like a line */ 944 if(groupTop>0) { 945 if(groupTop>GROUP_STORE_SIZE) { 946 fprintf(stderr, "gennames: group store overflow\n"); 947 exit(U_BUFFER_OVERFLOW_ERROR); 948 } 949 addGroup(groupMSB, groupStore, groupTop); 950 } 951 952 /* start the new group */ 953 lineLengthsTop=0; 954 groupTop=0; 955 groupMSB=inLine>>GROUP_SHIFT; 956 outLine=(inLine&~GROUP_MASK)-1; 957 } 958 959 /* write empty lines between the previous line in the group and this one */ 960 while(++outLine<inLine) { 961 appendLineLength(0); 962 } 963 964 /* write characters and tokens for this line */ 965 appendLineLength(compressLine(line->s, line->length, &groupTop)); 966 } 967 968 /* finish and store the last group */ 969 if(line && groupMSB!=0xffff) { 970 /* finish the current group with empty lines */ 971 while((++outLine&GROUP_MASK)!=0) { 972 appendLineLength(0); 973 } 974 975 /* store the group like a line */ 976 if(groupTop>0) { 977 if(groupTop>GROUP_STORE_SIZE) { 978 fprintf(stderr, "gennames: group store overflow\n"); 979 exit(U_BUFFER_OVERFLOW_ERROR); 980 } 981 addGroup(groupMSB, groupStore, groupTop); 982 } 983 } 984 985 if(!beQuiet) { 986 printf("number of groups: %lu\n", (unsigned long)lineCount); 987 } 988 } 989 990 static int16_t 991 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { 992 int16_t start, limit, token, groupTop=*pGroupTop; 993 994 start=0; 995 do { 996 /* write any "noise" characters */ 997 limit=skipNoise((char *)s, start, length); 998 while(start<limit) { 999 groupStore[groupTop++]=s[start++]; 1000 } 1001 1002 if(start==length) { 1003 break; 1004 } 1005 1006 /* write a word, as token or directly */ 1007 limit=getWord((char *)s, start, length); 1008 if(limit-start==1) { 1009 groupStore[groupTop++]=s[start++]; 1010 } else { 1011 token=findToken(s+start, (int16_t)(limit-start)); 1012 if(token!=-1) { 1013 if(token>0xff) { 1014 groupStore[groupTop++]=(uint8_t)(token>>8); 1015 } 1016 groupStore[groupTop++]=(uint8_t)token; 1017 start=limit; 1018 } else { 1019 while(start<limit) { 1020 groupStore[groupTop++]=s[start++]; 1021 } 1022 } 1023 } 1024 } while(start<length); 1025 1026 length=(int16_t)(groupTop-*pGroupTop); 1027 *pGroupTop=groupTop; 1028 return length; 1029 } 1030 1031 static int32_t 1032 compareWords(const void *context, const void *word1, const void *word2) { 1033 /* reverse sort by word weight */ 1034 return ((Word *)word2)->weight-((Word *)word1)->weight; 1035 } 1036 1037 /* generate output data ----------------------------------------------------- */ 1038 1039 static void 1040 generateData(const char *dataDir, Options *storeOptions) { 1041 UNewDataMemory *pData; 1042 UErrorCode errorCode=U_ZERO_ERROR; 1043 uint16_t groupWords[3]; 1044 uint32_t i, groupTop=lineTop, offset, size, 1045 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; 1046 long dataLength; 1047 int16_t token; 1048 1049 pData=udata_create(dataDir, 1050 DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME, 1051 &dataInfo, 1052 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 1053 if(U_FAILURE(errorCode)) { 1054 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); 1055 exit(errorCode); 1056 } 1057 1058 /* first, see how much space we need, and prepare the token strings */ 1059 for(i=0; i<tokenCount; ++i) { 1060 token=tokens[i]; 1061 if(token!=-1 && token!=-2) { 1062 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop); 1063 } 1064 } 1065 1066 /* 1067 * Required padding for data swapping: 1068 * The token table undergoes a permutation during data swapping when the 1069 * input and output charsets are different. 1070 * The token table cannot grow during swapping, so we need to make sure that 1071 * the table is long enough for successful in-place permutation. 1072 * 1073 * We simply round up tokenCount to the next multiple of 256 to account for 1074 * all possible permutations. 1075 * 1076 * An optimization is possible if we only ever swap between ASCII and EBCDIC: 1077 * 1078 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used 1079 * and will be swapped between ASCII and EBCDIC between 1080 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon). 1081 * This should be the only -1 entry in tokens[256..511] on which the data 1082 * swapper bases its trail byte permutation map (trailMap[]). 1083 * 1084 * It would be sufficient to increase tokenCount so that its lower 8 bits 1085 * are at least 0x5e+1 to make room for swapping between the two semicolons. 1086 * For values higher than 0x5e, the trail byte permutation map (trailMap[]) 1087 * should always be an identity map, where we do not need additional room. 1088 */ 1089 i=tokenCount; 1090 tokenCount=(tokenCount+0xff)&~0xff; 1091 if(!beQuiet && i<tokenCount) { 1092 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i)); 1093 } 1094 for(; i<tokenCount; ++i) { 1095 if((i&0xff)==NAME_SEPARATOR_CHAR) { 1096 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 1097 } else { 1098 tokens[i]=0; /* unused token for padding */ 1099 } 1100 } 1101 1102 /* 1103 * Calculate the total size in bytes of the data including: 1104 * - the offset to the token strings, uint32_t (4) 1105 * - the offset to the group table, uint32_t (4) 1106 * - the offset to the group strings, uint32_t (4) 1107 * - the offset to the algorithmic names, uint32_t (4) 1108 * 1109 * - the number of tokens, uint16_t (2) 1110 * - the token table, uint16_t[tokenCount] (2*tokenCount) 1111 * 1112 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded 1113 * 1114 * - the number of groups, uint16_t (2) 1115 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount] 1116 * 1117 * - the group strings (groupTop-groupBottom), 2-padded 1118 * 1119 * - the size of the data for the algorithmic names 1120 */ 1121 tokenStringOffset=4+4+4+4+2+2*tokenCount; 1122 groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1; 1123 groupStringOffset=groupsOffset+2+6*lineCount; 1124 algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3; 1125 1126 offset=generateAlgorithmicData(NULL, storeOptions); 1127 size=algNamesOffset+offset; 1128 1129 if(!beQuiet) { 1130 printf("size of the Unicode Names data:\n" 1131 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n", 1132 (unsigned long)size, (unsigned long)(lineTop-groupTop), 1133 (unsigned long)(groupTop-groupBottom), (unsigned long)offset); 1134 } 1135 1136 /* write the data to the file */ 1137 /* offsets */ 1138 udata_write32(pData, tokenStringOffset); 1139 udata_write32(pData, groupsOffset); 1140 udata_write32(pData, groupStringOffset); 1141 udata_write32(pData, algNamesOffset); 1142 1143 /* token table */ 1144 udata_write16(pData, (uint16_t)tokenCount); 1145 udata_writeBlock(pData, tokens, 2*tokenCount); 1146 1147 /* token strings */ 1148 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop); 1149 if((lineTop-groupTop)&1) { 1150 /* 2-padding */ 1151 udata_writePadding(pData, 1); 1152 } 1153 1154 /* group table */ 1155 udata_write16(pData, (uint16_t)lineCount); 1156 for(i=0; i<lineCount; ++i) { 1157 /* groupMSB */ 1158 groupWords[0]=(uint16_t)lines[i].code; 1159 1160 /* offset */ 1161 offset = (uint32_t)((lines[i].s - stringStore)-groupBottom); 1162 groupWords[1]=(uint16_t)(offset>>16); 1163 groupWords[2]=(uint16_t)(offset); 1164 udata_writeBlock(pData, groupWords, 6); 1165 } 1166 1167 /* group strings */ 1168 udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom); 1169 1170 /* 4-align the algorithmic names data */ 1171 udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); 1172 1173 generateAlgorithmicData(pData, storeOptions); 1174 1175 /* finish up */ 1176 dataLength=udata_finish(pData, &errorCode); 1177 if(U_FAILURE(errorCode)) { 1178 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); 1179 exit(errorCode); 1180 } 1181 1182 if(dataLength!=(long)size) { 1183 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", 1184 dataLength, (unsigned long)size); 1185 exit(U_INTERNAL_PROGRAM_ERROR); 1186 } 1187 } 1188 1189 /* the structure for algorithmic names needs to be 4-aligned */ 1190 typedef struct AlgorithmicRange { 1191 uint32_t rangeStart, rangeEnd; 1192 uint8_t algorithmType, algorithmVariant; 1193 uint16_t rangeSize; 1194 } AlgorithmicRange; 1195 1196 static uint32_t 1197 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) { 1198 static char prefix[] = "CJK UNIFIED IDEOGRAPH-"; 1199 # define PREFIX_LENGTH 23 1200 # define PREFIX_LENGTH_4 24 1201 uint32_t countAlgRanges; 1202 1203 static AlgorithmicRange cjkExtA={ 1204 0x3400, 0x4db5, 1205 0, 4, 1206 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1207 }; 1208 static AlgorithmicRange cjk={ 1209 0x4e00, 0x9fa5, 1210 0, 4, 1211 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1212 }; 1213 static AlgorithmicRange cjkExtB={ 1214 0x20000, 0x2a6d6, 1215 0, 5, 1216 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1217 }; 1218 static AlgorithmicRange cjkExtC={ 1219 0x2a700, 0x2b734, 1220 0, 5, 1221 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1222 }; 1223 1224 static char jamo[]= 1225 "HANGUL SYLLABLE \0" 1226 1227 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" 1228 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" 1229 1230 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" 1231 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" 1232 "YU\0EU\0YI\0I\0" 1233 1234 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" 1235 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" 1236 "S\0SS\0NG\0J\0C\0K\0T\0P\0H" 1237 ; 1238 1239 static AlgorithmicRange hangul={ 1240 0xac00, 0xd7a3, 1241 1, 3, 1242 sizeof(AlgorithmicRange)+6+sizeof(jamo) 1243 }; 1244 1245 /* modulo factors, maximum 8 */ 1246 /* 3 factors: 19, 21, 28, most-to-least-significant */ 1247 static uint16_t hangulFactors[3]={ 1248 19, 21, 28 1249 }; 1250 1251 uint32_t size; 1252 1253 size=0; 1254 1255 if(ucdVersion>=UNI_5_2) { 1256 /* Unicode 5.2 and up has a longer CJK Unihan range than before */ 1257 cjk.rangeEnd=0x9FCB; 1258 } else if(ucdVersion>=UNI_5_1) { 1259 /* Unicode 5.1 and up has a longer CJK Unihan range than before */ 1260 cjk.rangeEnd=0x9FC3; 1261 } else if(ucdVersion>=UNI_4_1) { 1262 /* Unicode 4.1 and up has a longer CJK Unihan range than before */ 1263 cjk.rangeEnd=0x9FBB; 1264 } 1265 1266 /* number of ranges of algorithmic names */ 1267 if(!storeOptions->storeNames) { 1268 countAlgRanges=0; 1269 } else if(ucdVersion>=UNI_5_2) { 1270 /* Unicode 5.2 and up has 5 ranges including CJK Extension C */ 1271 countAlgRanges=5; 1272 } else if(ucdVersion>=UNI_3_1) { 1273 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */ 1274 countAlgRanges=4; 1275 } else if(ucdVersion>=UNI_3_0) { 1276 /* Unicode 3.0 has 3 ranges including CJK Extension A */ 1277 countAlgRanges=3; 1278 } else { 1279 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */ 1280 countAlgRanges=2; 1281 } 1282 1283 if(pData!=NULL) { 1284 udata_write32(pData, countAlgRanges); 1285 } else { 1286 size+=4; 1287 } 1288 if(countAlgRanges==0) { 1289 return size; 1290 } 1291 1292 /* 1293 * each range: 1294 * uint32_t rangeStart 1295 * uint32_t rangeEnd 1296 * uint8_t algorithmType 1297 * uint8_t algorithmVariant 1298 * uint16_t size of range data 1299 * uint8_t[size] data 1300 */ 1301 1302 /* range 0: cjk extension a */ 1303 if(countAlgRanges>=3) { 1304 if(pData!=NULL) { 1305 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); 1306 udata_writeString(pData, prefix, PREFIX_LENGTH); 1307 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1308 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1309 } 1310 } else { 1311 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1312 } 1313 } 1314 1315 /* range 1: cjk */ 1316 if(pData!=NULL) { 1317 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange)); 1318 udata_writeString(pData, prefix, PREFIX_LENGTH); 1319 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1320 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1321 } 1322 } else { 1323 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1324 } 1325 1326 /* range 2: hangul syllables */ 1327 if(pData!=NULL) { 1328 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange)); 1329 udata_writeBlock(pData, hangulFactors, 6); 1330 udata_writeString(pData, jamo, sizeof(jamo)); 1331 } else { 1332 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo); 1333 } 1334 1335 /* range 3: cjk extension b */ 1336 if(countAlgRanges>=4) { 1337 if(pData!=NULL) { 1338 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange)); 1339 udata_writeString(pData, prefix, PREFIX_LENGTH); 1340 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1341 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1342 } 1343 } else { 1344 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1345 } 1346 } 1347 1348 /* range 4: cjk extension c */ 1349 if(countAlgRanges>=5) { 1350 if(pData!=NULL) { 1351 udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange)); 1352 udata_writeString(pData, prefix, PREFIX_LENGTH); 1353 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1354 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1355 } 1356 } else { 1357 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1358 } 1359 } 1360 1361 return size; 1362 } 1363 1364 /* helpers ------------------------------------------------------------------ */ 1365 1366 static int16_t 1367 findToken(uint8_t *s, int16_t length) { 1368 int16_t i, token; 1369 1370 for(i=0; i<(int16_t)tokenCount; ++i) { 1371 token=tokens[i]; 1372 if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { 1373 return i; 1374 } 1375 } 1376 1377 return -1; 1378 } 1379 1380 static Word * 1381 findWord(char *s, int16_t length) { 1382 uint32_t i; 1383 1384 for(i=0; i<wordCount; ++i) { 1385 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) { 1386 return words+i; 1387 } 1388 } 1389 1390 return NULL; 1391 } 1392 1393 static Word * 1394 addWord(char *s, int16_t length) { 1395 uint8_t *stringStart; 1396 Word *word; 1397 1398 if(wordCount==MAX_WORD_COUNT) { 1399 fprintf(stderr, "gennames: too many words\n"); 1400 exit(U_BUFFER_OVERFLOW_ERROR); 1401 } 1402 1403 stringStart=allocWord(length); 1404 uprv_memcpy(stringStart, s, length); 1405 1406 word=words+wordCount; 1407 1408 /* 1409 * Initialize the weight with the costs for this token: 1410 * a zero-terminated string and a 16-bit offset. 1411 */ 1412 word->weight=-(length+1+2); 1413 word->count=0; 1414 word->length=length; 1415 word->s=stringStart; 1416 1417 ++wordCount; 1418 1419 return word; 1420 } 1421 1422 static void 1423 countWord(Word *word) { 1424 /* add to the weight the savings: the length of the word minus 1 byte for the token */ 1425 word->weight+=word->length-1; 1426 ++word->count; 1427 } 1428 1429 static void 1430 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) { 1431 uint8_t *stringStart; 1432 Line *line; 1433 int16_t i, length; 1434 1435 if(lineCount==MAX_LINE_COUNT) { 1436 fprintf(stderr, "gennames: too many lines\n"); 1437 exit(U_BUFFER_OVERFLOW_ERROR); 1438 } 1439 1440 /* find the last non-empty name */ 1441 while(count>0 && lengths[count-1]==0) { 1442 --count; 1443 } 1444 if(count==0) { 1445 return; /* should not occur: caller should not have called */ 1446 } 1447 1448 /* there will be (count-1) separator characters */ 1449 i=count; 1450 length=count-1; 1451 1452 /* add lengths of strings */ 1453 while(i>0) { 1454 length+=lengths[--i]; 1455 } 1456 1457 /* allocate line memory */ 1458 stringStart=allocLine(length); 1459 1460 /* copy all strings into the line memory */ 1461 length=0; /* number of chars copied so far */ 1462 for(i=0; i<count; ++i) { 1463 if(i>0) { 1464 stringStart[length++]=NAME_SEPARATOR_CHAR; 1465 } 1466 if(lengths[i]>0) { 1467 uprv_memcpy(stringStart+length, names[i], lengths[i]); 1468 length+=lengths[i]; 1469 } 1470 } 1471 1472 line=lines+lineCount; 1473 1474 line->code=code; 1475 line->length=length; 1476 line->s=stringStart; 1477 1478 ++lineCount; 1479 1480 /* prevent a character value that is actually in a name from becoming a token */ 1481 while(length>0) { 1482 tokens[stringStart[--length]]=-1; 1483 } 1484 } 1485 1486 static void 1487 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { 1488 uint8_t *stringStart; 1489 Line *line; 1490 1491 if(lineCount==MAX_LINE_COUNT) { 1492 fprintf(stderr, "gennames: too many groups\n"); 1493 exit(U_BUFFER_OVERFLOW_ERROR); 1494 } 1495 1496 /* store the line lengths first, then the strings */ 1497 lineLengthsTop=(lineLengthsTop+1)/2; 1498 stringStart=allocLine(lineLengthsTop+length); 1499 uprv_memcpy(stringStart, lineLengths, lineLengthsTop); 1500 uprv_memcpy(stringStart+lineLengthsTop, strings, length); 1501 1502 line=lines+lineCount; 1503 1504 line->code=groupMSB; 1505 line->length=length; 1506 line->s=stringStart; 1507 1508 ++lineCount; 1509 } 1510 1511 static uint32_t 1512 addToken(uint8_t *s, int16_t length) { 1513 uint8_t *stringStart; 1514 1515 stringStart=allocLine(length+1); 1516 uprv_memcpy(stringStart, s, length); 1517 stringStart[length]=0; 1518 1519 return (uint32_t)(stringStart - stringStore); 1520 } 1521 1522 static void 1523 appendLineLength(int16_t length) { 1524 if(length>=76) { 1525 fprintf(stderr, "gennames: compressed line too long\n"); 1526 exit(U_BUFFER_OVERFLOW_ERROR); 1527 } 1528 if(length>=12) { 1529 length-=12; 1530 appendLineLengthNibble((uint8_t)((length>>4)|12)); 1531 } 1532 appendLineLengthNibble((uint8_t)length); 1533 } 1534 1535 static void 1536 appendLineLengthNibble(uint8_t nibble) { 1537 if((lineLengthsTop&1)==0) { 1538 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); 1539 } else { 1540 lineLengths[lineLengthsTop/2]|=nibble&0xf; 1541 } 1542 ++lineLengthsTop; 1543 } 1544 1545 static uint8_t * 1546 allocLine(int32_t length) { 1547 uint32_t top=lineTop+length; 1548 uint8_t *p; 1549 1550 if(top>wordBottom) { 1551 fprintf(stderr, "gennames: out of memory\n"); 1552 exit(U_MEMORY_ALLOCATION_ERROR); 1553 } 1554 p=stringStore+lineTop; 1555 lineTop=top; 1556 return p; 1557 } 1558 1559 static uint8_t * 1560 allocWord(uint32_t length) { 1561 uint32_t bottom=wordBottom-length; 1562 1563 if(lineTop>bottom) { 1564 fprintf(stderr, "gennames: out of memory\n"); 1565 exit(U_MEMORY_ALLOCATION_ERROR); 1566 } 1567 wordBottom=bottom; 1568 return stringStore+bottom; 1569 } 1570 1571 /* 1572 * Hey, Emacs, please set the following: 1573 * 1574 * Local Variables: 1575 * indent-tabs-mode: nil 1576 * End: 1577 * 1578 */ 1579