1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: gennames.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999sep30 14 * created by: Markus W. Scherer 15 * 16 * This program reads the Unicode character database text file, 17 * parses it, and extracts the character code, 18 * the "modern" character name, and optionally the 19 * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment. 20 * It then tokenizes and compresses the names and builds 21 * compact binary tables for random-access lookup 22 * in a u_charName() API function. 23 * 24 * unames.icu file format (after UDataInfo header etc. - see udata.c) 25 * (all data is static const) 26 * 27 * UDataInfo fields: 28 * dataFormat "unam" 29 * formatVersion 1.0 30 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 31 * 32 * -- data-based names 33 * uint32_t tokenStringOffset, 34 * groupsOffset, 35 * groupStringOffset, 36 * algNamesOffset; 37 * 38 * uint16_t tokenCount; 39 * uint16_t tokenTable[tokenCount]; 40 * 41 * char tokenStrings[]; -- padded to even count 42 * 43 * -- strings (groupStrings) are tokenized as follows: 44 * for each character c 45 * if(c>=tokenCount) write that character c directly 46 * else 47 * token=tokenTable[c]; 48 * if(token==0xfffe) -- lead byte of double-byte token 49 * token=tokenTable[c<<8|next character]; 50 * if(token==-1) 51 * write c directly 52 * else 53 * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) 54 * append zero-terminated tokenString; 55 * 56 * Different strings for a code point - normal name, 1.0 name, and ISO comment - 57 * are separated by ';'. 58 * 59 * uint16_t groupCount; 60 * struct { 61 * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 62 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset 63 * uint16_t offsetLow; 64 * } groupTable[groupCount]; 65 * 66 * char groupStrings[]; -- padded to 4-count 67 * 68 * -- The actual, tokenized group strings are not zero-terminated because 69 * that would take up too much space. 70 * Instead, they are preceeded by their length, written in a variable-length sequence: 71 * For each of the 32 group strings, one or two nibbles are stored for its length. 72 * Nibbles (4-bit values, half-bytes) are read MSB first. 73 * A nibble with a value of 0..11 directly indicates the length of the name string. 74 * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m 75 * by (((n-12)<<4)|m)+12, reaching values of 12..75. 76 * These lengths are sequentially for each tokenized string, not for the de-tokenized result. 77 * For the de-tokenizing, see token description above; the strings immediately follow the 78 * 32 lengths. 79 * 80 * -- algorithmic names 81 * 82 * typedef struct AlgorithmicRange { 83 * uint32_t rangeStart, rangeEnd; 84 * uint8_t algorithmType, algorithmVariant; 85 * uint16_t rangeSize; 86 * } AlgorithmicRange; 87 * 88 * uint32_t algRangesCount; -- number of data blocks for ranges of 89 * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) 90 * 91 * struct { 92 * AlgorithmicRange algRange; 93 * uint8_t algRangeData[]; -- padded to 4-count except in last range 94 * } algRanges[algNamesCount]; 95 * -- not a real array because each part has a different size 96 * of algRange.rangeSize (including AlgorithmicRange) 97 * 98 * -- algorithmic range types: 99 * 100 * 0 Names are formed from a string prefix that is stored in 101 * the algRangeData (zero-terminated), followed by the Unicode code point 102 * of the character in hexadecimal digits; 103 * algRange.algorithmVariant digits are written 104 * 105 * 1 Names are formed by calculating modulo-factors of the code point value as follows: 106 * algRange.algorithmVariant is the count of modulo factors 107 * algRangeData contains 108 * uint16_t factors[algRange.algorithmVariant]; 109 * char strings[]; 110 * the first zero-terminated string is written as the prefix; then: 111 * 112 * The rangeStart is subtracted; with the difference, here "code": 113 * for(i=algRange.algorithmVariant-1 to 0 step -1) 114 * index[i]=code%factor[i]; 115 * code/=factor[i]; 116 * 117 * The strings after the prefix are short pieces that are then appended to the result 118 * according to index[0..algRange.algorithmVariant-1]. 119 */ 120 121 #include <stdio.h> 122 #include "unicode/utypes.h" 123 #include "unicode/putil.h" 124 #include "unicode/uclean.h" 125 #include "unicode/udata.h" 126 #include "cmemory.h" 127 #include "cstring.h" 128 #include "uarrsort.h" 129 #include "unewdata.h" 130 #include "uoptions.h" 131 #include "uparse.h" 132 133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 134 135 #define STRING_STORE_SIZE 1000000 136 #define GROUP_STORE_SIZE 5000 137 138 #define GROUP_SHIFT 5 139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT) 140 #define GROUP_MASK (LINES_PER_GROUP-1) 141 142 #define MAX_LINE_COUNT 50000 143 #define MAX_WORD_COUNT 20000 144 #define MAX_GROUP_COUNT 5000 145 146 #define DATA_NAME "unames" 147 #define DATA_TYPE "icu" 148 #define VERSION_STRING "unam" 149 #define NAME_SEPARATOR_CHAR ';' 150 151 #define ISO_DATA_NAME "ucomment" 152 153 /* Unicode versions --------------------------------------------------------- */ 154 155 enum { 156 UNI_1_0, 157 UNI_1_1, 158 UNI_2_0, 159 UNI_3_0, 160 UNI_3_1, 161 UNI_3_2, 162 UNI_4_0, 163 UNI_4_0_1, 164 UNI_4_1, 165 UNI_5_0, 166 UNI_5_1, 167 UNI_VER_COUNT 168 }; 169 170 static const UVersionInfo 171 unicodeVersions[]={ 172 { 1, 0, 0, 0 }, 173 { 1, 1, 0, 0 }, 174 { 2, 0, 0, 0 }, 175 { 3, 0, 0, 0 }, 176 { 3, 1, 0, 0 }, 177 { 3, 2, 0, 0 }, 178 { 4, 0, 0, 0 }, 179 { 4, 0, 1, 0 }, 180 { 4, 1, 0, 0 }, 181 { 5, 0, 0, 0 }, 182 { 5, 1, 0, 0 } 183 }; 184 185 static int32_t ucdVersion=UNI_5_1; 186 187 static int32_t 188 findUnicodeVersion(const UVersionInfo version) { 189 int32_t i; 190 191 for(i=0; /* while(version>unicodeVersions[i]) {} */ 192 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; 193 ++i) {} 194 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { 195 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ 196 } 197 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ 198 } 199 200 /* generator data ----------------------------------------------------------- */ 201 202 /* UDataInfo cf. udata.h */ 203 static UDataInfo dataInfo={ 204 sizeof(UDataInfo), 205 0, 206 207 U_IS_BIG_ENDIAN, 208 U_CHARSET_FAMILY, 209 sizeof(UChar), 210 0, 211 212 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */ 213 {1, 0, 0, 0}, /* formatVersion */ 214 {3, 0, 0, 0} /* dataVersion */ 215 }; 216 217 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE; 218 219 typedef struct Options { 220 UBool storeNames; 221 UBool store10Names; 222 UBool storeISOComments; 223 } Options; 224 225 static uint8_t stringStore[STRING_STORE_SIZE], 226 groupStore[GROUP_STORE_SIZE], 227 lineLengths[LINES_PER_GROUP]; 228 229 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop; 230 231 typedef struct { 232 uint32_t code; 233 int16_t length; 234 uint8_t *s; 235 } Line; 236 237 typedef struct { 238 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */ 239 int16_t count; 240 int16_t length; 241 uint8_t *s; 242 } Word; 243 244 static Line lines[MAX_LINE_COUNT]; 245 static Word words[MAX_WORD_COUNT]; 246 247 static uint32_t lineCount=0, wordCount=0; 248 249 static int16_t leadByteCount; 250 251 #define LEADBYTE_LIMIT 16 252 253 static int16_t tokens[LEADBYTE_LIMIT*256]; 254 static uint32_t tokenCount; 255 256 /* prototypes --------------------------------------------------------------- */ 257 258 static void 259 init(void); 260 261 static void 262 parseDB(const char *filename, Options *options); 263 264 static void 265 parseName(char *name, int16_t length); 266 267 static int16_t 268 skipNoise(char *line, int16_t start, int16_t limit); 269 270 static int16_t 271 getWord(char *line, int16_t start, int16_t limit); 272 273 static void 274 compress(void); 275 276 static void 277 compressLines(void); 278 279 static int16_t 280 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop); 281 282 static int32_t 283 compareWords(const void *context, const void *word1, const void *word2); 284 285 static void 286 generateData(const char *dataDir, Options *options); 287 288 static uint32_t 289 generateAlgorithmicData(UNewDataMemory *pData, Options *options); 290 291 static int16_t 292 findToken(uint8_t *s, int16_t length); 293 294 static Word * 295 findWord(char *s, int16_t length); 296 297 static Word * 298 addWord(char *s, int16_t length); 299 300 static void 301 countWord(Word *word); 302 303 static void 304 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count); 305 306 static void 307 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length); 308 309 static uint32_t 310 addToken(uint8_t *s, int16_t length); 311 312 static void 313 appendLineLength(int16_t length); 314 315 static void 316 appendLineLengthNibble(uint8_t nibble); 317 318 static uint8_t * 319 allocLine(int32_t length); 320 321 static uint8_t * 322 allocWord(uint32_t length); 323 324 /* -------------------------------------------------------------------------- */ 325 326 enum { 327 HELP_H, 328 HELP_QUESTION_MARK, 329 VERBOSE, 330 QUIET, 331 COPYRIGHT, 332 DESTDIR, 333 UNICODE, 334 UNICODE1_NAMES, 335 NO_ISO_COMMENTS, 336 ONLY_ISO_COMMENTS 337 }; 338 339 static UOption options[]={ 340 UOPTION_HELP_H, 341 UOPTION_HELP_QUESTION_MARK, 342 UOPTION_VERBOSE, 343 UOPTION_QUIET, 344 UOPTION_COPYRIGHT, 345 UOPTION_DESTDIR, 346 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, 347 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 }, 348 { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, 349 { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } 350 }; 351 352 extern int 353 main(int argc, char* argv[]) { 354 UVersionInfo version; 355 Options moreOptions={ TRUE, FALSE, TRUE }; 356 UErrorCode errorCode = U_ZERO_ERROR; 357 358 U_MAIN_INIT_ARGS(argc, argv); 359 360 /* Initialize ICU */ 361 u_init(&errorCode); 362 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) { 363 /* Note: u_init() will try to open ICU property data. 364 * failures here are expected when building ICU from scratch. 365 * ignore them. 366 */ 367 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n", 368 argv[0], u_errorName(errorCode)); 369 exit(1); 370 } 371 372 /* preset then read command line options */ 373 options[DESTDIR].value=u_getDataDirectory(); 374 options[UNICODE].value="4.1"; 375 argc=u_parseArgs(argc, argv, LENGTHOF(options), options); 376 377 /* error handling, printing usage message */ 378 if(argc<0) { 379 fprintf(stderr, 380 "error in command line argument \"%s\"\n", 381 argv[-argc]); 382 } else if(argc<2) { 383 argc=-1; 384 } 385 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 386 /* 387 * Broken into chucks because the C89 standard says the minimum 388 * required supported string length is 509 bytes. 389 */ 390 fprintf(stderr, 391 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n" 392 "\n" 393 "Read the UnicodeData.txt file and \n" 394 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n" 395 "\n" 396 "\tfilename absolute path/filename for the Unicode database text file\n" 397 "\t\t(default: standard input)\n" 398 "\n", 399 argv[0]); 400 fprintf(stderr, 401 "Options:\n" 402 "\t-h or -? or --help this usage text\n" 403 "\t-v or --verbose verbose output\n" 404 "\t-q or --quiet no output\n" 405 "\t-c or --copyright include a copyright notice\n" 406 "\t-d or --destdir destination directory, followed by the path\n" 407 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); 408 fprintf(stderr, 409 "\t-1 or --unicode1-names store Unicode 1.0 character names\n" 410 "\t --no-iso-comments do not store ISO comments\n" 411 "\t --only-iso-comments write ucomment.icu with only ISO comments\n"); 412 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 413 } 414 415 /* get the options values */ 416 beVerbose=options[VERBOSE].doesOccur; 417 beQuiet=options[QUIET].doesOccur; 418 haveCopyright=options[COPYRIGHT].doesOccur; 419 moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur; 420 moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur; 421 if(options[ONLY_ISO_COMMENTS].doesOccur) { 422 moreOptions.storeNames=moreOptions.store10Names=FALSE; 423 moreOptions.storeISOComments=TRUE; 424 } 425 426 /* set the Unicode version */ 427 u_versionFromString(version, options[UNICODE].value); 428 uprv_memcpy(dataInfo.dataVersion, version, 4); 429 ucdVersion=findUnicodeVersion(version); 430 431 init(); 432 parseDB(argc>=2 ? argv[1] : "-", &moreOptions); 433 compress(); 434 generateData(options[DESTDIR].value, &moreOptions); 435 436 u_cleanup(); 437 return 0; 438 } 439 440 static void 441 init() { 442 int i; 443 444 for(i=0; i<256; ++i) { 445 tokens[i]=0; 446 } 447 } 448 449 /* parsing ------------------------------------------------------------------ */ 450 451 /* get a name, strip leading and trailing whitespace */ 452 static int16_t 453 getName(char **pStart, char *limit) { 454 /* strip leading whitespace */ 455 char *start=(char *)u_skipWhitespace(*pStart); 456 457 /* strip trailing whitespace */ 458 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { 459 --limit; 460 } 461 462 /* return results */ 463 *pStart=start; 464 return (int16_t)(limit-start); 465 } 466 467 static void U_CALLCONV 468 lineFn(void *context, 469 char *fields[][2], int32_t fieldCount, 470 UErrorCode *pErrorCode) { 471 Options *storeOptions=(Options *)context; 472 char *names[3]; 473 int16_t lengths[3]={ 0, 0, 0 }; 474 static uint32_t prevCode=0; 475 uint32_t code=0; 476 477 if(U_FAILURE(*pErrorCode)) { 478 return; 479 } 480 /* get the character code */ 481 code=uprv_strtoul(fields[0][0], NULL, 16); 482 483 /* get the character name */ 484 if(storeOptions->storeNames) { 485 names[0]=fields[1][0]; 486 lengths[0]=getName(names+0, fields[1][1]); 487 if(names[0][0]=='<') { 488 /* do not store pseudo-names in <> brackets */ 489 lengths[0]=0; 490 } 491 } 492 493 /* store 1.0 names */ 494 /* get the second character name, the one from Unicode 1.0 */ 495 if(storeOptions->store10Names) { 496 names[1]=fields[10][0]; 497 lengths[1]=getName(names+1, fields[10][1]); 498 if(names[1][0]=='<') { 499 /* do not store pseudo-names in <> brackets */ 500 lengths[1]=0; 501 } 502 } 503 504 /* get the ISO 10646 comment */ 505 if(storeOptions->storeISOComments) { 506 names[2]=fields[11][0]; 507 lengths[2]=getName(names+2, fields[11][1]); 508 } 509 510 if(lengths[0]+lengths[1]+lengths[2]==0) { 511 return; 512 } 513 514 /* check for non-character code points */ 515 if(!U_IS_UNICODE_CHAR(code)) { 516 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", 517 (unsigned long)code); 518 *pErrorCode=U_PARSE_ERROR; 519 exit(U_PARSE_ERROR); 520 } 521 522 /* check that the code points (code) are in ascending order */ 523 if(code<=prevCode && code>0) { 524 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", 525 (unsigned long)code, (unsigned long)prevCode); 526 *pErrorCode=U_PARSE_ERROR; 527 exit(U_PARSE_ERROR); 528 } 529 prevCode=code; 530 531 parseName(names[0], lengths[0]); 532 parseName(names[1], lengths[1]); 533 parseName(names[2], lengths[2]); 534 535 /* 536 * set the count argument to 537 * 1: only store regular names, or only store ISO 10646 comments 538 * 2: store regular and 1.0 names 539 * 3: store names and ISO 10646 comment 540 * 541 * addLine() will ignore empty trailing names 542 */ 543 if(storeOptions->storeNames) { 544 /* store names and comments as parsed according to storeOptions */ 545 addLine(code, names, lengths, 3); 546 } else { 547 /* store only ISO 10646 comments */ 548 addLine(code, names+2, lengths+2, 1); 549 } 550 } 551 552 static void 553 parseDB(const char *filename, Options *storeOptions) { 554 char *fields[15][2]; 555 UErrorCode errorCode=U_ZERO_ERROR; 556 557 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode); 558 if(U_FAILURE(errorCode)) { 559 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); 560 exit(errorCode); 561 } 562 563 if(!beQuiet) { 564 printf("size of all names in the database: %lu\n", 565 (unsigned long)lineTop); 566 printf("number of named Unicode characters: %lu\n", 567 (unsigned long)lineCount); 568 printf("number of words in the dictionary from these names: %lu\n", 569 (unsigned long)wordCount); 570 } 571 } 572 573 static void 574 parseName(char *name, int16_t length) { 575 int16_t start=0, limit, wordLength/*, prevStart=-1*/; 576 Word *word; 577 578 while(start<length) { 579 /* skip any "noise" characters */ 580 limit=skipNoise(name, start, length); 581 if(start<limit) { 582 /*prevStart=-1;*/ 583 start=limit; 584 } 585 if(start==length) { 586 break; 587 } 588 589 /* get a word and add it if it is longer than 1 */ 590 limit=getWord(name, start, length); 591 wordLength=(int16_t)(limit-start); 592 if(wordLength>1) { 593 word=findWord(name+start, wordLength); 594 if(word==NULL) { 595 word=addWord(name+start, wordLength); 596 } 597 countWord(word); 598 } 599 600 #if 0 601 /* 602 * if there was a word before this 603 * (with no noise in between), then add the pair of words, too 604 */ 605 if(prevStart!=-1) { 606 wordLength=limit-prevStart; 607 word=findWord(name+prevStart, wordLength); 608 if(word==NULL) { 609 word=addWord(name+prevStart, wordLength); 610 } 611 countWord(word); 612 } 613 #endif 614 615 /*prevStart=start;*/ 616 start=limit; 617 } 618 } 619 620 static UBool U_INLINE 621 isWordChar(char c) { 622 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */ 623 ('J'<=c && c<='R') || 624 ('S'<=c && c<='Z') || 625 626 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */ 627 ('j'<=c && c<='r') || 628 ('s'<=c && c<='z') || 629 630 ('0'<=c && c<='9'); 631 } 632 633 static int16_t 634 skipNoise(char *line, int16_t start, int16_t limit) { 635 /* skip anything that is not part of a word in this sense */ 636 while(start<limit && !isWordChar(line[start])) { 637 ++start; 638 } 639 640 return start; 641 } 642 643 static int16_t 644 getWord(char *line, int16_t start, int16_t limit) { 645 char c=0; /* initialize to avoid a compiler warning although the code was safe */ 646 647 /* a unicode character name word consists of A-Z0-9 */ 648 while(start<limit && isWordChar(line[start])) { 649 ++start; 650 } 651 652 /* include a following space or dash */ 653 if(start<limit && ((c=line[start])==' ' || c=='-')) { 654 ++start; 655 } 656 657 return start; 658 } 659 660 /* compressing -------------------------------------------------------------- */ 661 662 static void 663 compress() { 664 uint32_t i, letterCount; 665 int16_t wordNumber; 666 UErrorCode errorCode; 667 668 /* sort the words in reverse order by weight */ 669 errorCode=U_ZERO_ERROR; 670 uprv_sortArray(words, wordCount, sizeof(Word), 671 compareWords, NULL, FALSE, &errorCode); 672 673 /* remove the words that do not save anything */ 674 while(wordCount>0 && words[wordCount-1].weight<1) { 675 --wordCount; 676 } 677 678 /* count the letters in the token range */ 679 letterCount=0; 680 for(i=LEADBYTE_LIMIT; i<256; ++i) { 681 if(tokens[i]==-1) { 682 ++letterCount; 683 } 684 } 685 if(!beQuiet) { 686 printf("number of letters used in the names: %d\n", (int)letterCount); 687 } 688 689 /* do we need double-byte tokens? */ 690 if(wordCount+letterCount<=256) { 691 /* no, single-byte tokens are enough */ 692 leadByteCount=0; 693 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { 694 if(tokens[i]!=-1) { 695 tokens[i]=wordNumber; 696 if(beVerbose) { 697 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 698 (int)i, (long)words[wordNumber].weight, 699 words[wordNumber].length, words[wordNumber].s); 700 } 701 ++wordNumber; 702 } 703 } 704 tokenCount=i; 705 } else { 706 /* 707 * The tokens that need two token bytes 708 * get their weight reduced by their count 709 * because they save less. 710 */ 711 tokenCount=256-letterCount; 712 for(i=tokenCount; i<wordCount; ++i) { 713 words[i].weight-=words[i].count; 714 } 715 716 /* sort these words in reverse order by weight */ 717 errorCode=U_ZERO_ERROR; 718 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word), 719 compareWords, NULL, FALSE, &errorCode); 720 721 /* remove the words that do not save anything */ 722 while(wordCount>0 && words[wordCount-1].weight<1) { 723 --wordCount; 724 } 725 726 /* how many tokens and lead bytes do we have now? */ 727 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); 728 /* 729 * adjust upwards to take into account that 730 * double-byte tokens must not 731 * use NAME_SEPARATOR_CHAR as a second byte 732 */ 733 tokenCount+=(tokenCount-256+254)/255; 734 735 leadByteCount=(int16_t)(tokenCount>>8); 736 if(leadByteCount<LEADBYTE_LIMIT) { 737 /* adjust for the real number of lead bytes */ 738 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount; 739 } else { 740 /* limit the number of lead bytes */ 741 leadByteCount=LEADBYTE_LIMIT-1; 742 tokenCount=LEADBYTE_LIMIT*256; 743 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1); 744 /* adjust again to skip double-byte tokens with ';' */ 745 wordCount-=(tokenCount-256+254)/255; 746 } 747 748 /* set token 0 to word 0 */ 749 tokens[0]=0; 750 if(beVerbose) { 751 printf("tokens[0x000]: word%8ld \"%.*s\"\n", 752 (long)words[0].weight, 753 words[0].length, words[0].s); 754 } 755 wordNumber=1; 756 757 /* set the lead byte tokens */ 758 for(i=1; (int16_t)i<=leadByteCount; ++i) { 759 tokens[i]=-2; 760 } 761 762 /* set the tokens */ 763 for(; i<256; ++i) { 764 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ 765 if(tokens[i]!=-1) { 766 tokens[i]=wordNumber; 767 if(beVerbose) { 768 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 769 (int)i, (long)words[wordNumber].weight, 770 words[wordNumber].length, words[wordNumber].s); 771 } 772 ++wordNumber; 773 } 774 } 775 776 /* continue above 255 where there are no letters */ 777 for(; (uint32_t)wordNumber<wordCount; ++i) { 778 if((i&0xff)==NAME_SEPARATOR_CHAR) { 779 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 780 } else { 781 tokens[i]=wordNumber; 782 if(beVerbose) { 783 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 784 (int)i, (long)words[wordNumber].weight, 785 words[wordNumber].length, words[wordNumber].s); 786 } 787 ++wordNumber; 788 } 789 } 790 tokenCount=i; /* should be already tokenCount={i or i+1} */ 791 } 792 793 if(!beQuiet) { 794 printf("number of lead bytes: %d\n", leadByteCount); 795 printf("number of single-byte tokens: %lu\n", 796 (unsigned long)256-letterCount-leadByteCount); 797 printf("number of tokens: %lu\n", (unsigned long)tokenCount); 798 } 799 800 compressLines(); 801 } 802 803 static void 804 compressLines() { 805 Line *line=NULL; 806 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */, 807 groupMSB=0xffff, lineCount2; 808 int16_t groupTop=0; 809 810 /* store the groups like lines, with compressed data after raw strings */ 811 groupBottom=lineTop; 812 lineCount2=lineCount; 813 lineCount=0; 814 815 /* loop over all lines */ 816 while(i<lineCount2) { 817 line=lines+i++; 818 inLine=line->code; 819 820 /* segment the lines to groups of 32 */ 821 if(inLine>>GROUP_SHIFT!=groupMSB) { 822 /* finish the current group with empty lines */ 823 while((++outLine&GROUP_MASK)!=0) { 824 appendLineLength(0); 825 } 826 827 /* store the group like a line */ 828 if(groupTop>0) { 829 if(groupTop>GROUP_STORE_SIZE) { 830 fprintf(stderr, "gennames: group store overflow\n"); 831 exit(U_BUFFER_OVERFLOW_ERROR); 832 } 833 addGroup(groupMSB, groupStore, groupTop); 834 } 835 836 /* start the new group */ 837 lineLengthsTop=0; 838 groupTop=0; 839 groupMSB=inLine>>GROUP_SHIFT; 840 outLine=(inLine&~GROUP_MASK)-1; 841 } 842 843 /* write empty lines between the previous line in the group and this one */ 844 while(++outLine<inLine) { 845 appendLineLength(0); 846 } 847 848 /* write characters and tokens for this line */ 849 appendLineLength(compressLine(line->s, line->length, &groupTop)); 850 } 851 852 /* finish and store the last group */ 853 if(line && groupMSB!=0xffff) { 854 /* finish the current group with empty lines */ 855 while((++outLine&GROUP_MASK)!=0) { 856 appendLineLength(0); 857 } 858 859 /* store the group like a line */ 860 if(groupTop>0) { 861 if(groupTop>GROUP_STORE_SIZE) { 862 fprintf(stderr, "gennames: group store overflow\n"); 863 exit(U_BUFFER_OVERFLOW_ERROR); 864 } 865 addGroup(groupMSB, groupStore, groupTop); 866 } 867 } 868 869 if(!beQuiet) { 870 printf("number of groups: %lu\n", (unsigned long)lineCount); 871 } 872 } 873 874 static int16_t 875 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { 876 int16_t start, limit, token, groupTop=*pGroupTop; 877 878 start=0; 879 do { 880 /* write any "noise" characters */ 881 limit=skipNoise((char *)s, start, length); 882 while(start<limit) { 883 groupStore[groupTop++]=s[start++]; 884 } 885 886 if(start==length) { 887 break; 888 } 889 890 /* write a word, as token or directly */ 891 limit=getWord((char *)s, start, length); 892 if(limit-start==1) { 893 groupStore[groupTop++]=s[start++]; 894 } else { 895 token=findToken(s+start, (int16_t)(limit-start)); 896 if(token!=-1) { 897 if(token>0xff) { 898 groupStore[groupTop++]=(uint8_t)(token>>8); 899 } 900 groupStore[groupTop++]=(uint8_t)token; 901 start=limit; 902 } else { 903 while(start<limit) { 904 groupStore[groupTop++]=s[start++]; 905 } 906 } 907 } 908 } while(start<length); 909 910 length=(int16_t)(groupTop-*pGroupTop); 911 *pGroupTop=groupTop; 912 return length; 913 } 914 915 static int32_t 916 compareWords(const void *context, const void *word1, const void *word2) { 917 /* reverse sort by word weight */ 918 return ((Word *)word2)->weight-((Word *)word1)->weight; 919 } 920 921 /* generate output data ----------------------------------------------------- */ 922 923 static void 924 generateData(const char *dataDir, Options *storeOptions) { 925 UNewDataMemory *pData; 926 UErrorCode errorCode=U_ZERO_ERROR; 927 uint16_t groupWords[3]; 928 uint32_t i, groupTop=lineTop, offset, size, 929 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; 930 long dataLength; 931 int16_t token; 932 933 pData=udata_create(dataDir, 934 DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME, 935 &dataInfo, 936 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 937 if(U_FAILURE(errorCode)) { 938 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); 939 exit(errorCode); 940 } 941 942 /* first, see how much space we need, and prepare the token strings */ 943 for(i=0; i<tokenCount; ++i) { 944 token=tokens[i]; 945 if(token!=-1 && token!=-2) { 946 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop); 947 } 948 } 949 950 /* 951 * Required padding for data swapping: 952 * The token table undergoes a permutation during data swapping when the 953 * input and output charsets are different. 954 * The token table cannot grow during swapping, so we need to make sure that 955 * the table is long enough for successful in-place permutation. 956 * 957 * We simply round up tokenCount to the next multiple of 256 to account for 958 * all possible permutations. 959 * 960 * An optimization is possible if we only ever swap between ASCII and EBCDIC: 961 * 962 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used 963 * and will be swapped between ASCII and EBCDIC between 964 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon). 965 * This should be the only -1 entry in tokens[256..511] on which the data 966 * swapper bases its trail byte permutation map (trailMap[]). 967 * 968 * It would be sufficient to increase tokenCount so that its lower 8 bits 969 * are at least 0x5e+1 to make room for swapping between the two semicolons. 970 * For values higher than 0x5e, the trail byte permutation map (trailMap[]) 971 * should always be an identity map, where we do not need additional room. 972 */ 973 i=tokenCount; 974 tokenCount=(tokenCount+0xff)&~0xff; 975 if(!beQuiet && i<tokenCount) { 976 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i)); 977 } 978 for(; i<tokenCount; ++i) { 979 if((i&0xff)==NAME_SEPARATOR_CHAR) { 980 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 981 } else { 982 tokens[i]=0; /* unused token for padding */ 983 } 984 } 985 986 /* 987 * Calculate the total size in bytes of the data including: 988 * - the offset to the token strings, uint32_t (4) 989 * - the offset to the group table, uint32_t (4) 990 * - the offset to the group strings, uint32_t (4) 991 * - the offset to the algorithmic names, uint32_t (4) 992 * 993 * - the number of tokens, uint16_t (2) 994 * - the token table, uint16_t[tokenCount] (2*tokenCount) 995 * 996 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded 997 * 998 * - the number of groups, uint16_t (2) 999 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount] 1000 * 1001 * - the group strings (groupTop-groupBottom), 2-padded 1002 * 1003 * - the size of the data for the algorithmic names 1004 */ 1005 tokenStringOffset=4+4+4+4+2+2*tokenCount; 1006 groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1; 1007 groupStringOffset=groupsOffset+2+6*lineCount; 1008 algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3; 1009 1010 offset=generateAlgorithmicData(NULL, storeOptions); 1011 size=algNamesOffset+offset; 1012 1013 if(!beQuiet) { 1014 printf("size of the Unicode Names data:\n" 1015 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n", 1016 (unsigned long)size, (unsigned long)(lineTop-groupTop), 1017 (unsigned long)(groupTop-groupBottom), (unsigned long)offset); 1018 } 1019 1020 /* write the data to the file */ 1021 /* offsets */ 1022 udata_write32(pData, tokenStringOffset); 1023 udata_write32(pData, groupsOffset); 1024 udata_write32(pData, groupStringOffset); 1025 udata_write32(pData, algNamesOffset); 1026 1027 /* token table */ 1028 udata_write16(pData, (uint16_t)tokenCount); 1029 udata_writeBlock(pData, tokens, 2*tokenCount); 1030 1031 /* token strings */ 1032 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop); 1033 if((lineTop-groupTop)&1) { 1034 /* 2-padding */ 1035 udata_writePadding(pData, 1); 1036 } 1037 1038 /* group table */ 1039 udata_write16(pData, (uint16_t)lineCount); 1040 for(i=0; i<lineCount; ++i) { 1041 /* groupMSB */ 1042 groupWords[0]=(uint16_t)lines[i].code; 1043 1044 /* offset */ 1045 offset = (uint32_t)((lines[i].s - stringStore)-groupBottom); 1046 groupWords[1]=(uint16_t)(offset>>16); 1047 groupWords[2]=(uint16_t)(offset); 1048 udata_writeBlock(pData, groupWords, 6); 1049 } 1050 1051 /* group strings */ 1052 udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom); 1053 1054 /* 4-align the algorithmic names data */ 1055 udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); 1056 1057 generateAlgorithmicData(pData, storeOptions); 1058 1059 /* finish up */ 1060 dataLength=udata_finish(pData, &errorCode); 1061 if(U_FAILURE(errorCode)) { 1062 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); 1063 exit(errorCode); 1064 } 1065 1066 if(dataLength!=(long)size) { 1067 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", 1068 dataLength, (unsigned long)size); 1069 exit(U_INTERNAL_PROGRAM_ERROR); 1070 } 1071 } 1072 1073 /* the structure for algorithmic names needs to be 4-aligned */ 1074 typedef struct AlgorithmicRange { 1075 uint32_t rangeStart, rangeEnd; 1076 uint8_t algorithmType, algorithmVariant; 1077 uint16_t rangeSize; 1078 } AlgorithmicRange; 1079 1080 static uint32_t 1081 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) { 1082 static char prefix[] = "CJK UNIFIED IDEOGRAPH-"; 1083 # define PREFIX_LENGTH 23 1084 # define PREFIX_LENGTH_4 24 1085 uint32_t countAlgRanges; 1086 1087 static AlgorithmicRange cjkExtA={ 1088 0x3400, 0x4db5, 1089 0, 4, 1090 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1091 }; 1092 static AlgorithmicRange cjk={ 1093 0x4e00, 0x9fa5, 1094 0, 4, 1095 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1096 }; 1097 static AlgorithmicRange cjkExtB={ 1098 0x20000, 0x2a6d6, 1099 0, 5, 1100 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 1101 }; 1102 1103 static char jamo[]= 1104 "HANGUL SYLLABLE \0" 1105 1106 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" 1107 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" 1108 1109 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" 1110 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" 1111 "YU\0EU\0YI\0I\0" 1112 1113 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" 1114 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" 1115 "S\0SS\0NG\0J\0C\0K\0T\0P\0H" 1116 ; 1117 1118 static AlgorithmicRange hangul={ 1119 0xac00, 0xd7a3, 1120 1, 3, 1121 sizeof(AlgorithmicRange)+6+sizeof(jamo) 1122 }; 1123 1124 /* modulo factors, maximum 8 */ 1125 /* 3 factors: 19, 21, 28, most-to-least-significant */ 1126 static uint16_t hangulFactors[3]={ 1127 19, 21, 28 1128 }; 1129 1130 uint32_t size; 1131 1132 size=0; 1133 1134 if(ucdVersion>=UNI_5_1) { 1135 /* Unicode 5.1 and up has a longer CJK Unihan range than before */ 1136 cjk.rangeEnd=0x9FC3; 1137 } else if(ucdVersion>=UNI_4_1) { 1138 /* Unicode 4.1 and up has a longer CJK Unihan range than before */ 1139 cjk.rangeEnd=0x9FBB; 1140 } 1141 1142 /* number of ranges of algorithmic names */ 1143 if(!storeOptions->storeNames) { 1144 countAlgRanges=0; 1145 } else if(ucdVersion>=UNI_3_1) { 1146 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */ 1147 countAlgRanges=4; 1148 } else if(ucdVersion>=UNI_3_0) { 1149 /* Unicode 3.0 has 3 ranges including CJK Extension A */ 1150 countAlgRanges=3; 1151 } else { 1152 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */ 1153 countAlgRanges=2; 1154 } 1155 1156 if(pData!=NULL) { 1157 udata_write32(pData, countAlgRanges); 1158 } else { 1159 size+=4; 1160 } 1161 if(countAlgRanges==0) { 1162 return size; 1163 } 1164 1165 /* 1166 * each range: 1167 * uint32_t rangeStart 1168 * uint32_t rangeEnd 1169 * uint8_t algorithmType 1170 * uint8_t algorithmVariant 1171 * uint16_t size of range data 1172 * uint8_t[size] data 1173 */ 1174 1175 /* range 0: cjk extension a */ 1176 if(countAlgRanges>=3) { 1177 if(pData!=NULL) { 1178 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); 1179 udata_writeString(pData, prefix, PREFIX_LENGTH); 1180 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1181 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1182 } 1183 } else { 1184 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1185 } 1186 } 1187 1188 /* range 1: cjk */ 1189 if(pData!=NULL) { 1190 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange)); 1191 udata_writeString(pData, prefix, PREFIX_LENGTH); 1192 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1193 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1194 } 1195 } else { 1196 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1197 } 1198 1199 /* range 2: hangul syllables */ 1200 if(pData!=NULL) { 1201 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange)); 1202 udata_writeBlock(pData, hangulFactors, 6); 1203 udata_writeString(pData, jamo, sizeof(jamo)); 1204 } else { 1205 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo); 1206 } 1207 1208 /* range 3: cjk extension b */ 1209 if(countAlgRanges>=4) { 1210 if(pData!=NULL) { 1211 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange)); 1212 udata_writeString(pData, prefix, PREFIX_LENGTH); 1213 if(PREFIX_LENGTH<PREFIX_LENGTH_4) { 1214 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH); 1215 } 1216 } else { 1217 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4; 1218 } 1219 } 1220 1221 return size; 1222 } 1223 1224 /* helpers ------------------------------------------------------------------ */ 1225 1226 static int16_t 1227 findToken(uint8_t *s, int16_t length) { 1228 int16_t i, token; 1229 1230 for(i=0; i<(int16_t)tokenCount; ++i) { 1231 token=tokens[i]; 1232 if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { 1233 return i; 1234 } 1235 } 1236 1237 return -1; 1238 } 1239 1240 static Word * 1241 findWord(char *s, int16_t length) { 1242 uint32_t i; 1243 1244 for(i=0; i<wordCount; ++i) { 1245 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) { 1246 return words+i; 1247 } 1248 } 1249 1250 return NULL; 1251 } 1252 1253 static Word * 1254 addWord(char *s, int16_t length) { 1255 uint8_t *stringStart; 1256 Word *word; 1257 1258 if(wordCount==MAX_WORD_COUNT) { 1259 fprintf(stderr, "gennames: too many words\n"); 1260 exit(U_BUFFER_OVERFLOW_ERROR); 1261 } 1262 1263 stringStart=allocWord(length); 1264 uprv_memcpy(stringStart, s, length); 1265 1266 word=words+wordCount; 1267 1268 /* 1269 * Initialize the weight with the costs for this token: 1270 * a zero-terminated string and a 16-bit offset. 1271 */ 1272 word->weight=-(length+1+2); 1273 word->count=0; 1274 word->length=length; 1275 word->s=stringStart; 1276 1277 ++wordCount; 1278 1279 return word; 1280 } 1281 1282 static void 1283 countWord(Word *word) { 1284 /* add to the weight the savings: the length of the word minus 1 byte for the token */ 1285 word->weight+=word->length-1; 1286 ++word->count; 1287 } 1288 1289 static void 1290 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) { 1291 uint8_t *stringStart; 1292 Line *line; 1293 int16_t i, length; 1294 1295 if(lineCount==MAX_LINE_COUNT) { 1296 fprintf(stderr, "gennames: too many lines\n"); 1297 exit(U_BUFFER_OVERFLOW_ERROR); 1298 } 1299 1300 /* find the last non-empty name */ 1301 while(count>0 && lengths[count-1]==0) { 1302 --count; 1303 } 1304 if(count==0) { 1305 return; /* should not occur: caller should not have called */ 1306 } 1307 1308 /* there will be (count-1) separator characters */ 1309 i=count; 1310 length=count-1; 1311 1312 /* add lengths of strings */ 1313 while(i>0) { 1314 length+=lengths[--i]; 1315 } 1316 1317 /* allocate line memory */ 1318 stringStart=allocLine(length); 1319 1320 /* copy all strings into the line memory */ 1321 length=0; /* number of chars copied so far */ 1322 for(i=0; i<count; ++i) { 1323 if(i>0) { 1324 stringStart[length++]=NAME_SEPARATOR_CHAR; 1325 } 1326 if(lengths[i]>0) { 1327 uprv_memcpy(stringStart+length, names[i], lengths[i]); 1328 length+=lengths[i]; 1329 } 1330 } 1331 1332 line=lines+lineCount; 1333 1334 line->code=code; 1335 line->length=length; 1336 line->s=stringStart; 1337 1338 ++lineCount; 1339 1340 /* prevent a character value that is actually in a name from becoming a token */ 1341 while(length>0) { 1342 tokens[stringStart[--length]]=-1; 1343 } 1344 } 1345 1346 static void 1347 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { 1348 uint8_t *stringStart; 1349 Line *line; 1350 1351 if(lineCount==MAX_LINE_COUNT) { 1352 fprintf(stderr, "gennames: too many groups\n"); 1353 exit(U_BUFFER_OVERFLOW_ERROR); 1354 } 1355 1356 /* store the line lengths first, then the strings */ 1357 lineLengthsTop=(lineLengthsTop+1)/2; 1358 stringStart=allocLine(lineLengthsTop+length); 1359 uprv_memcpy(stringStart, lineLengths, lineLengthsTop); 1360 uprv_memcpy(stringStart+lineLengthsTop, strings, length); 1361 1362 line=lines+lineCount; 1363 1364 line->code=groupMSB; 1365 line->length=length; 1366 line->s=stringStart; 1367 1368 ++lineCount; 1369 } 1370 1371 static uint32_t 1372 addToken(uint8_t *s, int16_t length) { 1373 uint8_t *stringStart; 1374 1375 stringStart=allocLine(length+1); 1376 uprv_memcpy(stringStart, s, length); 1377 stringStart[length]=0; 1378 1379 return (uint32_t)(stringStart - stringStore); 1380 } 1381 1382 static void 1383 appendLineLength(int16_t length) { 1384 if(length>=76) { 1385 fprintf(stderr, "gennames: compressed line too long\n"); 1386 exit(U_BUFFER_OVERFLOW_ERROR); 1387 } 1388 if(length>=12) { 1389 length-=12; 1390 appendLineLengthNibble((uint8_t)((length>>4)|12)); 1391 } 1392 appendLineLengthNibble((uint8_t)length); 1393 } 1394 1395 static void 1396 appendLineLengthNibble(uint8_t nibble) { 1397 if((lineLengthsTop&1)==0) { 1398 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); 1399 } else { 1400 lineLengths[lineLengthsTop/2]|=nibble&0xf; 1401 } 1402 ++lineLengthsTop; 1403 } 1404 1405 static uint8_t * 1406 allocLine(int32_t length) { 1407 uint32_t top=lineTop+length; 1408 uint8_t *p; 1409 1410 if(top>wordBottom) { 1411 fprintf(stderr, "gennames: out of memory\n"); 1412 exit(U_MEMORY_ALLOCATION_ERROR); 1413 } 1414 p=stringStore+lineTop; 1415 lineTop=top; 1416 return p; 1417 } 1418 1419 static uint8_t * 1420 allocWord(uint32_t length) { 1421 uint32_t bottom=wordBottom-length; 1422 1423 if(lineTop>bottom) { 1424 fprintf(stderr, "gennames: out of memory\n"); 1425 exit(U_MEMORY_ALLOCATION_ERROR); 1426 } 1427 wordBottom=bottom; 1428 return stringStore+bottom; 1429 } 1430 1431 /* 1432 * Hey, Emacs, please set the following: 1433 * 1434 * Local Variables: 1435 * indent-tabs-mode: nil 1436 * End: 1437 * 1438 */ 1439