Home | History | Annotate | Download | only in gennames
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennames.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads the Unicode character database text file,
     17 *   parses it, and extracts the character code,
     18 *   the "modern" character name, and optionally the
     19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
     20 *   It then tokenizes and compresses the names and builds
     21 *   compact binary tables for random-access lookup
     22 *   in a u_charName() API function.
     23 *
     24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
     25 * (all data is static const)
     26 *
     27 * UDataInfo fields:
     28 *   dataFormat "unam"
     29 *   formatVersion 1.0
     30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
     31 *
     32 * -- data-based names
     33 * uint32_t tokenStringOffset,
     34 *          groupsOffset,
     35 *          groupStringOffset,
     36 *          algNamesOffset;
     37 *
     38 * uint16_t tokenCount;
     39 * uint16_t tokenTable[tokenCount];
     40 *
     41 * char     tokenStrings[]; -- padded to even count
     42 *
     43 * -- strings (groupStrings) are tokenized as follows:
     44 *   for each character c
     45 *       if(c>=tokenCount) write that character c directly
     46 *   else
     47 *       token=tokenTable[c];
     48 *       if(token==0xfffe) -- lead byte of double-byte token
     49 *           token=tokenTable[c<<8|next character];
     50 *       if(token==-1)
     51 *           write c directly
     52 *       else
     53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
     54 *           append zero-terminated tokenString;
     55 *
     56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
     57 *    are separated by ';'.
     58 *
     59 * uint16_t groupCount;
     60 * struct {
     61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
     62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
     63 *   uint16_t offsetLow;
     64 * } groupTable[groupCount];
     65 *
     66 * char     groupStrings[]; -- padded to 4-count
     67 *
     68 * -- The actual, tokenized group strings are not zero-terminated because
     69 *   that would take up too much space.
     70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
     71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
     72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
     73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
     74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
     75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
     76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
     77 *   For the de-tokenizing, see token description above; the strings immediately follow the
     78 *   32 lengths.
     79 *
     80 * -- algorithmic names
     81 *
     82 * typedef struct AlgorithmicRange {
     83 *     uint32_t rangeStart, rangeEnd;
     84 *     uint8_t algorithmType, algorithmVariant;
     85 *     uint16_t rangeSize;
     86 * } AlgorithmicRange;
     87 *
     88 * uint32_t algRangesCount; -- number of data blocks for ranges of
     89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
     90 *
     91 * struct {
     92 *     AlgorithmicRange algRange;
     93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
     94 * } algRanges[algNamesCount];
     95 * -- not a real array because each part has a different size
     96 *    of algRange.rangeSize (including AlgorithmicRange)
     97 *
     98 * -- algorithmic range types:
     99 *
    100 * 0 Names are formed from a string prefix that is stored in
    101 *   the algRangeData (zero-terminated), followed by the Unicode code point
    102 *   of the character in hexadecimal digits;
    103 *   algRange.algorithmVariant digits are written
    104 *
    105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
    106 *   algRange.algorithmVariant is the count of modulo factors
    107 *   algRangeData contains
    108 *       uint16_t factors[algRange.algorithmVariant];
    109 *       char strings[];
    110 *   the first zero-terminated string is written as the prefix; then:
    111 *
    112 *   The rangeStart is subtracted; with the difference, here "code":
    113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
    114 *       index[i]=code%factor[i];
    115 *       code/=factor[i];
    116 *
    117 *   The strings after the prefix are short pieces that are then appended to the result
    118 *   according to index[0..algRange.algorithmVariant-1].
    119 */
    120 
    121 #include <stdio.h>
    122 #include "unicode/utypes.h"
    123 #include "unicode/putil.h"
    124 #include "unicode/uclean.h"
    125 #include "unicode/udata.h"
    126 #include "cmemory.h"
    127 #include "cstring.h"
    128 #include "uarrsort.h"
    129 #include "unewdata.h"
    130 #include "uoptions.h"
    131 #include "uparse.h"
    132 
    133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    134 
    135 #define STRING_STORE_SIZE 1000000
    136 #define GROUP_STORE_SIZE 5000
    137 
    138 #define GROUP_SHIFT 5
    139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
    140 #define GROUP_MASK (LINES_PER_GROUP-1)
    141 
    142 #define MAX_LINE_COUNT 50000
    143 #define MAX_WORD_COUNT 20000
    144 #define MAX_GROUP_COUNT 5000
    145 
    146 #define DATA_NAME "unames"
    147 #define DATA_TYPE "icu"
    148 #define VERSION_STRING "unam"
    149 #define NAME_SEPARATOR_CHAR ';'
    150 
    151 #define ISO_DATA_NAME "ucomment"
    152 
    153 /* Unicode versions --------------------------------------------------------- */
    154 
    155 enum {
    156     UNI_1_0,
    157     UNI_1_1,
    158     UNI_2_0,
    159     UNI_3_0,
    160     UNI_3_1,
    161     UNI_3_2,
    162     UNI_4_0,
    163     UNI_4_0_1,
    164     UNI_4_1,
    165     UNI_5_0,
    166     UNI_5_1,
    167     UNI_VER_COUNT
    168 };
    169 
    170 static const UVersionInfo
    171 unicodeVersions[]={
    172     { 1, 0, 0, 0 },
    173     { 1, 1, 0, 0 },
    174     { 2, 0, 0, 0 },
    175     { 3, 0, 0, 0 },
    176     { 3, 1, 0, 0 },
    177     { 3, 2, 0, 0 },
    178     { 4, 0, 0, 0 },
    179     { 4, 0, 1, 0 },
    180     { 4, 1, 0, 0 },
    181     { 5, 0, 0, 0 },
    182     { 5, 1, 0, 0 }
    183 };
    184 
    185 static int32_t ucdVersion=UNI_5_1;
    186 
    187 static int32_t
    188 findUnicodeVersion(const UVersionInfo version) {
    189     int32_t i;
    190 
    191     for(i=0; /* while(version>unicodeVersions[i]) {} */
    192         i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
    193         ++i) {}
    194     if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
    195         --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
    196     }
    197     return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
    198 }
    199 
    200 /* generator data ----------------------------------------------------------- */
    201 
    202 /* UDataInfo cf. udata.h */
    203 static UDataInfo dataInfo={
    204     sizeof(UDataInfo),
    205     0,
    206 
    207     U_IS_BIG_ENDIAN,
    208     U_CHARSET_FAMILY,
    209     sizeof(UChar),
    210     0,
    211 
    212     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
    213     {1, 0, 0, 0},                 /* formatVersion */
    214     {3, 0, 0, 0}                  /* dataVersion */
    215 };
    216 
    217 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
    218 
    219 typedef struct Options {
    220     UBool storeNames;
    221     UBool store10Names;
    222     UBool storeISOComments;
    223 } Options;
    224 
    225 static uint8_t stringStore[STRING_STORE_SIZE],
    226                groupStore[GROUP_STORE_SIZE],
    227                lineLengths[LINES_PER_GROUP];
    228 
    229 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
    230 
    231 typedef struct {
    232     uint32_t code;
    233     int16_t length;
    234     uint8_t *s;
    235 } Line;
    236 
    237 typedef struct {
    238     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
    239     int16_t count;
    240     int16_t length;
    241     uint8_t *s;
    242 } Word;
    243 
    244 static Line lines[MAX_LINE_COUNT];
    245 static Word words[MAX_WORD_COUNT];
    246 
    247 static uint32_t lineCount=0, wordCount=0;
    248 
    249 static int16_t leadByteCount;
    250 
    251 #define LEADBYTE_LIMIT 16
    252 
    253 static int16_t tokens[LEADBYTE_LIMIT*256];
    254 static uint32_t tokenCount;
    255 
    256 /* prototypes --------------------------------------------------------------- */
    257 
    258 static void
    259 init(void);
    260 
    261 static void
    262 parseDB(const char *filename, Options *options);
    263 
    264 static void
    265 parseName(char *name, int16_t length);
    266 
    267 static int16_t
    268 skipNoise(char *line, int16_t start, int16_t limit);
    269 
    270 static int16_t
    271 getWord(char *line, int16_t start, int16_t limit);
    272 
    273 static void
    274 compress(void);
    275 
    276 static void
    277 compressLines(void);
    278 
    279 static int16_t
    280 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
    281 
    282 static int32_t
    283 compareWords(const void *context, const void *word1, const void *word2);
    284 
    285 static void
    286 generateData(const char *dataDir, Options *options);
    287 
    288 static uint32_t
    289 generateAlgorithmicData(UNewDataMemory *pData, Options *options);
    290 
    291 static int16_t
    292 findToken(uint8_t *s, int16_t length);
    293 
    294 static Word *
    295 findWord(char *s, int16_t length);
    296 
    297 static Word *
    298 addWord(char *s, int16_t length);
    299 
    300 static void
    301 countWord(Word *word);
    302 
    303 static void
    304 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
    305 
    306 static void
    307 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
    308 
    309 static uint32_t
    310 addToken(uint8_t *s, int16_t length);
    311 
    312 static void
    313 appendLineLength(int16_t length);
    314 
    315 static void
    316 appendLineLengthNibble(uint8_t nibble);
    317 
    318 static uint8_t *
    319 allocLine(int32_t length);
    320 
    321 static uint8_t *
    322 allocWord(uint32_t length);
    323 
    324 /* -------------------------------------------------------------------------- */
    325 
    326 enum {
    327     HELP_H,
    328     HELP_QUESTION_MARK,
    329     VERBOSE,
    330     QUIET,
    331     COPYRIGHT,
    332     DESTDIR,
    333     UNICODE,
    334     UNICODE1_NAMES,
    335     NO_ISO_COMMENTS,
    336     ONLY_ISO_COMMENTS
    337 };
    338 
    339 static UOption options[]={
    340     UOPTION_HELP_H,
    341     UOPTION_HELP_QUESTION_MARK,
    342     UOPTION_VERBOSE,
    343     UOPTION_QUIET,
    344     UOPTION_COPYRIGHT,
    345     UOPTION_DESTDIR,
    346     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
    347     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
    348     { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
    349     { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
    350 };
    351 
    352 extern int
    353 main(int argc, char* argv[]) {
    354     UVersionInfo version;
    355     Options moreOptions={ TRUE, FALSE, TRUE };
    356     UErrorCode errorCode = U_ZERO_ERROR;
    357 
    358     U_MAIN_INIT_ARGS(argc, argv);
    359 
    360     /* Initialize ICU */
    361     u_init(&errorCode);
    362     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
    363         /* Note: u_init() will try to open ICU property data.
    364          *       failures here are expected when building ICU from scratch.
    365          *       ignore them.
    366          */
    367         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
    368             argv[0], u_errorName(errorCode));
    369         exit(1);
    370     }
    371 
    372     /* preset then read command line options */
    373     options[DESTDIR].value=u_getDataDirectory();
    374     options[UNICODE].value="4.1";
    375     argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
    376 
    377     /* error handling, printing usage message */
    378     if(argc<0) {
    379         fprintf(stderr,
    380             "error in command line argument \"%s\"\n",
    381             argv[-argc]);
    382     } else if(argc<2) {
    383         argc=-1;
    384     }
    385     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    386         /*
    387          * Broken into chucks because the C89 standard says the minimum
    388          * required supported string length is 509 bytes.
    389          */
    390         fprintf(stderr,
    391             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
    392             "\n"
    393             "Read the UnicodeData.txt file and \n"
    394             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
    395             "\n"
    396             "\tfilename  absolute path/filename for the Unicode database text file\n"
    397             "\t\t(default: standard input)\n"
    398             "\n",
    399             argv[0]);
    400         fprintf(stderr,
    401             "Options:\n"
    402             "\t-h or -? or --help  this usage text\n"
    403             "\t-v or --verbose     verbose output\n"
    404             "\t-q or --quiet       no output\n"
    405             "\t-c or --copyright   include a copyright notice\n"
    406             "\t-d or --destdir     destination directory, followed by the path\n"
    407             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
    408         fprintf(stderr,
    409             "\t-1 or --unicode1-names     store Unicode 1.0 character names\n"
    410             "\t      --no-iso-comments    do not store ISO comments\n"
    411             "\t      --only-iso-comments  write ucomment.icu with only ISO comments\n");
    412         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    413     }
    414 
    415     /* get the options values */
    416     beVerbose=options[VERBOSE].doesOccur;
    417     beQuiet=options[QUIET].doesOccur;
    418     haveCopyright=options[COPYRIGHT].doesOccur;
    419     moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
    420     moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
    421     if(options[ONLY_ISO_COMMENTS].doesOccur) {
    422         moreOptions.storeNames=moreOptions.store10Names=FALSE;
    423         moreOptions.storeISOComments=TRUE;
    424     }
    425 
    426     /* set the Unicode version */
    427     u_versionFromString(version, options[UNICODE].value);
    428     uprv_memcpy(dataInfo.dataVersion, version, 4);
    429     ucdVersion=findUnicodeVersion(version);
    430 
    431     init();
    432     parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
    433     compress();
    434     generateData(options[DESTDIR].value, &moreOptions);
    435 
    436     u_cleanup();
    437     return 0;
    438 }
    439 
    440 static void
    441 init() {
    442     int i;
    443 
    444     for(i=0; i<256; ++i) {
    445         tokens[i]=0;
    446     }
    447 }
    448 
    449 /* parsing ------------------------------------------------------------------ */
    450 
    451 /* get a name, strip leading and trailing whitespace */
    452 static int16_t
    453 getName(char **pStart, char *limit) {
    454     /* strip leading whitespace */
    455     char *start=(char *)u_skipWhitespace(*pStart);
    456 
    457     /* strip trailing whitespace */
    458     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
    459         --limit;
    460     }
    461 
    462     /* return results */
    463     *pStart=start;
    464     return (int16_t)(limit-start);
    465 }
    466 
    467 static void U_CALLCONV
    468 lineFn(void *context,
    469        char *fields[][2], int32_t fieldCount,
    470        UErrorCode *pErrorCode) {
    471     Options *storeOptions=(Options *)context;
    472     char *names[3];
    473     int16_t lengths[3]={ 0, 0, 0 };
    474     static uint32_t prevCode=0;
    475     uint32_t code=0;
    476 
    477     if(U_FAILURE(*pErrorCode)) {
    478         return;
    479     }
    480     /* get the character code */
    481     code=uprv_strtoul(fields[0][0], NULL, 16);
    482 
    483     /* get the character name */
    484     if(storeOptions->storeNames) {
    485         names[0]=fields[1][0];
    486         lengths[0]=getName(names+0, fields[1][1]);
    487         if(names[0][0]=='<') {
    488             /* do not store pseudo-names in <> brackets */
    489             lengths[0]=0;
    490         }
    491     }
    492 
    493     /* store 1.0 names */
    494     /* get the second character name, the one from Unicode 1.0 */
    495     if(storeOptions->store10Names) {
    496         names[1]=fields[10][0];
    497         lengths[1]=getName(names+1, fields[10][1]);
    498         if(names[1][0]=='<') {
    499             /* do not store pseudo-names in <> brackets */
    500             lengths[1]=0;
    501         }
    502     }
    503 
    504     /* get the ISO 10646 comment */
    505     if(storeOptions->storeISOComments) {
    506         names[2]=fields[11][0];
    507         lengths[2]=getName(names+2, fields[11][1]);
    508     }
    509 
    510     if(lengths[0]+lengths[1]+lengths[2]==0) {
    511         return;
    512     }
    513 
    514     /* check for non-character code points */
    515     if(!U_IS_UNICODE_CHAR(code)) {
    516         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
    517                 (unsigned long)code);
    518         *pErrorCode=U_PARSE_ERROR;
    519         exit(U_PARSE_ERROR);
    520     }
    521 
    522     /* check that the code points (code) are in ascending order */
    523     if(code<=prevCode && code>0) {
    524         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
    525                 (unsigned long)code, (unsigned long)prevCode);
    526         *pErrorCode=U_PARSE_ERROR;
    527         exit(U_PARSE_ERROR);
    528     }
    529     prevCode=code;
    530 
    531     parseName(names[0], lengths[0]);
    532     parseName(names[1], lengths[1]);
    533     parseName(names[2], lengths[2]);
    534 
    535     /*
    536      * set the count argument to
    537      * 1: only store regular names, or only store ISO 10646 comments
    538      * 2: store regular and 1.0 names
    539      * 3: store names and ISO 10646 comment
    540      *
    541      * addLine() will ignore empty trailing names
    542      */
    543     if(storeOptions->storeNames) {
    544         /* store names and comments as parsed according to storeOptions */
    545         addLine(code, names, lengths, 3);
    546     } else {
    547         /* store only ISO 10646 comments */
    548         addLine(code, names+2, lengths+2, 1);
    549     }
    550 }
    551 
    552 static void
    553 parseDB(const char *filename, Options *storeOptions) {
    554     char *fields[15][2];
    555     UErrorCode errorCode=U_ZERO_ERROR;
    556 
    557     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
    558     if(U_FAILURE(errorCode)) {
    559         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
    560         exit(errorCode);
    561     }
    562 
    563     if(!beQuiet) {
    564         printf("size of all names in the database: %lu\n",
    565             (unsigned long)lineTop);
    566         printf("number of named Unicode characters: %lu\n",
    567             (unsigned long)lineCount);
    568         printf("number of words in the dictionary from these names: %lu\n",
    569             (unsigned long)wordCount);
    570     }
    571 }
    572 
    573 static void
    574 parseName(char *name, int16_t length) {
    575     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
    576     Word *word;
    577 
    578     while(start<length) {
    579         /* skip any "noise" characters */
    580         limit=skipNoise(name, start, length);
    581         if(start<limit) {
    582             /*prevStart=-1;*/
    583             start=limit;
    584         }
    585         if(start==length) {
    586             break;
    587         }
    588 
    589         /* get a word and add it if it is longer than 1 */
    590         limit=getWord(name, start, length);
    591         wordLength=(int16_t)(limit-start);
    592         if(wordLength>1) {
    593             word=findWord(name+start, wordLength);
    594             if(word==NULL) {
    595                 word=addWord(name+start, wordLength);
    596             }
    597             countWord(word);
    598         }
    599 
    600 #if 0
    601         /*
    602          * if there was a word before this
    603          * (with no noise in between), then add the pair of words, too
    604          */
    605         if(prevStart!=-1) {
    606             wordLength=limit-prevStart;
    607             word=findWord(name+prevStart, wordLength);
    608             if(word==NULL) {
    609                 word=addWord(name+prevStart, wordLength);
    610             }
    611             countWord(word);
    612         }
    613 #endif
    614 
    615         /*prevStart=start;*/
    616         start=limit;
    617     }
    618 }
    619 
    620 static UBool U_INLINE
    621 isWordChar(char c) {
    622     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
    623            ('J'<=c && c<='R') ||
    624            ('S'<=c && c<='Z') ||
    625 
    626            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
    627            ('j'<=c && c<='r') ||
    628            ('s'<=c && c<='z') ||
    629 
    630            ('0'<=c && c<='9');
    631 }
    632 
    633 static int16_t
    634 skipNoise(char *line, int16_t start, int16_t limit) {
    635     /* skip anything that is not part of a word in this sense */
    636     while(start<limit && !isWordChar(line[start])) {
    637         ++start;
    638     }
    639 
    640     return start;
    641 }
    642 
    643 static int16_t
    644 getWord(char *line, int16_t start, int16_t limit) {
    645     char c=0; /* initialize to avoid a compiler warning although the code was safe */
    646 
    647     /* a unicode character name word consists of A-Z0-9 */
    648     while(start<limit && isWordChar(line[start])) {
    649         ++start;
    650     }
    651 
    652     /* include a following space or dash */
    653     if(start<limit && ((c=line[start])==' ' || c=='-')) {
    654         ++start;
    655     }
    656 
    657     return start;
    658 }
    659 
    660 /* compressing -------------------------------------------------------------- */
    661 
    662 static void
    663 compress() {
    664     uint32_t i, letterCount;
    665     int16_t wordNumber;
    666     UErrorCode errorCode;
    667 
    668     /* sort the words in reverse order by weight */
    669     errorCode=U_ZERO_ERROR;
    670     uprv_sortArray(words, wordCount, sizeof(Word),
    671                     compareWords, NULL, FALSE, &errorCode);
    672 
    673     /* remove the words that do not save anything */
    674     while(wordCount>0 && words[wordCount-1].weight<1) {
    675         --wordCount;
    676     }
    677 
    678     /* count the letters in the token range */
    679     letterCount=0;
    680     for(i=LEADBYTE_LIMIT; i<256; ++i) {
    681         if(tokens[i]==-1) {
    682             ++letterCount;
    683         }
    684     }
    685     if(!beQuiet) {
    686         printf("number of letters used in the names: %d\n", (int)letterCount);
    687     }
    688 
    689     /* do we need double-byte tokens? */
    690     if(wordCount+letterCount<=256) {
    691         /* no, single-byte tokens are enough */
    692         leadByteCount=0;
    693         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
    694             if(tokens[i]!=-1) {
    695                 tokens[i]=wordNumber;
    696                 if(beVerbose) {
    697                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    698                             (int)i, (long)words[wordNumber].weight,
    699                             words[wordNumber].length, words[wordNumber].s);
    700                 }
    701                 ++wordNumber;
    702             }
    703         }
    704         tokenCount=i;
    705     } else {
    706         /*
    707          * The tokens that need two token bytes
    708          * get their weight reduced by their count
    709          * because they save less.
    710          */
    711         tokenCount=256-letterCount;
    712         for(i=tokenCount; i<wordCount; ++i) {
    713             words[i].weight-=words[i].count;
    714         }
    715 
    716         /* sort these words in reverse order by weight */
    717         errorCode=U_ZERO_ERROR;
    718         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
    719                         compareWords, NULL, FALSE, &errorCode);
    720 
    721         /* remove the words that do not save anything */
    722         while(wordCount>0 && words[wordCount-1].weight<1) {
    723             --wordCount;
    724         }
    725 
    726         /* how many tokens and lead bytes do we have now? */
    727         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
    728         /*
    729          * adjust upwards to take into account that
    730          * double-byte tokens must not
    731          * use NAME_SEPARATOR_CHAR as a second byte
    732          */
    733         tokenCount+=(tokenCount-256+254)/255;
    734 
    735         leadByteCount=(int16_t)(tokenCount>>8);
    736         if(leadByteCount<LEADBYTE_LIMIT) {
    737             /* adjust for the real number of lead bytes */
    738             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
    739         } else {
    740             /* limit the number of lead bytes */
    741             leadByteCount=LEADBYTE_LIMIT-1;
    742             tokenCount=LEADBYTE_LIMIT*256;
    743             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
    744             /* adjust again to skip double-byte tokens with ';' */
    745             wordCount-=(tokenCount-256+254)/255;
    746         }
    747 
    748         /* set token 0 to word 0 */
    749         tokens[0]=0;
    750         if(beVerbose) {
    751             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
    752                     (long)words[0].weight,
    753                     words[0].length, words[0].s);
    754         }
    755         wordNumber=1;
    756 
    757         /* set the lead byte tokens */
    758         for(i=1; (int16_t)i<=leadByteCount; ++i) {
    759             tokens[i]=-2;
    760         }
    761 
    762         /* set the tokens */
    763         for(; i<256; ++i) {
    764             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
    765             if(tokens[i]!=-1) {
    766                 tokens[i]=wordNumber;
    767                 if(beVerbose) {
    768                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    769                             (int)i, (long)words[wordNumber].weight,
    770                             words[wordNumber].length, words[wordNumber].s);
    771                 }
    772                 ++wordNumber;
    773             }
    774         }
    775 
    776         /* continue above 255 where there are no letters */
    777         for(; (uint32_t)wordNumber<wordCount; ++i) {
    778             if((i&0xff)==NAME_SEPARATOR_CHAR) {
    779                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
    780             } else {
    781                 tokens[i]=wordNumber;
    782                 if(beVerbose) {
    783                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    784                             (int)i, (long)words[wordNumber].weight,
    785                             words[wordNumber].length, words[wordNumber].s);
    786                 }
    787                 ++wordNumber;
    788             }
    789         }
    790         tokenCount=i; /* should be already tokenCount={i or i+1} */
    791     }
    792 
    793     if(!beQuiet) {
    794         printf("number of lead bytes: %d\n", leadByteCount);
    795         printf("number of single-byte tokens: %lu\n",
    796             (unsigned long)256-letterCount-leadByteCount);
    797         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
    798     }
    799 
    800     compressLines();
    801 }
    802 
    803 static void
    804 compressLines() {
    805     Line *line=NULL;
    806     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
    807              groupMSB=0xffff, lineCount2;
    808     int16_t groupTop=0;
    809 
    810     /* store the groups like lines, with compressed data after raw strings */
    811     groupBottom=lineTop;
    812     lineCount2=lineCount;
    813     lineCount=0;
    814 
    815     /* loop over all lines */
    816     while(i<lineCount2) {
    817         line=lines+i++;
    818         inLine=line->code;
    819 
    820         /* segment the lines to groups of 32 */
    821         if(inLine>>GROUP_SHIFT!=groupMSB) {
    822             /* finish the current group with empty lines */
    823             while((++outLine&GROUP_MASK)!=0) {
    824                 appendLineLength(0);
    825             }
    826 
    827             /* store the group like a line */
    828             if(groupTop>0) {
    829                 if(groupTop>GROUP_STORE_SIZE) {
    830                     fprintf(stderr, "gennames: group store overflow\n");
    831                     exit(U_BUFFER_OVERFLOW_ERROR);
    832                 }
    833                 addGroup(groupMSB, groupStore, groupTop);
    834             }
    835 
    836             /* start the new group */
    837             lineLengthsTop=0;
    838             groupTop=0;
    839             groupMSB=inLine>>GROUP_SHIFT;
    840             outLine=(inLine&~GROUP_MASK)-1;
    841         }
    842 
    843         /* write empty lines between the previous line in the group and this one */
    844         while(++outLine<inLine) {
    845             appendLineLength(0);
    846         }
    847 
    848         /* write characters and tokens for this line */
    849         appendLineLength(compressLine(line->s, line->length, &groupTop));
    850     }
    851 
    852     /* finish and store the last group */
    853     if(line && groupMSB!=0xffff) {
    854         /* finish the current group with empty lines */
    855         while((++outLine&GROUP_MASK)!=0) {
    856             appendLineLength(0);
    857         }
    858 
    859         /* store the group like a line */
    860         if(groupTop>0) {
    861             if(groupTop>GROUP_STORE_SIZE) {
    862                 fprintf(stderr, "gennames: group store overflow\n");
    863                 exit(U_BUFFER_OVERFLOW_ERROR);
    864             }
    865             addGroup(groupMSB, groupStore, groupTop);
    866         }
    867     }
    868 
    869     if(!beQuiet) {
    870         printf("number of groups: %lu\n", (unsigned long)lineCount);
    871     }
    872 }
    873 
    874 static int16_t
    875 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
    876     int16_t start, limit, token, groupTop=*pGroupTop;
    877 
    878     start=0;
    879     do {
    880         /* write any "noise" characters */
    881         limit=skipNoise((char *)s, start, length);
    882         while(start<limit) {
    883             groupStore[groupTop++]=s[start++];
    884         }
    885 
    886         if(start==length) {
    887             break;
    888         }
    889 
    890         /* write a word, as token or directly */
    891         limit=getWord((char *)s, start, length);
    892         if(limit-start==1) {
    893             groupStore[groupTop++]=s[start++];
    894         } else {
    895             token=findToken(s+start, (int16_t)(limit-start));
    896             if(token!=-1) {
    897                 if(token>0xff) {
    898                     groupStore[groupTop++]=(uint8_t)(token>>8);
    899                 }
    900                 groupStore[groupTop++]=(uint8_t)token;
    901                 start=limit;
    902             } else {
    903                 while(start<limit) {
    904                     groupStore[groupTop++]=s[start++];
    905                 }
    906             }
    907         }
    908     } while(start<length);
    909 
    910     length=(int16_t)(groupTop-*pGroupTop);
    911     *pGroupTop=groupTop;
    912     return length;
    913 }
    914 
    915 static int32_t
    916 compareWords(const void *context, const void *word1, const void *word2) {
    917     /* reverse sort by word weight */
    918     return ((Word *)word2)->weight-((Word *)word1)->weight;
    919 }
    920 
    921 /* generate output data ----------------------------------------------------- */
    922 
    923 static void
    924 generateData(const char *dataDir, Options *storeOptions) {
    925     UNewDataMemory *pData;
    926     UErrorCode errorCode=U_ZERO_ERROR;
    927     uint16_t groupWords[3];
    928     uint32_t i, groupTop=lineTop, offset, size,
    929              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
    930     long dataLength;
    931     int16_t token;
    932 
    933     pData=udata_create(dataDir,
    934                        DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
    935                        &dataInfo,
    936                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    937     if(U_FAILURE(errorCode)) {
    938         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
    939         exit(errorCode);
    940     }
    941 
    942     /* first, see how much space we need, and prepare the token strings */
    943     for(i=0; i<tokenCount; ++i) {
    944         token=tokens[i];
    945         if(token!=-1 && token!=-2) {
    946             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
    947         }
    948     }
    949 
    950     /*
    951      * Required padding for data swapping:
    952      * The token table undergoes a permutation during data swapping when the
    953      * input and output charsets are different.
    954      * The token table cannot grow during swapping, so we need to make sure that
    955      * the table is long enough for successful in-place permutation.
    956      *
    957      * We simply round up tokenCount to the next multiple of 256 to account for
    958      * all possible permutations.
    959      *
    960      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
    961      *
    962      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
    963      * and will be swapped between ASCII and EBCDIC between
    964      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
    965      * This should be the only -1 entry in tokens[256..511] on which the data
    966      * swapper bases its trail byte permutation map (trailMap[]).
    967      *
    968      * It would be sufficient to increase tokenCount so that its lower 8 bits
    969      * are at least 0x5e+1 to make room for swapping between the two semicolons.
    970      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
    971      * should always be an identity map, where we do not need additional room.
    972      */
    973     i=tokenCount;
    974     tokenCount=(tokenCount+0xff)&~0xff;
    975     if(!beQuiet && i<tokenCount) {
    976         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
    977     }
    978     for(; i<tokenCount; ++i) {
    979         if((i&0xff)==NAME_SEPARATOR_CHAR) {
    980             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
    981         } else {
    982             tokens[i]=0; /* unused token for padding */
    983         }
    984     }
    985 
    986     /*
    987      * Calculate the total size in bytes of the data including:
    988      * - the offset to the token strings, uint32_t (4)
    989      * - the offset to the group table, uint32_t (4)
    990      * - the offset to the group strings, uint32_t (4)
    991      * - the offset to the algorithmic names, uint32_t (4)
    992      *
    993      * - the number of tokens, uint16_t (2)
    994      * - the token table, uint16_t[tokenCount] (2*tokenCount)
    995      *
    996      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
    997      *
    998      * - the number of groups, uint16_t (2)
    999      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
   1000      *
   1001      * - the group strings (groupTop-groupBottom), 2-padded
   1002      *
   1003      * - the size of the data for the algorithmic names
   1004      */
   1005     tokenStringOffset=4+4+4+4+2+2*tokenCount;
   1006     groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
   1007     groupStringOffset=groupsOffset+2+6*lineCount;
   1008     algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
   1009 
   1010     offset=generateAlgorithmicData(NULL, storeOptions);
   1011     size=algNamesOffset+offset;
   1012 
   1013     if(!beQuiet) {
   1014         printf("size of the Unicode Names data:\n"
   1015                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
   1016                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
   1017                 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
   1018     }
   1019 
   1020     /* write the data to the file */
   1021     /* offsets */
   1022     udata_write32(pData, tokenStringOffset);
   1023     udata_write32(pData, groupsOffset);
   1024     udata_write32(pData, groupStringOffset);
   1025     udata_write32(pData, algNamesOffset);
   1026 
   1027     /* token table */
   1028     udata_write16(pData, (uint16_t)tokenCount);
   1029     udata_writeBlock(pData, tokens, 2*tokenCount);
   1030 
   1031     /* token strings */
   1032     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
   1033     if((lineTop-groupTop)&1) {
   1034         /* 2-padding */
   1035         udata_writePadding(pData, 1);
   1036     }
   1037 
   1038     /* group table */
   1039     udata_write16(pData, (uint16_t)lineCount);
   1040     for(i=0; i<lineCount; ++i) {
   1041         /* groupMSB */
   1042         groupWords[0]=(uint16_t)lines[i].code;
   1043 
   1044         /* offset */
   1045         offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
   1046         groupWords[1]=(uint16_t)(offset>>16);
   1047         groupWords[2]=(uint16_t)(offset);
   1048         udata_writeBlock(pData, groupWords, 6);
   1049     }
   1050 
   1051     /* group strings */
   1052     udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
   1053 
   1054     /* 4-align the algorithmic names data */
   1055     udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
   1056 
   1057     generateAlgorithmicData(pData, storeOptions);
   1058 
   1059     /* finish up */
   1060     dataLength=udata_finish(pData, &errorCode);
   1061     if(U_FAILURE(errorCode)) {
   1062         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
   1063         exit(errorCode);
   1064     }
   1065 
   1066     if(dataLength!=(long)size) {
   1067         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
   1068 dataLength, (unsigned long)size);
   1069         exit(U_INTERNAL_PROGRAM_ERROR);
   1070     }
   1071 }
   1072 
   1073 /* the structure for algorithmic names needs to be 4-aligned */
   1074 typedef struct AlgorithmicRange {
   1075     uint32_t rangeStart, rangeEnd;
   1076     uint8_t algorithmType, algorithmVariant;
   1077     uint16_t rangeSize;
   1078 } AlgorithmicRange;
   1079 
   1080 static uint32_t
   1081 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
   1082     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
   1083 #   define PREFIX_LENGTH 23
   1084 #   define PREFIX_LENGTH_4 24
   1085     uint32_t countAlgRanges;
   1086 
   1087     static AlgorithmicRange cjkExtA={
   1088         0x3400, 0x4db5,
   1089         0, 4,
   1090         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1091     };
   1092     static AlgorithmicRange cjk={
   1093         0x4e00, 0x9fa5,
   1094         0, 4,
   1095         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1096     };
   1097     static AlgorithmicRange cjkExtB={
   1098         0x20000, 0x2a6d6,
   1099         0, 5,
   1100         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1101     };
   1102 
   1103     static char jamo[]=
   1104         "HANGUL SYLLABLE \0"
   1105 
   1106         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
   1107         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
   1108 
   1109         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
   1110         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
   1111         "YU\0EU\0YI\0I\0"
   1112 
   1113         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
   1114         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
   1115         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
   1116     ;
   1117 
   1118     static AlgorithmicRange hangul={
   1119         0xac00, 0xd7a3,
   1120         1, 3,
   1121         sizeof(AlgorithmicRange)+6+sizeof(jamo)
   1122     };
   1123 
   1124     /* modulo factors, maximum 8 */
   1125     /* 3 factors: 19, 21, 28, most-to-least-significant */
   1126     static uint16_t hangulFactors[3]={
   1127         19, 21, 28
   1128     };
   1129 
   1130     uint32_t size;
   1131 
   1132     size=0;
   1133 
   1134     if(ucdVersion>=UNI_5_1) {
   1135         /* Unicode 5.1 and up has a longer CJK Unihan range than before */
   1136         cjk.rangeEnd=0x9FC3;
   1137     } else if(ucdVersion>=UNI_4_1) {
   1138         /* Unicode 4.1 and up has a longer CJK Unihan range than before */
   1139         cjk.rangeEnd=0x9FBB;
   1140     }
   1141 
   1142     /* number of ranges of algorithmic names */
   1143     if(!storeOptions->storeNames) {
   1144         countAlgRanges=0;
   1145     } else if(ucdVersion>=UNI_3_1) {
   1146         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
   1147         countAlgRanges=4;
   1148     } else if(ucdVersion>=UNI_3_0) {
   1149         /* Unicode 3.0 has 3 ranges including CJK Extension A */
   1150         countAlgRanges=3;
   1151     } else {
   1152         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
   1153         countAlgRanges=2;
   1154     }
   1155 
   1156     if(pData!=NULL) {
   1157         udata_write32(pData, countAlgRanges);
   1158     } else {
   1159         size+=4;
   1160     }
   1161     if(countAlgRanges==0) {
   1162         return size;
   1163     }
   1164 
   1165     /*
   1166      * each range:
   1167      * uint32_t rangeStart
   1168      * uint32_t rangeEnd
   1169      * uint8_t algorithmType
   1170      * uint8_t algorithmVariant
   1171      * uint16_t size of range data
   1172      * uint8_t[size] data
   1173      */
   1174 
   1175     /* range 0: cjk extension a */
   1176     if(countAlgRanges>=3) {
   1177         if(pData!=NULL) {
   1178             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
   1179             udata_writeString(pData, prefix, PREFIX_LENGTH);
   1180             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1181                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1182             }
   1183         } else {
   1184             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1185         }
   1186     }
   1187 
   1188     /* range 1: cjk */
   1189     if(pData!=NULL) {
   1190         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
   1191         udata_writeString(pData, prefix, PREFIX_LENGTH);
   1192         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1193             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1194         }
   1195     } else {
   1196         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1197     }
   1198 
   1199     /* range 2: hangul syllables */
   1200     if(pData!=NULL) {
   1201         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
   1202         udata_writeBlock(pData, hangulFactors, 6);
   1203         udata_writeString(pData, jamo, sizeof(jamo));
   1204     } else {
   1205         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
   1206     }
   1207 
   1208     /* range 3: cjk extension b */
   1209     if(countAlgRanges>=4) {
   1210         if(pData!=NULL) {
   1211             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
   1212             udata_writeString(pData, prefix, PREFIX_LENGTH);
   1213             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1214                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1215             }
   1216         } else {
   1217             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1218         }
   1219     }
   1220 
   1221     return size;
   1222 }
   1223 
   1224 /* helpers ------------------------------------------------------------------ */
   1225 
   1226 static int16_t
   1227 findToken(uint8_t *s, int16_t length) {
   1228     int16_t i, token;
   1229 
   1230     for(i=0; i<(int16_t)tokenCount; ++i) {
   1231         token=tokens[i];
   1232         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
   1233             return i;
   1234         }
   1235     }
   1236 
   1237     return -1;
   1238 }
   1239 
   1240 static Word *
   1241 findWord(char *s, int16_t length) {
   1242     uint32_t i;
   1243 
   1244     for(i=0; i<wordCount; ++i) {
   1245         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
   1246             return words+i;
   1247         }
   1248     }
   1249 
   1250     return NULL;
   1251 }
   1252 
   1253 static Word *
   1254 addWord(char *s, int16_t length) {
   1255     uint8_t *stringStart;
   1256     Word *word;
   1257 
   1258     if(wordCount==MAX_WORD_COUNT) {
   1259         fprintf(stderr, "gennames: too many words\n");
   1260         exit(U_BUFFER_OVERFLOW_ERROR);
   1261     }
   1262 
   1263     stringStart=allocWord(length);
   1264     uprv_memcpy(stringStart, s, length);
   1265 
   1266     word=words+wordCount;
   1267 
   1268     /*
   1269      * Initialize the weight with the costs for this token:
   1270      * a zero-terminated string and a 16-bit offset.
   1271      */
   1272     word->weight=-(length+1+2);
   1273     word->count=0;
   1274     word->length=length;
   1275     word->s=stringStart;
   1276 
   1277     ++wordCount;
   1278 
   1279     return word;
   1280 }
   1281 
   1282 static void
   1283 countWord(Word *word) {
   1284     /* add to the weight the savings: the length of the word minus 1 byte for the token */
   1285     word->weight+=word->length-1;
   1286     ++word->count;
   1287 }
   1288 
   1289 static void
   1290 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
   1291     uint8_t *stringStart;
   1292     Line *line;
   1293     int16_t i, length;
   1294 
   1295     if(lineCount==MAX_LINE_COUNT) {
   1296         fprintf(stderr, "gennames: too many lines\n");
   1297         exit(U_BUFFER_OVERFLOW_ERROR);
   1298     }
   1299 
   1300     /* find the last non-empty name */
   1301     while(count>0 && lengths[count-1]==0) {
   1302         --count;
   1303     }
   1304     if(count==0) {
   1305         return; /* should not occur: caller should not have called */
   1306     }
   1307 
   1308     /* there will be (count-1) separator characters */
   1309     i=count;
   1310     length=count-1;
   1311 
   1312     /* add lengths of strings */
   1313     while(i>0) {
   1314         length+=lengths[--i];
   1315     }
   1316 
   1317     /* allocate line memory */
   1318     stringStart=allocLine(length);
   1319 
   1320     /* copy all strings into the line memory */
   1321     length=0; /* number of chars copied so far */
   1322     for(i=0; i<count; ++i) {
   1323         if(i>0) {
   1324             stringStart[length++]=NAME_SEPARATOR_CHAR;
   1325         }
   1326         if(lengths[i]>0) {
   1327             uprv_memcpy(stringStart+length, names[i], lengths[i]);
   1328             length+=lengths[i];
   1329         }
   1330     }
   1331 
   1332     line=lines+lineCount;
   1333 
   1334     line->code=code;
   1335     line->length=length;
   1336     line->s=stringStart;
   1337 
   1338     ++lineCount;
   1339 
   1340     /* prevent a character value that is actually in a name from becoming a token */
   1341     while(length>0) {
   1342         tokens[stringStart[--length]]=-1;
   1343     }
   1344 }
   1345 
   1346 static void
   1347 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
   1348     uint8_t *stringStart;
   1349     Line *line;
   1350 
   1351     if(lineCount==MAX_LINE_COUNT) {
   1352         fprintf(stderr, "gennames: too many groups\n");
   1353         exit(U_BUFFER_OVERFLOW_ERROR);
   1354     }
   1355 
   1356     /* store the line lengths first, then the strings */
   1357     lineLengthsTop=(lineLengthsTop+1)/2;
   1358     stringStart=allocLine(lineLengthsTop+length);
   1359     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
   1360     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
   1361 
   1362     line=lines+lineCount;
   1363 
   1364     line->code=groupMSB;
   1365     line->length=length;
   1366     line->s=stringStart;
   1367 
   1368     ++lineCount;
   1369 }
   1370 
   1371 static uint32_t
   1372 addToken(uint8_t *s, int16_t length) {
   1373     uint8_t *stringStart;
   1374 
   1375     stringStart=allocLine(length+1);
   1376     uprv_memcpy(stringStart, s, length);
   1377     stringStart[length]=0;
   1378 
   1379     return (uint32_t)(stringStart - stringStore);
   1380 }
   1381 
   1382 static void
   1383 appendLineLength(int16_t length) {
   1384     if(length>=76) {
   1385         fprintf(stderr, "gennames: compressed line too long\n");
   1386         exit(U_BUFFER_OVERFLOW_ERROR);
   1387     }
   1388     if(length>=12) {
   1389         length-=12;
   1390         appendLineLengthNibble((uint8_t)((length>>4)|12));
   1391     }
   1392     appendLineLengthNibble((uint8_t)length);
   1393 }
   1394 
   1395 static void
   1396 appendLineLengthNibble(uint8_t nibble) {
   1397     if((lineLengthsTop&1)==0) {
   1398         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
   1399     } else {
   1400         lineLengths[lineLengthsTop/2]|=nibble&0xf;
   1401     }
   1402     ++lineLengthsTop;
   1403 }
   1404 
   1405 static uint8_t *
   1406 allocLine(int32_t length) {
   1407     uint32_t top=lineTop+length;
   1408     uint8_t *p;
   1409 
   1410     if(top>wordBottom) {
   1411         fprintf(stderr, "gennames: out of memory\n");
   1412         exit(U_MEMORY_ALLOCATION_ERROR);
   1413     }
   1414     p=stringStore+lineTop;
   1415     lineTop=top;
   1416     return p;
   1417 }
   1418 
   1419 static uint8_t *
   1420 allocWord(uint32_t length) {
   1421     uint32_t bottom=wordBottom-length;
   1422 
   1423     if(lineTop>bottom) {
   1424         fprintf(stderr, "gennames: out of memory\n");
   1425         exit(U_MEMORY_ALLOCATION_ERROR);
   1426     }
   1427     wordBottom=bottom;
   1428     return stringStore+bottom;
   1429 }
   1430 
   1431 /*
   1432  * Hey, Emacs, please set the following:
   1433  *
   1434  * Local Variables:
   1435  * indent-tabs-mode: nil
   1436  * End:
   1437  *
   1438  */
   1439