Home | History | Annotate | Download | only in gennames
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gennames.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads the Unicode character database text file,
     17 *   parses it, and extracts the character code,
     18 *   the "modern" character name, and optionally the
     19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
     20 *   It then tokenizes and compresses the names and builds
     21 *   compact binary tables for random-access lookup
     22 *   in a u_charName() API function.
     23 *
     24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
     25 * (all data is static const)
     26 *
     27 * UDataInfo fields:
     28 *   dataFormat "unam"
     29 *   formatVersion 1.0
     30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
     31 *
     32 * -- data-based names
     33 * uint32_t tokenStringOffset,
     34 *          groupsOffset,
     35 *          groupStringOffset,
     36 *          algNamesOffset;
     37 *
     38 * uint16_t tokenCount;
     39 * uint16_t tokenTable[tokenCount];
     40 *
     41 * char     tokenStrings[]; -- padded to even count
     42 *
     43 * -- strings (groupStrings) are tokenized as follows:
     44 *   for each character c
     45 *       if(c>=tokenCount) write that character c directly
     46 *   else
     47 *       token=tokenTable[c];
     48 *       if(token==0xfffe) -- lead byte of double-byte token
     49 *           token=tokenTable[c<<8|next character];
     50 *       if(token==-1)
     51 *           write c directly
     52 *       else
     53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
     54 *           append zero-terminated tokenString;
     55 *
     56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
     57 *    are separated by ';'.
     58 *
     59 * uint16_t groupCount;
     60 * struct {
     61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
     62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
     63 *   uint16_t offsetLow;
     64 * } groupTable[groupCount];
     65 *
     66 * char     groupStrings[]; -- padded to 4-count
     67 *
     68 * -- The actual, tokenized group strings are not zero-terminated because
     69 *   that would take up too much space.
     70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
     71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
     72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
     73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
     74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
     75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
     76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
     77 *   For the de-tokenizing, see token description above; the strings immediately follow the
     78 *   32 lengths.
     79 *
     80 * -- algorithmic names
     81 *
     82 * typedef struct AlgorithmicRange {
     83 *     uint32_t rangeStart, rangeEnd;
     84 *     uint8_t algorithmType, algorithmVariant;
     85 *     uint16_t rangeSize;
     86 * } AlgorithmicRange;
     87 *
     88 * uint32_t algRangesCount; -- number of data blocks for ranges of
     89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
     90 *
     91 * struct {
     92 *     AlgorithmicRange algRange;
     93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
     94 * } algRanges[algNamesCount];
     95 * -- not a real array because each part has a different size
     96 *    of algRange.rangeSize (including AlgorithmicRange)
     97 *
     98 * -- algorithmic range types:
     99 *
    100 * 0 Names are formed from a string prefix that is stored in
    101 *   the algRangeData (zero-terminated), followed by the Unicode code point
    102 *   of the character in hexadecimal digits;
    103 *   algRange.algorithmVariant digits are written
    104 *
    105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
    106 *   algRange.algorithmVariant is the count of modulo factors
    107 *   algRangeData contains
    108 *       uint16_t factors[algRange.algorithmVariant];
    109 *       char strings[];
    110 *   the first zero-terminated string is written as the prefix; then:
    111 *
    112 *   The rangeStart is subtracted; with the difference, here "code":
    113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
    114 *       index[i]=code%factor[i];
    115 *       code/=factor[i];
    116 *
    117 *   The strings after the prefix are short pieces that are then appended to the result
    118 *   according to index[0..algRange.algorithmVariant-1].
    119 */
    120 
    121 #include <stdio.h>
    122 #include "unicode/utypes.h"
    123 #include "unicode/putil.h"
    124 #include "unicode/uclean.h"
    125 #include "unicode/udata.h"
    126 #include "cmemory.h"
    127 #include "cstring.h"
    128 #include "uarrsort.h"
    129 #include "unewdata.h"
    130 #include "uoptions.h"
    131 #include "uparse.h"
    132 
    133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    134 
    135 #define STRING_STORE_SIZE 1000000
    136 #define GROUP_STORE_SIZE 5000
    137 
    138 #define GROUP_SHIFT 5
    139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
    140 #define GROUP_MASK (LINES_PER_GROUP-1)
    141 
    142 #define MAX_LINE_COUNT 50000
    143 #define MAX_WORD_COUNT 20000
    144 #define MAX_GROUP_COUNT 5000
    145 
    146 #define DATA_NAME "unames"
    147 #define DATA_TYPE "icu"
    148 #define VERSION_STRING "unam"
    149 #define NAME_SEPARATOR_CHAR ';'
    150 
    151 #define ISO_DATA_NAME "ucomment"
    152 
    153 /* Unicode versions --------------------------------------------------------- */
    154 
    155 enum {
    156     UNI_1_0,
    157     UNI_1_1,
    158     UNI_2_0,
    159     UNI_3_0,
    160     UNI_3_1,
    161     UNI_3_2,
    162     UNI_4_0,
    163     UNI_4_0_1,
    164     UNI_4_1,
    165     UNI_5_0,
    166     UNI_5_1,
    167     UNI_5_2,
    168     UNI_VER_COUNT
    169 };
    170 
    171 static const UVersionInfo
    172 unicodeVersions[]={
    173     { 1, 0, 0, 0 },
    174     { 1, 1, 0, 0 },
    175     { 2, 0, 0, 0 },
    176     { 3, 0, 0, 0 },
    177     { 3, 1, 0, 0 },
    178     { 3, 2, 0, 0 },
    179     { 4, 0, 0, 0 },
    180     { 4, 0, 1, 0 },
    181     { 4, 1, 0, 0 },
    182     { 5, 0, 0, 0 },
    183     { 5, 1, 0, 0 },
    184     { 5, 2, 0, 0 }
    185 };
    186 
    187 static int32_t ucdVersion=UNI_5_2;
    188 
    189 static int32_t
    190 findUnicodeVersion(const UVersionInfo version) {
    191     int32_t i;
    192 
    193     for(i=0; /* while(version>unicodeVersions[i]) {} */
    194         i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
    195         ++i) {}
    196     if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
    197         --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
    198     }
    199     return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
    200 }
    201 
    202 /* generator data ----------------------------------------------------------- */
    203 
    204 /* UDataInfo cf. udata.h */
    205 static UDataInfo dataInfo={
    206     sizeof(UDataInfo),
    207     0,
    208 
    209     U_IS_BIG_ENDIAN,
    210     U_CHARSET_FAMILY,
    211     sizeof(UChar),
    212     0,
    213 
    214     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
    215     {1, 0, 0, 0},                 /* formatVersion */
    216     {3, 0, 0, 0}                  /* dataVersion */
    217 };
    218 
    219 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
    220 
    221 typedef struct Options {
    222     UBool storeNames;
    223     UBool store10Names;
    224     UBool storeISOComments;
    225 } Options;
    226 
    227 /*
    228  * Pair of code point and name alias.
    229  * Try to keep sizeof(CpNameAlias) a multiple of 4 to avoid padding.
    230  */
    231 typedef struct CpNameAlias {
    232     uint32_t code;
    233     char nameAlias[124];
    234 } CpNameAlias;
    235 
    236 static CpNameAlias cpNameAliases[50];
    237 
    238 static uint32_t cpNameAliasesIndex=0, cpNameAliasesTop=0;
    239 
    240 static uint8_t stringStore[STRING_STORE_SIZE],
    241                groupStore[GROUP_STORE_SIZE],
    242                lineLengths[LINES_PER_GROUP];
    243 
    244 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
    245 
    246 typedef struct {
    247     uint32_t code;
    248     int16_t length;
    249     uint8_t *s;
    250 } Line;
    251 
    252 typedef struct {
    253     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
    254     int16_t count;
    255     int16_t length;
    256     uint8_t *s;
    257 } Word;
    258 
    259 static Line lines[MAX_LINE_COUNT];
    260 static Word words[MAX_WORD_COUNT];
    261 
    262 static uint32_t lineCount=0, wordCount=0;
    263 
    264 static int16_t leadByteCount;
    265 
    266 #define LEADBYTE_LIMIT 16
    267 
    268 static int16_t tokens[LEADBYTE_LIMIT*256];
    269 static uint32_t tokenCount;
    270 
    271 /* prototypes --------------------------------------------------------------- */
    272 
    273 static void
    274 init(void);
    275 
    276 static void
    277 parseNameAliases(const char *filename, Options *options);
    278 
    279 static void
    280 parseDB(const char *filename, Options *options);
    281 
    282 static void
    283 parseName(char *name, int16_t length);
    284 
    285 static int16_t
    286 skipNoise(char *line, int16_t start, int16_t limit);
    287 
    288 static int16_t
    289 getWord(char *line, int16_t start, int16_t limit);
    290 
    291 static void
    292 compress(void);
    293 
    294 static void
    295 compressLines(void);
    296 
    297 static int16_t
    298 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
    299 
    300 static int32_t
    301 compareWords(const void *context, const void *word1, const void *word2);
    302 
    303 static void
    304 generateData(const char *dataDir, Options *options);
    305 
    306 static uint32_t
    307 generateAlgorithmicData(UNewDataMemory *pData, Options *options);
    308 
    309 static int16_t
    310 findToken(uint8_t *s, int16_t length);
    311 
    312 static Word *
    313 findWord(char *s, int16_t length);
    314 
    315 static Word *
    316 addWord(char *s, int16_t length);
    317 
    318 static void
    319 countWord(Word *word);
    320 
    321 static void
    322 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
    323 
    324 static void
    325 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
    326 
    327 static uint32_t
    328 addToken(uint8_t *s, int16_t length);
    329 
    330 static void
    331 appendLineLength(int16_t length);
    332 
    333 static void
    334 appendLineLengthNibble(uint8_t nibble);
    335 
    336 static uint8_t *
    337 allocLine(int32_t length);
    338 
    339 static uint8_t *
    340 allocWord(uint32_t length);
    341 
    342 /* -------------------------------------------------------------------------- */
    343 
    344 enum {
    345     HELP_H,
    346     HELP_QUESTION_MARK,
    347     VERBOSE,
    348     QUIET,
    349     COPYRIGHT,
    350     DESTDIR,
    351     UNICODE,
    352     UNICODE1_NAMES,
    353     NO_ISO_COMMENTS,
    354     ONLY_ISO_COMMENTS
    355 };
    356 
    357 static UOption options[]={
    358     UOPTION_HELP_H,
    359     UOPTION_HELP_QUESTION_MARK,
    360     UOPTION_VERBOSE,
    361     UOPTION_QUIET,
    362     UOPTION_COPYRIGHT,
    363     UOPTION_DESTDIR,
    364     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
    365     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
    366     { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
    367     { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
    368 };
    369 
    370 extern int
    371 main(int argc, char* argv[]) {
    372     UVersionInfo version;
    373     Options moreOptions={ TRUE, FALSE, TRUE };
    374     UErrorCode errorCode = U_ZERO_ERROR;
    375 
    376     U_MAIN_INIT_ARGS(argc, argv);
    377 
    378     /* Initialize ICU */
    379     u_init(&errorCode);
    380     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
    381         /* Note: u_init() will try to open ICU property data.
    382          *       failures here are expected when building ICU from scratch.
    383          *       ignore them.
    384          */
    385         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
    386             argv[0], u_errorName(errorCode));
    387         exit(1);
    388     }
    389 
    390     /* preset then read command line options */
    391     options[DESTDIR].value=u_getDataDirectory();
    392     options[UNICODE].value="4.1";
    393     argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
    394 
    395     /* error handling, printing usage message */
    396     if(argc<0) {
    397         fprintf(stderr,
    398             "error in command line argument \"%s\"\n",
    399             argv[-argc]);
    400     } else if(argc<2) {
    401         argc=-1;
    402     }
    403     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    404         /*
    405          * Broken into chucks because the C89 standard says the minimum
    406          * required supported string length is 509 bytes.
    407          */
    408         fprintf(stderr,
    409             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] [filename_ud [filename_na]]\n"
    410             "\n"
    411             "Read the UnicodeData.txt file and \n"
    412             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
    413             "\n"
    414             "\tfilename_ud  absolute path/filename for the UnicodeData.txt file\n"
    415             "\t             (default: standard input)\n"
    416             "\tfilename_na  absolute path/filename for the NameAliases.txt file\n"
    417             "\t             (default: no name aliases)\n"
    418             "\n",
    419             argv[0]);
    420         fprintf(stderr,
    421             "Options:\n"
    422             "\t-h or -? or --help  this usage text\n"
    423             "\t-v or --verbose     verbose output\n"
    424             "\t-q or --quiet       no output\n"
    425             "\t-c or --copyright   include a copyright notice\n"
    426             "\t-d or --destdir     destination directory, followed by the path\n"
    427             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
    428         fprintf(stderr,
    429             "\t-1 or --unicode1-names     store Unicode 1.0 character names\n"
    430             "\t      --no-iso-comments    do not store ISO comments\n"
    431             "\t      --only-iso-comments  write ucomment.icu with only ISO comments\n");
    432         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    433     }
    434 
    435     /* get the options values */
    436     beVerbose=options[VERBOSE].doesOccur;
    437     beQuiet=options[QUIET].doesOccur;
    438     haveCopyright=options[COPYRIGHT].doesOccur;
    439     moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
    440     moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
    441     if(options[ONLY_ISO_COMMENTS].doesOccur) {
    442         moreOptions.storeNames=moreOptions.store10Names=FALSE;
    443         moreOptions.storeISOComments=TRUE;
    444     }
    445 
    446     /* set the Unicode version */
    447     u_versionFromString(version, options[UNICODE].value);
    448     uprv_memcpy(dataInfo.dataVersion, version, 4);
    449     ucdVersion=findUnicodeVersion(version);
    450 
    451     init();
    452     if(argc>=3) {
    453         parseNameAliases(argv[2], &moreOptions);
    454     }
    455     parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
    456     compress();
    457     generateData(options[DESTDIR].value, &moreOptions);
    458 
    459     u_cleanup();
    460     return 0;
    461 }
    462 
    463 static void
    464 init() {
    465     int i;
    466 
    467     for(i=0; i<256; ++i) {
    468         tokens[i]=0;
    469     }
    470 }
    471 
    472 /* parsing ------------------------------------------------------------------ */
    473 
    474 /* get a name, strip leading and trailing whitespace */
    475 static int16_t
    476 getName(char **pStart, char *limit) {
    477     /* strip leading whitespace */
    478     char *start=(char *)u_skipWhitespace(*pStart);
    479 
    480     /* strip trailing whitespace */
    481     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
    482         --limit;
    483     }
    484 
    485     /* return results */
    486     *pStart=start;
    487     return (int16_t)(limit-start);
    488 }
    489 
    490 static void U_CALLCONV
    491 nameAliasesLineFn(void *context,
    492        char *fields[][2], int32_t fieldCount,
    493        UErrorCode *pErrorCode) {
    494     char *name;
    495     int16_t length=0;
    496     static uint32_t prevCode=0;
    497     uint32_t code=0;
    498 
    499     if(U_FAILURE(*pErrorCode)) {
    500         return;
    501     }
    502     /* get the character code */
    503     code=uprv_strtoul(fields[0][0], NULL, 16);
    504 
    505     /* get the character name */
    506     name=fields[1][0];
    507     length=getName(&name, fields[1][1]);
    508     if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) {
    509         fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n",
    510                 name, (unsigned long)code);
    511         *pErrorCode=U_PARSE_ERROR;
    512         exit(U_PARSE_ERROR);
    513     }
    514 
    515     /* check for non-character code points */
    516     if(!U_IS_UNICODE_CHAR(code)) {
    517         fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n",
    518                 (unsigned long)code);
    519         *pErrorCode=U_PARSE_ERROR;
    520         exit(U_PARSE_ERROR);
    521     }
    522 
    523     /* check that the code points (code) are in ascending order */
    524     if(code<=prevCode && code>0) {
    525         fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n",
    526                 (unsigned long)code, (unsigned long)prevCode);
    527         *pErrorCode=U_PARSE_ERROR;
    528         exit(U_PARSE_ERROR);
    529     }
    530     prevCode=code;
    531 
    532     if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) {
    533         fprintf(stderr, "gennames: error - too many name aliases\n");
    534         *pErrorCode=U_PARSE_ERROR;
    535         exit(U_PARSE_ERROR);
    536     }
    537     cpNameAliases[cpNameAliasesTop].code=code;
    538     uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length);
    539     cpNameAliases[cpNameAliasesTop].nameAlias[length]=0;
    540     ++cpNameAliasesTop;
    541 
    542     parseName(name, length);
    543 }
    544 
    545 static void U_CALLCONV
    546 lineFn(void *context,
    547        char *fields[][2], int32_t fieldCount,
    548        UErrorCode *pErrorCode) {
    549     Options *storeOptions=(Options *)context;
    550     char *names[4];
    551     int16_t lengths[4]={ 0, 0, 0, 0 };
    552     static uint32_t prevCode=0;
    553     uint32_t code=0;
    554 
    555     if(U_FAILURE(*pErrorCode)) {
    556         return;
    557     }
    558     /* get the character code */
    559     code=uprv_strtoul(fields[0][0], NULL, 16);
    560 
    561     /* get the character name */
    562     if(storeOptions->storeNames) {
    563         names[0]=fields[1][0];
    564         lengths[0]=getName(names+0, fields[1][1]);
    565         if(names[0][0]=='<') {
    566             /* do not store pseudo-names in <> brackets */
    567             lengths[0]=0;
    568         }
    569     }
    570 
    571     /* store 1.0 names */
    572     /* get the second character name, the one from Unicode 1.0 */
    573     if(storeOptions->store10Names) {
    574         names[1]=fields[10][0];
    575         lengths[1]=getName(names+1, fields[10][1]);
    576         if(names[1][0]=='<') {
    577             /* do not store pseudo-names in <> brackets */
    578             lengths[1]=0;
    579         }
    580     }
    581 
    582     /* get the ISO 10646 comment */
    583     if(storeOptions->storeISOComments) {
    584         names[2]=fields[11][0];
    585         lengths[2]=getName(names+2, fields[11][1]);
    586     }
    587 
    588     if(lengths[0]+lengths[1]+lengths[2]==0) {
    589         return;
    590     }
    591 
    592     /* check for non-character code points */
    593     if(!U_IS_UNICODE_CHAR(code)) {
    594         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
    595                 (unsigned long)code);
    596         *pErrorCode=U_PARSE_ERROR;
    597         exit(U_PARSE_ERROR);
    598     }
    599 
    600     /* check that the code points (code) are in ascending order */
    601     if(code<=prevCode && code>0) {
    602         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
    603                 (unsigned long)code, (unsigned long)prevCode);
    604         *pErrorCode=U_PARSE_ERROR;
    605         exit(U_PARSE_ERROR);
    606     }
    607     prevCode=code;
    608 
    609     parseName(names[0], lengths[0]);
    610     parseName(names[1], lengths[1]);
    611     parseName(names[2], lengths[2]);
    612 
    613     if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) {
    614         if(code==cpNameAliases[cpNameAliasesIndex].code) {
    615             names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias;
    616             lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias);
    617             ++cpNameAliasesIndex;
    618         } else {
    619             fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
    620                     (unsigned long)code);
    621             *pErrorCode=U_PARSE_ERROR;
    622             exit(U_PARSE_ERROR);
    623         }
    624     }
    625 
    626     /*
    627      * set the count argument to
    628      * 1: only store regular names, or only store ISO 10646 comments
    629      * 2: store regular and 1.0 names
    630      * 3: store names and ISO 10646 comment
    631      * 4: also store name alias
    632      *
    633      * addLine() will ignore empty trailing names
    634      */
    635     if(storeOptions->storeNames) {
    636         /* store names and comments as parsed according to storeOptions */
    637         addLine(code, names, lengths, LENGTHOF(names));
    638     } else {
    639         /* store only ISO 10646 comments */
    640         addLine(code, names+2, lengths+2, 1);
    641     }
    642 }
    643 
    644 static void
    645 parseNameAliases(const char *filename, Options *storeOptions) {
    646     char *fields[2][2];
    647     UErrorCode errorCode=U_ZERO_ERROR;
    648 
    649     if(!storeOptions->storeNames) {
    650         return;
    651     }
    652     u_parseDelimitedFile(filename, ';', fields, 2, nameAliasesLineFn, NULL, &errorCode);
    653     if(U_FAILURE(errorCode)) {
    654         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
    655         exit(errorCode);
    656     }
    657 
    658     if(!beQuiet) {
    659         printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop);
    660     }
    661 }
    662 
    663 static void
    664 parseDB(const char *filename, Options *storeOptions) {
    665     char *fields[15][2];
    666     UErrorCode errorCode=U_ZERO_ERROR;
    667 
    668     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
    669     if(U_FAILURE(errorCode)) {
    670         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
    671         exit(errorCode);
    672     }
    673     if(cpNameAliasesIndex<cpNameAliasesTop) {
    674         fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
    675                 (unsigned long)cpNameAliases[cpNameAliasesIndex].code);
    676         exit(U_PARSE_ERROR);
    677     }
    678 
    679     if(!beQuiet) {
    680         printf("size of all names in the database: %lu\n",
    681             (unsigned long)lineTop);
    682         printf("number of named Unicode characters: %lu\n",
    683             (unsigned long)lineCount);
    684         printf("number of words in the dictionary from these names: %lu\n",
    685             (unsigned long)wordCount);
    686     }
    687 }
    688 
    689 static void
    690 parseName(char *name, int16_t length) {
    691     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
    692     Word *word;
    693 
    694     while(start<length) {
    695         /* skip any "noise" characters */
    696         limit=skipNoise(name, start, length);
    697         if(start<limit) {
    698             /*prevStart=-1;*/
    699             start=limit;
    700         }
    701         if(start==length) {
    702             break;
    703         }
    704 
    705         /* get a word and add it if it is longer than 1 */
    706         limit=getWord(name, start, length);
    707         wordLength=(int16_t)(limit-start);
    708         if(wordLength>1) {
    709             word=findWord(name+start, wordLength);
    710             if(word==NULL) {
    711                 word=addWord(name+start, wordLength);
    712             }
    713             countWord(word);
    714         }
    715 
    716 #if 0
    717         /*
    718          * if there was a word before this
    719          * (with no noise in between), then add the pair of words, too
    720          */
    721         if(prevStart!=-1) {
    722             wordLength=limit-prevStart;
    723             word=findWord(name+prevStart, wordLength);
    724             if(word==NULL) {
    725                 word=addWord(name+prevStart, wordLength);
    726             }
    727             countWord(word);
    728         }
    729 #endif
    730 
    731         /*prevStart=start;*/
    732         start=limit;
    733     }
    734 }
    735 
    736 static UBool U_INLINE
    737 isWordChar(char c) {
    738     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
    739            ('J'<=c && c<='R') ||
    740            ('S'<=c && c<='Z') ||
    741 
    742            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
    743            ('j'<=c && c<='r') ||
    744            ('s'<=c && c<='z') ||
    745 
    746            ('0'<=c && c<='9');
    747 }
    748 
    749 static int16_t
    750 skipNoise(char *line, int16_t start, int16_t limit) {
    751     /* skip anything that is not part of a word in this sense */
    752     while(start<limit && !isWordChar(line[start])) {
    753         ++start;
    754     }
    755 
    756     return start;
    757 }
    758 
    759 static int16_t
    760 getWord(char *line, int16_t start, int16_t limit) {
    761     char c=0; /* initialize to avoid a compiler warning although the code was safe */
    762 
    763     /* a unicode character name word consists of A-Z0-9 */
    764     while(start<limit && isWordChar(line[start])) {
    765         ++start;
    766     }
    767 
    768     /* include a following space or dash */
    769     if(start<limit && ((c=line[start])==' ' || c=='-')) {
    770         ++start;
    771     }
    772 
    773     return start;
    774 }
    775 
    776 /* compressing -------------------------------------------------------------- */
    777 
    778 static void
    779 compress() {
    780     uint32_t i, letterCount;
    781     int16_t wordNumber;
    782     UErrorCode errorCode;
    783 
    784     /* sort the words in reverse order by weight */
    785     errorCode=U_ZERO_ERROR;
    786     uprv_sortArray(words, wordCount, sizeof(Word),
    787                     compareWords, NULL, FALSE, &errorCode);
    788 
    789     /* remove the words that do not save anything */
    790     while(wordCount>0 && words[wordCount-1].weight<1) {
    791         --wordCount;
    792     }
    793 
    794     /* count the letters in the token range */
    795     letterCount=0;
    796     for(i=LEADBYTE_LIMIT; i<256; ++i) {
    797         if(tokens[i]==-1) {
    798             ++letterCount;
    799         }
    800     }
    801     if(!beQuiet) {
    802         printf("number of letters used in the names: %d\n", (int)letterCount);
    803     }
    804 
    805     /* do we need double-byte tokens? */
    806     if(wordCount+letterCount<=256) {
    807         /* no, single-byte tokens are enough */
    808         leadByteCount=0;
    809         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
    810             if(tokens[i]!=-1) {
    811                 tokens[i]=wordNumber;
    812                 if(beVerbose) {
    813                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    814                             (int)i, (long)words[wordNumber].weight,
    815                             words[wordNumber].length, words[wordNumber].s);
    816                 }
    817                 ++wordNumber;
    818             }
    819         }
    820         tokenCount=i;
    821     } else {
    822         /*
    823          * The tokens that need two token bytes
    824          * get their weight reduced by their count
    825          * because they save less.
    826          */
    827         tokenCount=256-letterCount;
    828         for(i=tokenCount; i<wordCount; ++i) {
    829             words[i].weight-=words[i].count;
    830         }
    831 
    832         /* sort these words in reverse order by weight */
    833         errorCode=U_ZERO_ERROR;
    834         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
    835                         compareWords, NULL, FALSE, &errorCode);
    836 
    837         /* remove the words that do not save anything */
    838         while(wordCount>0 && words[wordCount-1].weight<1) {
    839             --wordCount;
    840         }
    841 
    842         /* how many tokens and lead bytes do we have now? */
    843         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
    844         /*
    845          * adjust upwards to take into account that
    846          * double-byte tokens must not
    847          * use NAME_SEPARATOR_CHAR as a second byte
    848          */
    849         tokenCount+=(tokenCount-256+254)/255;
    850 
    851         leadByteCount=(int16_t)(tokenCount>>8);
    852         if(leadByteCount<LEADBYTE_LIMIT) {
    853             /* adjust for the real number of lead bytes */
    854             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
    855         } else {
    856             /* limit the number of lead bytes */
    857             leadByteCount=LEADBYTE_LIMIT-1;
    858             tokenCount=LEADBYTE_LIMIT*256;
    859             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
    860             /* adjust again to skip double-byte tokens with ';' */
    861             wordCount-=(tokenCount-256+254)/255;
    862         }
    863 
    864         /* set token 0 to word 0 */
    865         tokens[0]=0;
    866         if(beVerbose) {
    867             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
    868                     (long)words[0].weight,
    869                     words[0].length, words[0].s);
    870         }
    871         wordNumber=1;
    872 
    873         /* set the lead byte tokens */
    874         for(i=1; (int16_t)i<=leadByteCount; ++i) {
    875             tokens[i]=-2;
    876         }
    877 
    878         /* set the tokens */
    879         for(; i<256; ++i) {
    880             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
    881             if(tokens[i]!=-1) {
    882                 tokens[i]=wordNumber;
    883                 if(beVerbose) {
    884                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    885                             (int)i, (long)words[wordNumber].weight,
    886                             words[wordNumber].length, words[wordNumber].s);
    887                 }
    888                 ++wordNumber;
    889             }
    890         }
    891 
    892         /* continue above 255 where there are no letters */
    893         for(; (uint32_t)wordNumber<wordCount; ++i) {
    894             if((i&0xff)==NAME_SEPARATOR_CHAR) {
    895                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
    896             } else {
    897                 tokens[i]=wordNumber;
    898                 if(beVerbose) {
    899                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
    900                             (int)i, (long)words[wordNumber].weight,
    901                             words[wordNumber].length, words[wordNumber].s);
    902                 }
    903                 ++wordNumber;
    904             }
    905         }
    906         tokenCount=i; /* should be already tokenCount={i or i+1} */
    907     }
    908 
    909     if(!beQuiet) {
    910         printf("number of lead bytes: %d\n", leadByteCount);
    911         printf("number of single-byte tokens: %lu\n",
    912             (unsigned long)256-letterCount-leadByteCount);
    913         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
    914     }
    915 
    916     compressLines();
    917 }
    918 
    919 static void
    920 compressLines() {
    921     Line *line=NULL;
    922     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
    923              groupMSB=0xffff, lineCount2;
    924     int16_t groupTop=0;
    925 
    926     /* store the groups like lines, with compressed data after raw strings */
    927     groupBottom=lineTop;
    928     lineCount2=lineCount;
    929     lineCount=0;
    930 
    931     /* loop over all lines */
    932     while(i<lineCount2) {
    933         line=lines+i++;
    934         inLine=line->code;
    935 
    936         /* segment the lines to groups of 32 */
    937         if(inLine>>GROUP_SHIFT!=groupMSB) {
    938             /* finish the current group with empty lines */
    939             while((++outLine&GROUP_MASK)!=0) {
    940                 appendLineLength(0);
    941             }
    942 
    943             /* store the group like a line */
    944             if(groupTop>0) {
    945                 if(groupTop>GROUP_STORE_SIZE) {
    946                     fprintf(stderr, "gennames: group store overflow\n");
    947                     exit(U_BUFFER_OVERFLOW_ERROR);
    948                 }
    949                 addGroup(groupMSB, groupStore, groupTop);
    950             }
    951 
    952             /* start the new group */
    953             lineLengthsTop=0;
    954             groupTop=0;
    955             groupMSB=inLine>>GROUP_SHIFT;
    956             outLine=(inLine&~GROUP_MASK)-1;
    957         }
    958 
    959         /* write empty lines between the previous line in the group and this one */
    960         while(++outLine<inLine) {
    961             appendLineLength(0);
    962         }
    963 
    964         /* write characters and tokens for this line */
    965         appendLineLength(compressLine(line->s, line->length, &groupTop));
    966     }
    967 
    968     /* finish and store the last group */
    969     if(line && groupMSB!=0xffff) {
    970         /* finish the current group with empty lines */
    971         while((++outLine&GROUP_MASK)!=0) {
    972             appendLineLength(0);
    973         }
    974 
    975         /* store the group like a line */
    976         if(groupTop>0) {
    977             if(groupTop>GROUP_STORE_SIZE) {
    978                 fprintf(stderr, "gennames: group store overflow\n");
    979                 exit(U_BUFFER_OVERFLOW_ERROR);
    980             }
    981             addGroup(groupMSB, groupStore, groupTop);
    982         }
    983     }
    984 
    985     if(!beQuiet) {
    986         printf("number of groups: %lu\n", (unsigned long)lineCount);
    987     }
    988 }
    989 
    990 static int16_t
    991 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
    992     int16_t start, limit, token, groupTop=*pGroupTop;
    993 
    994     start=0;
    995     do {
    996         /* write any "noise" characters */
    997         limit=skipNoise((char *)s, start, length);
    998         while(start<limit) {
    999             groupStore[groupTop++]=s[start++];
   1000         }
   1001 
   1002         if(start==length) {
   1003             break;
   1004         }
   1005 
   1006         /* write a word, as token or directly */
   1007         limit=getWord((char *)s, start, length);
   1008         if(limit-start==1) {
   1009             groupStore[groupTop++]=s[start++];
   1010         } else {
   1011             token=findToken(s+start, (int16_t)(limit-start));
   1012             if(token!=-1) {
   1013                 if(token>0xff) {
   1014                     groupStore[groupTop++]=(uint8_t)(token>>8);
   1015                 }
   1016                 groupStore[groupTop++]=(uint8_t)token;
   1017                 start=limit;
   1018             } else {
   1019                 while(start<limit) {
   1020                     groupStore[groupTop++]=s[start++];
   1021                 }
   1022             }
   1023         }
   1024     } while(start<length);
   1025 
   1026     length=(int16_t)(groupTop-*pGroupTop);
   1027     *pGroupTop=groupTop;
   1028     return length;
   1029 }
   1030 
   1031 static int32_t
   1032 compareWords(const void *context, const void *word1, const void *word2) {
   1033     /* reverse sort by word weight */
   1034     return ((Word *)word2)->weight-((Word *)word1)->weight;
   1035 }
   1036 
   1037 /* generate output data ----------------------------------------------------- */
   1038 
   1039 static void
   1040 generateData(const char *dataDir, Options *storeOptions) {
   1041     UNewDataMemory *pData;
   1042     UErrorCode errorCode=U_ZERO_ERROR;
   1043     uint16_t groupWords[3];
   1044     uint32_t i, groupTop=lineTop, offset, size,
   1045              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
   1046     long dataLength;
   1047     int16_t token;
   1048 
   1049     pData=udata_create(dataDir,
   1050                        DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
   1051                        &dataInfo,
   1052                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
   1053     if(U_FAILURE(errorCode)) {
   1054         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
   1055         exit(errorCode);
   1056     }
   1057 
   1058     /* first, see how much space we need, and prepare the token strings */
   1059     for(i=0; i<tokenCount; ++i) {
   1060         token=tokens[i];
   1061         if(token!=-1 && token!=-2) {
   1062             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
   1063         }
   1064     }
   1065 
   1066     /*
   1067      * Required padding for data swapping:
   1068      * The token table undergoes a permutation during data swapping when the
   1069      * input and output charsets are different.
   1070      * The token table cannot grow during swapping, so we need to make sure that
   1071      * the table is long enough for successful in-place permutation.
   1072      *
   1073      * We simply round up tokenCount to the next multiple of 256 to account for
   1074      * all possible permutations.
   1075      *
   1076      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
   1077      *
   1078      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
   1079      * and will be swapped between ASCII and EBCDIC between
   1080      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
   1081      * This should be the only -1 entry in tokens[256..511] on which the data
   1082      * swapper bases its trail byte permutation map (trailMap[]).
   1083      *
   1084      * It would be sufficient to increase tokenCount so that its lower 8 bits
   1085      * are at least 0x5e+1 to make room for swapping between the two semicolons.
   1086      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
   1087      * should always be an identity map, where we do not need additional room.
   1088      */
   1089     i=tokenCount;
   1090     tokenCount=(tokenCount+0xff)&~0xff;
   1091     if(!beQuiet && i<tokenCount) {
   1092         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
   1093     }
   1094     for(; i<tokenCount; ++i) {
   1095         if((i&0xff)==NAME_SEPARATOR_CHAR) {
   1096             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
   1097         } else {
   1098             tokens[i]=0; /* unused token for padding */
   1099         }
   1100     }
   1101 
   1102     /*
   1103      * Calculate the total size in bytes of the data including:
   1104      * - the offset to the token strings, uint32_t (4)
   1105      * - the offset to the group table, uint32_t (4)
   1106      * - the offset to the group strings, uint32_t (4)
   1107      * - the offset to the algorithmic names, uint32_t (4)
   1108      *
   1109      * - the number of tokens, uint16_t (2)
   1110      * - the token table, uint16_t[tokenCount] (2*tokenCount)
   1111      *
   1112      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
   1113      *
   1114      * - the number of groups, uint16_t (2)
   1115      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
   1116      *
   1117      * - the group strings (groupTop-groupBottom), 2-padded
   1118      *
   1119      * - the size of the data for the algorithmic names
   1120      */
   1121     tokenStringOffset=4+4+4+4+2+2*tokenCount;
   1122     groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
   1123     groupStringOffset=groupsOffset+2+6*lineCount;
   1124     algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
   1125 
   1126     offset=generateAlgorithmicData(NULL, storeOptions);
   1127     size=algNamesOffset+offset;
   1128 
   1129     if(!beQuiet) {
   1130         printf("size of the Unicode Names data:\n"
   1131                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
   1132                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
   1133                 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
   1134     }
   1135 
   1136     /* write the data to the file */
   1137     /* offsets */
   1138     udata_write32(pData, tokenStringOffset);
   1139     udata_write32(pData, groupsOffset);
   1140     udata_write32(pData, groupStringOffset);
   1141     udata_write32(pData, algNamesOffset);
   1142 
   1143     /* token table */
   1144     udata_write16(pData, (uint16_t)tokenCount);
   1145     udata_writeBlock(pData, tokens, 2*tokenCount);
   1146 
   1147     /* token strings */
   1148     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
   1149     if((lineTop-groupTop)&1) {
   1150         /* 2-padding */
   1151         udata_writePadding(pData, 1);
   1152     }
   1153 
   1154     /* group table */
   1155     udata_write16(pData, (uint16_t)lineCount);
   1156     for(i=0; i<lineCount; ++i) {
   1157         /* groupMSB */
   1158         groupWords[0]=(uint16_t)lines[i].code;
   1159 
   1160         /* offset */
   1161         offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
   1162         groupWords[1]=(uint16_t)(offset>>16);
   1163         groupWords[2]=(uint16_t)(offset);
   1164         udata_writeBlock(pData, groupWords, 6);
   1165     }
   1166 
   1167     /* group strings */
   1168     udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
   1169 
   1170     /* 4-align the algorithmic names data */
   1171     udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
   1172 
   1173     generateAlgorithmicData(pData, storeOptions);
   1174 
   1175     /* finish up */
   1176     dataLength=udata_finish(pData, &errorCode);
   1177     if(U_FAILURE(errorCode)) {
   1178         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
   1179         exit(errorCode);
   1180     }
   1181 
   1182     if(dataLength!=(long)size) {
   1183         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
   1184 dataLength, (unsigned long)size);
   1185         exit(U_INTERNAL_PROGRAM_ERROR);
   1186     }
   1187 }
   1188 
   1189 /* the structure for algorithmic names needs to be 4-aligned */
   1190 typedef struct AlgorithmicRange {
   1191     uint32_t rangeStart, rangeEnd;
   1192     uint8_t algorithmType, algorithmVariant;
   1193     uint16_t rangeSize;
   1194 } AlgorithmicRange;
   1195 
   1196 static uint32_t
   1197 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
   1198     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
   1199 #   define PREFIX_LENGTH 23
   1200 #   define PREFIX_LENGTH_4 24
   1201     uint32_t countAlgRanges;
   1202 
   1203     static AlgorithmicRange cjkExtA={
   1204         0x3400, 0x4db5,
   1205         0, 4,
   1206         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1207     };
   1208     static AlgorithmicRange cjk={
   1209         0x4e00, 0x9fa5,
   1210         0, 4,
   1211         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1212     };
   1213     static AlgorithmicRange cjkExtB={
   1214         0x20000, 0x2a6d6,
   1215         0, 5,
   1216         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1217     };
   1218     static AlgorithmicRange cjkExtC={
   1219         0x2a700, 0x2b734,
   1220         0, 5,
   1221         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
   1222     };
   1223 
   1224     static char jamo[]=
   1225         "HANGUL SYLLABLE \0"
   1226 
   1227         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
   1228         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
   1229 
   1230         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
   1231         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
   1232         "YU\0EU\0YI\0I\0"
   1233 
   1234         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
   1235         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
   1236         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
   1237     ;
   1238 
   1239     static AlgorithmicRange hangul={
   1240         0xac00, 0xd7a3,
   1241         1, 3,
   1242         sizeof(AlgorithmicRange)+6+sizeof(jamo)
   1243     };
   1244 
   1245     /* modulo factors, maximum 8 */
   1246     /* 3 factors: 19, 21, 28, most-to-least-significant */
   1247     static uint16_t hangulFactors[3]={
   1248         19, 21, 28
   1249     };
   1250 
   1251     uint32_t size;
   1252 
   1253     size=0;
   1254 
   1255     if(ucdVersion>=UNI_5_2) {
   1256         /* Unicode 5.2 and up has a longer CJK Unihan range than before */
   1257         cjk.rangeEnd=0x9FCB;
   1258     } else if(ucdVersion>=UNI_5_1) {
   1259         /* Unicode 5.1 and up has a longer CJK Unihan range than before */
   1260         cjk.rangeEnd=0x9FC3;
   1261     } else if(ucdVersion>=UNI_4_1) {
   1262         /* Unicode 4.1 and up has a longer CJK Unihan range than before */
   1263         cjk.rangeEnd=0x9FBB;
   1264     }
   1265 
   1266     /* number of ranges of algorithmic names */
   1267     if(!storeOptions->storeNames) {
   1268         countAlgRanges=0;
   1269     } else if(ucdVersion>=UNI_5_2) {
   1270         /* Unicode 5.2 and up has 5 ranges including CJK Extension C */
   1271         countAlgRanges=5;
   1272     } else if(ucdVersion>=UNI_3_1) {
   1273         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
   1274         countAlgRanges=4;
   1275     } else if(ucdVersion>=UNI_3_0) {
   1276         /* Unicode 3.0 has 3 ranges including CJK Extension A */
   1277         countAlgRanges=3;
   1278     } else {
   1279         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
   1280         countAlgRanges=2;
   1281     }
   1282 
   1283     if(pData!=NULL) {
   1284         udata_write32(pData, countAlgRanges);
   1285     } else {
   1286         size+=4;
   1287     }
   1288     if(countAlgRanges==0) {
   1289         return size;
   1290     }
   1291 
   1292     /*
   1293      * each range:
   1294      * uint32_t rangeStart
   1295      * uint32_t rangeEnd
   1296      * uint8_t algorithmType
   1297      * uint8_t algorithmVariant
   1298      * uint16_t size of range data
   1299      * uint8_t[size] data
   1300      */
   1301 
   1302     /* range 0: cjk extension a */
   1303     if(countAlgRanges>=3) {
   1304         if(pData!=NULL) {
   1305             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
   1306             udata_writeString(pData, prefix, PREFIX_LENGTH);
   1307             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1308                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1309             }
   1310         } else {
   1311             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1312         }
   1313     }
   1314 
   1315     /* range 1: cjk */
   1316     if(pData!=NULL) {
   1317         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
   1318         udata_writeString(pData, prefix, PREFIX_LENGTH);
   1319         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1320             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1321         }
   1322     } else {
   1323         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1324     }
   1325 
   1326     /* range 2: hangul syllables */
   1327     if(pData!=NULL) {
   1328         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
   1329         udata_writeBlock(pData, hangulFactors, 6);
   1330         udata_writeString(pData, jamo, sizeof(jamo));
   1331     } else {
   1332         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
   1333     }
   1334 
   1335     /* range 3: cjk extension b */
   1336     if(countAlgRanges>=4) {
   1337         if(pData!=NULL) {
   1338             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
   1339             udata_writeString(pData, prefix, PREFIX_LENGTH);
   1340             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1341                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1342             }
   1343         } else {
   1344             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1345         }
   1346     }
   1347 
   1348     /* range 4: cjk extension c */
   1349     if(countAlgRanges>=5) {
   1350         if(pData!=NULL) {
   1351             udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange));
   1352             udata_writeString(pData, prefix, PREFIX_LENGTH);
   1353             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
   1354                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
   1355             }
   1356         } else {
   1357             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
   1358         }
   1359     }
   1360 
   1361     return size;
   1362 }
   1363 
   1364 /* helpers ------------------------------------------------------------------ */
   1365 
   1366 static int16_t
   1367 findToken(uint8_t *s, int16_t length) {
   1368     int16_t i, token;
   1369 
   1370     for(i=0; i<(int16_t)tokenCount; ++i) {
   1371         token=tokens[i];
   1372         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
   1373             return i;
   1374         }
   1375     }
   1376 
   1377     return -1;
   1378 }
   1379 
   1380 static Word *
   1381 findWord(char *s, int16_t length) {
   1382     uint32_t i;
   1383 
   1384     for(i=0; i<wordCount; ++i) {
   1385         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
   1386             return words+i;
   1387         }
   1388     }
   1389 
   1390     return NULL;
   1391 }
   1392 
   1393 static Word *
   1394 addWord(char *s, int16_t length) {
   1395     uint8_t *stringStart;
   1396     Word *word;
   1397 
   1398     if(wordCount==MAX_WORD_COUNT) {
   1399         fprintf(stderr, "gennames: too many words\n");
   1400         exit(U_BUFFER_OVERFLOW_ERROR);
   1401     }
   1402 
   1403     stringStart=allocWord(length);
   1404     uprv_memcpy(stringStart, s, length);
   1405 
   1406     word=words+wordCount;
   1407 
   1408     /*
   1409      * Initialize the weight with the costs for this token:
   1410      * a zero-terminated string and a 16-bit offset.
   1411      */
   1412     word->weight=-(length+1+2);
   1413     word->count=0;
   1414     word->length=length;
   1415     word->s=stringStart;
   1416 
   1417     ++wordCount;
   1418 
   1419     return word;
   1420 }
   1421 
   1422 static void
   1423 countWord(Word *word) {
   1424     /* add to the weight the savings: the length of the word minus 1 byte for the token */
   1425     word->weight+=word->length-1;
   1426     ++word->count;
   1427 }
   1428 
   1429 static void
   1430 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
   1431     uint8_t *stringStart;
   1432     Line *line;
   1433     int16_t i, length;
   1434 
   1435     if(lineCount==MAX_LINE_COUNT) {
   1436         fprintf(stderr, "gennames: too many lines\n");
   1437         exit(U_BUFFER_OVERFLOW_ERROR);
   1438     }
   1439 
   1440     /* find the last non-empty name */
   1441     while(count>0 && lengths[count-1]==0) {
   1442         --count;
   1443     }
   1444     if(count==0) {
   1445         return; /* should not occur: caller should not have called */
   1446     }
   1447 
   1448     /* there will be (count-1) separator characters */
   1449     i=count;
   1450     length=count-1;
   1451 
   1452     /* add lengths of strings */
   1453     while(i>0) {
   1454         length+=lengths[--i];
   1455     }
   1456 
   1457     /* allocate line memory */
   1458     stringStart=allocLine(length);
   1459 
   1460     /* copy all strings into the line memory */
   1461     length=0; /* number of chars copied so far */
   1462     for(i=0; i<count; ++i) {
   1463         if(i>0) {
   1464             stringStart[length++]=NAME_SEPARATOR_CHAR;
   1465         }
   1466         if(lengths[i]>0) {
   1467             uprv_memcpy(stringStart+length, names[i], lengths[i]);
   1468             length+=lengths[i];
   1469         }
   1470     }
   1471 
   1472     line=lines+lineCount;
   1473 
   1474     line->code=code;
   1475     line->length=length;
   1476     line->s=stringStart;
   1477 
   1478     ++lineCount;
   1479 
   1480     /* prevent a character value that is actually in a name from becoming a token */
   1481     while(length>0) {
   1482         tokens[stringStart[--length]]=-1;
   1483     }
   1484 }
   1485 
   1486 static void
   1487 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
   1488     uint8_t *stringStart;
   1489     Line *line;
   1490 
   1491     if(lineCount==MAX_LINE_COUNT) {
   1492         fprintf(stderr, "gennames: too many groups\n");
   1493         exit(U_BUFFER_OVERFLOW_ERROR);
   1494     }
   1495 
   1496     /* store the line lengths first, then the strings */
   1497     lineLengthsTop=(lineLengthsTop+1)/2;
   1498     stringStart=allocLine(lineLengthsTop+length);
   1499     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
   1500     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
   1501 
   1502     line=lines+lineCount;
   1503 
   1504     line->code=groupMSB;
   1505     line->length=length;
   1506     line->s=stringStart;
   1507 
   1508     ++lineCount;
   1509 }
   1510 
   1511 static uint32_t
   1512 addToken(uint8_t *s, int16_t length) {
   1513     uint8_t *stringStart;
   1514 
   1515     stringStart=allocLine(length+1);
   1516     uprv_memcpy(stringStart, s, length);
   1517     stringStart[length]=0;
   1518 
   1519     return (uint32_t)(stringStart - stringStore);
   1520 }
   1521 
   1522 static void
   1523 appendLineLength(int16_t length) {
   1524     if(length>=76) {
   1525         fprintf(stderr, "gennames: compressed line too long\n");
   1526         exit(U_BUFFER_OVERFLOW_ERROR);
   1527     }
   1528     if(length>=12) {
   1529         length-=12;
   1530         appendLineLengthNibble((uint8_t)((length>>4)|12));
   1531     }
   1532     appendLineLengthNibble((uint8_t)length);
   1533 }
   1534 
   1535 static void
   1536 appendLineLengthNibble(uint8_t nibble) {
   1537     if((lineLengthsTop&1)==0) {
   1538         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
   1539     } else {
   1540         lineLengths[lineLengthsTop/2]|=nibble&0xf;
   1541     }
   1542     ++lineLengthsTop;
   1543 }
   1544 
   1545 static uint8_t *
   1546 allocLine(int32_t length) {
   1547     uint32_t top=lineTop+length;
   1548     uint8_t *p;
   1549 
   1550     if(top>wordBottom) {
   1551         fprintf(stderr, "gennames: out of memory\n");
   1552         exit(U_MEMORY_ALLOCATION_ERROR);
   1553     }
   1554     p=stringStore+lineTop;
   1555     lineTop=top;
   1556     return p;
   1557 }
   1558 
   1559 static uint8_t *
   1560 allocWord(uint32_t length) {
   1561     uint32_t bottom=wordBottom-length;
   1562 
   1563     if(lineTop>bottom) {
   1564         fprintf(stderr, "gennames: out of memory\n");
   1565         exit(U_MEMORY_ALLOCATION_ERROR);
   1566     }
   1567     wordBottom=bottom;
   1568     return stringStore+bottom;
   1569 }
   1570 
   1571 /*
   1572  * Hey, Emacs, please set the following:
   1573  *
   1574  * Local Variables:
   1575  * indent-tabs-mode: nil
   1576  * End:
   1577  *
   1578  */
   1579