Home | History | Annotate | Download | only in genprops
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  props2.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002feb24
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Parse more Unicode Character Database files and store
     17 *   additional Unicode character properties in bit set vectors.
     18 */
     19 
     20 #include <stdio.h>
     21 #include "unicode/utypes.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/uscript.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "utrie.h"
     27 #include "uprops.h"
     28 #include "propsvec.h"
     29 #include "uparse.h"
     30 #include "writesrc.h"
     31 #include "genprops.h"
     32 
     33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     34 
     35 /* data --------------------------------------------------------------------- */
     36 
     37 static UNewTrie *newTrie;
     38 UPropsVectors *pv;
     39 
     40 /* miscellaneous ------------------------------------------------------------ */
     41 
     42 static char *
     43 trimTerminateField(char *s, char *limit) {
     44     /* trim leading whitespace */
     45     s=(char *)u_skipWhitespace(s);
     46 
     47     /* trim trailing whitespace */
     48     while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
     49         --limit;
     50     }
     51     *limit=0;
     52 
     53     return s;
     54 }
     55 
     56 static void
     57 parseTwoFieldFile(char *filename, char *basename,
     58                   const char *ucdFile, const char *suffix,
     59                   UParseLineFn *lineFn,
     60                   UErrorCode *pErrorCode) {
     61     char *fields[2][2];
     62 
     63     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
     64         return;
     65     }
     66 
     67     writeUCDFilename(basename, ucdFile, suffix);
     68 
     69     u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
     70     if(U_FAILURE(*pErrorCode)) {
     71         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
     72     }
     73 }
     74 
     75 static void U_CALLCONV
     76 ageLineFn(void *context,
     77           char *fields[][2], int32_t fieldCount,
     78           UErrorCode *pErrorCode);
     79 
     80 static void
     81 parseMultiFieldFile(char *filename, char *basename,
     82                     const char *ucdFile, const char *suffix,
     83                     int32_t fieldCount,
     84                     UParseLineFn *lineFn,
     85                     UErrorCode *pErrorCode) {
     86     char *fields[20][2];
     87 
     88     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
     89         return;
     90     }
     91 
     92     writeUCDFilename(basename, ucdFile, suffix);
     93 
     94     u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
     95     if(U_FAILURE(*pErrorCode)) {
     96         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
     97     }
     98 }
     99 
    100 static void U_CALLCONV
    101 numericLineFn(void *context,
    102               char *fields[][2], int32_t fieldCount,
    103               UErrorCode *pErrorCode);
    104 
    105 /* parse files with single enumerated properties ---------------------------- */
    106 
    107 struct SingleEnum {
    108     const char *ucdFile, *propName;
    109     UProperty prop;
    110     int32_t vecWord, vecShift;
    111     uint32_t vecMask;
    112 };
    113 typedef struct SingleEnum SingleEnum;
    114 
    115 static void
    116 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
    117                     const SingleEnum *sen,
    118                     UErrorCode *pErrorCode);
    119 
    120 static const SingleEnum scriptSingleEnum={
    121     "Scripts", "script",
    122     UCHAR_SCRIPT,
    123     0, 0, UPROPS_SCRIPT_MASK
    124 };
    125 
    126 static const SingleEnum blockSingleEnum={
    127     "Blocks", "block",
    128     UCHAR_BLOCK,
    129     0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
    130 };
    131 
    132 static const SingleEnum graphemeClusterBreakSingleEnum={
    133     "GraphemeBreakProperty", "Grapheme_Cluster_Break",
    134     UCHAR_GRAPHEME_CLUSTER_BREAK,
    135     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
    136 };
    137 
    138 static const SingleEnum wordBreakSingleEnum={
    139     "WordBreakProperty", "Word_Break",
    140     UCHAR_WORD_BREAK,
    141     2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
    142 };
    143 
    144 static const SingleEnum sentenceBreakSingleEnum={
    145     "SentenceBreakProperty", "Sentence_Break",
    146     UCHAR_SENTENCE_BREAK,
    147     2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
    148 };
    149 
    150 static const SingleEnum lineBreakSingleEnum={
    151     "LineBreak", "line break",
    152     UCHAR_LINE_BREAK,
    153     UPROPS_LB_VWORD, UPROPS_LB_SHIFT, UPROPS_LB_MASK
    154 };
    155 
    156 static const SingleEnum eawSingleEnum={
    157     "EastAsianWidth", "east asian width",
    158     UCHAR_EAST_ASIAN_WIDTH,
    159     0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
    160 };
    161 
    162 static void U_CALLCONV
    163 singleEnumLineFn(void *context,
    164                  char *fields[][2], int32_t fieldCount,
    165                  UErrorCode *pErrorCode) {
    166     const SingleEnum *sen;
    167     char *s;
    168     uint32_t start, end, uv;
    169     int32_t value;
    170 
    171     sen=(const SingleEnum *)context;
    172 
    173     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    174     if(U_FAILURE(*pErrorCode)) {
    175         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
    176         exit(*pErrorCode);
    177     }
    178 
    179     /* parse property alias */
    180     s=trimTerminateField(fields[1][0], fields[1][1]);
    181     value=u_getPropertyValueEnum(sen->prop, s);
    182     if(value<0) {
    183         if(sen->prop==UCHAR_BLOCK) {
    184             if(isToken("Greek", s)) {
    185                 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
    186             } else if(isToken("Combining Marks for Symbols", s)) {
    187                 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
    188             } else if(isToken("Private Use", s)) {
    189                 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
    190             }
    191         }
    192     }
    193     if(value<0) {
    194         fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
    195                         sen->propName, sen->ucdFile, s);
    196         exit(U_PARSE_ERROR);
    197     }
    198 
    199     uv=(uint32_t)(value<<sen->vecShift);
    200     if((uv&sen->vecMask)!=uv) {
    201         fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
    202                         sen->propName, (int)uv, s);
    203         exit(U_INTERNAL_PROGRAM_ERROR);
    204     }
    205 
    206     if(start==0 && end==0x10ffff) {
    207         /* Also set bits for initialValue and errorValue. */
    208         end=UPVEC_MAX_CP;
    209     }
    210     upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
    211     if(U_FAILURE(*pErrorCode)) {
    212         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
    213                         sen->propName, u_errorName(*pErrorCode));
    214         exit(*pErrorCode);
    215     }
    216 }
    217 
    218 static void
    219 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
    220                     const SingleEnum *sen,
    221                     UErrorCode *pErrorCode) {
    222     char *fields[2][2];
    223 
    224     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    225         return;
    226     }
    227 
    228     writeUCDFilename(basename, sen->ucdFile, suffix);
    229 
    230     u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
    231     if(U_FAILURE(*pErrorCode)) {
    232         fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
    233     }
    234 }
    235 
    236 /* parse files with multiple binary properties ------------------------------ */
    237 
    238 struct Binary {
    239     const char *propName;
    240     int32_t vecWord, vecShift;
    241 };
    242 typedef struct Binary Binary;
    243 
    244 struct Binaries {
    245     const char *ucdFile;
    246     const Binary *binaries;
    247     int32_t binariesCount;
    248 };
    249 typedef struct Binaries Binaries;
    250 
    251 static const Binary
    252 propListNames[]={
    253     { "White_Space",                        1, UPROPS_WHITE_SPACE },
    254     { "Dash",                               1, UPROPS_DASH },
    255     { "Hyphen",                             1, UPROPS_HYPHEN },
    256     { "Quotation_Mark",                     1, UPROPS_QUOTATION_MARK },
    257     { "Terminal_Punctuation",               1, UPROPS_TERMINAL_PUNCTUATION },
    258     { "Hex_Digit",                          1, UPROPS_HEX_DIGIT },
    259     { "ASCII_Hex_Digit",                    1, UPROPS_ASCII_HEX_DIGIT },
    260     { "Ideographic",                        1, UPROPS_IDEOGRAPHIC },
    261     { "Diacritic",                          1, UPROPS_DIACRITIC },
    262     { "Extender",                           1, UPROPS_EXTENDER },
    263     { "Noncharacter_Code_Point",            1, UPROPS_NONCHARACTER_CODE_POINT },
    264     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK },
    265     { "IDS_Binary_Operator",                1, UPROPS_IDS_BINARY_OPERATOR },
    266     { "IDS_Trinary_Operator",               1, UPROPS_IDS_TRINARY_OPERATOR },
    267     { "Radical",                            1, UPROPS_RADICAL },
    268     { "Unified_Ideograph",                  1, UPROPS_UNIFIED_IDEOGRAPH },
    269     { "Deprecated",                         1, UPROPS_DEPRECATED },
    270     { "Logical_Order_Exception",            1, UPROPS_LOGICAL_ORDER_EXCEPTION },
    271 
    272     /* new properties in Unicode 4.0.1 */
    273     { "STerm",                              1, UPROPS_S_TERM },
    274     { "Variation_Selector",                 1, UPROPS_VARIATION_SELECTOR },
    275 
    276     /* new properties in Unicode 4.1 */
    277     { "Pattern_Syntax",                     1, UPROPS_PATTERN_SYNTAX },
    278     { "Pattern_White_Space",                1, UPROPS_PATTERN_WHITE_SPACE }
    279 };
    280 
    281 static const Binaries
    282 propListBinaries={
    283     "PropList", propListNames, LENGTHOF(propListNames)
    284 };
    285 
    286 static const Binary
    287 derCorePropsNames[]={
    288     { "XID_Start",                          1, UPROPS_XID_START },
    289     { "XID_Continue",                       1, UPROPS_XID_CONTINUE },
    290 
    291     /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
    292     { "Math",                               1, UPROPS_MATH },
    293     { "Alphabetic",                         1, UPROPS_ALPHABETIC },
    294     { "Grapheme_Extend",                    1, UPROPS_GRAPHEME_EXTEND },
    295     { "Default_Ignorable_Code_Point",       1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
    296 
    297     /* new properties bits in ICU 2.6/format version 3.2 */
    298     { "ID_Start",                           1, UPROPS_ID_START },
    299     { "ID_Continue",                        1, UPROPS_ID_CONTINUE },
    300     { "Grapheme_Base",                      1, UPROPS_GRAPHEME_BASE },
    301 
    302     /*
    303      * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
    304      * to DerivedCoreProperties.txt and deprecates it.
    305      */
    306     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK }
    307 };
    308 
    309 static const Binaries
    310 derCorePropsBinaries={
    311     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
    312 };
    313 
    314 static char ignoredProps[100][64];
    315 static int32_t ignoredPropsCount;
    316 
    317 static void
    318 addIgnoredProp(char *s, char *limit) {
    319     int32_t i;
    320 
    321     s=trimTerminateField(s, limit);
    322     for(i=0; i<ignoredPropsCount; ++i) {
    323         if(0==uprv_strcmp(ignoredProps[i], s)) {
    324             return;
    325         }
    326     }
    327     uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
    328 }
    329 
    330 static void U_CALLCONV
    331 binariesLineFn(void *context,
    332                char *fields[][2], int32_t fieldCount,
    333                UErrorCode *pErrorCode) {
    334     const Binaries *bin;
    335     char *s;
    336     uint32_t start, end, uv;
    337     int32_t i;
    338 
    339     bin=(const Binaries *)context;
    340 
    341     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    342     if(U_FAILURE(*pErrorCode)) {
    343         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
    344         exit(*pErrorCode);
    345     }
    346 
    347     /* parse binary property name */
    348     s=(char *)u_skipWhitespace(fields[1][0]);
    349     for(i=0;; ++i) {
    350         if(i==bin->binariesCount) {
    351             /* ignore unrecognized properties */
    352             if(beVerbose) {
    353                 addIgnoredProp(s, fields[1][1]);
    354             }
    355             return;
    356         }
    357         if(isToken(bin->binaries[i].propName, s)) {
    358             break;
    359         }
    360     }
    361 
    362     if(bin->binaries[i].vecShift>=32) {
    363         fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
    364                         (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
    365         exit(U_INTERNAL_PROGRAM_ERROR);
    366     }
    367     uv=U_MASK(bin->binaries[i].vecShift);
    368 
    369     if(start==0 && end==0x10ffff) {
    370         /* Also set bits for initialValue and errorValue. */
    371         end=UPVEC_MAX_CP;
    372     }
    373     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
    374     if(U_FAILURE(*pErrorCode)) {
    375         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
    376                         bin->binaries[i].propName, u_errorName(*pErrorCode));
    377         exit(*pErrorCode);
    378     }
    379 }
    380 
    381 static void
    382 parseBinariesFile(char *filename, char *basename, const char *suffix,
    383                   const Binaries *bin,
    384                   UErrorCode *pErrorCode) {
    385     char *fields[2][2];
    386     int32_t i;
    387 
    388     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    389         return;
    390     }
    391 
    392     writeUCDFilename(basename, bin->ucdFile, suffix);
    393 
    394     ignoredPropsCount=0;
    395 
    396     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
    397     if(U_FAILURE(*pErrorCode)) {
    398         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
    399     }
    400 
    401     if(beVerbose) {
    402         for(i=0; i<ignoredPropsCount; ++i) {
    403             printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
    404         }
    405     }
    406 }
    407 
    408 /* -------------------------------------------------------------------------- */
    409 
    410 U_CFUNC void
    411 initAdditionalProperties() {
    412     UErrorCode errorCode=U_ZERO_ERROR;
    413     pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
    414     if(U_FAILURE(errorCode)) {
    415         fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
    416         exit(errorCode);
    417     }
    418 }
    419 
    420 U_CFUNC void
    421 exitAdditionalProperties() {
    422     utrie_close(newTrie);
    423     upvec_close(pv);
    424 }
    425 
    426 U_CFUNC void
    427 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
    428     char *basename;
    429 
    430     basename=filename+uprv_strlen(filename);
    431 
    432     /* process various UCD .txt files */
    433 
    434     /* add Han numeric types & values */
    435     parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
    436 
    437     parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
    438 
    439     /*
    440      * UTR 24 says:
    441      * Section 2:
    442      *   "Common - For characters that may be used
    443      *             within multiple scripts,
    444      *             or any unassigned code points."
    445      *
    446      * Section 4:
    447      *   "The value COMMON is the default value,
    448      *    given to all code points that are not
    449      *    explicitly mentioned in the data file."
    450      *
    451      * COMMON==USCRIPT_COMMON==0 - nothing to do
    452      */
    453     parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
    454 
    455     parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
    456 
    457     parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
    458 
    459     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
    460 
    461     parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
    462 
    463     parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
    464 
    465     parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
    466 
    467     /*
    468      * LineBreak-4.0.0.txt:
    469      *  - All code points, assigned and unassigned, that are not listed
    470      *         explicitly are given the value "XX".
    471      *
    472      * XX==U_LB_UNKNOWN==0 - nothing to do
    473      */
    474     parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
    475 
    476     /*
    477      * Preset East Asian Width defaults:
    478      *
    479      * http://www.unicode.org/reports/tr11/#Unassigned
    480      * 7.1 Unassigned and Private Use characters
    481      *
    482      * All unassigned characters are by default classified as non-East Asian neutral,
    483      * except for the range U+20000 to U+2FFFD,
    484      * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
    485      * All Private use characters are by default classified as ambiguous,
    486      * since their definition depends on context.
    487      *
    488      * N for all ==0 - nothing to do
    489      * A for Private Use
    490      * W for plane 2
    491      */
    492     *pErrorCode=U_ZERO_ERROR;
    493     upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
    494     upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
    495     upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
    496     upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
    497     if(U_FAILURE(*pErrorCode)) {
    498         fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
    499         exit(*pErrorCode);
    500     }
    501 
    502     /* parse EastAsianWidth.txt */
    503     parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
    504 
    505     {
    506         UPVecToUTrieContext toUTrie={ NULL, 50000 /* capacity */, 0, TRUE /* latin1Linear */ };
    507         upvec_compact(pv, upvec_compactToUTrieHandler, &toUTrie, pErrorCode);
    508         if(U_FAILURE(*pErrorCode)) {
    509             fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
    510                     u_errorName(*pErrorCode));
    511             exit(*pErrorCode);
    512         }
    513         newTrie=toUTrie.newTrie;
    514     }
    515 }
    516 
    517 /* DerivedAge.txt ----------------------------------------------------------- */
    518 
    519 static void U_CALLCONV
    520 ageLineFn(void *context,
    521           char *fields[][2], int32_t fieldCount,
    522           UErrorCode *pErrorCode) {
    523     char *s, *numberLimit;
    524     uint32_t value, start, end, version;
    525 
    526     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    527     if(U_FAILURE(*pErrorCode)) {
    528         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
    529         exit(*pErrorCode);
    530     }
    531 
    532     /* ignore "unassigned" (the default is already set to 0.0) */
    533     s=(char *)u_skipWhitespace(fields[1][0]);
    534     if(0==uprv_strncmp(s, "unassigned", 10)) {
    535         return;
    536     }
    537 
    538     /* parse version number */
    539     value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
    540     if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
    541         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
    542         *pErrorCode=U_PARSE_ERROR;
    543         exit(U_PARSE_ERROR);
    544     }
    545     version=value<<4;
    546 
    547     /* parse minor version number */
    548     if(*numberLimit=='.') {
    549         s=(char *)u_skipWhitespace(numberLimit+1);
    550         value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
    551         if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
    552             fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
    553             *pErrorCode=U_PARSE_ERROR;
    554             exit(U_PARSE_ERROR);
    555         }
    556         version|=value;
    557     }
    558 
    559     if(start==0 && end==0x10ffff) {
    560         /* Also set bits for initialValue and errorValue. */
    561         end=UPVEC_MAX_CP;
    562     }
    563     upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
    564     if(U_FAILURE(*pErrorCode)) {
    565         fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
    566         exit(*pErrorCode);
    567     }
    568 }
    569 
    570 /* DerivedNumericValues.txt ------------------------------------------------- */
    571 
    572 static void U_CALLCONV
    573 numericLineFn(void *context,
    574               char *fields[][2], int32_t fieldCount,
    575               UErrorCode *pErrorCode) {
    576     Props newProps={ 0 };
    577     char *s, *numberLimit;
    578     uint32_t start, end, value, oldProps32;
    579     char c;
    580     UBool isFraction;
    581 
    582     /* get the code point range */
    583     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    584     if(U_FAILURE(*pErrorCode)) {
    585         fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
    586         exit(*pErrorCode);
    587     }
    588 
    589     /*
    590      * Ignore the
    591      * # @missing: 0000..10FFFF; NaN
    592      * line from Unicode 5.1's DerivedNumericValues.txt:
    593      * The following code cannot parse "NaN", and we don't want to overwrite
    594      * the numeric values for all characters after reading most
    595      * from UnicodeData.txt already.
    596      */
    597     if(start==0 && end==0x10ffff) {
    598         return;
    599     }
    600 
    601     /* check if the numeric value is a fraction (this code does not handle any) */
    602     isFraction=FALSE;
    603     s=uprv_strchr(fields[1][0], '.');
    604     if(s!=NULL) {
    605         numberLimit=s+1;
    606         while('0'<=(c=*numberLimit++) && c<='9') {
    607             if(c!='0') {
    608                 isFraction=TRUE;
    609                 break;
    610             }
    611         }
    612     }
    613 
    614     if(isFraction) {
    615         value=0;
    616     } else {
    617         /* parse numeric value */
    618         s=(char *)u_skipWhitespace(fields[1][0]);
    619 
    620         /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
    621         if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
    622             /* large integers are encoded in a special way, see store.c */
    623             uint8_t exp=0;
    624 
    625             value=s[0]-'0';
    626             numberLimit=s;
    627             while(*(++numberLimit)=='0') {
    628                 ++exp;
    629             }
    630             newProps.exponent=exp;
    631         } else {
    632             /* normal number parsing */
    633             value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
    634         }
    635         if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
    636             fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
    637             exit(U_PARSE_ERROR);
    638         }
    639     }
    640 
    641     /*
    642      * Unicode 4.0.1 removes the third column that used to list the numeric type.
    643      * Assume that either the data is the same as in UnicodeData.txt,
    644      * or else that the numeric type is "numeric".
    645      * This should work because we only expect to add numeric values for
    646      * Han characters; for those, UnicodeData.txt lists only ranges without
    647      * specific properties for single characters.
    648      */
    649 
    650     /* set the new numeric value */
    651     newProps.code=start;
    652     newProps.numericValue=(int32_t)value;       /* newly parsed numeric value */
    653     /* the exponent may have been set above */
    654 
    655     for(; start<=end; ++start) {
    656         uint32_t newProps32;
    657         int32_t oldNtv;
    658         oldProps32=getProps(start);
    659         oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);
    660 
    661         if(isFraction) {
    662             if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
    663                 /* this code point was already listed with its numeric value in UnicodeData.txt */
    664                 continue;
    665             } else {
    666                 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
    667                 exit(U_PARSE_ERROR);
    668             }
    669         }
    670 
    671         /*
    672          * For simplicity, and because we only expect to set numeric values for Han characters,
    673          * for now we only allow to set these values for Lo characters.
    674          */
    675         if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
    676             fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
    677             exit(U_PARSE_ERROR);
    678         }
    679 
    680         /* verify that we do not change an existing value (fractions were excluded above) */
    681         if(oldNtv!=UPROPS_NTV_NONE) {
    682             /* the code point already has a value stored */
    683             newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
    684             newProps32=makeProps(&newProps);
    685             if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
    686                 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
    687                 exit(U_PARSE_ERROR);
    688             }
    689             /* same value, continue */
    690         } else {
    691             /* the code point is getting a new numeric value */
    692             newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
    693             newProps32=makeProps(&newProps);
    694             if(beVerbose) {
    695                 printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
    696                        (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
    697             }
    698 
    699             addProps(start, newProps32|GET_CATEGORY(oldProps32));
    700         }
    701     }
    702 }
    703 
    704 /* data serialization ------------------------------------------------------- */
    705 
    706 U_CFUNC int32_t
    707 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
    708     const uint32_t *pvArray;
    709     int32_t pvRows, pvCount;
    710     int32_t length;
    711     UErrorCode errorCode;
    712 
    713     pvArray=upvec_getArray(pv, &pvRows, NULL);
    714     pvCount=pvRows*UPROPS_VECTOR_WORDS;
    715 
    716     errorCode=U_ZERO_ERROR;
    717     length=utrie_serialize(newTrie, p, capacity, NULL, TRUE, &errorCode);
    718     if(U_FAILURE(errorCode)) {
    719         fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
    720         exit(errorCode);
    721     }
    722     if(p!=NULL) {
    723         if(beVerbose) {
    724             printf("size in bytes of additional props trie:%5u\n", (int)length);
    725         }
    726         if(f!=NULL) {
    727             UTrie trie={ NULL };
    728             UTrie2 *trie2;
    729 
    730             utrie_unserialize(&trie, p, length, &errorCode);
    731             if(U_FAILURE(errorCode)) {
    732                 fprintf(
    733                     stderr,
    734                     "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
    735                     u_errorName(errorCode));
    736                 exit(errorCode);
    737             }
    738 
    739             /* use UTrie2 */
    740             trie2=utrie2_fromUTrie(&trie, trie.initialValue, &errorCode);
    741             if(U_FAILURE(errorCode)) {
    742                 fprintf(
    743                     stderr,
    744                     "genprops error: utrie2_fromUTrie() failed - %s\n",
    745                     u_errorName(errorCode));
    746                 exit(errorCode);
    747             }
    748             {
    749                 /* delete lead surrogate code unit values */
    750                 UChar lead;
    751                 trie2=utrie2_cloneAsThawed(trie2, &errorCode);
    752                 for(lead=0xd800; lead<0xdc00; ++lead) {
    753                     utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
    754                 }
    755                 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
    756                 if(U_FAILURE(errorCode)) {
    757                     fprintf(
    758                         stderr,
    759                         "genbidi error: deleting lead surrogate code unit values failed - %s\n",
    760                         u_errorName(errorCode));
    761                     exit(errorCode);
    762                 }
    763             }
    764 
    765             usrc_writeUTrie2Arrays(f,
    766                 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
    767                 trie2,
    768                 "\n};\n\n");
    769             usrc_writeUTrie2Struct(f,
    770                 "static const UTrie2 propsVectorsTrie={\n",
    771                 trie2, "propsVectorsTrie_index", NULL,
    772                 "};\n\n");
    773 
    774             utrie2_close(trie2);
    775         }
    776 
    777         p+=length;
    778         capacity-=length;
    779 
    780         /* set indexes */
    781         indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
    782             indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
    783         indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
    784         indexes[UPROPS_RESERVED_INDEX]=
    785             indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
    786 
    787         indexes[UPROPS_MAX_VALUES_INDEX]=
    788             (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
    789             (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
    790             (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
    791         indexes[UPROPS_MAX_VALUES_2_INDEX]=
    792             (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
    793             (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
    794             (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
    795             (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
    796             ((int32_t)U_DT_COUNT-1);
    797     }
    798 
    799     if(p!=NULL && (pvCount*4)<=capacity) {
    800         if(f!=NULL) {
    801             usrc_writeArray(f,
    802                 "static const uint32_t propsVectors[%ld]={\n",
    803                 pvArray, 32, pvCount,
    804                 "};\n\n");
    805             fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
    806             fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
    807         } else {
    808             uprv_memcpy(p, pvArray, pvCount*4);
    809         }
    810         if(beVerbose) {
    811             printf("number of additional props vectors:    %5u\n", (int)pvRows);
    812             printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
    813         }
    814     }
    815     length+=pvCount*4;
    816 
    817     return length;
    818 }
    819