Home | History | Annotate | Download | only in genprops
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  store.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999dec11
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store Unicode character properties efficiently for
     17 *   random access.
     18 */
     19 
     20 #include <stdio.h>
     21 #include "unicode/utypes.h"
     22 #include "unicode/uchar.h"
     23 #include "cmemory.h"
     24 #include "cstring.h"
     25 #include "utrie.h"
     26 #include "unicode/udata.h"
     27 #include "unewdata.h"
     28 #include "writesrc.h"
     29 #include "uprops.h"
     30 #include "genprops.h"
     31 
     32 #define DO_DEBUG_OUT 0
     33 
     34 /* Unicode character properties file format ------------------------------------
     35 
     36 The file format prepared and written here contains several data
     37 structures that store indexes or data.
     38 
     39 Before the data contents described below, there are the headers required by
     40 the udata API for loading ICU data. Especially, a UDataInfo structure
     41 precedes the actual data. It contains platform properties values and the
     42 file format version.
     43 
     44 The following is a description of format version 5 .
     45 
     46 The format changes between version 3 and 4 because the properties related to
     47 case mappings and bidi/shaping are pulled out into separate files
     48 for modularization.
     49 In order to reduce the need for code changes, some of the previous data
     50 structures are omitted, rather than rearranging everything.
     51 
     52 For details see "Changes in format version 4" below.
     53 
     54 Format version 5 became necessary because the bit field for script codes
     55 overflowed. Several bit fields got rearranged, and three (Script, Block,
     56 Word_Break) got widened by one bit each.
     57 
     58 Data contents:
     59 
     60 The contents is a parsed, binary form of several Unicode character
     61 database files, most prominently UnicodeData.txt.
     62 
     63 Any Unicode code point from 0 to 0x10ffff can be looked up to get
     64 the properties, if any, for that code point. This means that the input
     65 to the lookup are 21-bit unsigned integers, with not all of the
     66 21-bit range used.
     67 
     68 It is assumed that client code keeps a uint32_t pointer
     69 to the beginning of the data:
     70 
     71     const uint32_t *p32;
     72 
     73 Formally, the file contains the following structures:
     74 
     75     const int32_t indexes[16] with values i0..i15:
     76 
     77   i0 indicates the length of the main trie.
     78   i0..i3 all have the same value in format version 4.0;
     79          the related props32[] and exceptions[] and uchars[] were used in format version 3
     80 
     81     i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
     82     i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
     83     i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
     84 
     85     i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
     86     i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
     87     i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
     88 
     89     i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
     90     i7..i9 reservedIndexes; -- reserved values; 0 for now
     91 
     92     i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
     93     i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
     94     i12..i15 reservedIndexes; -- reserved values; 0 for now
     95 
     96     PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
     97 
     98   P, E, and U are not used (empty) in format version 4
     99 
    100     P  const uint32_t props32[i1-i0];
    101     E  const uint32_t exceptions[i2-i1];
    102     U  const UChar uchars[2*(i3-i2)];
    103 
    104     AT serialized trie for additional properties (byte size: 4*(i4-i3))
    105     PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
    106 
    107 Trie lookup and properties:
    108 
    109 In order to condense the data for the 21-bit code space, several properties of
    110 the Unicode code assignment are exploited:
    111 - The code space is sparse.
    112 - There are several 10k of consecutive codes with the same properties.
    113 - Characters and scripts are allocated in groups of 16 code points.
    114 - Inside blocks for scripts the properties are often repetitive.
    115 - The 21-bit space is not fully used for Unicode.
    116 
    117 The lookup of properties for a given code point is done with a trie lookup,
    118 using the UTrie implementation.
    119 The trie lookup result is a 16-bit properties word.
    120 
    121 With a given Unicode code point
    122 
    123     UChar32 c;
    124 
    125 and 0<=c<0x110000, the lookup is done like this:
    126 
    127     uint16_t props;
    128     UTRIE_GET16(trie, c, props);
    129 
    130 Each 16-bit properties word contains:
    131 
    132  0.. 4  general category
    133  5.. 7  numeric type
    134         non-digit numbers are stored with multiple types and pseudo-types
    135         in order to facilitate compact encoding:
    136         0 no numeric value (0)
    137         1 decimal digit value (0..9)
    138         2 digit value (0..9)
    139         3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
    140         4 (internal type UPROPS_NT_FRACTION) fraction
    141         5 (internal type UPROPS_NT_LARGE) large number >0xff
    142         6..7 reserved
    143 
    144         when returning the numeric type from a public API,
    145         internal types must be turned into U_NT_NUMERIC
    146 
    147  8..15  numeric value
    148         encoding of fractions and large numbers see below
    149 
    150 Fractions:
    151     // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
    152     int32_t num, den;
    153     num=n>>3;       // num=0..31
    154     den=(n&7)+2;    // den=2..9
    155     if(num==0) {
    156         num=-1;     // num=-1 or 1..31
    157     }
    158     double result=(double)num/(double)den;
    159 
    160 Large numbers:
    161     // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
    162     int32_t m, e;
    163     m=n>>4;         // m=0..15
    164     e=(n&0xf);
    165     if(m==0) {
    166         m=1;        // for large powers of 10
    167         e+=18;      // e=18..33
    168     } else {
    169         e+=2;       // e=2..17
    170     } // m==10..15 are reserved
    171     double result=(double)m*10^e;
    172 
    173 --- Additional properties (new in format version 2.1) ---
    174 
    175 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
    176 The data words consist of 32-bit unit indexes (not row indexes!) into the
    177 table of unique properties vectors (PV).
    178 Each vector contains a set of properties.
    179 The width of a vector (number of uint32_t per row) may change
    180 with the formatVersion, it is stored in i5.
    181 
    182 Current properties: see icu/source/common/uprops.h
    183 
    184 --- Changes in format version 3.1 ---
    185 
    186 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
    187 
    188 --- Changes in format version 3.2 ---
    189 
    190 - The tries use linear Latin-1 ranges.
    191 - The additional properties bits store full properties XYZ instead
    192   of partial Other_XYZ, so that changes in the derivation formulas
    193   need not be tracked in runtime library code.
    194 - Joining Type and Line Break are also stored completely, so that uprops.c
    195   needs no runtime formulas for enumerated properties either.
    196 - Store the case-sensitive flag in the main properties word.
    197 - i10 also contains U_LB_COUNT and U_EA_COUNT.
    198 - i11 contains maxValues2 for vector word 2.
    199 
    200 --- Changes in format version 4 ---
    201 
    202 The format changes between version 3 and 4 because the properties related to
    203 case mappings and bidi/shaping are pulled out into separate files
    204 for modularization.
    205 In order to reduce the need for code changes, some of the previous data
    206 structures are omitted, rather than rearranging everything.
    207 
    208 (The change to format version 4 is for ICU 3.4. The last CVS revision of
    209 genprops/store.c for format version 3.2 is 1.48.)
    210 
    211 The main trie's data is significantly simplified:
    212 - The trie's 16-bit data word is used directly instead of as an index
    213   into props32[].
    214 - The trie uses the default trie folding functions instead of custom ones.
    215 - Numeric values are stored directly in the trie data word, with special
    216   encodings.
    217 - No more exception data (the data that needed it was pulled out, or, in the
    218   case of numeric values, encoded differently).
    219 - No more string data (pulled out - was for case mappings).
    220 
    221 Also, some of the previously used properties vector bits are reserved again.
    222 
    223 The indexes[] values for the omitted structures are still filled in
    224 (indicating zero-length arrays) so that the swapper code remains unchanged.
    225 
    226 --- Changes in format version 5 ---
    227 
    228 Rearranged bit fields in the second trie (AT) because the script code field
    229 overflowed. Old code would have seen nonsensically low values for new, higher
    230 script codes.
    231 Modified bit fields in icu/source/common/uprops.h
    232 
    233 ----------------------------------------------------------------------------- */
    234 
    235 /* UDataInfo cf. udata.h */
    236 static UDataInfo dataInfo={
    237     sizeof(UDataInfo),
    238     0,
    239 
    240     U_IS_BIG_ENDIAN,
    241     U_CHARSET_FAMILY,
    242     U_SIZEOF_UCHAR,
    243     0,
    244 
    245     { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
    246     { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
    247     { 5, 1, 0, 0 }                              /* dataVersion */
    248 };
    249 
    250 static UNewTrie *pTrie=NULL;
    251 
    252 /* -------------------------------------------------------------------------- */
    253 
    254 extern void
    255 setUnicodeVersion(const char *v) {
    256     UVersionInfo version;
    257     u_versionFromString(version, v);
    258     uprv_memcpy(dataInfo.dataVersion, version, 4);
    259 }
    260 
    261 extern void
    262 initStore() {
    263     pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
    264     if(pTrie==NULL) {
    265         fprintf(stderr, "error: unable to create a UNewTrie\n");
    266         exit(U_MEMORY_ALLOCATION_ERROR);
    267     }
    268 
    269     initAdditionalProperties();
    270 }
    271 
    272 extern void
    273 exitStore() {
    274     utrie_close(pTrie);
    275     exitAdditionalProperties();
    276 }
    277 
    278 static uint32_t printNumericTypeValueError(Props *p) {
    279     fprintf(stderr, "genprops error: unable to encode numeric type & value %d  %ld/%lu E%d\n",
    280             (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
    281     exit(U_ILLEGAL_ARGUMENT_ERROR);
    282     return 0;
    283 }
    284 
    285 /* store a character's properties ------------------------------------------- */
    286 
    287 extern uint32_t
    288 makeProps(Props *p) {
    289     uint32_t den;
    290     int32_t type, value, exp;
    291 
    292     /* encode numeric type & value */
    293     type=p->numericType;
    294     value=p->numericValue;
    295     den=p->denominator;
    296     exp=p->exponent;
    297 
    298     if(den!=0) {
    299         /* fraction */
    300         if( type!=U_NT_NUMERIC ||
    301             value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
    302             den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
    303             exp!=0
    304         ) {
    305             return printNumericTypeValueError(p);
    306         }
    307         type=UPROPS_NT_FRACTION;
    308 
    309         if(value==-1) {
    310             value=0;
    311         }
    312         den-=UPROPS_FRACTION_DEN_OFFSET;
    313         value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
    314     } else if(exp!=0) {
    315         /* very large value */
    316         if( type!=U_NT_NUMERIC ||
    317             value<1 || 9<value ||
    318             exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
    319         ) {
    320             return printNumericTypeValueError(p);
    321         }
    322         type=UPROPS_NT_LARGE;
    323 
    324         if(exp<=UPROPS_LARGE_MAX_EXP) {
    325             /* 1..9 * 10^(2..17) */
    326             exp-=UPROPS_LARGE_EXP_OFFSET;
    327         } else {
    328             /* 1 * 10^(18..33) */
    329             if(value!=1) {
    330                 return printNumericTypeValueError(p);
    331             }
    332             value=0;
    333             exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
    334         }
    335         value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
    336     } else if(value>UPROPS_MAX_SMALL_NUMBER) {
    337         /* large value */
    338         if(type!=U_NT_NUMERIC) {
    339             return printNumericTypeValueError(p);
    340         }
    341         type=UPROPS_NT_LARGE;
    342 
    343         /* split the value into mantissa and exponent, base 10 */
    344         while((value%10)==0) {
    345             value/=10;
    346             ++exp;
    347         }
    348         if(value>9) {
    349             return printNumericTypeValueError(p);
    350         }
    351 
    352         exp-=UPROPS_LARGE_EXP_OFFSET;
    353         value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
    354     } else if(value<0) {
    355         /* unable to encode negative values, other than fractions -1/x */
    356         return printNumericTypeValueError(p);
    357 
    358     /* } else normal value=0..0xff { */
    359     }
    360 
    361     /* encode the properties */
    362     return
    363         (uint32_t)p->generalCategory |
    364         ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
    365         ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
    366 }
    367 
    368 extern void
    369 addProps(uint32_t c, uint32_t x) {
    370     if(!utrie_set32(pTrie, (UChar32)c, x)) {
    371         fprintf(stderr, "error: too many entries for the properties trie\n");
    372         exit(U_BUFFER_OVERFLOW_ERROR);
    373     }
    374 }
    375 
    376 extern uint32_t
    377 getProps(uint32_t c) {
    378     return utrie_get32(pTrie, (UChar32)c, NULL);
    379 }
    380 
    381 /* areas of same properties ------------------------------------------------- */
    382 
    383 extern void
    384 repeatProps(uint32_t first, uint32_t last, uint32_t x) {
    385     if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
    386         fprintf(stderr, "error: too many entries for the properties trie\n");
    387         exit(U_BUFFER_OVERFLOW_ERROR);
    388     }
    389 }
    390 
    391 /* generate output data ----------------------------------------------------- */
    392 
    393 extern void
    394 generateData(const char *dataDir, UBool csource) {
    395     static int32_t indexes[UPROPS_INDEX_COUNT]={
    396         0, 0, 0, 0,
    397         0, 0, 0, 0,
    398         0, 0, 0, 0,
    399         0, 0, 0, 0
    400     };
    401     static uint8_t trieBlock[40000];
    402     static uint8_t additionalProps[120000];
    403 
    404     UNewDataMemory *pData;
    405     UErrorCode errorCode=U_ZERO_ERROR;
    406     uint32_t size = 0;
    407     int32_t trieSize, additionalPropsSize, offset;
    408     long dataLength;
    409 
    410     trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    411     if(U_FAILURE(errorCode)) {
    412         fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
    413         exit(errorCode);
    414     }
    415 
    416     offset=sizeof(indexes)/4;               /* uint32_t offset to the properties trie */
    417 
    418     /* round up trie size to 4-alignment */
    419     trieSize=(trieSize+3)&~3;
    420     offset+=trieSize>>2;
    421     indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
    422     indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
    423     indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
    424     indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
    425 
    426     if(beVerbose) {
    427         printf("trie size in bytes:                    %5u\n", (int)trieSize);
    428     }
    429 
    430     if(csource) {
    431         /* write .c file for hardcoded data */
    432         UTrie trie={ NULL };
    433         UTrie2 *trie2;
    434         FILE *f;
    435 
    436         utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
    437         if(U_FAILURE(errorCode)) {
    438             fprintf(
    439                 stderr,
    440                 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
    441                 u_errorName(errorCode));
    442             exit(errorCode);
    443         }
    444 
    445         /* use UTrie2 */
    446         dataInfo.formatVersion[0]=6;
    447         dataInfo.formatVersion[2]=0;
    448         dataInfo.formatVersion[3]=0;
    449         trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
    450         if(U_FAILURE(errorCode)) {
    451             fprintf(
    452                 stderr,
    453                 "genprops error: utrie2_fromUTrie() failed - %s\n",
    454                 u_errorName(errorCode));
    455             exit(errorCode);
    456         }
    457         {
    458             /* delete lead surrogate code unit values */
    459             UChar lead;
    460             trie2=utrie2_cloneAsThawed(trie2, &errorCode);
    461             for(lead=0xd800; lead<0xdc00; ++lead) {
    462                 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
    463             }
    464             utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
    465             if(U_FAILURE(errorCode)) {
    466                 fprintf(
    467                     stderr,
    468                     "genprops error: deleting lead surrogate code unit values failed - %s\n",
    469                     u_errorName(errorCode));
    470                 exit(errorCode);
    471             }
    472         }
    473 
    474         f=usrc_create(dataDir, "uchar_props_data.c");
    475         if(f!=NULL) {
    476             usrc_writeArray(f,
    477                 "static const UVersionInfo formatVersion={",
    478                 dataInfo.formatVersion, 8, 4,
    479                 "};\n\n");
    480             usrc_writeArray(f,
    481                 "static const UVersionInfo dataVersion={",
    482                 dataInfo.dataVersion, 8, 4,
    483                 "};\n\n");
    484             usrc_writeUTrie2Arrays(f,
    485                 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
    486                 trie2,
    487                 "\n};\n\n");
    488             usrc_writeUTrie2Struct(f,
    489                 "static const UTrie2 propsTrie={\n",
    490                 trie2, "propsTrie_index", NULL,
    491                 "};\n\n");
    492 
    493             additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
    494             size=4*offset+additionalPropsSize;      /* total size of data */
    495 
    496             usrc_writeArray(f,
    497                 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
    498                 indexes, 32, UPROPS_INDEX_COUNT,
    499                 "};\n\n");
    500             fclose(f);
    501         }
    502         utrie2_close(trie2);
    503     } else {
    504         /* write the data */
    505         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
    506                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    507         if(U_FAILURE(errorCode)) {
    508             fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
    509             exit(errorCode);
    510         }
    511 
    512         additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
    513         size=4*offset+additionalPropsSize;      /* total size of data */
    514 
    515         udata_writeBlock(pData, indexes, sizeof(indexes));
    516         udata_writeBlock(pData, trieBlock, trieSize);
    517         udata_writeBlock(pData, additionalProps, additionalPropsSize);
    518 
    519         /* finish up */
    520         dataLength=udata_finish(pData, &errorCode);
    521         if(U_FAILURE(errorCode)) {
    522             fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
    523             exit(errorCode);
    524         }
    525 
    526         if(dataLength!=(long)size) {
    527             fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
    528                 dataLength, (unsigned long)size);
    529             exit(U_INTERNAL_PROGRAM_ERROR);
    530         }
    531     }
    532 
    533     if(beVerbose) {
    534         printf("data size:                            %6lu\n", (unsigned long)size);
    535     }
    536 }
    537 
    538 /*
    539  * Hey, Emacs, please set the following:
    540  *
    541  * Local Variables:
    542  * indent-tabs-mode: nil
    543  * End:
    544  *
    545  */
    546