Home | History | Annotate | Download | only in genprops
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  store.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999dec11
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store Unicode character properties efficiently for
     17 *   random access.
     18 */
     19 
     20 #include <stdio.h>
     21 #include "unicode/utypes.h"
     22 #include "unicode/uchar.h"
     23 #include "cmemory.h"
     24 #include "cstring.h"
     25 #include "utrie.h"
     26 #include "unicode/udata.h"
     27 #include "unewdata.h"
     28 #include "writesrc.h"
     29 #include "uprops.h"
     30 #include "genprops.h"
     31 
     32 #define DO_DEBUG_OUT 0
     33 
     34 /* Unicode character properties file format ------------------------------------
     35 
     36 The file format prepared and written here contains several data
     37 structures that store indexes or data.
     38 
     39 Before the data contents described below, there are the headers required by
     40 the udata API for loading ICU data. Especially, a UDataInfo structure
     41 precedes the actual data. It contains platform properties values and the
     42 file format version.
     43 
     44 The following is a description of format version 6 .
     45 
     46 Data contents:
     47 
     48 The contents is a parsed, binary form of several Unicode character
     49 database files, most prominently UnicodeData.txt.
     50 
     51 Any Unicode code point from 0 to 0x10ffff can be looked up to get
     52 the properties, if any, for that code point. This means that the input
     53 to the lookup are 21-bit unsigned integers, with not all of the
     54 21-bit range used.
     55 
     56 It is assumed that client code keeps a uint32_t pointer
     57 to the beginning of the data:
     58 
     59     const uint32_t *p32;
     60 
     61 Formally, the file contains the following structures:
     62 
     63     const int32_t indexes[16] with values i0..i15:
     64 
     65   i0 indicates the length of the main trie.
     66   i0..i3 all have the same value in format versions 4.0 and higher;
     67          the related props32[] and exceptions[] and uchars[] were used in format version 3
     68 
     69     i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
     70     i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
     71     i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
     72 
     73     i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
     74     i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
     75     i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
     76 
     77     i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
     78     i7..i9 reservedIndexes; -- reserved values; 0 for now
     79 
     80     i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
     81     i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
     82     i12..i15 reservedIndexes; -- reserved values; 0 for now
     83 
     84     PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
     85 
     86   P, E, and U are not used (empty) in format versions 4 and above
     87 
     88     P  const uint32_t props32[i1-i0];
     89     E  const uint32_t exceptions[i2-i1];
     90     U  const UChar uchars[2*(i3-i2)];
     91 
     92     AT serialized trie for additional properties (byte size: 4*(i4-i3))
     93     PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
     94 
     95 Trie lookup and properties:
     96 
     97 In order to condense the data for the 21-bit code space, several properties of
     98 the Unicode code assignment are exploited:
     99 - The code space is sparse.
    100 - There are several 10k of consecutive codes with the same properties.
    101 - Characters and scripts are allocated in groups of 16 code points.
    102 - Inside blocks for scripts the properties are often repetitive.
    103 - The 21-bit space is not fully used for Unicode.
    104 
    105 The lookup of properties for a given code point is done with a trie lookup,
    106 using the UTrie implementation.
    107 The trie lookup result is a 16-bit properties word.
    108 
    109 With a given Unicode code point
    110 
    111     UChar32 c;
    112 
    113 and 0<=c<0x110000, the lookup is done like this:
    114 
    115     uint16_t props;
    116     UTRIE_GET16(trie, c, props);
    117 
    118 Each 16-bit properties word contains:
    119 
    120  0.. 4  general category
    121      5  reserved
    122  6..15  numeric type and value (ntv)
    123 
    124 Encoding of numeric type and value in the 10-bit ntv field:
    125     ntv             type            value
    126     0               U_NT_NONE       0
    127     1..10           U_NT_DECIMAL    0..9
    128     11..20          U_NT_DIGIT      0..9
    129     21..0x2ff       U_NT_NUMERIC    see below
    130     0x300..0x3ff    reserved
    131 
    132     For U_NT_NUMERIC:
    133     ntv             value
    134     21..0xaf        integer     0..154
    135     0xb0..0x1df     fraction    ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
    136     0x1e0..0x2ff    large int   ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
    137                     (only one significant decimal digit)
    138 
    139 --- Additional properties (new in format version 2.1) ---
    140 
    141 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
    142 The data words consist of 32-bit unit indexes (not row indexes!) into the
    143 table of unique properties vectors (PV).
    144 Each vector contains a set of properties.
    145 The width of a vector (number of uint32_t per row) may change
    146 with the formatVersion, it is stored in i5.
    147 
    148 Current properties: see icu/source/common/uprops.h
    149 
    150 --- Changes in format version 3.1 ---
    151 
    152 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
    153 
    154 --- Changes in format version 3.2 ---
    155 
    156 - The tries use linear Latin-1 ranges.
    157 - The additional properties bits store full properties XYZ instead
    158   of partial Other_XYZ, so that changes in the derivation formulas
    159   need not be tracked in runtime library code.
    160 - Joining Type and Line Break are also stored completely, so that uprops.c
    161   needs no runtime formulas for enumerated properties either.
    162 - Store the case-sensitive flag in the main properties word.
    163 - i10 also contains U_LB_COUNT and U_EA_COUNT.
    164 - i11 contains maxValues2 for vector word 2.
    165 
    166 --- Changes in format version 4 ---
    167 
    168 The format changes between version 3 and 4 because the properties related to
    169 case mappings and bidi/shaping are pulled out into separate files
    170 for modularization.
    171 In order to reduce the need for code changes, some of the previous data
    172 structures are omitted, rather than rearranging everything.
    173 
    174 (The change to format version 4 is for ICU 3.4. The last CVS revision of
    175 genprops/store.c for format version 3.2 is 1.48.)
    176 
    177 The main trie's data is significantly simplified:
    178 - The trie's 16-bit data word is used directly instead of as an index
    179   into props32[].
    180 - The trie uses the default trie folding functions instead of custom ones.
    181 - Numeric values are stored directly in the trie data word, with special
    182   encodings.
    183 - No more exception data (the data that needed it was pulled out, or, in the
    184   case of numeric values, encoded differently).
    185 - No more string data (pulled out - was for case mappings).
    186 
    187 Also, some of the previously used properties vector bits are reserved again.
    188 
    189 The indexes[] values for the omitted structures are still filled in
    190 (indicating zero-length arrays) so that the swapper code remains unchanged.
    191 
    192 --- Changes in format version 5 ---
    193 
    194 Format version 5 became necessary because the bit field for script codes
    195 overflowed. The changes are incompatible because
    196 old code would have seen nonsensically low values for new, higher script codes.
    197 
    198 Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
    199 Word_Break) by one bit each.
    200 
    201 Modified bit fields in icu/source/common/uprops.h
    202 
    203 --- Changes in format version 6 ---
    204 
    205 Format version 6 became necessary because Unicode 5.2 adds fractions with
    206 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
    207 types and values rather than add another variant to the previous format.
    208 
    209 ----------------------------------------------------------------------------- */
    210 
    211 /* UDataInfo cf. udata.h */
    212 static UDataInfo dataInfo={
    213     sizeof(UDataInfo),
    214     0,
    215 
    216     U_IS_BIG_ENDIAN,
    217     U_CHARSET_FAMILY,
    218     U_SIZEOF_UCHAR,
    219     0,
    220 
    221     { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
    222     { 6, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
    223     { 5, 1, 0, 0 }                              /* dataVersion */
    224 };
    225 
    226 static UNewTrie *pTrie=NULL;
    227 
    228 /* -------------------------------------------------------------------------- */
    229 
    230 extern void
    231 setUnicodeVersion(const char *v) {
    232     UVersionInfo version;
    233     u_versionFromString(version, v);
    234     uprv_memcpy(dataInfo.dataVersion, version, 4);
    235 }
    236 
    237 extern void
    238 initStore() {
    239     pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
    240     if(pTrie==NULL) {
    241         fprintf(stderr, "error: unable to create a UNewTrie\n");
    242         exit(U_MEMORY_ALLOCATION_ERROR);
    243     }
    244 
    245     initAdditionalProperties();
    246 }
    247 
    248 extern void
    249 exitStore() {
    250     utrie_close(pTrie);
    251     exitAdditionalProperties();
    252 }
    253 
    254 /* store a character's properties ------------------------------------------- */
    255 
    256 extern uint32_t
    257 makeProps(Props *p) {
    258     uint32_t den;
    259     int32_t type, value, exp, ntv;
    260 
    261     /* encode numeric type & value */
    262     type=p->numericType;
    263     value=p->numericValue;
    264     den=p->denominator;
    265     exp=p->exponent;
    266 
    267     ntv=-1; /* the numeric type and value cannot be encoded if ntv remains -1 */
    268     switch(type) {
    269     case U_NT_NONE:
    270         if(value==0 && den==0 && exp==0) {
    271             ntv=UPROPS_NTV_NONE;
    272         }
    273         break;
    274     case U_NT_DECIMAL:
    275         if(0<=value && value<=9 && den==0 && exp==0) {
    276             ntv=UPROPS_NTV_DECIMAL_START+value;
    277         }
    278         break;
    279     case U_NT_DIGIT:
    280         if(0<=value && value<=9 && den==0 && exp==0) {
    281             ntv=UPROPS_NTV_DIGIT_START+value;
    282         }
    283         break;
    284     case U_NT_NUMERIC:
    285         if(den==0) {
    286             if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
    287                 /* small integer parsed like a large one */
    288                 ntv=UPROPS_NTV_NUMERIC_START+value*100;
    289             } else if(exp==0 && value>=0) {
    290                 if(value<=UPROPS_NTV_MAX_SMALL_INT) {
    291                     /* small integer */
    292                     ntv=UPROPS_NTV_NUMERIC_START+value;
    293                 } else {
    294                     /* large integer parsed like a small one */
    295                     /* split the value into mantissa and exponent, base 10 */
    296                     int32_t mant=value;
    297                     while((mant%10)==0) {
    298                         mant/=10;
    299                         ++exp;
    300                     }
    301                     if(mant<=9) {
    302                         ntv=((mant+14)<<5)+(exp-2);
    303                     }
    304                 }
    305             } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
    306                 /* large, single-significant-digit integer */
    307                 ntv=((value+14)<<5)+(exp-2);
    308             }
    309         } else if(exp==0) {
    310             if(-1<=value && value<=17 && 1<=den && den<=16) {
    311                 /* fraction */
    312                 ntv=((value+12)<<4)+(den-1);
    313             }
    314         }
    315     default:
    316         break;
    317     }
    318     if(ntv<0) {
    319         fprintf(stderr, "genprops error: unable to encode numeric type %d & value %ld/%lu E%d\n",
    320                 (int)type, (long)value, (unsigned long)den, exp);
    321         exit(U_ILLEGAL_ARGUMENT_ERROR);
    322     }
    323 
    324     /* encode the properties */
    325     return
    326         (uint32_t)p->generalCategory |
    327         (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
    328 }
    329 
    330 extern void
    331 addProps(uint32_t c, uint32_t x) {
    332     if(!utrie_set32(pTrie, (UChar32)c, x)) {
    333         fprintf(stderr, "error: too many entries for the properties trie\n");
    334         exit(U_BUFFER_OVERFLOW_ERROR);
    335     }
    336 }
    337 
    338 extern uint32_t
    339 getProps(uint32_t c) {
    340     return utrie_get32(pTrie, (UChar32)c, NULL);
    341 }
    342 
    343 /* areas of same properties ------------------------------------------------- */
    344 
    345 extern void
    346 repeatProps(uint32_t first, uint32_t last, uint32_t x) {
    347     if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
    348         fprintf(stderr, "error: too many entries for the properties trie\n");
    349         exit(U_BUFFER_OVERFLOW_ERROR);
    350     }
    351 }
    352 
    353 /* generate output data ----------------------------------------------------- */
    354 
    355 extern void
    356 generateData(const char *dataDir, UBool csource) {
    357     static int32_t indexes[UPROPS_INDEX_COUNT]={
    358         0, 0, 0, 0,
    359         0, 0, 0, 0,
    360         0, 0, 0, 0,
    361         0, 0, 0, 0
    362     };
    363     static uint8_t trieBlock[40000];
    364     static uint8_t additionalProps[120000];
    365 
    366     UNewDataMemory *pData;
    367     UErrorCode errorCode=U_ZERO_ERROR;
    368     uint32_t size = 0;
    369     int32_t trieSize, additionalPropsSize, offset;
    370     long dataLength;
    371 
    372     trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    373     if(U_FAILURE(errorCode)) {
    374         fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
    375         exit(errorCode);
    376     }
    377 
    378     offset=sizeof(indexes)/4;               /* uint32_t offset to the properties trie */
    379 
    380     /* round up trie size to 4-alignment */
    381     trieSize=(trieSize+3)&~3;
    382     offset+=trieSize>>2;
    383     indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
    384     indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
    385     indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
    386     indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
    387 
    388     if(beVerbose) {
    389         printf("trie size in bytes:                    %5u\n", (int)trieSize);
    390     }
    391 
    392     if(csource) {
    393         /* write .c file for hardcoded data */
    394         UTrie trie={ NULL };
    395         UTrie2 *trie2;
    396         FILE *f;
    397 
    398         utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
    399         if(U_FAILURE(errorCode)) {
    400             fprintf(
    401                 stderr,
    402                 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
    403                 u_errorName(errorCode));
    404             exit(errorCode);
    405         }
    406 
    407         /* use UTrie2 */
    408         trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
    409         if(U_FAILURE(errorCode)) {
    410             fprintf(
    411                 stderr,
    412                 "genprops error: utrie2_fromUTrie() failed - %s\n",
    413                 u_errorName(errorCode));
    414             exit(errorCode);
    415         }
    416         {
    417             /* delete lead surrogate code unit values */
    418             UChar lead;
    419             trie2=utrie2_cloneAsThawed(trie2, &errorCode);
    420             for(lead=0xd800; lead<0xdc00; ++lead) {
    421                 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
    422             }
    423             utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
    424             if(U_FAILURE(errorCode)) {
    425                 fprintf(
    426                     stderr,
    427                     "genprops error: deleting lead surrogate code unit values failed - %s\n",
    428                     u_errorName(errorCode));
    429                 exit(errorCode);
    430             }
    431         }
    432 
    433         f=usrc_create(dataDir, "uchar_props_data.c");
    434         if(f!=NULL) {
    435             /* unused
    436             usrc_writeArray(f,
    437                 "static const UVersionInfo formatVersion={",
    438                 dataInfo.formatVersion, 8, 4,
    439                 "};\n\n");
    440              */
    441             usrc_writeArray(f,
    442                 "static const UVersionInfo dataVersion={",
    443                 dataInfo.dataVersion, 8, 4,
    444                 "};\n\n");
    445             usrc_writeUTrie2Arrays(f,
    446                 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
    447                 trie2,
    448                 "\n};\n\n");
    449             usrc_writeUTrie2Struct(f,
    450                 "static const UTrie2 propsTrie={\n",
    451                 trie2, "propsTrie_index", NULL,
    452                 "};\n\n");
    453 
    454             additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
    455             size=4*offset+additionalPropsSize;      /* total size of data */
    456 
    457             usrc_writeArray(f,
    458                 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
    459                 indexes, 32, UPROPS_INDEX_COUNT,
    460                 "};\n\n");
    461             fclose(f);
    462         }
    463         utrie2_close(trie2);
    464     } else {
    465         /* write the data */
    466         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
    467                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    468         if(U_FAILURE(errorCode)) {
    469             fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
    470             exit(errorCode);
    471         }
    472 
    473         additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
    474         size=4*offset+additionalPropsSize;      /* total size of data */
    475 
    476         udata_writeBlock(pData, indexes, sizeof(indexes));
    477         udata_writeBlock(pData, trieBlock, trieSize);
    478         udata_writeBlock(pData, additionalProps, additionalPropsSize);
    479 
    480         /* finish up */
    481         dataLength=udata_finish(pData, &errorCode);
    482         if(U_FAILURE(errorCode)) {
    483             fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
    484             exit(errorCode);
    485         }
    486 
    487         if(dataLength!=(long)size) {
    488             fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
    489                 dataLength, (unsigned long)size);
    490             exit(U_INTERNAL_PROGRAM_ERROR);
    491         }
    492     }
    493 
    494     if(beVerbose) {
    495         printf("data size:                            %6lu\n", (unsigned long)size);
    496     }
    497 }
    498 
    499 /*
    500  * Hey, Emacs, please set the following:
    501  *
    502  * Local Variables:
    503  * indent-tabs-mode: nil
    504  * End:
    505  *
    506  */
    507