Home | History | Annotate | Download | only in genbidi
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  store.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004dec30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Store Unicode bidi/shaping properties efficiently for
     17 *   random access.
     18 */
     19 
     20 #include <stdio.h>
     21 #include <stdlib.h>
     22 #include "unicode/utypes.h"
     23 #include "unicode/uchar.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "utrie.h"
     27 #include "utrie2.h"
     28 #include "uarrsort.h"
     29 #include "unicode/udata.h"
     30 #include "unewdata.h"
     31 #include "propsvec.h"
     32 #include "writesrc.h"
     33 #include "ubidi_props.h"
     34 #include "genbidi.h"
     35 
     36 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     37 
     38 /* Unicode bidi/shaping properties file format ---------------------------------
     39 
     40 The file format prepared and written here contains several data
     41 structures that store indexes or data.
     42 
     43 Before the data contents described below, there are the headers required by
     44 the udata API for loading ICU data. Especially, a UDataInfo structure
     45 precedes the actual data. It contains platform properties values and the
     46 file format version.
     47 
     48 The following is a description of format version 1.0 .
     49 
     50 The file contains the following structures:
     51 
     52     const int32_t indexes[i0] with values i0, i1, ...:
     53     (see UBIDI_IX_... constants for names of indexes)
     54 
     55     i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP)
     56     i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
     57     i2 trieSize; -- size in bytes of the bidi/shaping properties trie
     58     i3 mirrorLength; -- length in uint32_t of the bidi mirroring array
     59 
     60     i4 jgStart; -- first code point with Joining_Group data
     61     i5 jgLimit; -- limit code point for Joining_Group data
     62 
     63     i6..i14 reservedIndexes; -- reserved values; 0 for now
     64 
     65     i15 maxValues; -- maximum code values for enumerated properties
     66                       bits 23..16 contain the max value for Joining_Group,
     67                       otherwise the bits are used like enum fields in the trie word
     68 
     69     Serialized trie, see utrie.h;
     70 
     71     const uint32_t mirrors[mirrorLength];
     72 
     73     const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4
     74 
     75 Trie data word:
     76 Bits
     77 15..13  signed delta to bidi mirroring code point
     78         (add delta to input code point)
     79         0 no such code point (source maps to itself)
     80         -3..-1, 1..3 delta
     81         -4 look in mirrors table
     82     12  is mirrored
     83     11  Bidi_Control
     84     10  Join_Control
     85  9.. 8  reserved (set to 0)
     86  7.. 5  Joining_Type
     87  4.. 0  BiDi category
     88 
     89 
     90 Mirrors:
     91 Stores some of the bidi mirroring data, where each code point maps to
     92 at most one other.
     93 Most code points do not have a mirroring code point; most that do have a signed
     94 delta stored in the trie data value. Only those where the delta does not fit
     95 into the trie data are stored in this table.
     96 
     97 Logically, this is a two-column table with source and mirror code points.
     98 
     99 Physically, the table is compressed by taking advantage of the fact that each
    100 mirror code point is also a source code point
    101 (each of them is a mirror of the other).
    102 Therefore, both logical columns contain the same set of code points, which needs
    103 to be stored only once.
    104 
    105 The table stores source code points, and also for each the index of its mirror
    106 code point in the same table, in a simple array of uint32_t.
    107 Bits
    108 31..21  index to mirror code point (unsigned)
    109 20.. 0  source code point
    110 
    111 The table is sorted by source code points.
    112 
    113 
    114 Joining_Group array:
    115 The Joining_Group values do not fit into the 16-bit trie, but the data is also
    116 limited to a small range of code points (Arabic and Syriac) and not
    117 well compressible.
    118 
    119 The start and limit code points for the range are stored in the indexes[]
    120 array, and the jgArray[] stores a byte for each of these code points,
    121 containing the Joining_Group value.
    122 
    123 All code points outside of this range have No_Joining_Group (0).
    124 
    125 ----------------------------------------------------------------------------- */
    126 
    127 /* UDataInfo cf. udata.h */
    128 static UDataInfo dataInfo={
    129     sizeof(UDataInfo),
    130     0,
    131 
    132     U_IS_BIG_ENDIAN,
    133     U_CHARSET_FAMILY,
    134     U_SIZEOF_UCHAR,
    135     0,
    136 
    137     /* dataFormat="BiDi" */
    138     { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 },
    139     { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
    140     { 4, 0, 1, 0 }                              /* dataVersion */
    141 };
    142 
    143 /* exceptions values */
    144 static uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2];
    145 static uint16_t mirrorTop=0;
    146 
    147 /* -------------------------------------------------------------------------- */
    148 
    149 extern void
    150 setUnicodeVersion(const char *v) {
    151     UVersionInfo version;
    152     u_versionFromString(version, v);
    153     uprv_memcpy(dataInfo.dataVersion, version, 4);
    154 }
    155 
    156 /* bidi mirroring table ----------------------------------------------------- */
    157 
    158 extern void
    159 addMirror(UChar32 src, UChar32 mirror) {
    160     UErrorCode errorCode;
    161     int32_t delta;
    162 
    163     delta=mirror-src;
    164     if(delta==0) {
    165         return; /* mapping to self=no mapping */
    166     }
    167 
    168     if(delta<UBIDI_MIN_MIRROR_DELTA || UBIDI_MAX_MIRROR_DELTA<delta) {
    169         /* delta does not fit into the trie properties value, store in the mirrors[] table */
    170         if(mirrorTop==LENGTHOF(mirrors)) {
    171             fprintf(stderr, "genbidi error: too many long-distance mirroring mappings\n");
    172             exit(U_BUFFER_OVERFLOW_ERROR);
    173         }
    174 
    175         /* possible: search the table so far and see if src is already listed */
    176 
    177         mirrors[mirrorTop][0]=(uint32_t)src;
    178         mirrors[mirrorTop][1]=(uint32_t)mirror;
    179         ++mirrorTop;
    180 
    181         /* set an escape marker in src's properties */
    182         delta=UBIDI_ESC_MIRROR_DELTA;
    183     }
    184 
    185     errorCode=U_ZERO_ERROR;
    186     upvec_setValue(
    187             pv, src, src, 0,
    188             (uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT, (uint32_t)(-1)<<UBIDI_MIRROR_DELTA_SHIFT,
    189             &errorCode);
    190     if(U_FAILURE(errorCode)) {
    191         fprintf(stderr, "genbidi error: unable to set mirroring delta, code: %s\n",
    192                         u_errorName(errorCode));
    193         exit(errorCode);
    194     }
    195 }
    196 
    197 static int32_t U_CALLCONV
    198 compareMirror(const void *context, const void *left, const void *right) {
    199     UChar32 l, r;
    200 
    201     l=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)left)[0]);
    202     r=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)right)[0]);
    203     return l-r;
    204 }
    205 
    206 static void
    207 makeMirror() {
    208     uint32_t *reducedMirror;
    209     UErrorCode errorCode;
    210     int32_t i, j, start, limit, step;
    211     uint32_t c;
    212 
    213     /* sort the mirroring table by source code points */
    214     errorCode=U_ZERO_ERROR;
    215     uprv_sortArray(mirrors, mirrorTop, 8,
    216                    compareMirror, NULL, FALSE, &errorCode);
    217 
    218     /*
    219      * reduce the 2-column table to a single column
    220      * by putting the index to the mirror entry into the source entry
    221      *
    222      * first:
    223      * find each mirror code point in the source column and set each other's indexes
    224      *
    225      * second:
    226      * reduce the table, combine the source code points with their indexes
    227      * and store as a simple array of uint32_t
    228      */
    229     for(i=0; i<mirrorTop; ++i) {
    230         c=mirrors[i][1]; /* mirror code point */
    231         if(c>0x1fffff) {
    232             continue; /* this entry already has an index */
    233         }
    234 
    235         /* search for the mirror code point in the source column */
    236         if(c<mirrors[i][0]) {
    237             /* search before i */
    238             start=i-1;
    239             limit=-1;
    240             step=-1;
    241         } else {
    242             start=i+1;
    243             limit=mirrorTop;
    244             step=1;
    245         }
    246 
    247         for(j=start;; j+=step) {
    248             if(j==limit) {
    249                 fprintf(stderr,
    250                         "genbidi error: bidi mirror does not roundtrip - %04lx->%04lx->?\n",
    251                         (long)mirrors[i][0], (long)mirrors[i][1]);
    252                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    253             }
    254             if(c==mirrors[j][0]) {
    255                 /*
    256                  * found the mirror code point c in the source column,
    257                  * set both entries' indexes to each other
    258                  */
    259                 if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) {
    260                     /* roundtrip check fails */
    261                     fprintf(stderr,
    262                             "genbidi error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n",
    263                             (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]);
    264                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    265                 } else {
    266                     mirrors[i][1]|=(uint32_t)j<<UBIDI_MIRROR_INDEX_SHIFT;
    267                     mirrors[j][1]|=(uint32_t)i<<UBIDI_MIRROR_INDEX_SHIFT;
    268                 }
    269                 break;
    270             }
    271         }
    272     }
    273 
    274     /* now the second step, the actual reduction of the table */
    275     reducedMirror=mirrors[0];
    276     for(i=0; i<mirrorTop; ++i) {
    277         reducedMirror[i]=mirrors[i][0]|(mirrors[i][1]&~0x1fffff);
    278     }
    279 
    280     if(U_FAILURE(errorCode)) {
    281         exit(errorCode);
    282     }
    283 }
    284 
    285 /* generate output data ----------------------------------------------------- */
    286 
    287 extern void
    288 generateData(const char *dataDir, UBool csource) {
    289     static int32_t indexes[UBIDI_IX_TOP]={
    290         UBIDI_IX_TOP
    291     };
    292     static uint8_t trieBlock[40000];
    293     static uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */
    294 
    295     const uint32_t *row;
    296     UChar32 start, end, prev, jgStart;
    297     int32_t i;
    298 
    299     UNewDataMemory *pData;
    300     UNewTrie *pTrie;
    301     UErrorCode errorCode=U_ZERO_ERROR;
    302     int32_t trieSize;
    303     long dataLength;
    304 
    305     makeMirror();
    306 
    307     pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
    308     if(pTrie==NULL) {
    309         fprintf(stderr, "genbidi error: unable to create a UNewTrie\n");
    310         exit(U_MEMORY_ALLOCATION_ERROR);
    311     }
    312 
    313     prev=jgStart=0;
    314     for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP; ++i) {
    315         /* store most values from vector column 0 in the trie */
    316         if(!utrie_setRange32(pTrie, start, end+1, *row, TRUE)) {
    317             fprintf(stderr, "genbidi error: unable to set trie value (overflow)\n");
    318             exit(U_BUFFER_OVERFLOW_ERROR);
    319         }
    320 
    321         /* store Joining_Group values from vector column 1 in a simple byte array */
    322         if(row[1]!=0) {
    323             if(start<0x600 || 0x8ff<end) {
    324                 fprintf(stderr, "genbidi error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n",
    325                         (long)start, (long)end);
    326                 exit(U_ILLEGAL_ARGUMENT_ERROR);
    327             }
    328 
    329             if(prev==0) {
    330                 /* first code point with any value */
    331                 prev=jgStart=start;
    332             } else {
    333                 /* add No_Joining_Group for code points between prev and start */
    334                 while(prev<start) {
    335                     jgArray[prev++ -jgStart]=0;
    336                 }
    337             }
    338 
    339             /* set Joining_Group value for start..end */
    340             while(prev<=end) {
    341                 jgArray[prev++ -jgStart]=(uint8_t)row[1];
    342             }
    343         }
    344     }
    345 
    346     /* finish jgArray, pad to multiple of 4 */
    347     while((prev-jgStart)&3) {
    348         jgArray[prev++ -jgStart]=0;
    349     }
    350     indexes[UBIDI_IX_JG_START]=jgStart;
    351     indexes[UBIDI_IX_JG_LIMIT]=prev;
    352 
    353     trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    354     if(U_FAILURE(errorCode)) {
    355         fprintf(stderr, "genbidi error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
    356         exit(errorCode);
    357     }
    358 
    359     indexes[UBIDI_IX_TRIE_SIZE]=trieSize;
    360     indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop;
    361     indexes[UBIDI_IX_LENGTH]=
    362         (int32_t)sizeof(indexes)+
    363         trieSize+
    364         4*mirrorTop+
    365         (prev-jgStart);
    366 
    367     if(beVerbose) {
    368         printf("trie size in bytes:                    %5d\n", (int)trieSize);
    369         printf("size in bytes of mirroring table:      %5d\n", (int)(4*mirrorTop));
    370         printf("length of Joining_Group array:         %5d (U+%04x..U+%04x)\n", (int)(prev-jgStart), (int)jgStart, (int)(prev-1));
    371         printf("data size:                             %5d\n", (int)indexes[UBIDI_IX_LENGTH]);
    372     }
    373 
    374     indexes[UBIDI_MAX_VALUES_INDEX]=
    375         ((int32_t)U_CHAR_DIRECTION_COUNT-1)|
    376         (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)|
    377         (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT);
    378 
    379     if(csource) {
    380         /* write .c file for hardcoded data */
    381         UTrie trie={ NULL };
    382         UTrie2 *trie2;
    383         FILE *f;
    384 
    385         utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
    386         if(U_FAILURE(errorCode)) {
    387             fprintf(
    388                 stderr,
    389                 "genbidi error: failed to utrie_unserialize(ubidi.icu trie) - %s\n",
    390                 u_errorName(errorCode));
    391             exit(errorCode);
    392         }
    393 
    394         /* use UTrie2 */
    395         dataInfo.formatVersion[0]=2;
    396         dataInfo.formatVersion[2]=0;
    397         dataInfo.formatVersion[3]=0;
    398         trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
    399         if(U_FAILURE(errorCode)) {
    400             fprintf(
    401                 stderr,
    402                 "genbidi error: utrie2_fromUTrie() failed - %s\n",
    403                 u_errorName(errorCode));
    404             exit(errorCode);
    405         }
    406         {
    407             /* delete lead surrogate code unit values */
    408             UChar lead;
    409             trie2=utrie2_cloneAsThawed(trie2, &errorCode);
    410             for(lead=0xd800; lead<0xdc00; ++lead) {
    411                 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
    412             }
    413             utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
    414             if(U_FAILURE(errorCode)) {
    415                 fprintf(
    416                     stderr,
    417                     "genbidi error: deleting lead surrogate code unit values failed - %s\n",
    418                     u_errorName(errorCode));
    419                 exit(errorCode);
    420             }
    421         }
    422 
    423         f=usrc_create(dataDir, "ubidi_props_data.c");
    424         if(f!=NULL) {
    425             usrc_writeArray(f,
    426                 "static const UVersionInfo ubidi_props_dataVersion={",
    427                 dataInfo.dataVersion, 8, 4,
    428                 "};\n\n");
    429             usrc_writeArray(f,
    430                 "static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={",
    431                 indexes, 32, UBIDI_IX_TOP,
    432                 "};\n\n");
    433             usrc_writeUTrie2Arrays(f,
    434                 "static const uint16_t ubidi_props_trieIndex[%ld]={\n", NULL,
    435                 trie2,
    436                 "\n};\n\n");
    437             usrc_writeArray(f,
    438                 "static const uint32_t ubidi_props_mirrors[%ld]={\n",
    439                 mirrors, 32, mirrorTop,
    440                 "\n};\n\n");
    441             usrc_writeArray(f,
    442                 "static const uint8_t ubidi_props_jgArray[%ld]={\n",
    443                 jgArray, 8, prev-jgStart,
    444                 "\n};\n\n");
    445             fputs(
    446                 "static const UBiDiProps ubidi_props_singleton={\n"
    447                 "  NULL,\n"
    448                 "  ubidi_props_indexes,\n"
    449                 "  ubidi_props_mirrors,\n"
    450                 "  ubidi_props_jgArray,\n",
    451                 f);
    452             usrc_writeUTrie2Struct(f,
    453                 "  {\n",
    454                 trie2, "ubidi_props_trieIndex", NULL,
    455                 "  },\n");
    456             usrc_writeArray(f, "  { ", dataInfo.formatVersion, 8, 4, " }\n");
    457             fputs("};\n", f);
    458             fclose(f);
    459         }
    460         utrie2_close(trie2);
    461     } else {
    462         /* write the data */
    463         pData=udata_create(dataDir, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &dataInfo,
    464                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
    465         if(U_FAILURE(errorCode)) {
    466             fprintf(stderr, "genbidi: unable to create data memory, %s\n", u_errorName(errorCode));
    467             exit(errorCode);
    468         }
    469 
    470         udata_writeBlock(pData, indexes, sizeof(indexes));
    471         udata_writeBlock(pData, trieBlock, trieSize);
    472         udata_writeBlock(pData, mirrors, 4*mirrorTop);
    473         udata_writeBlock(pData, jgArray, prev-jgStart);
    474 
    475         /* finish up */
    476         dataLength=udata_finish(pData, &errorCode);
    477         if(U_FAILURE(errorCode)) {
    478             fprintf(stderr, "genbidi: error %d writing the output file\n", errorCode);
    479             exit(errorCode);
    480         }
    481 
    482         if(dataLength!=indexes[UBIDI_IX_LENGTH]) {
    483             fprintf(stderr, "genbidi: data length %ld != calculated size %d\n",
    484                 dataLength, (int)indexes[UBIDI_IX_LENGTH]);
    485             exit(U_INTERNAL_PROGRAM_ERROR);
    486         }
    487     }
    488 
    489     utrie_close(pTrie);
    490     upvec_close(pv);
    491 }
    492 
    493 /*
    494  * Hey, Emacs, please set the following:
    495  *
    496  * Local Variables:
    497  * indent-tabs-mode: nil
    498  * End:
    499  *
    500  */
    501