Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  utrie.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2001oct20
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is a common implementation of a "folded" trie.
     17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     18 *   Unicode code points (0..0x10ffff).
     19 */
     20 
     21 #ifdef UTRIE_DEBUG
     22 #   include <stdio.h>
     23 #endif
     24 
     25 #include "unicode/utypes.h"
     26 #include "cmemory.h"
     27 #include "utrie.h"
     28 
     29 /* miscellaneous ------------------------------------------------------------ */
     30 
     31 #undef ABS
     32 #define ABS(x) ((x)>=0 ? (x) : -(x))
     33 
     34 static inline UBool
     35 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
     36     while(length>0 && *s==*t) {
     37         ++s;
     38         ++t;
     39         --length;
     40     }
     41     return (UBool)(length==0);
     42 }
     43 
     44 /* Building a trie ----------------------------------------------------------*/
     45 
     46 U_CAPI UNewTrie * U_EXPORT2
     47 utrie_open(UNewTrie *fillIn,
     48            uint32_t *aliasData, int32_t maxDataLength,
     49            uint32_t initialValue, uint32_t leadUnitValue,
     50            UBool latin1Linear) {
     51     UNewTrie *trie;
     52     int32_t i, j;
     53 
     54     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
     55         (latin1Linear && maxDataLength<1024)
     56     ) {
     57         return NULL;
     58     }
     59 
     60     if(fillIn!=NULL) {
     61         trie=fillIn;
     62     } else {
     63         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
     64         if(trie==NULL) {
     65             return NULL;
     66         }
     67     }
     68     uprv_memset(trie, 0, sizeof(UNewTrie));
     69     trie->isAllocated= (UBool)(fillIn==NULL);
     70 
     71     if(aliasData!=NULL) {
     72         trie->data=aliasData;
     73         trie->isDataAllocated=FALSE;
     74     } else {
     75         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
     76         if(trie->data==NULL) {
     77             uprv_free(trie);
     78             return NULL;
     79         }
     80         trie->isDataAllocated=TRUE;
     81     }
     82 
     83     /* preallocate and reset the first data block (block index 0) */
     84     j=UTRIE_DATA_BLOCK_LENGTH;
     85 
     86     if(latin1Linear) {
     87         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
     88         /* made sure above that maxDataLength>=1024 */
     89 
     90         /* set indexes to point to consecutive data blocks */
     91         i=0;
     92         do {
     93             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
     94             trie->index[i++]=j;
     95             j+=UTRIE_DATA_BLOCK_LENGTH;
     96         } while(i<(256>>UTRIE_SHIFT));
     97     }
     98 
     99     /* reset the initially allocated blocks to the initial value */
    100     trie->dataLength=j;
    101     while(j>0) {
    102         trie->data[--j]=initialValue;
    103     }
    104 
    105     trie->leadUnitValue=leadUnitValue;
    106     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
    107     trie->dataCapacity=maxDataLength;
    108     trie->isLatin1Linear=latin1Linear;
    109     trie->isCompacted=FALSE;
    110     return trie;
    111 }
    112 
    113 U_CAPI UNewTrie * U_EXPORT2
    114 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
    115     UNewTrie *trie;
    116     UBool isDataAllocated;
    117 
    118     /* do not clone if other is not valid or already compacted */
    119     if(other==NULL || other->data==NULL || other->isCompacted) {
    120         return NULL;
    121     }
    122 
    123     /* clone data */
    124     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
    125         isDataAllocated=FALSE;
    126     } else {
    127         aliasDataCapacity=other->dataCapacity;
    128         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
    129         if(aliasData==NULL) {
    130             return NULL;
    131         }
    132         isDataAllocated=TRUE;
    133     }
    134 
    135     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
    136                     other->data[0], other->leadUnitValue,
    137                     other->isLatin1Linear);
    138     if(trie==NULL) {
    139         uprv_free(aliasData);
    140     } else {
    141         uprv_memcpy(trie->index, other->index, sizeof(trie->index));
    142         uprv_memcpy(trie->data, other->data, other->dataLength*4);
    143         trie->dataLength=other->dataLength;
    144         trie->isDataAllocated=isDataAllocated;
    145     }
    146 
    147     return trie;
    148 }
    149 
    150 U_CAPI void U_EXPORT2
    151 utrie_close(UNewTrie *trie) {
    152     if(trie!=NULL) {
    153         if(trie->isDataAllocated) {
    154             uprv_free(trie->data);
    155             trie->data=NULL;
    156         }
    157         if(trie->isAllocated) {
    158             uprv_free(trie);
    159         }
    160     }
    161 }
    162 
    163 U_CAPI uint32_t * U_EXPORT2
    164 utrie_getData(UNewTrie *trie, int32_t *pLength) {
    165     if(trie==NULL || pLength==NULL) {
    166         return NULL;
    167     }
    168 
    169     *pLength=trie->dataLength;
    170     return trie->data;
    171 }
    172 
    173 static int32_t
    174 utrie_allocDataBlock(UNewTrie *trie) {
    175     int32_t newBlock, newTop;
    176 
    177     newBlock=trie->dataLength;
    178     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
    179     if(newTop>trie->dataCapacity) {
    180         /* out of memory in the data array */
    181         return -1;
    182     }
    183     trie->dataLength=newTop;
    184     return newBlock;
    185 }
    186 
    187 /**
    188  * No error checking for illegal arguments.
    189  *
    190  * @return -1 if no new data block available (out of memory in data array)
    191  * @internal
    192  */
    193 static int32_t
    194 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
    195     int32_t indexValue, newBlock;
    196 
    197     c>>=UTRIE_SHIFT;
    198     indexValue=trie->index[c];
    199     if(indexValue>0) {
    200         return indexValue;
    201     }
    202 
    203     /* allocate a new data block */
    204     newBlock=utrie_allocDataBlock(trie);
    205     if(newBlock<0) {
    206         /* out of memory in the data array */
    207         return -1;
    208     }
    209     trie->index[c]=newBlock;
    210 
    211     /* copy-on-write for a block from a setRange() */
    212     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
    213     return newBlock;
    214 }
    215 
    216 /**
    217  * @return TRUE if the value was successfully set
    218  */
    219 U_CAPI UBool U_EXPORT2
    220 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
    221     int32_t block;
    222 
    223     /* valid, uncompacted trie and valid c? */
    224     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
    225         return FALSE;
    226     }
    227 
    228     block=utrie_getDataBlock(trie, c);
    229     if(block<0) {
    230         return FALSE;
    231     }
    232 
    233     trie->data[block+(c&UTRIE_MASK)]=value;
    234     return TRUE;
    235 }
    236 
    237 U_CAPI uint32_t U_EXPORT2
    238 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
    239     int32_t block;
    240 
    241     /* valid, uncompacted trie and valid c? */
    242     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
    243         if(pInBlockZero!=NULL) {
    244             *pInBlockZero=TRUE;
    245         }
    246         return 0;
    247     }
    248 
    249     block=trie->index[c>>UTRIE_SHIFT];
    250     if(pInBlockZero!=NULL) {
    251         *pInBlockZero= (UBool)(block==0);
    252     }
    253 
    254     return trie->data[ABS(block)+(c&UTRIE_MASK)];
    255 }
    256 
    257 /**
    258  * @internal
    259  */
    260 static void
    261 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
    262                 uint32_t value, uint32_t initialValue, UBool overwrite) {
    263     uint32_t *pLimit;
    264 
    265     pLimit=block+limit;
    266     block+=start;
    267     if(overwrite) {
    268         while(block<pLimit) {
    269             *block++=value;
    270         }
    271     } else {
    272         while(block<pLimit) {
    273             if(*block==initialValue) {
    274                 *block=value;
    275             }
    276             ++block;
    277         }
    278     }
    279 }
    280 
    281 U_CAPI UBool U_EXPORT2
    282 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
    283     /*
    284      * repeat value in [start..limit[
    285      * mark index values for repeat-data blocks by setting bit 31 of the index values
    286      * fill around existing values if any, if(overwrite)
    287      */
    288     uint32_t initialValue;
    289     int32_t block, rest, repeatBlock;
    290 
    291     /* valid, uncompacted trie and valid indexes? */
    292     if( trie==NULL || trie->isCompacted ||
    293         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
    294     ) {
    295         return FALSE;
    296     }
    297     if(start==limit) {
    298         return TRUE; /* nothing to do */
    299     }
    300 
    301     initialValue=trie->data[0];
    302     if(start&UTRIE_MASK) {
    303         UChar32 nextStart;
    304 
    305         /* set partial block at [start..following block boundary[ */
    306         block=utrie_getDataBlock(trie, start);
    307         if(block<0) {
    308             return FALSE;
    309         }
    310 
    311         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
    312         if(nextStart<=limit) {
    313             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
    314                             value, initialValue, overwrite);
    315             start=nextStart;
    316         } else {
    317             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
    318                             value, initialValue, overwrite);
    319             return TRUE;
    320         }
    321     }
    322 
    323     /* number of positions in the last, partial block */
    324     rest=limit&UTRIE_MASK;
    325 
    326     /* round down limit to a block boundary */
    327     limit&=~UTRIE_MASK;
    328 
    329     /* iterate over all-value blocks */
    330     if(value==initialValue) {
    331         repeatBlock=0;
    332     } else {
    333         repeatBlock=-1;
    334     }
    335     while(start<limit) {
    336         /* get index value */
    337         block=trie->index[start>>UTRIE_SHIFT];
    338         if(block>0) {
    339             /* already allocated, fill in value */
    340             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
    341         } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
    342             /* set the repeatBlock instead of the current block 0 or range block */
    343             if(repeatBlock>=0) {
    344                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    345             } else {
    346                 /* create and set and fill the repeatBlock */
    347                 repeatBlock=utrie_getDataBlock(trie, start);
    348                 if(repeatBlock<0) {
    349                     return FALSE;
    350                 }
    351 
    352                 /* set the negative block number to indicate that it is a repeat block */
    353                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    354                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
    355             }
    356         }
    357 
    358         start+=UTRIE_DATA_BLOCK_LENGTH;
    359     }
    360 
    361     if(rest>0) {
    362         /* set partial block at [last block boundary..limit[ */
    363         block=utrie_getDataBlock(trie, start);
    364         if(block<0) {
    365             return FALSE;
    366         }
    367 
    368         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
    369     }
    370 
    371     return TRUE;
    372 }
    373 
    374 static int32_t
    375 _findSameIndexBlock(const int32_t *idx, int32_t indexLength,
    376                     int32_t otherBlock) {
    377     int32_t block, i;
    378 
    379     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
    380         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
    381             if(idx[block+i]!=idx[otherBlock+i]) {
    382                 break;
    383             }
    384         }
    385         if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
    386             return block;
    387         }
    388     }
    389     return indexLength;
    390 }
    391 
    392 /*
    393  * Fold the normalization data for supplementary code points into
    394  * a compact area on top of the BMP-part of the trie index,
    395  * with the lead surrogates indexing this compact area.
    396  *
    397  * Duplicate the index values for lead surrogates:
    398  * From inside the BMP area, where some may be overridden with folded values,
    399  * to just after the BMP area, where they can be retrieved for
    400  * code point lookups.
    401  */
    402 static void
    403 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
    404     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
    405     int32_t *idx;
    406     uint32_t value;
    407     UChar32 c;
    408     int32_t indexLength, block;
    409 #ifdef UTRIE_DEBUG
    410     int countLeadCUWithData=0;
    411 #endif
    412 
    413     idx=trie->index;
    414 
    415     /* copy the lead surrogate indexes into a temporary array */
    416     uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
    417 
    418     /*
    419      * set all values for lead surrogate code *units* to leadUnitValue
    420      * so that, by default, runtime lookups will find no data for associated
    421      * supplementary code points, unless there is data for such code points
    422      * which will result in a non-zero folding value below that is set for
    423      * the respective lead units
    424      *
    425      * the above saved the indexes for surrogate code *points*
    426      * fill the indexes with simplified code from utrie_setRange32()
    427      */
    428     if(trie->leadUnitValue==trie->data[0]) {
    429         block=0; /* leadUnitValue==initialValue, use all-initial-value block */
    430     } else {
    431         /* create and fill the repeatBlock */
    432         block=utrie_allocDataBlock(trie);
    433         if(block<0) {
    434             /* data table overflow */
    435             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    436             return;
    437         }
    438         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
    439         block=-block; /* negative block number to indicate that it is a repeat block */
    440     }
    441     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
    442         trie->index[c]=block;
    443     }
    444 
    445     /*
    446      * Fold significant index values into the area just after the BMP indexes.
    447      * In case the first lead surrogate has significant data,
    448      * its index block must be used first (in which case the folding is a no-op).
    449      * Later all folded index blocks are moved up one to insert the copied
    450      * lead surrogate indexes.
    451      */
    452     indexLength=UTRIE_BMP_INDEX_LENGTH;
    453 
    454     /* search for any index (stage 1) entries for supplementary code points */
    455     for(c=0x10000; c<0x110000;) {
    456         if(idx[c>>UTRIE_SHIFT]!=0) {
    457             /* there is data, treat the full block for a lead surrogate */
    458             c&=~0x3ff;
    459 
    460 #ifdef UTRIE_DEBUG
    461             ++countLeadCUWithData;
    462             /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
    463 #endif
    464 
    465             /* is there an identical index block? */
    466             block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
    467 
    468             /*
    469              * get a folded value for [c..c+0x400[ and,
    470              * if different from the value for the lead surrogate code point,
    471              * set it for the lead surrogate code unit
    472              */
    473             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
    474             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
    475                 if(!utrie_set32(trie, U16_LEAD(c), value)) {
    476                     /* data table overflow */
    477                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    478                     return;
    479                 }
    480 
    481                 /* if we did not find an identical index block... */
    482                 if(block==indexLength) {
    483                     /* move the actual index (stage 1) entries from the supplementary position to the new one */
    484                     uprv_memmove(idx+indexLength,
    485                                  idx+(c>>UTRIE_SHIFT),
    486                                  4*UTRIE_SURROGATE_BLOCK_COUNT);
    487                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    488                 }
    489             }
    490             c+=0x400;
    491         } else {
    492             c+=UTRIE_DATA_BLOCK_LENGTH;
    493         }
    494     }
    495 #ifdef UTRIE_DEBUG
    496     if(countLeadCUWithData>0) {
    497         printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
    498     }
    499 #endif
    500 
    501     /*
    502      * index array overflow?
    503      * This is to guarantee that a folding offset is of the form
    504      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
    505      * If the index is too large, then n>=1024 and more than 10 bits are necessary.
    506      *
    507      * In fact, it can only ever become n==1024 with completely unfoldable data and
    508      * the additional block of duplicated values for lead surrogates.
    509      */
    510     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
    511         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    512         return;
    513     }
    514 
    515     /*
    516      * make space for the lead surrogate index block and
    517      * insert it between the BMP indexes and the folded ones
    518      */
    519     uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
    520                  idx+UTRIE_BMP_INDEX_LENGTH,
    521                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
    522     uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
    523                 leadIndexes,
    524                 4*UTRIE_SURROGATE_BLOCK_COUNT);
    525     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    526 
    527 #ifdef UTRIE_DEBUG
    528     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
    529            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
    530 #endif
    531 
    532     trie->indexLength=indexLength;
    533 }
    534 
    535 /*
    536  * Set a value in the trie index map to indicate which data block
    537  * is referenced and which one is not.
    538  * utrie_compact() will remove data blocks that are not used at all.
    539  * Set
    540  * - 0 if it is used
    541  * - -1 if it is not used
    542  */
    543 static void
    544 _findUnusedBlocks(UNewTrie *trie) {
    545     int32_t i;
    546 
    547     /* fill the entire map with "not used" */
    548     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
    549 
    550     /* mark each block that _is_ used with 0 */
    551     for(i=0; i<trie->indexLength; ++i) {
    552         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
    553     }
    554 
    555     /* never move the all-initial-value block 0 */
    556     trie->map[0]=0;
    557 }
    558 
    559 static int32_t
    560 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
    561                    int32_t otherBlock, int32_t step) {
    562     int32_t block;
    563 
    564     /* ensure that we do not even partially get past dataLength */
    565     dataLength-=UTRIE_DATA_BLOCK_LENGTH;
    566 
    567     for(block=0; block<=dataLength; block+=step) {
    568         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
    569             return block;
    570         }
    571     }
    572     return -1;
    573 }
    574 
    575 /*
    576  * Compact a folded build-time trie.
    577  *
    578  * The compaction
    579  * - removes blocks that are identical with earlier ones
    580  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
    581  * - moves blocks in steps of the data granularity
    582  * - moves and overlaps blocks that overlap with multiple values in the overlap region
    583  *
    584  * It does not
    585  * - try to move and overlap blocks that are not already adjacent
    586  */
    587 static void
    588 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
    589     int32_t i, start, newStart, overlapStart;
    590 
    591     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    592         return;
    593     }
    594 
    595     /* valid, uncompacted trie? */
    596     if(trie==NULL) {
    597         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    598         return;
    599     }
    600     if(trie->isCompacted) {
    601         return; /* nothing left to do */
    602     }
    603 
    604     /* compaction */
    605 
    606     /* initialize the index map with "block is used/unused" flags */
    607     _findUnusedBlocks(trie);
    608 
    609     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
    610     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
    611         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
    612     } else {
    613         overlapStart=UTRIE_DATA_BLOCK_LENGTH;
    614     }
    615 
    616     newStart=UTRIE_DATA_BLOCK_LENGTH;
    617     for(start=newStart; start<trie->dataLength;) {
    618         /*
    619          * start: index of first entry of current block
    620          * newStart: index where the current block is to be moved
    621          *           (right after current end of already-compacted data)
    622          */
    623 
    624         /* skip blocks that are not used */
    625         if(trie->map[start>>UTRIE_SHIFT]<0) {
    626             /* advance start to the next block */
    627             start+=UTRIE_DATA_BLOCK_LENGTH;
    628 
    629             /* leave newStart with the previous block! */
    630             continue;
    631         }
    632 
    633         /* search for an identical block */
    634         if( start>=overlapStart &&
    635             (i=_findSameDataBlock(trie->data, newStart, start,
    636                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
    637              >=0
    638         ) {
    639             /* found an identical block, set the other block's index value for the current block */
    640             trie->map[start>>UTRIE_SHIFT]=i;
    641 
    642             /* advance start to the next block */
    643             start+=UTRIE_DATA_BLOCK_LENGTH;
    644 
    645             /* leave newStart with the previous block! */
    646             continue;
    647         }
    648 
    649         /* see if the beginning of this block can be overlapped with the end of the previous block */
    650         if(overlap && start>=overlapStart) {
    651             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
    652             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
    653                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
    654                 i-=UTRIE_DATA_GRANULARITY) {}
    655         } else {
    656             i=0;
    657         }
    658 
    659         if(i>0) {
    660             /* some overlap */
    661             trie->map[start>>UTRIE_SHIFT]=newStart-i;
    662 
    663             /* move the non-overlapping indexes to their new positions */
    664             start+=i;
    665             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
    666                 trie->data[newStart++]=trie->data[start++];
    667             }
    668         } else if(newStart<start) {
    669             /* no overlap, just move the indexes to their new positions */
    670             trie->map[start>>UTRIE_SHIFT]=newStart;
    671             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
    672                 trie->data[newStart++]=trie->data[start++];
    673             }
    674         } else /* no overlap && newStart==start */ {
    675             trie->map[start>>UTRIE_SHIFT]=start;
    676             newStart+=UTRIE_DATA_BLOCK_LENGTH;
    677             start=newStart;
    678         }
    679     }
    680 
    681     /* now adjust the index (stage 1) table */
    682     for(i=0; i<trie->indexLength; ++i) {
    683         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
    684     }
    685 
    686 #ifdef UTRIE_DEBUG
    687     /* we saved some space */
    688     printf("compacting trie: count of 32-bit words %lu->%lu\n",
    689             (long)trie->dataLength, (long)newStart);
    690 #endif
    691 
    692     trie->dataLength=newStart;
    693 }
    694 
    695 /* serialization ------------------------------------------------------------ */
    696 
    697 /*
    698  * Default function for the folding value:
    699  * Just store the offset (16 bits) if there is any non-initial-value entry.
    700  *
    701  * The offset parameter is never 0.
    702  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
    703  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
    704  * which fits into 16-bit trie values;
    705  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
    706  *
    707  * Theoretically, it would be safer for all possible UTRIE_SHIFT including
    708  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
    709  * which would always result in a value of 0x40..0x43f
    710  * (start/end 1k blocks of supplementary Unicode code points).
    711  * However, this would be uglier, and would not work for some existing
    712  * binary data file formats.
    713  *
    714  * Also, we do not plan to change UTRIE_SHIFT because it would change binary
    715  * data file formats, and we would probably not make it smaller because of
    716  * the then even larger BMP index length even for empty tries.
    717  */
    718 static uint32_t U_CALLCONV
    719 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
    720     uint32_t value, initialValue;
    721     UChar32 limit;
    722     UBool inBlockZero;
    723 
    724     initialValue=trie->data[0];
    725     limit=start+0x400;
    726     while(start<limit) {
    727         value=utrie_get32(trie, start, &inBlockZero);
    728         if(inBlockZero) {
    729             start+=UTRIE_DATA_BLOCK_LENGTH;
    730         } else if(value!=initialValue) {
    731             return (uint32_t)offset;
    732         } else {
    733             ++start;
    734         }
    735     }
    736     return 0;
    737 }
    738 
    739 U_CAPI int32_t U_EXPORT2
    740 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
    741                 UNewTrieGetFoldedValue *getFoldedValue,
    742                 UBool reduceTo16Bits,
    743                 UErrorCode *pErrorCode) {
    744     UTrieHeader *header;
    745     uint32_t *p;
    746     uint16_t *dest16;
    747     int32_t i, length;
    748     uint8_t* data = NULL;
    749 
    750     /* argument check */
    751     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    752         return 0;
    753     }
    754 
    755     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
    756         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    757         return 0;
    758     }
    759     if(getFoldedValue==NULL) {
    760         getFoldedValue=defaultGetFoldedValue;
    761     }
    762 
    763     data = (uint8_t*)dt;
    764     /* fold and compact if necessary, also checks that indexLength is within limits */
    765     if(!trie->isCompacted) {
    766         /* compact once without overlap to improve folding */
    767         utrie_compact(trie, FALSE, pErrorCode);
    768 
    769         /* fold the supplementary part of the index array */
    770         utrie_fold(trie, getFoldedValue, pErrorCode);
    771 
    772         /* compact again with overlap for minimum data array length */
    773         utrie_compact(trie, TRUE, pErrorCode);
    774 
    775         trie->isCompacted=TRUE;
    776         if(U_FAILURE(*pErrorCode)) {
    777             return 0;
    778         }
    779     }
    780 
    781     /* is dataLength within limits? */
    782     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
    783         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    784     }
    785 
    786     length=sizeof(UTrieHeader)+2*trie->indexLength;
    787     if(reduceTo16Bits) {
    788         length+=2*trie->dataLength;
    789     } else {
    790         length+=4*trie->dataLength;
    791     }
    792 
    793     if(length>capacity) {
    794         return length; /* preflighting */
    795     }
    796 
    797 #ifdef UTRIE_DEBUG
    798     printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n",
    799            (long)trie->indexLength, (long)trie->dataLength, (long)length);
    800 #endif
    801 
    802     /* set the header fields */
    803     header=(UTrieHeader *)data;
    804     data+=sizeof(UTrieHeader);
    805 
    806     header->signature=0x54726965; /* "Trie" */
    807     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
    808 
    809     if(!reduceTo16Bits) {
    810         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
    811     }
    812     if(trie->isLatin1Linear) {
    813         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
    814     }
    815 
    816     header->indexLength=trie->indexLength;
    817     header->dataLength=trie->dataLength;
    818 
    819     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
    820     if(reduceTo16Bits) {
    821         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
    822         p=(uint32_t *)trie->index;
    823         dest16=(uint16_t *)data;
    824         for(i=trie->indexLength; i>0; --i) {
    825             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
    826         }
    827 
    828         /* write 16-bit data values */
    829         p=trie->data;
    830         for(i=trie->dataLength; i>0; --i) {
    831             *dest16++=(uint16_t)*p++;
    832         }
    833     } else {
    834         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
    835         p=(uint32_t *)trie->index;
    836         dest16=(uint16_t *)data;
    837         for(i=trie->indexLength; i>0; --i) {
    838             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
    839         }
    840 
    841         /* write 32-bit data values */
    842         uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
    843     }
    844 
    845     return length;
    846 }
    847 
    848 /* inverse to defaultGetFoldedValue() */
    849 U_CAPI int32_t U_EXPORT2
    850 utrie_defaultGetFoldingOffset(uint32_t data) {
    851     return (int32_t)data;
    852 }
    853 
    854 U_CAPI int32_t U_EXPORT2
    855 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
    856     const UTrieHeader *header;
    857     const uint16_t *p16;
    858     uint32_t options;
    859 
    860     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    861         return -1;
    862     }
    863 
    864     /* enough data for a trie header? */
    865     if(length<(int32_t)sizeof(UTrieHeader)) {
    866         *pErrorCode=U_INVALID_FORMAT_ERROR;
    867         return -1;
    868     }
    869 
    870     /* check the signature */
    871     header=(const UTrieHeader *)data;
    872     if(header->signature!=0x54726965) {
    873         *pErrorCode=U_INVALID_FORMAT_ERROR;
    874         return -1;
    875     }
    876 
    877     /* get the options and check the shift values */
    878     options=header->options;
    879     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
    880         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
    881     ) {
    882         *pErrorCode=U_INVALID_FORMAT_ERROR;
    883         return -1;
    884     }
    885     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
    886 
    887     /* get the length values */
    888     trie->indexLength=header->indexLength;
    889     trie->dataLength=header->dataLength;
    890 
    891     length-=(int32_t)sizeof(UTrieHeader);
    892 
    893     /* enough data for the index? */
    894     if(length<2*trie->indexLength) {
    895         *pErrorCode=U_INVALID_FORMAT_ERROR;
    896         return -1;
    897     }
    898     p16=(const uint16_t *)(header+1);
    899     trie->index=p16;
    900     p16+=trie->indexLength;
    901     length-=2*trie->indexLength;
    902 
    903     /* get the data */
    904     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
    905         if(length<4*trie->dataLength) {
    906             *pErrorCode=U_INVALID_FORMAT_ERROR;
    907             return -1;
    908         }
    909         trie->data32=(const uint32_t *)p16;
    910         trie->initialValue=trie->data32[0];
    911         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
    912     } else {
    913         if(length<2*trie->dataLength) {
    914             *pErrorCode=U_INVALID_FORMAT_ERROR;
    915             return -1;
    916         }
    917 
    918         /* the "data16" data is used via the index pointer */
    919         trie->data32=NULL;
    920         trie->initialValue=trie->index[trie->indexLength];
    921         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
    922     }
    923 
    924     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
    925 
    926     return length;
    927 }
    928 
    929 U_CAPI int32_t U_EXPORT2
    930 utrie_unserializeDummy(UTrie *trie,
    931                        void *data, int32_t length,
    932                        uint32_t initialValue, uint32_t leadUnitValue,
    933                        UBool make16BitTrie,
    934                        UErrorCode *pErrorCode) {
    935     uint16_t *p16;
    936     int32_t actualLength, latin1Length, i, limit;
    937     uint16_t block;
    938 
    939     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    940         return -1;
    941     }
    942 
    943     /* calculate the actual size of the dummy trie data */
    944 
    945     /* max(Latin-1, block 0) */
    946     latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/
    947 
    948     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
    949     trie->dataLength=latin1Length;
    950     if(leadUnitValue!=initialValue) {
    951         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
    952     }
    953 
    954     actualLength=trie->indexLength*2;
    955     if(make16BitTrie) {
    956         actualLength+=trie->dataLength*2;
    957     } else {
    958         actualLength+=trie->dataLength*4;
    959     }
    960 
    961     /* enough space for the dummy trie? */
    962     if(length<actualLength) {
    963         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    964         return actualLength;
    965     }
    966 
    967     trie->isLatin1Linear=TRUE;
    968     trie->initialValue=initialValue;
    969 
    970     /* fill the index and data arrays */
    971     p16=(uint16_t *)data;
    972     trie->index=p16;
    973 
    974     if(make16BitTrie) {
    975         /* indexes to block 0 */
    976         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
    977         limit=trie->indexLength;
    978         for(i=0; i<limit; ++i) {
    979             p16[i]=block;
    980         }
    981 
    982         if(leadUnitValue!=initialValue) {
    983             /* indexes for lead surrogate code units to the block after Latin-1 */
    984             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
    985             i=0xd800>>UTRIE_SHIFT;
    986             limit=0xdc00>>UTRIE_SHIFT;
    987             for(; i<limit; ++i) {
    988                 p16[i]=block;
    989             }
    990         }
    991 
    992         trie->data32=NULL;
    993 
    994         /* Latin-1 data */
    995         p16+=trie->indexLength;
    996         for(i=0; i<latin1Length; ++i) {
    997             p16[i]=(uint16_t)initialValue;
    998         }
    999 
   1000         /* data for lead surrogate code units */
   1001         if(leadUnitValue!=initialValue) {
   1002             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1003             for(/* i=latin1Length */; i<limit; ++i) {
   1004                 p16[i]=(uint16_t)leadUnitValue;
   1005             }
   1006         }
   1007     } else {
   1008         uint32_t *p32;
   1009 
   1010         /* indexes to block 0 */
   1011         uprv_memset(p16, 0, trie->indexLength*2);
   1012 
   1013         if(leadUnitValue!=initialValue) {
   1014             /* indexes for lead surrogate code units to the block after Latin-1 */
   1015             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
   1016             i=0xd800>>UTRIE_SHIFT;
   1017             limit=0xdc00>>UTRIE_SHIFT;
   1018             for(; i<limit; ++i) {
   1019                 p16[i]=block;
   1020             }
   1021         }
   1022 
   1023         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
   1024 
   1025         /* Latin-1 data */
   1026         for(i=0; i<latin1Length; ++i) {
   1027             p32[i]=initialValue;
   1028         }
   1029 
   1030         /* data for lead surrogate code units */
   1031         if(leadUnitValue!=initialValue) {
   1032             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1033             for(/* i=latin1Length */; i<limit; ++i) {
   1034                 p32[i]=leadUnitValue;
   1035             }
   1036         }
   1037     }
   1038 
   1039     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
   1040 
   1041     return actualLength;
   1042 }
   1043 
   1044 /* enumeration -------------------------------------------------------------- */
   1045 
   1046 /* default UTrieEnumValue() returns the input value itself */
   1047 static uint32_t U_CALLCONV
   1048 enumSameValue(const void * /*context*/, uint32_t value) {
   1049     return value;
   1050 }
   1051 
   1052 /**
   1053  * Enumerate all ranges of code points with the same relevant values.
   1054  * The values are transformed from the raw trie entries by the enumValue function.
   1055  */
   1056 U_CAPI void U_EXPORT2
   1057 utrie_enum(const UTrie *trie,
   1058            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
   1059     const uint32_t *data32;
   1060     const uint16_t *idx;
   1061 
   1062     uint32_t value, prevValue, initialValue;
   1063     UChar32 c, prev;
   1064     int32_t l, i, j, block, prevBlock, nullBlock, offset;
   1065 
   1066     /* check arguments */
   1067     if(trie==NULL || trie->index==NULL || enumRange==NULL) {
   1068         return;
   1069     }
   1070     if(enumValue==NULL) {
   1071         enumValue=enumSameValue;
   1072     }
   1073 
   1074     idx=trie->index;
   1075     data32=trie->data32;
   1076 
   1077     /* get the enumeration value that corresponds to an initial-value trie data entry */
   1078     initialValue=enumValue(context, trie->initialValue);
   1079 
   1080     if(data32==NULL) {
   1081         nullBlock=trie->indexLength;
   1082     } else {
   1083         nullBlock=0;
   1084     }
   1085 
   1086     /* set variables for previous range */
   1087     prevBlock=nullBlock;
   1088     prev=0;
   1089     prevValue=initialValue;
   1090 
   1091     /* enumerate BMP - the main loop enumerates data blocks */
   1092     for(i=0, c=0; c<=0xffff; ++i) {
   1093         if(c==0xd800) {
   1094             /* skip lead surrogate code _units_, go to lead surr. code _points_ */
   1095             i=UTRIE_BMP_INDEX_LENGTH;
   1096         } else if(c==0xdc00) {
   1097             /* go back to regular BMP code points */
   1098             i=c>>UTRIE_SHIFT;
   1099         }
   1100 
   1101         block=idx[i]<<UTRIE_INDEX_SHIFT;
   1102         if(block==prevBlock) {
   1103             /* the block is the same as the previous one, and filled with value */
   1104             c+=UTRIE_DATA_BLOCK_LENGTH;
   1105         } else if(block==nullBlock) {
   1106             /* this is the all-initial-value block */
   1107             if(prevValue!=initialValue) {
   1108                 if(prev<c) {
   1109                     if(!enumRange(context, prev, c, prevValue)) {
   1110                         return;
   1111                     }
   1112                 }
   1113                 prevBlock=nullBlock;
   1114                 prev=c;
   1115                 prevValue=initialValue;
   1116             }
   1117             c+=UTRIE_DATA_BLOCK_LENGTH;
   1118         } else {
   1119             prevBlock=block;
   1120             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1121                 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
   1122                 if(value!=prevValue) {
   1123                     if(prev<c) {
   1124                         if(!enumRange(context, prev, c, prevValue)) {
   1125                             return;
   1126                         }
   1127                     }
   1128                     if(j>0) {
   1129                         /* the block is not filled with all the same value */
   1130                         prevBlock=-1;
   1131                     }
   1132                     prev=c;
   1133                     prevValue=value;
   1134                 }
   1135                 ++c;
   1136             }
   1137         }
   1138     }
   1139 
   1140     /* enumerate supplementary code points */
   1141     for(l=0xd800; l<0xdc00;) {
   1142         /* lead surrogate access */
   1143         offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
   1144         if(offset==nullBlock) {
   1145             /* no entries for a whole block of lead surrogates */
   1146             if(prevValue!=initialValue) {
   1147                 if(prev<c) {
   1148                     if(!enumRange(context, prev, c, prevValue)) {
   1149                         return;
   1150                     }
   1151                 }
   1152                 prevBlock=nullBlock;
   1153                 prev=c;
   1154                 prevValue=initialValue;
   1155             }
   1156 
   1157             l+=UTRIE_DATA_BLOCK_LENGTH;
   1158             c+=UTRIE_DATA_BLOCK_LENGTH<<10;
   1159             continue;
   1160         }
   1161 
   1162         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
   1163 
   1164         /* enumerate trail surrogates for this lead surrogate */
   1165         offset=trie->getFoldingOffset(value);
   1166         if(offset<=0) {
   1167             /* no data for this lead surrogate */
   1168             if(prevValue!=initialValue) {
   1169                 if(prev<c) {
   1170                     if(!enumRange(context, prev, c, prevValue)) {
   1171                         return;
   1172                     }
   1173                 }
   1174                 prevBlock=nullBlock;
   1175                 prev=c;
   1176                 prevValue=initialValue;
   1177             }
   1178 
   1179             /* nothing else to do for the supplementary code points for this lead surrogate */
   1180             c+=0x400;
   1181         } else {
   1182             /* enumerate code points for this lead surrogate */
   1183             i=offset;
   1184             offset+=UTRIE_SURROGATE_BLOCK_COUNT;
   1185             do {
   1186                 /* copy of most of the body of the BMP loop */
   1187                 block=idx[i]<<UTRIE_INDEX_SHIFT;
   1188                 if(block==prevBlock) {
   1189                     /* the block is the same as the previous one, and filled with value */
   1190                     c+=UTRIE_DATA_BLOCK_LENGTH;
   1191                 } else if(block==nullBlock) {
   1192                     /* this is the all-initial-value block */
   1193                     if(prevValue!=initialValue) {
   1194                         if(prev<c) {
   1195                             if(!enumRange(context, prev, c, prevValue)) {
   1196                                 return;
   1197                             }
   1198                         }
   1199                         prevBlock=nullBlock;
   1200                         prev=c;
   1201                         prevValue=initialValue;
   1202                     }
   1203                     c+=UTRIE_DATA_BLOCK_LENGTH;
   1204                 } else {
   1205                     prevBlock=block;
   1206                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1207                         value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
   1208                         if(value!=prevValue) {
   1209                             if(prev<c) {
   1210                                 if(!enumRange(context, prev, c, prevValue)) {
   1211                                     return;
   1212                                 }
   1213                             }
   1214                             if(j>0) {
   1215                                 /* the block is not filled with all the same value */
   1216                                 prevBlock=-1;
   1217                             }
   1218                             prev=c;
   1219                             prevValue=value;
   1220                         }
   1221                         ++c;
   1222                     }
   1223                 }
   1224             } while(++i<offset);
   1225         }
   1226 
   1227         ++l;
   1228     }
   1229 
   1230     /* deliver last range */
   1231     enumRange(context, prev, c, prevValue);
   1232 }
   1233