Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2001oct20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is a common implementation of a "folded" trie.
     19 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     20 *   Unicode code points (0..0x10ffff).
     21 */
     22 
     23 #ifdef UTRIE_DEBUG
     24 #   include <stdio.h>
     25 #endif
     26 
     27 #include "unicode/utypes.h"
     28 #include "cmemory.h"
     29 #include "utrie.h"
     30 
     31 /* miscellaneous ------------------------------------------------------------ */
     32 
     33 #undef ABS
     34 #define ABS(x) ((x)>=0 ? (x) : -(x))
     35 
     36 static inline UBool
     37 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
     38     while(length>0 && *s==*t) {
     39         ++s;
     40         ++t;
     41         --length;
     42     }
     43     return (UBool)(length==0);
     44 }
     45 
     46 /* Building a trie ----------------------------------------------------------*/
     47 
     48 U_CAPI UNewTrie * U_EXPORT2
     49 utrie_open(UNewTrie *fillIn,
     50            uint32_t *aliasData, int32_t maxDataLength,
     51            uint32_t initialValue, uint32_t leadUnitValue,
     52            UBool latin1Linear) {
     53     UNewTrie *trie;
     54     int32_t i, j;
     55 
     56     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
     57         (latin1Linear && maxDataLength<1024)
     58     ) {
     59         return NULL;
     60     }
     61 
     62     if(fillIn!=NULL) {
     63         trie=fillIn;
     64     } else {
     65         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
     66         if(trie==NULL) {
     67             return NULL;
     68         }
     69     }
     70     uprv_memset(trie, 0, sizeof(UNewTrie));
     71     trie->isAllocated= (UBool)(fillIn==NULL);
     72 
     73     if(aliasData!=NULL) {
     74         trie->data=aliasData;
     75         trie->isDataAllocated=FALSE;
     76     } else {
     77         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
     78         if(trie->data==NULL) {
     79             uprv_free(trie);
     80             return NULL;
     81         }
     82         trie->isDataAllocated=TRUE;
     83     }
     84 
     85     /* preallocate and reset the first data block (block index 0) */
     86     j=UTRIE_DATA_BLOCK_LENGTH;
     87 
     88     if(latin1Linear) {
     89         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
     90         /* made sure above that maxDataLength>=1024 */
     91 
     92         /* set indexes to point to consecutive data blocks */
     93         i=0;
     94         do {
     95             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
     96             trie->index[i++]=j;
     97             j+=UTRIE_DATA_BLOCK_LENGTH;
     98         } while(i<(256>>UTRIE_SHIFT));
     99     }
    100 
    101     /* reset the initially allocated blocks to the initial value */
    102     trie->dataLength=j;
    103     while(j>0) {
    104         trie->data[--j]=initialValue;
    105     }
    106 
    107     trie->leadUnitValue=leadUnitValue;
    108     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
    109     trie->dataCapacity=maxDataLength;
    110     trie->isLatin1Linear=latin1Linear;
    111     trie->isCompacted=FALSE;
    112     return trie;
    113 }
    114 
    115 U_CAPI UNewTrie * U_EXPORT2
    116 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
    117     UNewTrie *trie;
    118     UBool isDataAllocated;
    119 
    120     /* do not clone if other is not valid or already compacted */
    121     if(other==NULL || other->data==NULL || other->isCompacted) {
    122         return NULL;
    123     }
    124 
    125     /* clone data */
    126     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
    127         isDataAllocated=FALSE;
    128     } else {
    129         aliasDataCapacity=other->dataCapacity;
    130         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
    131         if(aliasData==NULL) {
    132             return NULL;
    133         }
    134         isDataAllocated=TRUE;
    135     }
    136 
    137     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
    138                     other->data[0], other->leadUnitValue,
    139                     other->isLatin1Linear);
    140     if(trie==NULL) {
    141         uprv_free(aliasData);
    142     } else {
    143         uprv_memcpy(trie->index, other->index, sizeof(trie->index));
    144         uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
    145         trie->dataLength=other->dataLength;
    146         trie->isDataAllocated=isDataAllocated;
    147     }
    148 
    149     return trie;
    150 }
    151 
    152 U_CAPI void U_EXPORT2
    153 utrie_close(UNewTrie *trie) {
    154     if(trie!=NULL) {
    155         if(trie->isDataAllocated) {
    156             uprv_free(trie->data);
    157             trie->data=NULL;
    158         }
    159         if(trie->isAllocated) {
    160             uprv_free(trie);
    161         }
    162     }
    163 }
    164 
    165 U_CAPI uint32_t * U_EXPORT2
    166 utrie_getData(UNewTrie *trie, int32_t *pLength) {
    167     if(trie==NULL || pLength==NULL) {
    168         return NULL;
    169     }
    170 
    171     *pLength=trie->dataLength;
    172     return trie->data;
    173 }
    174 
    175 static int32_t
    176 utrie_allocDataBlock(UNewTrie *trie) {
    177     int32_t newBlock, newTop;
    178 
    179     newBlock=trie->dataLength;
    180     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
    181     if(newTop>trie->dataCapacity) {
    182         /* out of memory in the data array */
    183         return -1;
    184     }
    185     trie->dataLength=newTop;
    186     return newBlock;
    187 }
    188 
    189 /**
    190  * No error checking for illegal arguments.
    191  *
    192  * @return -1 if no new data block available (out of memory in data array)
    193  * @internal
    194  */
    195 static int32_t
    196 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
    197     int32_t indexValue, newBlock;
    198 
    199     c>>=UTRIE_SHIFT;
    200     indexValue=trie->index[c];
    201     if(indexValue>0) {
    202         return indexValue;
    203     }
    204 
    205     /* allocate a new data block */
    206     newBlock=utrie_allocDataBlock(trie);
    207     if(newBlock<0) {
    208         /* out of memory in the data array */
    209         return -1;
    210     }
    211     trie->index[c]=newBlock;
    212 
    213     /* copy-on-write for a block from a setRange() */
    214     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
    215     return newBlock;
    216 }
    217 
    218 /**
    219  * @return TRUE if the value was successfully set
    220  */
    221 U_CAPI UBool U_EXPORT2
    222 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
    223     int32_t block;
    224 
    225     /* valid, uncompacted trie and valid c? */
    226     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
    227         return FALSE;
    228     }
    229 
    230     block=utrie_getDataBlock(trie, c);
    231     if(block<0) {
    232         return FALSE;
    233     }
    234 
    235     trie->data[block+(c&UTRIE_MASK)]=value;
    236     return TRUE;
    237 }
    238 
    239 U_CAPI uint32_t U_EXPORT2
    240 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
    241     int32_t block;
    242 
    243     /* valid, uncompacted trie and valid c? */
    244     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
    245         if(pInBlockZero!=NULL) {
    246             *pInBlockZero=TRUE;
    247         }
    248         return 0;
    249     }
    250 
    251     block=trie->index[c>>UTRIE_SHIFT];
    252     if(pInBlockZero!=NULL) {
    253         *pInBlockZero= (UBool)(block==0);
    254     }
    255 
    256     return trie->data[ABS(block)+(c&UTRIE_MASK)];
    257 }
    258 
    259 /**
    260  * @internal
    261  */
    262 static void
    263 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
    264                 uint32_t value, uint32_t initialValue, UBool overwrite) {
    265     uint32_t *pLimit;
    266 
    267     pLimit=block+limit;
    268     block+=start;
    269     if(overwrite) {
    270         while(block<pLimit) {
    271             *block++=value;
    272         }
    273     } else {
    274         while(block<pLimit) {
    275             if(*block==initialValue) {
    276                 *block=value;
    277             }
    278             ++block;
    279         }
    280     }
    281 }
    282 
    283 U_CAPI UBool U_EXPORT2
    284 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
    285     /*
    286      * repeat value in [start..limit[
    287      * mark index values for repeat-data blocks by setting bit 31 of the index values
    288      * fill around existing values if any, if(overwrite)
    289      */
    290     uint32_t initialValue;
    291     int32_t block, rest, repeatBlock;
    292 
    293     /* valid, uncompacted trie and valid indexes? */
    294     if( trie==NULL || trie->isCompacted ||
    295         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
    296     ) {
    297         return FALSE;
    298     }
    299     if(start==limit) {
    300         return TRUE; /* nothing to do */
    301     }
    302 
    303     initialValue=trie->data[0];
    304     if(start&UTRIE_MASK) {
    305         UChar32 nextStart;
    306 
    307         /* set partial block at [start..following block boundary[ */
    308         block=utrie_getDataBlock(trie, start);
    309         if(block<0) {
    310             return FALSE;
    311         }
    312 
    313         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
    314         if(nextStart<=limit) {
    315             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
    316                             value, initialValue, overwrite);
    317             start=nextStart;
    318         } else {
    319             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
    320                             value, initialValue, overwrite);
    321             return TRUE;
    322         }
    323     }
    324 
    325     /* number of positions in the last, partial block */
    326     rest=limit&UTRIE_MASK;
    327 
    328     /* round down limit to a block boundary */
    329     limit&=~UTRIE_MASK;
    330 
    331     /* iterate over all-value blocks */
    332     if(value==initialValue) {
    333         repeatBlock=0;
    334     } else {
    335         repeatBlock=-1;
    336     }
    337     while(start<limit) {
    338         /* get index value */
    339         block=trie->index[start>>UTRIE_SHIFT];
    340         if(block>0) {
    341             /* already allocated, fill in value */
    342             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
    343         } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
    344             /* set the repeatBlock instead of the current block 0 or range block */
    345             if(repeatBlock>=0) {
    346                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    347             } else {
    348                 /* create and set and fill the repeatBlock */
    349                 repeatBlock=utrie_getDataBlock(trie, start);
    350                 if(repeatBlock<0) {
    351                     return FALSE;
    352                 }
    353 
    354                 /* set the negative block number to indicate that it is a repeat block */
    355                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    356                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
    357             }
    358         }
    359 
    360         start+=UTRIE_DATA_BLOCK_LENGTH;
    361     }
    362 
    363     if(rest>0) {
    364         /* set partial block at [last block boundary..limit[ */
    365         block=utrie_getDataBlock(trie, start);
    366         if(block<0) {
    367             return FALSE;
    368         }
    369 
    370         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
    371     }
    372 
    373     return TRUE;
    374 }
    375 
    376 static int32_t
    377 _findSameIndexBlock(const int32_t *idx, int32_t indexLength,
    378                     int32_t otherBlock) {
    379     int32_t block, i;
    380 
    381     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
    382         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
    383             if(idx[block+i]!=idx[otherBlock+i]) {
    384                 break;
    385             }
    386         }
    387         if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
    388             return block;
    389         }
    390     }
    391     return indexLength;
    392 }
    393 
    394 /*
    395  * Fold the normalization data for supplementary code points into
    396  * a compact area on top of the BMP-part of the trie index,
    397  * with the lead surrogates indexing this compact area.
    398  *
    399  * Duplicate the index values for lead surrogates:
    400  * From inside the BMP area, where some may be overridden with folded values,
    401  * to just after the BMP area, where they can be retrieved for
    402  * code point lookups.
    403  */
    404 static void
    405 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
    406     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
    407     int32_t *idx;
    408     uint32_t value;
    409     UChar32 c;
    410     int32_t indexLength, block;
    411 #ifdef UTRIE_DEBUG
    412     int countLeadCUWithData=0;
    413 #endif
    414 
    415     idx=trie->index;
    416 
    417     /* copy the lead surrogate indexes into a temporary array */
    418     uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
    419 
    420     /*
    421      * set all values for lead surrogate code *units* to leadUnitValue
    422      * so that, by default, runtime lookups will find no data for associated
    423      * supplementary code points, unless there is data for such code points
    424      * which will result in a non-zero folding value below that is set for
    425      * the respective lead units
    426      *
    427      * the above saved the indexes for surrogate code *points*
    428      * fill the indexes with simplified code from utrie_setRange32()
    429      */
    430     if(trie->leadUnitValue==trie->data[0]) {
    431         block=0; /* leadUnitValue==initialValue, use all-initial-value block */
    432     } else {
    433         /* create and fill the repeatBlock */
    434         block=utrie_allocDataBlock(trie);
    435         if(block<0) {
    436             /* data table overflow */
    437             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    438             return;
    439         }
    440         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
    441         block=-block; /* negative block number to indicate that it is a repeat block */
    442     }
    443     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
    444         trie->index[c]=block;
    445     }
    446 
    447     /*
    448      * Fold significant index values into the area just after the BMP indexes.
    449      * In case the first lead surrogate has significant data,
    450      * its index block must be used first (in which case the folding is a no-op).
    451      * Later all folded index blocks are moved up one to insert the copied
    452      * lead surrogate indexes.
    453      */
    454     indexLength=UTRIE_BMP_INDEX_LENGTH;
    455 
    456     /* search for any index (stage 1) entries for supplementary code points */
    457     for(c=0x10000; c<0x110000;) {
    458         if(idx[c>>UTRIE_SHIFT]!=0) {
    459             /* there is data, treat the full block for a lead surrogate */
    460             c&=~0x3ff;
    461 
    462 #ifdef UTRIE_DEBUG
    463             ++countLeadCUWithData;
    464             /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
    465 #endif
    466 
    467             /* is there an identical index block? */
    468             block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
    469 
    470             /*
    471              * get a folded value for [c..c+0x400[ and,
    472              * if different from the value for the lead surrogate code point,
    473              * set it for the lead surrogate code unit
    474              */
    475             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
    476             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
    477                 if(!utrie_set32(trie, U16_LEAD(c), value)) {
    478                     /* data table overflow */
    479                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    480                     return;
    481                 }
    482 
    483                 /* if we did not find an identical index block... */
    484                 if(block==indexLength) {
    485                     /* move the actual index (stage 1) entries from the supplementary position to the new one */
    486                     uprv_memmove(idx+indexLength,
    487                                  idx+(c>>UTRIE_SHIFT),
    488                                  4*UTRIE_SURROGATE_BLOCK_COUNT);
    489                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    490                 }
    491             }
    492             c+=0x400;
    493         } else {
    494             c+=UTRIE_DATA_BLOCK_LENGTH;
    495         }
    496     }
    497 #ifdef UTRIE_DEBUG
    498     if(countLeadCUWithData>0) {
    499         printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
    500     }
    501 #endif
    502 
    503     /*
    504      * index array overflow?
    505      * This is to guarantee that a folding offset is of the form
    506      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
    507      * If the index is too large, then n>=1024 and more than 10 bits are necessary.
    508      *
    509      * In fact, it can only ever become n==1024 with completely unfoldable data and
    510      * the additional block of duplicated values for lead surrogates.
    511      */
    512     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
    513         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    514         return;
    515     }
    516 
    517     /*
    518      * make space for the lead surrogate index block and
    519      * insert it between the BMP indexes and the folded ones
    520      */
    521     uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
    522                  idx+UTRIE_BMP_INDEX_LENGTH,
    523                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
    524     uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
    525                 leadIndexes,
    526                 4*UTRIE_SURROGATE_BLOCK_COUNT);
    527     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    528 
    529 #ifdef UTRIE_DEBUG
    530     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
    531            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
    532 #endif
    533 
    534     trie->indexLength=indexLength;
    535 }
    536 
    537 /*
    538  * Set a value in the trie index map to indicate which data block
    539  * is referenced and which one is not.
    540  * utrie_compact() will remove data blocks that are not used at all.
    541  * Set
    542  * - 0 if it is used
    543  * - -1 if it is not used
    544  */
    545 static void
    546 _findUnusedBlocks(UNewTrie *trie) {
    547     int32_t i;
    548 
    549     /* fill the entire map with "not used" */
    550     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
    551 
    552     /* mark each block that _is_ used with 0 */
    553     for(i=0; i<trie->indexLength; ++i) {
    554         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
    555     }
    556 
    557     /* never move the all-initial-value block 0 */
    558     trie->map[0]=0;
    559 }
    560 
    561 static int32_t
    562 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
    563                    int32_t otherBlock, int32_t step) {
    564     int32_t block;
    565 
    566     /* ensure that we do not even partially get past dataLength */
    567     dataLength-=UTRIE_DATA_BLOCK_LENGTH;
    568 
    569     for(block=0; block<=dataLength; block+=step) {
    570         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
    571             return block;
    572         }
    573     }
    574     return -1;
    575 }
    576 
    577 /*
    578  * Compact a folded build-time trie.
    579  *
    580  * The compaction
    581  * - removes blocks that are identical with earlier ones
    582  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
    583  * - moves blocks in steps of the data granularity
    584  * - moves and overlaps blocks that overlap with multiple values in the overlap region
    585  *
    586  * It does not
    587  * - try to move and overlap blocks that are not already adjacent
    588  */
    589 static void
    590 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
    591     int32_t i, start, newStart, overlapStart;
    592 
    593     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    594         return;
    595     }
    596 
    597     /* valid, uncompacted trie? */
    598     if(trie==NULL) {
    599         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    600         return;
    601     }
    602     if(trie->isCompacted) {
    603         return; /* nothing left to do */
    604     }
    605 
    606     /* compaction */
    607 
    608     /* initialize the index map with "block is used/unused" flags */
    609     _findUnusedBlocks(trie);
    610 
    611     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
    612     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
    613         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
    614     } else {
    615         overlapStart=UTRIE_DATA_BLOCK_LENGTH;
    616     }
    617 
    618     newStart=UTRIE_DATA_BLOCK_LENGTH;
    619     for(start=newStart; start<trie->dataLength;) {
    620         /*
    621          * start: index of first entry of current block
    622          * newStart: index where the current block is to be moved
    623          *           (right after current end of already-compacted data)
    624          */
    625 
    626         /* skip blocks that are not used */
    627         if(trie->map[start>>UTRIE_SHIFT]<0) {
    628             /* advance start to the next block */
    629             start+=UTRIE_DATA_BLOCK_LENGTH;
    630 
    631             /* leave newStart with the previous block! */
    632             continue;
    633         }
    634 
    635         /* search for an identical block */
    636         if( start>=overlapStart &&
    637             (i=_findSameDataBlock(trie->data, newStart, start,
    638                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
    639              >=0
    640         ) {
    641             /* found an identical block, set the other block's index value for the current block */
    642             trie->map[start>>UTRIE_SHIFT]=i;
    643 
    644             /* advance start to the next block */
    645             start+=UTRIE_DATA_BLOCK_LENGTH;
    646 
    647             /* leave newStart with the previous block! */
    648             continue;
    649         }
    650 
    651         /* see if the beginning of this block can be overlapped with the end of the previous block */
    652         if(overlap && start>=overlapStart) {
    653             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
    654             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
    655                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
    656                 i-=UTRIE_DATA_GRANULARITY) {}
    657         } else {
    658             i=0;
    659         }
    660 
    661         if(i>0) {
    662             /* some overlap */
    663             trie->map[start>>UTRIE_SHIFT]=newStart-i;
    664 
    665             /* move the non-overlapping indexes to their new positions */
    666             start+=i;
    667             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
    668                 trie->data[newStart++]=trie->data[start++];
    669             }
    670         } else if(newStart<start) {
    671             /* no overlap, just move the indexes to their new positions */
    672             trie->map[start>>UTRIE_SHIFT]=newStart;
    673             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
    674                 trie->data[newStart++]=trie->data[start++];
    675             }
    676         } else /* no overlap && newStart==start */ {
    677             trie->map[start>>UTRIE_SHIFT]=start;
    678             newStart+=UTRIE_DATA_BLOCK_LENGTH;
    679             start=newStart;
    680         }
    681     }
    682 
    683     /* now adjust the index (stage 1) table */
    684     for(i=0; i<trie->indexLength; ++i) {
    685         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
    686     }
    687 
    688 #ifdef UTRIE_DEBUG
    689     /* we saved some space */
    690     printf("compacting trie: count of 32-bit words %lu->%lu\n",
    691             (long)trie->dataLength, (long)newStart);
    692 #endif
    693 
    694     trie->dataLength=newStart;
    695 }
    696 
    697 /* serialization ------------------------------------------------------------ */
    698 
    699 /*
    700  * Default function for the folding value:
    701  * Just store the offset (16 bits) if there is any non-initial-value entry.
    702  *
    703  * The offset parameter is never 0.
    704  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
    705  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
    706  * which fits into 16-bit trie values;
    707  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
    708  *
    709  * Theoretically, it would be safer for all possible UTRIE_SHIFT including
    710  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
    711  * which would always result in a value of 0x40..0x43f
    712  * (start/end 1k blocks of supplementary Unicode code points).
    713  * However, this would be uglier, and would not work for some existing
    714  * binary data file formats.
    715  *
    716  * Also, we do not plan to change UTRIE_SHIFT because it would change binary
    717  * data file formats, and we would probably not make it smaller because of
    718  * the then even larger BMP index length even for empty tries.
    719  */
    720 static uint32_t U_CALLCONV
    721 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
    722     uint32_t value, initialValue;
    723     UChar32 limit;
    724     UBool inBlockZero;
    725 
    726     initialValue=trie->data[0];
    727     limit=start+0x400;
    728     while(start<limit) {
    729         value=utrie_get32(trie, start, &inBlockZero);
    730         if(inBlockZero) {
    731             start+=UTRIE_DATA_BLOCK_LENGTH;
    732         } else if(value!=initialValue) {
    733             return (uint32_t)offset;
    734         } else {
    735             ++start;
    736         }
    737     }
    738     return 0;
    739 }
    740 
    741 U_CAPI int32_t U_EXPORT2
    742 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
    743                 UNewTrieGetFoldedValue *getFoldedValue,
    744                 UBool reduceTo16Bits,
    745                 UErrorCode *pErrorCode) {
    746     UTrieHeader *header;
    747     uint32_t *p;
    748     uint16_t *dest16;
    749     int32_t i, length;
    750     uint8_t* data = NULL;
    751 
    752     /* argument check */
    753     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    754         return 0;
    755     }
    756 
    757     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
    758         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    759         return 0;
    760     }
    761     if(getFoldedValue==NULL) {
    762         getFoldedValue=defaultGetFoldedValue;
    763     }
    764 
    765     data = (uint8_t*)dt;
    766     /* fold and compact if necessary, also checks that indexLength is within limits */
    767     if(!trie->isCompacted) {
    768         /* compact once without overlap to improve folding */
    769         utrie_compact(trie, FALSE, pErrorCode);
    770 
    771         /* fold the supplementary part of the index array */
    772         utrie_fold(trie, getFoldedValue, pErrorCode);
    773 
    774         /* compact again with overlap for minimum data array length */
    775         utrie_compact(trie, TRUE, pErrorCode);
    776 
    777         trie->isCompacted=TRUE;
    778         if(U_FAILURE(*pErrorCode)) {
    779             return 0;
    780         }
    781     }
    782 
    783     /* is dataLength within limits? */
    784     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
    785         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    786     }
    787 
    788     length=sizeof(UTrieHeader)+2*trie->indexLength;
    789     if(reduceTo16Bits) {
    790         length+=2*trie->dataLength;
    791     } else {
    792         length+=4*trie->dataLength;
    793     }
    794 
    795     if(length>capacity) {
    796         return length; /* preflighting */
    797     }
    798 
    799 #ifdef UTRIE_DEBUG
    800     printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n",
    801            (long)trie->indexLength, (long)trie->dataLength, (long)length);
    802 #endif
    803 
    804     /* set the header fields */
    805     header=(UTrieHeader *)data;
    806     data+=sizeof(UTrieHeader);
    807 
    808     header->signature=0x54726965; /* "Trie" */
    809     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
    810 
    811     if(!reduceTo16Bits) {
    812         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
    813     }
    814     if(trie->isLatin1Linear) {
    815         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
    816     }
    817 
    818     header->indexLength=trie->indexLength;
    819     header->dataLength=trie->dataLength;
    820 
    821     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
    822     if(reduceTo16Bits) {
    823         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
    824         p=(uint32_t *)trie->index;
    825         dest16=(uint16_t *)data;
    826         for(i=trie->indexLength; i>0; --i) {
    827             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
    828         }
    829 
    830         /* write 16-bit data values */
    831         p=trie->data;
    832         for(i=trie->dataLength; i>0; --i) {
    833             *dest16++=(uint16_t)*p++;
    834         }
    835     } else {
    836         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
    837         p=(uint32_t *)trie->index;
    838         dest16=(uint16_t *)data;
    839         for(i=trie->indexLength; i>0; --i) {
    840             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
    841         }
    842 
    843         /* write 32-bit data values */
    844         uprv_memcpy(dest16, trie->data, 4*(size_t)trie->dataLength);
    845     }
    846 
    847     return length;
    848 }
    849 
    850 /* inverse to defaultGetFoldedValue() */
    851 U_CAPI int32_t U_EXPORT2
    852 utrie_defaultGetFoldingOffset(uint32_t data) {
    853     return (int32_t)data;
    854 }
    855 
    856 U_CAPI int32_t U_EXPORT2
    857 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
    858     const UTrieHeader *header;
    859     const uint16_t *p16;
    860     uint32_t options;
    861 
    862     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    863         return -1;
    864     }
    865 
    866     /* enough data for a trie header? */
    867     if(length<(int32_t)sizeof(UTrieHeader)) {
    868         *pErrorCode=U_INVALID_FORMAT_ERROR;
    869         return -1;
    870     }
    871 
    872     /* check the signature */
    873     header=(const UTrieHeader *)data;
    874     if(header->signature!=0x54726965) {
    875         *pErrorCode=U_INVALID_FORMAT_ERROR;
    876         return -1;
    877     }
    878 
    879     /* get the options and check the shift values */
    880     options=header->options;
    881     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
    882         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
    883     ) {
    884         *pErrorCode=U_INVALID_FORMAT_ERROR;
    885         return -1;
    886     }
    887     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
    888 
    889     /* get the length values */
    890     trie->indexLength=header->indexLength;
    891     trie->dataLength=header->dataLength;
    892 
    893     length-=(int32_t)sizeof(UTrieHeader);
    894 
    895     /* enough data for the index? */
    896     if(length<2*trie->indexLength) {
    897         *pErrorCode=U_INVALID_FORMAT_ERROR;
    898         return -1;
    899     }
    900     p16=(const uint16_t *)(header+1);
    901     trie->index=p16;
    902     p16+=trie->indexLength;
    903     length-=2*trie->indexLength;
    904 
    905     /* get the data */
    906     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
    907         if(length<4*trie->dataLength) {
    908             *pErrorCode=U_INVALID_FORMAT_ERROR;
    909             return -1;
    910         }
    911         trie->data32=(const uint32_t *)p16;
    912         trie->initialValue=trie->data32[0];
    913         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
    914     } else {
    915         if(length<2*trie->dataLength) {
    916             *pErrorCode=U_INVALID_FORMAT_ERROR;
    917             return -1;
    918         }
    919 
    920         /* the "data16" data is used via the index pointer */
    921         trie->data32=NULL;
    922         trie->initialValue=trie->index[trie->indexLength];
    923         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
    924     }
    925 
    926     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
    927 
    928     return length;
    929 }
    930 
    931 U_CAPI int32_t U_EXPORT2
    932 utrie_unserializeDummy(UTrie *trie,
    933                        void *data, int32_t length,
    934                        uint32_t initialValue, uint32_t leadUnitValue,
    935                        UBool make16BitTrie,
    936                        UErrorCode *pErrorCode) {
    937     uint16_t *p16;
    938     int32_t actualLength, latin1Length, i, limit;
    939     uint16_t block;
    940 
    941     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    942         return -1;
    943     }
    944 
    945     /* calculate the actual size of the dummy trie data */
    946 
    947     /* max(Latin-1, block 0) */
    948     latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/
    949 
    950     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
    951     trie->dataLength=latin1Length;
    952     if(leadUnitValue!=initialValue) {
    953         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
    954     }
    955 
    956     actualLength=trie->indexLength*2;
    957     if(make16BitTrie) {
    958         actualLength+=trie->dataLength*2;
    959     } else {
    960         actualLength+=trie->dataLength*4;
    961     }
    962 
    963     /* enough space for the dummy trie? */
    964     if(length<actualLength) {
    965         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    966         return actualLength;
    967     }
    968 
    969     trie->isLatin1Linear=TRUE;
    970     trie->initialValue=initialValue;
    971 
    972     /* fill the index and data arrays */
    973     p16=(uint16_t *)data;
    974     trie->index=p16;
    975 
    976     if(make16BitTrie) {
    977         /* indexes to block 0 */
    978         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
    979         limit=trie->indexLength;
    980         for(i=0; i<limit; ++i) {
    981             p16[i]=block;
    982         }
    983 
    984         if(leadUnitValue!=initialValue) {
    985             /* indexes for lead surrogate code units to the block after Latin-1 */
    986             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
    987             i=0xd800>>UTRIE_SHIFT;
    988             limit=0xdc00>>UTRIE_SHIFT;
    989             for(; i<limit; ++i) {
    990                 p16[i]=block;
    991             }
    992         }
    993 
    994         trie->data32=NULL;
    995 
    996         /* Latin-1 data */
    997         p16+=trie->indexLength;
    998         for(i=0; i<latin1Length; ++i) {
    999             p16[i]=(uint16_t)initialValue;
   1000         }
   1001 
   1002         /* data for lead surrogate code units */
   1003         if(leadUnitValue!=initialValue) {
   1004             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1005             for(/* i=latin1Length */; i<limit; ++i) {
   1006                 p16[i]=(uint16_t)leadUnitValue;
   1007             }
   1008         }
   1009     } else {
   1010         uint32_t *p32;
   1011 
   1012         /* indexes to block 0 */
   1013         uprv_memset(p16, 0, trie->indexLength*2);
   1014 
   1015         if(leadUnitValue!=initialValue) {
   1016             /* indexes for lead surrogate code units to the block after Latin-1 */
   1017             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
   1018             i=0xd800>>UTRIE_SHIFT;
   1019             limit=0xdc00>>UTRIE_SHIFT;
   1020             for(; i<limit; ++i) {
   1021                 p16[i]=block;
   1022             }
   1023         }
   1024 
   1025         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
   1026 
   1027         /* Latin-1 data */
   1028         for(i=0; i<latin1Length; ++i) {
   1029             p32[i]=initialValue;
   1030         }
   1031 
   1032         /* data for lead surrogate code units */
   1033         if(leadUnitValue!=initialValue) {
   1034             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1035             for(/* i=latin1Length */; i<limit; ++i) {
   1036                 p32[i]=leadUnitValue;
   1037             }
   1038         }
   1039     }
   1040 
   1041     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
   1042 
   1043     return actualLength;
   1044 }
   1045 
   1046 /* enumeration -------------------------------------------------------------- */
   1047 
   1048 /* default UTrieEnumValue() returns the input value itself */
   1049 static uint32_t U_CALLCONV
   1050 enumSameValue(const void * /*context*/, uint32_t value) {
   1051     return value;
   1052 }
   1053 
   1054 /**
   1055  * Enumerate all ranges of code points with the same relevant values.
   1056  * The values are transformed from the raw trie entries by the enumValue function.
   1057  */
   1058 U_CAPI void U_EXPORT2
   1059 utrie_enum(const UTrie *trie,
   1060            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
   1061     const uint32_t *data32;
   1062     const uint16_t *idx;
   1063 
   1064     uint32_t value, prevValue, initialValue;
   1065     UChar32 c, prev;
   1066     int32_t l, i, j, block, prevBlock, nullBlock, offset;
   1067 
   1068     /* check arguments */
   1069     if(trie==NULL || trie->index==NULL || enumRange==NULL) {
   1070         return;
   1071     }
   1072     if(enumValue==NULL) {
   1073         enumValue=enumSameValue;
   1074     }
   1075 
   1076     idx=trie->index;
   1077     data32=trie->data32;
   1078 
   1079     /* get the enumeration value that corresponds to an initial-value trie data entry */
   1080     initialValue=enumValue(context, trie->initialValue);
   1081 
   1082     if(data32==NULL) {
   1083         nullBlock=trie->indexLength;
   1084     } else {
   1085         nullBlock=0;
   1086     }
   1087 
   1088     /* set variables for previous range */
   1089     prevBlock=nullBlock;
   1090     prev=0;
   1091     prevValue=initialValue;
   1092 
   1093     /* enumerate BMP - the main loop enumerates data blocks */
   1094     for(i=0, c=0; c<=0xffff; ++i) {
   1095         if(c==0xd800) {
   1096             /* skip lead surrogate code _units_, go to lead surr. code _points_ */
   1097             i=UTRIE_BMP_INDEX_LENGTH;
   1098         } else if(c==0xdc00) {
   1099             /* go back to regular BMP code points */
   1100             i=c>>UTRIE_SHIFT;
   1101         }
   1102 
   1103         block=idx[i]<<UTRIE_INDEX_SHIFT;
   1104         if(block==prevBlock) {
   1105             /* the block is the same as the previous one, and filled with value */
   1106             c+=UTRIE_DATA_BLOCK_LENGTH;
   1107         } else if(block==nullBlock) {
   1108             /* this is the all-initial-value block */
   1109             if(prevValue!=initialValue) {
   1110                 if(prev<c) {
   1111                     if(!enumRange(context, prev, c, prevValue)) {
   1112                         return;
   1113                     }
   1114                 }
   1115                 prevBlock=nullBlock;
   1116                 prev=c;
   1117                 prevValue=initialValue;
   1118             }
   1119             c+=UTRIE_DATA_BLOCK_LENGTH;
   1120         } else {
   1121             prevBlock=block;
   1122             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1123                 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
   1124                 if(value!=prevValue) {
   1125                     if(prev<c) {
   1126                         if(!enumRange(context, prev, c, prevValue)) {
   1127                             return;
   1128                         }
   1129                     }
   1130                     if(j>0) {
   1131                         /* the block is not filled with all the same value */
   1132                         prevBlock=-1;
   1133                     }
   1134                     prev=c;
   1135                     prevValue=value;
   1136                 }
   1137                 ++c;
   1138             }
   1139         }
   1140     }
   1141 
   1142     /* enumerate supplementary code points */
   1143     for(l=0xd800; l<0xdc00;) {
   1144         /* lead surrogate access */
   1145         offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
   1146         if(offset==nullBlock) {
   1147             /* no entries for a whole block of lead surrogates */
   1148             if(prevValue!=initialValue) {
   1149                 if(prev<c) {
   1150                     if(!enumRange(context, prev, c, prevValue)) {
   1151                         return;
   1152                     }
   1153                 }
   1154                 prevBlock=nullBlock;
   1155                 prev=c;
   1156                 prevValue=initialValue;
   1157             }
   1158 
   1159             l+=UTRIE_DATA_BLOCK_LENGTH;
   1160             c+=UTRIE_DATA_BLOCK_LENGTH<<10;
   1161             continue;
   1162         }
   1163 
   1164         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
   1165 
   1166         /* enumerate trail surrogates for this lead surrogate */
   1167         offset=trie->getFoldingOffset(value);
   1168         if(offset<=0) {
   1169             /* no data for this lead surrogate */
   1170             if(prevValue!=initialValue) {
   1171                 if(prev<c) {
   1172                     if(!enumRange(context, prev, c, prevValue)) {
   1173                         return;
   1174                     }
   1175                 }
   1176                 prevBlock=nullBlock;
   1177                 prev=c;
   1178                 prevValue=initialValue;
   1179             }
   1180 
   1181             /* nothing else to do for the supplementary code points for this lead surrogate */
   1182             c+=0x400;
   1183         } else {
   1184             /* enumerate code points for this lead surrogate */
   1185             i=offset;
   1186             offset+=UTRIE_SURROGATE_BLOCK_COUNT;
   1187             do {
   1188                 /* copy of most of the body of the BMP loop */
   1189                 block=idx[i]<<UTRIE_INDEX_SHIFT;
   1190                 if(block==prevBlock) {
   1191                     /* the block is the same as the previous one, and filled with value */
   1192                     c+=UTRIE_DATA_BLOCK_LENGTH;
   1193                 } else if(block==nullBlock) {
   1194                     /* this is the all-initial-value block */
   1195                     if(prevValue!=initialValue) {
   1196                         if(prev<c) {
   1197                             if(!enumRange(context, prev, c, prevValue)) {
   1198                                 return;
   1199                             }
   1200                         }
   1201                         prevBlock=nullBlock;
   1202                         prev=c;
   1203                         prevValue=initialValue;
   1204                     }
   1205                     c+=UTRIE_DATA_BLOCK_LENGTH;
   1206                 } else {
   1207                     prevBlock=block;
   1208                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1209                         value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
   1210                         if(value!=prevValue) {
   1211                             if(prev<c) {
   1212                                 if(!enumRange(context, prev, c, prevValue)) {
   1213                                     return;
   1214                                 }
   1215                             }
   1216                             if(j>0) {
   1217                                 /* the block is not filled with all the same value */
   1218                                 prevBlock=-1;
   1219                             }
   1220                             prev=c;
   1221                             prevValue=value;
   1222                         }
   1223                         ++c;
   1224                     }
   1225                 }
   1226             } while(++i<offset);
   1227         }
   1228 
   1229         ++l;
   1230     }
   1231 
   1232     /* deliver last range */
   1233     enumRange(context, prev, c, prevValue);
   1234 }
   1235