Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie2_builder.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2008sep26 (split off from utrie2.c)
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is a common implementation of a Unicode trie.
     19 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     20 *   Unicode code points (0..0x10ffff).
     21 *   This is the second common version of a Unicode trie (hence the name UTrie2).
     22 *   See utrie2.h for a comparison.
     23 *
     24 *   This file contains only the builder code.
     25 *   See utrie2.c for the runtime and enumeration code.
     26 */
     27 #ifdef UTRIE2_DEBUG
     28 #   include <stdio.h>
     29 #endif
     30 
     31 #include "unicode/utypes.h"
     32 #include "cmemory.h"
     33 #include "utrie2.h"
     34 #include "utrie2_impl.h"
     35 
     36 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
     37 
     38 /* Implementation notes ----------------------------------------------------- */
     39 
     40 /*
     41  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
     42  * have been chosen to minimize trie sizes overall.
     43  * Most of the code is flexible enough to work with a range of values,
     44  * within certain limits.
     45  *
     46  * Exception: Support for separate values for lead surrogate code _units_
     47  * vs. code _points_ was added after the constants were fixed,
     48  * and has not been tested nor particularly designed for different constant values.
     49  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
     50  * part and back.)
     51  *
     52  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
     53  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
     54  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
     55  * remapping) stops working.
     56  *
     57  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
     58  * assumes that a single index-2 block is used for 0x400 code points
     59  * corresponding to one lead surrogate.
     60  *
     61  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
     62  * more than one Unicode plane, and the split of the index-2 table into a BMP
     63  * part and a supplementary part, with a gap in between, would not work.
     64  *
     65  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
     66  * there is data with more than 64k distinct values,
     67  * for example for Unihan collation with a separate collation weight per
     68  * Han character.
     69  */
     70 
     71 /* Building a trie ----------------------------------------------------------*/
     72 
     73 enum {
     74     /** The null index-2 block, following the gap in the index-2 table. */
     75     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
     76 
     77     /** The start of allocated index-2 blocks. */
     78     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
     79 
     80     /**
     81      * The null data block.
     82      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
     83      * to work with 6-bit trail bytes from 2-byte UTF-8.
     84      */
     85     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
     86 
     87     /** The start of allocated data blocks. */
     88     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
     89 
     90     /**
     91      * The start of data blocks for U+0800 and above.
     92      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
     93      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
     94      * Data values for 0x780 code points beyond ASCII.
     95      */
     96     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
     97 };
     98 
     99 /* Start with allocation of 16k data entries. */
    100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
    101 
    102 /* Grow about 8x each time. */
    103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
    104 
    105 static int32_t
    106 allocIndex2Block(UNewTrie2 *trie);
    107 
    108 U_CAPI UTrie2 * U_EXPORT2
    109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
    110     UTrie2 *trie;
    111     UNewTrie2 *newTrie;
    112     uint32_t *data;
    113     int32_t i, j;
    114 
    115     if(U_FAILURE(*pErrorCode)) {
    116         return NULL;
    117     }
    118 
    119     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
    120     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
    121     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
    122     if(trie==NULL || newTrie==NULL || data==NULL) {
    123         uprv_free(trie);
    124         uprv_free(newTrie);
    125         uprv_free(data);
    126         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    127         return 0;
    128     }
    129 
    130     uprv_memset(trie, 0, sizeof(UTrie2));
    131     trie->initialValue=initialValue;
    132     trie->errorValue=errorValue;
    133     trie->highStart=0x110000;
    134     trie->newTrie=newTrie;
    135 
    136     newTrie->data=data;
    137     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
    138     newTrie->initialValue=initialValue;
    139     newTrie->errorValue=errorValue;
    140     newTrie->highStart=0x110000;
    141     newTrie->firstFreeBlock=0;  /* no free block in the list */
    142     newTrie->isCompacted=FALSE;
    143 
    144     /*
    145      * preallocate and reset
    146      * - ASCII
    147      * - the bad-UTF-8-data block
    148      * - the null data block
    149      */
    150     for(i=0; i<0x80; ++i) {
    151         newTrie->data[i]=initialValue;
    152     }
    153     for(; i<0xc0; ++i) {
    154         newTrie->data[i]=errorValue;
    155     }
    156     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
    157         newTrie->data[i]=initialValue;
    158     }
    159     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
    160     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
    161 
    162     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
    163     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    164         newTrie->index2[i]=j;
    165         newTrie->map[i]=1;
    166     }
    167     /* reference counts for the bad-UTF-8-data block */
    168     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    169         newTrie->map[i]=0;
    170     }
    171     /*
    172      * Reference counts for the null data block: all blocks except for the ASCII blocks.
    173      * Plus 1 so that we don't drop this block during compaction.
    174      * Plus as many as needed for lead surrogate code points.
    175      */
    176     /* i==newTrie->dataNullOffset */
    177     newTrie->map[i++]=
    178         (0x110000>>UTRIE2_SHIFT_2)-
    179         (0x80>>UTRIE2_SHIFT_2)+
    180         1+
    181         UTRIE2_LSCP_INDEX_2_LENGTH;
    182     j+=UTRIE2_DATA_BLOCK_LENGTH;
    183     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    184         newTrie->map[i]=0;
    185     }
    186 
    187     /*
    188      * set the remaining indexes in the BMP index-2 block
    189      * to the null data block
    190      */
    191     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
    192         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
    193     }
    194 
    195     /*
    196      * Fill the index gap with impossible values so that compaction
    197      * does not overlap other index-2 blocks with the gap.
    198      */
    199     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
    200         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
    201     }
    202 
    203     /* set the indexes in the null index-2 block */
    204     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
    205         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
    206     }
    207     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
    208     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
    209 
    210     /* set the index-1 indexes for the linear index-2 block */
    211     for(i=0, j=0;
    212         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
    213         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
    214     ) {
    215         newTrie->index1[i]=j;
    216     }
    217 
    218     /* set the remaining index-1 indexes to the null index-2 block */
    219     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
    220         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
    221     }
    222 
    223     /*
    224      * Preallocate and reset data for U+0080..U+07ff,
    225      * for 2-byte UTF-8 which will be compacted in 64-blocks
    226      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
    227      */
    228     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
    229         utrie2_set32(trie, i, initialValue, pErrorCode);
    230     }
    231 
    232     return trie;
    233 }
    234 
    235 static UNewTrie2 *
    236 cloneBuilder(const UNewTrie2 *other) {
    237     UNewTrie2 *trie;
    238 
    239     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
    240     if(trie==NULL) {
    241         return NULL;
    242     }
    243 
    244     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
    245     if(trie->data==NULL) {
    246         uprv_free(trie);
    247         return NULL;
    248     }
    249     trie->dataCapacity=other->dataCapacity;
    250 
    251     /* clone data */
    252     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
    253     uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
    254     trie->index2NullOffset=other->index2NullOffset;
    255     trie->index2Length=other->index2Length;
    256 
    257     uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
    258     trie->dataNullOffset=other->dataNullOffset;
    259     trie->dataLength=other->dataLength;
    260 
    261     /* reference counters */
    262     if(other->isCompacted) {
    263         trie->firstFreeBlock=0;
    264     } else {
    265         uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
    266         trie->firstFreeBlock=other->firstFreeBlock;
    267     }
    268 
    269     trie->initialValue=other->initialValue;
    270     trie->errorValue=other->errorValue;
    271     trie->highStart=other->highStart;
    272     trie->isCompacted=other->isCompacted;
    273 
    274     return trie;
    275 }
    276 
    277 U_CAPI UTrie2 * U_EXPORT2
    278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
    279     UTrie2 *trie;
    280 
    281     if(U_FAILURE(*pErrorCode)) {
    282         return NULL;
    283     }
    284     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
    285         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    286         return NULL;
    287     }
    288 
    289     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
    290     if(trie==NULL) {
    291         return NULL;
    292     }
    293     uprv_memcpy(trie, other, sizeof(UTrie2));
    294 
    295     if(other->memory!=NULL) {
    296         trie->memory=uprv_malloc(other->length);
    297         if(trie->memory!=NULL) {
    298             trie->isMemoryOwned=TRUE;
    299             uprv_memcpy(trie->memory, other->memory, other->length);
    300 
    301             /* make the clone's pointers point to its own memory */
    302             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
    303             if(other->data16!=NULL) {
    304                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
    305             }
    306             if(other->data32!=NULL) {
    307                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
    308             }
    309         }
    310     } else /* other->newTrie!=NULL */ {
    311         trie->newTrie=cloneBuilder(other->newTrie);
    312     }
    313 
    314     if(trie->memory==NULL && trie->newTrie==NULL) {
    315         uprv_free(trie);
    316         trie=NULL;
    317     }
    318     return trie;
    319 }
    320 
    321 typedef struct NewTrieAndStatus {
    322     UTrie2 *trie;
    323     UErrorCode errorCode;
    324     UBool exclusiveLimit;  /* rather than inclusive range end */
    325 } NewTrieAndStatus;
    326 
    327 static UBool U_CALLCONV
    328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    329     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
    330     if(value!=nt->trie->initialValue) {
    331         if(nt->exclusiveLimit) {
    332             --end;
    333         }
    334         if(start==end) {
    335             utrie2_set32(nt->trie, start, value, &nt->errorCode);
    336         } else {
    337             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
    338         }
    339         return U_SUCCESS(nt->errorCode);
    340     } else {
    341         return TRUE;
    342     }
    343 }
    344 
    345 #ifdef UTRIE2_DEBUG
    346 static void
    347 utrie_printLengths(const UTrie *trie) {
    348     long indexLength=trie->indexLength;
    349     long dataLength=(long)trie->dataLength;
    350     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
    351     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
    352            indexLength, dataLength, totalLength);
    353 }
    354 
    355 static void
    356 utrie2_printLengths(const UTrie2 *trie, const char *which) {
    357     long indexLength=trie->indexLength;
    358     long dataLength=(long)trie->dataLength;
    359     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
    360     printf("**UTrie2Lengths(%s)** index:%6ld  data:%6ld  serialized:%6ld\n",
    361            which, indexLength, dataLength, totalLength);
    362 }
    363 #endif
    364 
    365 U_CAPI UTrie2 * U_EXPORT2
    366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
    367     NewTrieAndStatus context;
    368     UChar lead;
    369 
    370     if(U_FAILURE(*pErrorCode)) {
    371         return NULL;
    372     }
    373     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
    374         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    375         return NULL;
    376     }
    377     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
    378         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
    379     }
    380 
    381     /* Clone the frozen trie by enumerating it and building a new one. */
    382     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
    383     if(U_FAILURE(*pErrorCode)) {
    384         return NULL;
    385     }
    386     context.exclusiveLimit=FALSE;
    387     context.errorCode=*pErrorCode;
    388     utrie2_enum(other, NULL, copyEnumRange, &context);
    389     *pErrorCode=context.errorCode;
    390     for(lead=0xd800; lead<0xdc00; ++lead) {
    391         uint32_t value;
    392         if(other->data32==NULL) {
    393             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
    394         } else {
    395             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
    396         }
    397         if(value!=other->initialValue) {
    398             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
    399         }
    400     }
    401     if(U_FAILURE(*pErrorCode)) {
    402         utrie2_close(context.trie);
    403         context.trie=NULL;
    404     }
    405     return context.trie;
    406 }
    407 
    408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
    409 U_CAPI UTrie2 * U_EXPORT2
    410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
    411     NewTrieAndStatus context;
    412     UChar lead;
    413 
    414     if(U_FAILURE(*pErrorCode)) {
    415         return NULL;
    416     }
    417     if(trie1==NULL) {
    418         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    419         return NULL;
    420     }
    421     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
    422     if(U_FAILURE(*pErrorCode)) {
    423         return NULL;
    424     }
    425     context.exclusiveLimit=TRUE;
    426     context.errorCode=*pErrorCode;
    427     utrie_enum(trie1, NULL, copyEnumRange, &context);
    428     *pErrorCode=context.errorCode;
    429     for(lead=0xd800; lead<0xdc00; ++lead) {
    430         uint32_t value;
    431         if(trie1->data32==NULL) {
    432             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
    433         } else {
    434             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
    435         }
    436         if(value!=trie1->initialValue) {
    437             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
    438         }
    439     }
    440     if(U_SUCCESS(*pErrorCode)) {
    441         utrie2_freeze(context.trie,
    442                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
    443                       pErrorCode);
    444     }
    445 #ifdef UTRIE2_DEBUG
    446     if(U_SUCCESS(*pErrorCode)) {
    447         utrie_printLengths(trie1);
    448         utrie2_printLengths(context.trie, "fromUTrie");
    449     }
    450 #endif
    451     if(U_FAILURE(*pErrorCode)) {
    452         utrie2_close(context.trie);
    453         context.trie=NULL;
    454     }
    455     return context.trie;
    456 }
    457 
    458 static inline UBool
    459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    460     int32_t i2, block;
    461 
    462     if(U_IS_LEAD(c) && forLSCP) {
    463         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
    464             (c>>UTRIE2_SHIFT_2);
    465     } else {
    466         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
    467             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
    468     }
    469     block=trie->index2[i2];
    470     return (UBool)(block==trie->dataNullOffset);
    471 }
    472 
    473 static int32_t
    474 allocIndex2Block(UNewTrie2 *trie) {
    475     int32_t newBlock, newTop;
    476 
    477     newBlock=trie->index2Length;
    478     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
    479     if(newTop>UPRV_LENGTHOF(trie->index2)) {
    480         /*
    481          * Should never occur.
    482          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
    483          * or the code writes more values than should be possible.
    484          */
    485         return -1;
    486     }
    487     trie->index2Length=newTop;
    488     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
    489     return newBlock;
    490 }
    491 
    492 static int32_t
    493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    494     int32_t i1, i2;
    495 
    496     if(U_IS_LEAD(c) && forLSCP) {
    497         return UTRIE2_LSCP_INDEX_2_OFFSET;
    498     }
    499 
    500     i1=c>>UTRIE2_SHIFT_1;
    501     i2=trie->index1[i1];
    502     if(i2==trie->index2NullOffset) {
    503         i2=allocIndex2Block(trie);
    504         if(i2<0) {
    505             return -1;  /* program error */
    506         }
    507         trie->index1[i1]=i2;
    508     }
    509     return i2;
    510 }
    511 
    512 static int32_t
    513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
    514     int32_t newBlock, newTop;
    515 
    516     if(trie->firstFreeBlock!=0) {
    517         /* get the first free block */
    518         newBlock=trie->firstFreeBlock;
    519         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
    520     } else {
    521         /* get a new block from the high end */
    522         newBlock=trie->dataLength;
    523         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
    524         if(newTop>trie->dataCapacity) {
    525             /* out of memory in the data array */
    526             int32_t capacity;
    527             uint32_t *data;
    528 
    529             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
    530                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
    531             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
    532                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
    533             } else {
    534                 /*
    535                  * Should never occur.
    536                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
    537                  * or the code writes more values than should be possible.
    538                  */
    539                 return -1;
    540             }
    541             data=(uint32_t *)uprv_malloc(capacity*4);
    542             if(data==NULL) {
    543                 return -1;
    544             }
    545             uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
    546             uprv_free(trie->data);
    547             trie->data=data;
    548             trie->dataCapacity=capacity;
    549         }
    550         trie->dataLength=newTop;
    551     }
    552     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
    553     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
    554     return newBlock;
    555 }
    556 
    557 /* call when the block's reference counter reaches 0 */
    558 static void
    559 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
    560     /* put this block at the front of the free-block chain */
    561     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
    562     trie->firstFreeBlock=block;
    563 }
    564 
    565 static inline UBool
    566 isWritableBlock(UNewTrie2 *trie, int32_t block) {
    567     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
    568 }
    569 
    570 static inline void
    571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
    572     int32_t oldBlock;
    573     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
    574     oldBlock=trie->index2[i2];
    575     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
    576         releaseDataBlock(trie, oldBlock);
    577     }
    578     trie->index2[i2]=block;
    579 }
    580 
    581 /**
    582  * No error checking for illegal arguments.
    583  *
    584  * @return -1 if no new data block available (out of memory in data array)
    585  * @internal
    586  */
    587 static int32_t
    588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    589     int32_t i2, oldBlock, newBlock;
    590 
    591     i2=getIndex2Block(trie, c, forLSCP);
    592     if(i2<0) {
    593         return -1;  /* program error */
    594     }
    595 
    596     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
    597     oldBlock=trie->index2[i2];
    598     if(isWritableBlock(trie, oldBlock)) {
    599         return oldBlock;
    600     }
    601 
    602     /* allocate a new data block */
    603     newBlock=allocDataBlock(trie, oldBlock);
    604     if(newBlock<0) {
    605         /* out of memory in the data array */
    606         return -1;
    607     }
    608     setIndex2Entry(trie, i2, newBlock);
    609     return newBlock;
    610 }
    611 
    612 /**
    613  * @return TRUE if the value was successfully set
    614  */
    615 static void
    616 set32(UNewTrie2 *trie,
    617       UChar32 c, UBool forLSCP, uint32_t value,
    618       UErrorCode *pErrorCode) {
    619     int32_t block;
    620 
    621     if(trie==NULL || trie->isCompacted) {
    622         *pErrorCode=U_NO_WRITE_PERMISSION;
    623         return;
    624     }
    625 
    626     block=getDataBlock(trie, c, forLSCP);
    627     if(block<0) {
    628         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    629         return;
    630     }
    631 
    632     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
    633 }
    634 
    635 U_CAPI void U_EXPORT2
    636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
    637     if(U_FAILURE(*pErrorCode)) {
    638         return;
    639     }
    640     if((uint32_t)c>0x10ffff) {
    641         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    642         return;
    643     }
    644     set32(trie->newTrie, c, TRUE, value, pErrorCode);
    645 }
    646 
    647 U_CAPI void U_EXPORT2
    648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
    649                                      UChar32 c, uint32_t value,
    650                                      UErrorCode *pErrorCode) {
    651     if(U_FAILURE(*pErrorCode)) {
    652         return;
    653     }
    654     if(!U_IS_LEAD(c)) {
    655         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    656         return;
    657     }
    658     set32(trie->newTrie, c, FALSE, value, pErrorCode);
    659 }
    660 
    661 static void
    662 writeBlock(uint32_t *block, uint32_t value) {
    663     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
    664     while(block<limit) {
    665         *block++=value;
    666     }
    667 }
    668 
    669 /**
    670  * initialValue is ignored if overwrite=TRUE
    671  * @internal
    672  */
    673 static void
    674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
    675           uint32_t value, uint32_t initialValue, UBool overwrite) {
    676     uint32_t *pLimit;
    677 
    678     pLimit=block+limit;
    679     block+=start;
    680     if(overwrite) {
    681         while(block<pLimit) {
    682             *block++=value;
    683         }
    684     } else {
    685         while(block<pLimit) {
    686             if(*block==initialValue) {
    687                 *block=value;
    688             }
    689             ++block;
    690         }
    691     }
    692 }
    693 
    694 U_CAPI void U_EXPORT2
    695 utrie2_setRange32(UTrie2 *trie,
    696                   UChar32 start, UChar32 end,
    697                   uint32_t value, UBool overwrite,
    698                   UErrorCode *pErrorCode) {
    699     /*
    700      * repeat value in [start..end]
    701      * mark index values for repeat-data blocks by setting bit 31 of the index values
    702      * fill around existing values if any, if(overwrite)
    703      */
    704     UNewTrie2 *newTrie;
    705     int32_t block, rest, repeatBlock;
    706     UChar32 limit;
    707 
    708     if(U_FAILURE(*pErrorCode)) {
    709         return;
    710     }
    711     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
    712         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    713         return;
    714     }
    715     newTrie=trie->newTrie;
    716     if(newTrie==NULL || newTrie->isCompacted) {
    717         *pErrorCode=U_NO_WRITE_PERMISSION;
    718         return;
    719     }
    720     if(!overwrite && value==newTrie->initialValue) {
    721         return; /* nothing to do */
    722     }
    723 
    724     limit=end+1;
    725     if(start&UTRIE2_DATA_MASK) {
    726         UChar32 nextStart;
    727 
    728         /* set partial block at [start..following block boundary[ */
    729         block=getDataBlock(newTrie, start, TRUE);
    730         if(block<0) {
    731             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    732             return;
    733         }
    734 
    735         nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
    736         if(nextStart<=limit) {
    737             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
    738                       value, newTrie->initialValue, overwrite);
    739             start=nextStart;
    740         } else {
    741             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
    742                       value, newTrie->initialValue, overwrite);
    743             return;
    744         }
    745     }
    746 
    747     /* number of positions in the last, partial block */
    748     rest=limit&UTRIE2_DATA_MASK;
    749 
    750     /* round down limit to a block boundary */
    751     limit&=~UTRIE2_DATA_MASK;
    752 
    753     /* iterate over all-value blocks */
    754     if(value==newTrie->initialValue) {
    755         repeatBlock=newTrie->dataNullOffset;
    756     } else {
    757         repeatBlock=-1;
    758     }
    759 
    760     while(start<limit) {
    761         int32_t i2;
    762         UBool setRepeatBlock=FALSE;
    763 
    764         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
    765             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
    766             continue;
    767         }
    768 
    769         /* get index value */
    770         i2=getIndex2Block(newTrie, start, TRUE);
    771         if(i2<0) {
    772             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    773             return;
    774         }
    775         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
    776         block=newTrie->index2[i2];
    777         if(isWritableBlock(newTrie, block)) {
    778             /* already allocated */
    779             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
    780                 /*
    781                  * We overwrite all values, and it's not a
    782                  * protected (ASCII-linear or 2-byte UTF-8) block:
    783                  * replace with the repeatBlock.
    784                  */
    785                 setRepeatBlock=TRUE;
    786             } else {
    787                 /* !overwrite, or protected block: just write the values into this block */
    788                 fillBlock(newTrie->data+block,
    789                           0, UTRIE2_DATA_BLOCK_LENGTH,
    790                           value, newTrie->initialValue, overwrite);
    791             }
    792         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
    793             /*
    794              * Set the repeatBlock instead of the null block or previous repeat block:
    795              *
    796              * If !isWritableBlock() then all entries in the block have the same value
    797              * because it's the null block or a range block (the repeatBlock from a previous
    798              * call to utrie2_setRange32()).
    799              * No other blocks are used multiple times before compacting.
    800              *
    801              * The null block is the only non-writable block with the initialValue because
    802              * of the repeatBlock initialization above. (If value==initialValue, then
    803              * the repeatBlock will be the null data block.)
    804              *
    805              * We set our repeatBlock if the desired value differs from the block's value,
    806              * and if we overwrite any data or if the data is all initial values
    807              * (which is the same as the block being the null block, see above).
    808              */
    809             setRepeatBlock=TRUE;
    810         }
    811         if(setRepeatBlock) {
    812             if(repeatBlock>=0) {
    813                 setIndex2Entry(newTrie, i2, repeatBlock);
    814             } else {
    815                 /* create and set and fill the repeatBlock */
    816                 repeatBlock=getDataBlock(newTrie, start, TRUE);
    817                 if(repeatBlock<0) {
    818                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    819                     return;
    820                 }
    821                 writeBlock(newTrie->data+repeatBlock, value);
    822             }
    823         }
    824 
    825         start+=UTRIE2_DATA_BLOCK_LENGTH;
    826     }
    827 
    828     if(rest>0) {
    829         /* set partial block at [last block boundary..limit[ */
    830         block=getDataBlock(newTrie, start, TRUE);
    831         if(block<0) {
    832             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    833             return;
    834         }
    835 
    836         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
    837     }
    838 
    839     return;
    840 }
    841 
    842 /* compaction --------------------------------------------------------------- */
    843 
    844 static inline UBool
    845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
    846     while(length>0 && *s==*t) {
    847         ++s;
    848         ++t;
    849         --length;
    850     }
    851     return (UBool)(length==0);
    852 }
    853 
    854 static inline UBool
    855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
    856     while(length>0 && *s==*t) {
    857         ++s;
    858         ++t;
    859         --length;
    860     }
    861     return (UBool)(length==0);
    862 }
    863 
    864 static int32_t
    865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
    866     int32_t block;
    867 
    868     /* ensure that we do not even partially get past index2Length */
    869     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
    870 
    871     for(block=0; block<=index2Length; ++block) {
    872         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
    873             return block;
    874         }
    875     }
    876     return -1;
    877 }
    878 
    879 static int32_t
    880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
    881     int32_t block;
    882 
    883     /* ensure that we do not even partially get past dataLength */
    884     dataLength-=blockLength;
    885 
    886     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
    887         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
    888             return block;
    889         }
    890     }
    891     return -1;
    892 }
    893 
    894 /*
    895  * Find the start of the last range in the trie by enumerating backward.
    896  * Indexes for supplementary code points higher than this will be omitted.
    897  */
    898 static UChar32
    899 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
    900     const uint32_t *data32;
    901 
    902     uint32_t value, initialValue;
    903     UChar32 c, prev;
    904     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
    905 
    906     data32=trie->data;
    907     initialValue=trie->initialValue;
    908 
    909     index2NullOffset=trie->index2NullOffset;
    910     nullBlock=trie->dataNullOffset;
    911 
    912     /* set variables for previous range */
    913     if(highValue==initialValue) {
    914         prevI2Block=index2NullOffset;
    915         prevBlock=nullBlock;
    916     } else {
    917         prevI2Block=-1;
    918         prevBlock=-1;
    919     }
    920     prev=0x110000;
    921 
    922     /* enumerate index-2 blocks */
    923     i1=UNEWTRIE2_INDEX_1_LENGTH;
    924     c=prev;
    925     while(c>0) {
    926         i2Block=trie->index1[--i1];
    927         if(i2Block==prevI2Block) {
    928             /* the index-2 block is the same as the previous one, and filled with highValue */
    929             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
    930             continue;
    931         }
    932         prevI2Block=i2Block;
    933         if(i2Block==index2NullOffset) {
    934             /* this is the null index-2 block */
    935             if(highValue!=initialValue) {
    936                 return c;
    937             }
    938             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
    939         } else {
    940             /* enumerate data blocks for one index-2 block */
    941             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
    942                 block=trie->index2[i2Block+ --i2];
    943                 if(block==prevBlock) {
    944                     /* the block is the same as the previous one, and filled with highValue */
    945                     c-=UTRIE2_DATA_BLOCK_LENGTH;
    946                     continue;
    947                 }
    948                 prevBlock=block;
    949                 if(block==nullBlock) {
    950                     /* this is the null data block */
    951                     if(highValue!=initialValue) {
    952                         return c;
    953                     }
    954                     c-=UTRIE2_DATA_BLOCK_LENGTH;
    955                 } else {
    956                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
    957                         value=data32[block+ --j];
    958                         if(value!=highValue) {
    959                             return c;
    960                         }
    961                         --c;
    962                     }
    963                 }
    964             }
    965         }
    966     }
    967 
    968     /* deliver last range */
    969     return 0;
    970 }
    971 
    972 /*
    973  * Compact a build-time trie.
    974  *
    975  * The compaction
    976  * - removes blocks that are identical with earlier ones
    977  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
    978  * - moves blocks in steps of the data granularity
    979  * - moves and overlaps blocks that overlap with multiple values in the overlap region
    980  *
    981  * It does not
    982  * - try to move and overlap blocks that are not already adjacent
    983  */
    984 static void
    985 compactData(UNewTrie2 *trie) {
    986     int32_t start, newStart, movedStart;
    987     int32_t blockLength, overlap;
    988     int32_t i, mapIndex, blockCount;
    989 
    990     /* do not compact linear-ASCII data */
    991     newStart=UTRIE2_DATA_START_OFFSET;
    992     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
    993         trie->map[i]=start;
    994     }
    995 
    996     /*
    997      * Start with a block length of 64 for 2-byte UTF-8,
    998      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
    999      */
   1000     blockLength=64;
   1001     blockCount=blockLength>>UTRIE2_SHIFT_2;
   1002     for(start=newStart; start<trie->dataLength;) {
   1003         /*
   1004          * start: index of first entry of current block
   1005          * newStart: index where the current block is to be moved
   1006          *           (right after current end of already-compacted data)
   1007          */
   1008         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
   1009             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
   1010             blockCount=1;
   1011         }
   1012 
   1013         /* skip blocks that are not used */
   1014         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
   1015             /* advance start to the next block */
   1016             start+=blockLength;
   1017 
   1018             /* leave newStart with the previous block! */
   1019             continue;
   1020         }
   1021 
   1022         /* search for an identical block */
   1023         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
   1024              >=0
   1025         ) {
   1026             /* found an identical block, set the other block's index value for the current block */
   1027             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1028                 trie->map[mapIndex++]=movedStart;
   1029                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
   1030             }
   1031 
   1032             /* advance start to the next block */
   1033             start+=blockLength;
   1034 
   1035             /* leave newStart with the previous block! */
   1036             continue;
   1037         }
   1038 
   1039         /* see if the beginning of this block can be overlapped with the end of the previous block */
   1040         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
   1041         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
   1042             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
   1043             overlap-=UTRIE2_DATA_GRANULARITY) {}
   1044 
   1045         if(overlap>0 || newStart<start) {
   1046             /* some overlap, or just move the whole block */
   1047             movedStart=newStart-overlap;
   1048             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1049                 trie->map[mapIndex++]=movedStart;
   1050                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
   1051             }
   1052 
   1053             /* move the non-overlapping indexes to their new positions */
   1054             start+=overlap;
   1055             for(i=blockLength-overlap; i>0; --i) {
   1056                 trie->data[newStart++]=trie->data[start++];
   1057             }
   1058         } else /* no overlap && newStart==start */ {
   1059             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1060                 trie->map[mapIndex++]=start;
   1061                 start+=UTRIE2_DATA_BLOCK_LENGTH;
   1062             }
   1063             newStart=start;
   1064         }
   1065     }
   1066 
   1067     /* now adjust the index-2 table */
   1068     for(i=0; i<trie->index2Length; ++i) {
   1069         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
   1070             /* Gap indexes are invalid (-1). Skip over the gap. */
   1071             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
   1072         }
   1073         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
   1074     }
   1075     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
   1076 
   1077     /* ensure dataLength alignment */
   1078     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
   1079         trie->data[newStart++]=trie->initialValue;
   1080     }
   1081 
   1082 #ifdef UTRIE2_DEBUG
   1083     /* we saved some space */
   1084     printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
   1085             (long)trie->dataLength, (long)newStart);
   1086 #endif
   1087 
   1088     trie->dataLength=newStart;
   1089 }
   1090 
   1091 static void
   1092 compactIndex2(UNewTrie2 *trie) {
   1093     int32_t i, start, newStart, movedStart, overlap;
   1094 
   1095     /* do not compact linear-BMP index-2 blocks */
   1096     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
   1097     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
   1098         trie->map[i]=start;
   1099     }
   1100 
   1101     /* Reduce the index table gap to what will be needed at runtime. */
   1102     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
   1103 
   1104     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
   1105         /*
   1106          * start: index of first entry of current block
   1107          * newStart: index where the current block is to be moved
   1108          *           (right after current end of already-compacted data)
   1109          */
   1110 
   1111         /* search for an identical block */
   1112         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
   1113              >=0
   1114         ) {
   1115             /* found an identical block, set the other block's index value for the current block */
   1116             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
   1117 
   1118             /* advance start to the next block */
   1119             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
   1120 
   1121             /* leave newStart with the previous block! */
   1122             continue;
   1123         }
   1124 
   1125         /* see if the beginning of this block can be overlapped with the end of the previous block */
   1126         /* look for maximum overlap with the previous, adjacent block */
   1127         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
   1128             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
   1129             --overlap) {}
   1130 
   1131         if(overlap>0 || newStart<start) {
   1132             /* some overlap, or just move the whole block */
   1133             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
   1134 
   1135             /* move the non-overlapping indexes to their new positions */
   1136             start+=overlap;
   1137             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
   1138                 trie->index2[newStart++]=trie->index2[start++];
   1139             }
   1140         } else /* no overlap && newStart==start */ {
   1141             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
   1142             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
   1143             newStart=start;
   1144         }
   1145     }
   1146 
   1147     /* now adjust the index-1 table */
   1148     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
   1149         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
   1150     }
   1151     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
   1152 
   1153     /*
   1154      * Ensure data table alignment:
   1155      * Needs to be granularity-aligned for 16-bit trie
   1156      * (so that dataMove will be down-shiftable),
   1157      * and 2-aligned for uint32_t data.
   1158      */
   1159     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
   1160         /* Arbitrary value: 0x3fffc not possible for real data. */
   1161         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
   1162     }
   1163 
   1164 #ifdef UTRIE2_DEBUG
   1165     /* we saved some space */
   1166     printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
   1167             (long)trie->index2Length, (long)newStart);
   1168 #endif
   1169 
   1170     trie->index2Length=newStart;
   1171 }
   1172 
   1173 static void
   1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
   1175     UNewTrie2 *newTrie;
   1176     UChar32 highStart, suppHighStart;
   1177     uint32_t highValue;
   1178 
   1179     newTrie=trie->newTrie;
   1180 
   1181     /* find highStart and round it up */
   1182     highValue=utrie2_get32(trie, 0x10ffff);
   1183     highStart=findHighStart(newTrie, highValue);
   1184     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
   1185     if(highStart==0x110000) {
   1186         highValue=trie->errorValue;
   1187     }
   1188 
   1189     /*
   1190      * Set trie->highStart only after utrie2_get32(trie, highStart).
   1191      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
   1192      */
   1193     trie->highStart=newTrie->highStart=highStart;
   1194 
   1195 #ifdef UTRIE2_DEBUG
   1196     printf("UTrie2: highStart U+%04lx  highValue 0x%lx  initialValue 0x%lx\n",
   1197             (long)highStart, (long)highValue, (long)trie->initialValue);
   1198 #endif
   1199 
   1200     if(highStart<0x110000) {
   1201         /* Blank out [highStart..10ffff] to release associated data blocks. */
   1202         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
   1203         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
   1204         if(U_FAILURE(*pErrorCode)) {
   1205             return;
   1206         }
   1207     }
   1208 
   1209     compactData(newTrie);
   1210     if(highStart>0x10000) {
   1211         compactIndex2(newTrie);
   1212 #ifdef UTRIE2_DEBUG
   1213     } else {
   1214         printf("UTrie2: highStart U+%04lx  count of 16-bit index-2 words %lu->%lu\n",
   1215                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
   1216 #endif
   1217     }
   1218 
   1219     /*
   1220      * Store the highValue in the data array and round up the dataLength.
   1221      * Must be done after compactData() because that assumes that dataLength
   1222      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
   1223      */
   1224     newTrie->data[newTrie->dataLength++]=highValue;
   1225     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
   1226         newTrie->data[newTrie->dataLength++]=trie->initialValue;
   1227     }
   1228 
   1229     newTrie->isCompacted=TRUE;
   1230 }
   1231 
   1232 /* serialization ------------------------------------------------------------ */
   1233 
   1234 /**
   1235  * Maximum length of the runtime index array.
   1236  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
   1237  * (The actual maximum length is lower,
   1238  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
   1239  */
   1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
   1241 
   1242 /**
   1243  * Maximum length of the runtime data array.
   1244  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
   1245  * and by uint16_t UTrie2Header.shiftedDataLength.
   1246  */
   1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
   1248 
   1249 /* Compact and internally serialize the trie. */
   1250 U_CAPI void U_EXPORT2
   1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
   1252     UNewTrie2 *newTrie;
   1253     UTrie2Header *header;
   1254     uint32_t *p;
   1255     uint16_t *dest16;
   1256     int32_t i, length;
   1257     int32_t allIndexesLength;
   1258     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
   1259     UChar32 highStart;
   1260 
   1261     /* argument check */
   1262     if(U_FAILURE(*pErrorCode)) {
   1263         return;
   1264     }
   1265     if( trie==NULL ||
   1266         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
   1267     ) {
   1268         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1269         return;
   1270     }
   1271     newTrie=trie->newTrie;
   1272     if(newTrie==NULL) {
   1273         /* already frozen */
   1274         UTrie2ValueBits frozenValueBits=
   1275             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
   1276         if(valueBits!=frozenValueBits) {
   1277             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1278         }
   1279         return;
   1280     }
   1281 
   1282     /* compact if necessary */
   1283     if(!newTrie->isCompacted) {
   1284         compactTrie(trie, pErrorCode);
   1285         if(U_FAILURE(*pErrorCode)) {
   1286             return;
   1287         }
   1288     }
   1289     highStart=trie->highStart;
   1290 
   1291     if(highStart<=0x10000) {
   1292         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
   1293     } else {
   1294         allIndexesLength=newTrie->index2Length;
   1295     }
   1296     if(valueBits==UTRIE2_16_VALUE_BITS) {
   1297         dataMove=allIndexesLength;
   1298     } else {
   1299         dataMove=0;
   1300     }
   1301 
   1302     /* are indexLength and dataLength within limits? */
   1303     if( /* for unshifted indexLength */
   1304         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
   1305         /* for unshifted dataNullOffset */
   1306         (dataMove+newTrie->dataNullOffset)>0xffff ||
   1307         /* for unshifted 2-byte UTF-8 index-2 values */
   1308         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
   1309         /* for shiftedDataLength */
   1310         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
   1311     ) {
   1312         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1313         return;
   1314     }
   1315 
   1316     /* calculate the total serialized length */
   1317     length=sizeof(UTrie2Header)+allIndexesLength*2;
   1318     if(valueBits==UTRIE2_16_VALUE_BITS) {
   1319         length+=newTrie->dataLength*2;
   1320     } else {
   1321         length+=newTrie->dataLength*4;
   1322     }
   1323 
   1324     trie->memory=uprv_malloc(length);
   1325     if(trie->memory==NULL) {
   1326         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1327         return;
   1328     }
   1329     trie->length=length;
   1330     trie->isMemoryOwned=TRUE;
   1331 
   1332     trie->indexLength=allIndexesLength;
   1333     trie->dataLength=newTrie->dataLength;
   1334     if(highStart<=0x10000) {
   1335         trie->index2NullOffset=0xffff;
   1336     } else {
   1337         trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
   1338     }
   1339     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
   1340     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
   1341 
   1342     /* set the header fields */
   1343     header=(UTrie2Header *)trie->memory;
   1344 
   1345     header->signature=UTRIE2_SIG; /* "Tri2" */
   1346     header->options=(uint16_t)valueBits;
   1347 
   1348     header->indexLength=(uint16_t)trie->indexLength;
   1349     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
   1350     header->index2NullOffset=trie->index2NullOffset;
   1351     header->dataNullOffset=trie->dataNullOffset;
   1352     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
   1353 
   1354     /* fill the index and data arrays */
   1355     dest16=(uint16_t *)(header+1);
   1356     trie->index=dest16;
   1357 
   1358     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
   1359     p=(uint32_t *)newTrie->index2;
   1360     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
   1361         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
   1362     }
   1363 
   1364     /* write UTF-8 2-byte index-2 values, not right-shifted */
   1365     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
   1366         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
   1367     }
   1368     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
   1369         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
   1370     }
   1371 
   1372     if(highStart>0x10000) {
   1373         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
   1374         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
   1375 
   1376         /* write 16-bit index-1 values for supplementary code points */
   1377         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
   1378         for(i=index1Length; i>0; --i) {
   1379             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
   1380         }
   1381 
   1382         /*
   1383          * write the index-2 array values for supplementary code points,
   1384          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
   1385          */
   1386         p=(uint32_t *)newTrie->index2+index2Offset;
   1387         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
   1388             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
   1389         }
   1390     }
   1391 
   1392     /* write the 16/32-bit data array */
   1393     switch(valueBits) {
   1394     case UTRIE2_16_VALUE_BITS:
   1395         /* write 16-bit data values */
   1396         trie->data16=dest16;
   1397         trie->data32=NULL;
   1398         p=newTrie->data;
   1399         for(i=newTrie->dataLength; i>0; --i) {
   1400             *dest16++=(uint16_t)*p++;
   1401         }
   1402         break;
   1403     case UTRIE2_32_VALUE_BITS:
   1404         /* write 32-bit data values */
   1405         trie->data16=NULL;
   1406         trie->data32=(uint32_t *)dest16;
   1407         uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
   1408         break;
   1409     default:
   1410         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1411         return;
   1412     }
   1413 
   1414     /* Delete the UNewTrie2. */
   1415     uprv_free(newTrie->data);
   1416     uprv_free(newTrie);
   1417     trie->newTrie=NULL;
   1418 }
   1419 
   1420 /*
   1421  * This is here to avoid a dependency from utrie2.cpp on utrie.c.
   1422  * This file already depends on utrie.c.
   1423  * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
   1424  */
   1425 U_CAPI int32_t U_EXPORT2
   1426 utrie2_swapAnyVersion(const UDataSwapper *ds,
   1427                       const void *inData, int32_t length, void *outData,
   1428                       UErrorCode *pErrorCode) {
   1429     if(U_SUCCESS(*pErrorCode)) {
   1430         switch(utrie2_getVersion(inData, length, TRUE)) {
   1431         case 1:
   1432             return utrie_swap(ds, inData, length, outData, pErrorCode);
   1433         case 2:
   1434             return utrie2_swap(ds, inData, length, outData, pErrorCode);
   1435         default:
   1436             *pErrorCode=U_INVALID_FORMAT_ERROR;
   1437             return 0;
   1438         }
   1439     }
   1440     return 0;
   1441 }
   1442