Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie2_builder.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2008sep26 (split off from utrie2.c)
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is a common implementation of a Unicode trie.
     19 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     20 *   Unicode code points (0..0x10ffff).
     21 *   This is the second common version of a Unicode trie (hence the name UTrie2).
     22 *   See utrie2.h for a comparison.
     23 *
     24 *   This file contains only the builder code.
     25 *   See utrie2.c for the runtime and enumeration code.
     26 */
     27 // #define UTRIE2_DEBUG
     28 #ifdef UTRIE2_DEBUG
     29 #   include <stdio.h>
     30 #endif
     31 // #define UCPTRIE_DEBUG
     32 
     33 #include "unicode/utypes.h"
     34 #ifdef UCPTRIE_DEBUG
     35 #include "unicode/ucptrie.h"
     36 #include "unicode/umutablecptrie.h"
     37 #include "ucptrie_impl.h"
     38 #endif
     39 #include "cmemory.h"
     40 #include "utrie2.h"
     41 #include "utrie2_impl.h"
     42 
     43 #include "utrie.h"  // for utrie2_fromUTrie()
     44 
     45 /* Implementation notes ----------------------------------------------------- */
     46 
     47 /*
     48  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
     49  * have been chosen to minimize trie sizes overall.
     50  * Most of the code is flexible enough to work with a range of values,
     51  * within certain limits.
     52  *
     53  * Exception: Support for separate values for lead surrogate code _units_
     54  * vs. code _points_ was added after the constants were fixed,
     55  * and has not been tested nor particularly designed for different constant values.
     56  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
     57  * part and back.)
     58  *
     59  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
     60  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
     61  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
     62  * remapping) stops working.
     63  *
     64  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
     65  * assumes that a single index-2 block is used for 0x400 code points
     66  * corresponding to one lead surrogate.
     67  *
     68  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
     69  * more than one Unicode plane, and the split of the index-2 table into a BMP
     70  * part and a supplementary part, with a gap in between, would not work.
     71  *
     72  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
     73  * there is data with more than 64k distinct values,
     74  * for example for Unihan collation with a separate collation weight per
     75  * Han character.
     76  */
     77 
     78 /* Building a trie ----------------------------------------------------------*/
     79 
     80 enum {
     81     /** The null index-2 block, following the gap in the index-2 table. */
     82     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
     83 
     84     /** The start of allocated index-2 blocks. */
     85     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
     86 
     87     /**
     88      * The null data block.
     89      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
     90      * to work with 6-bit trail bytes from 2-byte UTF-8.
     91      */
     92     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
     93 
     94     /** The start of allocated data blocks. */
     95     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
     96 
     97     /**
     98      * The start of data blocks for U+0800 and above.
     99      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
    100      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
    101      * Data values for 0x780 code points beyond ASCII.
    102      */
    103     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
    104 };
    105 
    106 /* Start with allocation of 16k data entries. */
    107 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
    108 
    109 /* Grow about 8x each time. */
    110 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
    111 
    112 static int32_t
    113 allocIndex2Block(UNewTrie2 *trie);
    114 
    115 U_CAPI UTrie2 * U_EXPORT2
    116 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
    117     UTrie2 *trie;
    118     UNewTrie2 *newTrie;
    119     uint32_t *data;
    120     int32_t i, j;
    121 
    122     if(U_FAILURE(*pErrorCode)) {
    123         return NULL;
    124     }
    125 
    126     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
    127     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
    128     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
    129     if(trie==NULL || newTrie==NULL || data==NULL) {
    130         uprv_free(trie);
    131         uprv_free(newTrie);
    132         uprv_free(data);
    133         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    134         return 0;
    135     }
    136 
    137     uprv_memset(trie, 0, sizeof(UTrie2));
    138     trie->initialValue=initialValue;
    139     trie->errorValue=errorValue;
    140     trie->highStart=0x110000;
    141     trie->newTrie=newTrie;
    142 #ifdef UTRIE2_DEBUG
    143     trie->name="open";
    144 #endif
    145 
    146     newTrie->data=data;
    147 #ifdef UCPTRIE_DEBUG
    148     newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode);
    149 #endif
    150     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
    151     newTrie->initialValue=initialValue;
    152     newTrie->errorValue=errorValue;
    153     newTrie->highStart=0x110000;
    154     newTrie->firstFreeBlock=0;  /* no free block in the list */
    155     newTrie->isCompacted=FALSE;
    156 
    157     /*
    158      * preallocate and reset
    159      * - ASCII
    160      * - the bad-UTF-8-data block
    161      * - the null data block
    162      */
    163     for(i=0; i<0x80; ++i) {
    164         newTrie->data[i]=initialValue;
    165     }
    166     for(; i<0xc0; ++i) {
    167         newTrie->data[i]=errorValue;
    168     }
    169     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
    170         newTrie->data[i]=initialValue;
    171     }
    172     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
    173     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
    174 
    175     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
    176     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    177         newTrie->index2[i]=j;
    178         newTrie->map[i]=1;
    179     }
    180     /* reference counts for the bad-UTF-8-data block */
    181     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    182         newTrie->map[i]=0;
    183     }
    184     /*
    185      * Reference counts for the null data block: all blocks except for the ASCII blocks.
    186      * Plus 1 so that we don't drop this block during compaction.
    187      * Plus as many as needed for lead surrogate code points.
    188      */
    189     /* i==newTrie->dataNullOffset */
    190     newTrie->map[i++]=
    191         (0x110000>>UTRIE2_SHIFT_2)-
    192         (0x80>>UTRIE2_SHIFT_2)+
    193         1+
    194         UTRIE2_LSCP_INDEX_2_LENGTH;
    195     j+=UTRIE2_DATA_BLOCK_LENGTH;
    196     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
    197         newTrie->map[i]=0;
    198     }
    199 
    200     /*
    201      * set the remaining indexes in the BMP index-2 block
    202      * to the null data block
    203      */
    204     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
    205         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
    206     }
    207 
    208     /*
    209      * Fill the index gap with impossible values so that compaction
    210      * does not overlap other index-2 blocks with the gap.
    211      */
    212     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
    213         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
    214     }
    215 
    216     /* set the indexes in the null index-2 block */
    217     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
    218         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
    219     }
    220     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
    221     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
    222 
    223     /* set the index-1 indexes for the linear index-2 block */
    224     for(i=0, j=0;
    225         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
    226         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
    227     ) {
    228         newTrie->index1[i]=j;
    229     }
    230 
    231     /* set the remaining index-1 indexes to the null index-2 block */
    232     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
    233         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
    234     }
    235 
    236     /*
    237      * Preallocate and reset data for U+0080..U+07ff,
    238      * for 2-byte UTF-8 which will be compacted in 64-blocks
    239      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
    240      */
    241     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
    242         utrie2_set32(trie, i, initialValue, pErrorCode);
    243     }
    244 
    245     return trie;
    246 }
    247 
    248 static UNewTrie2 *
    249 cloneBuilder(const UNewTrie2 *other) {
    250     UNewTrie2 *trie;
    251 
    252     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
    253     if(trie==NULL) {
    254         return NULL;
    255     }
    256 
    257     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
    258     if(trie->data==NULL) {
    259         uprv_free(trie);
    260         return NULL;
    261     }
    262 #ifdef UCPTRIE_DEBUG
    263     if(other->t3==nullptr) {
    264         trie->t3=nullptr;
    265     } else {
    266         UErrorCode errorCode=U_ZERO_ERROR;
    267         trie->t3=umutablecptrie_clone(other->t3, &errorCode);
    268     }
    269 #endif
    270     trie->dataCapacity=other->dataCapacity;
    271 
    272     /* clone data */
    273     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
    274     uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
    275     trie->index2NullOffset=other->index2NullOffset;
    276     trie->index2Length=other->index2Length;
    277 
    278     uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
    279     trie->dataNullOffset=other->dataNullOffset;
    280     trie->dataLength=other->dataLength;
    281 
    282     /* reference counters */
    283     if(other->isCompacted) {
    284         trie->firstFreeBlock=0;
    285     } else {
    286         uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
    287         trie->firstFreeBlock=other->firstFreeBlock;
    288     }
    289 
    290     trie->initialValue=other->initialValue;
    291     trie->errorValue=other->errorValue;
    292     trie->highStart=other->highStart;
    293     trie->isCompacted=other->isCompacted;
    294 
    295     return trie;
    296 }
    297 
    298 U_CAPI UTrie2 * U_EXPORT2
    299 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
    300     UTrie2 *trie;
    301 
    302     if(U_FAILURE(*pErrorCode)) {
    303         return NULL;
    304     }
    305     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
    306         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    307         return NULL;
    308     }
    309 
    310     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
    311     if(trie==NULL) {
    312         return NULL;
    313     }
    314     uprv_memcpy(trie, other, sizeof(UTrie2));
    315 
    316     if(other->memory!=NULL) {
    317         trie->memory=uprv_malloc(other->length);
    318         if(trie->memory!=NULL) {
    319             trie->isMemoryOwned=TRUE;
    320             uprv_memcpy(trie->memory, other->memory, other->length);
    321 
    322             /* make the clone's pointers point to its own memory */
    323             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
    324             if(other->data16!=NULL) {
    325                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
    326             }
    327             if(other->data32!=NULL) {
    328                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
    329             }
    330         }
    331     } else /* other->newTrie!=NULL */ {
    332         trie->newTrie=cloneBuilder(other->newTrie);
    333     }
    334 
    335     if(trie->memory==NULL && trie->newTrie==NULL) {
    336         uprv_free(trie);
    337         trie=NULL;
    338     }
    339     return trie;
    340 }
    341 
    342 typedef struct NewTrieAndStatus {
    343     UTrie2 *trie;
    344     UErrorCode errorCode;
    345     UBool exclusiveLimit;  /* rather than inclusive range end */
    346 } NewTrieAndStatus;
    347 
    348 static UBool U_CALLCONV
    349 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    350     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
    351     if(value!=nt->trie->initialValue) {
    352         if(nt->exclusiveLimit) {
    353             --end;
    354         }
    355         if(start==end) {
    356             utrie2_set32(nt->trie, start, value, &nt->errorCode);
    357         } else {
    358             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
    359         }
    360         return U_SUCCESS(nt->errorCode);
    361     } else {
    362         return TRUE;
    363     }
    364 }
    365 
    366 #ifdef UTRIE2_DEBUG
    367 static long countInitial(const UTrie2 *trie) {
    368     uint32_t initialValue=trie->initialValue;
    369     int32_t length=trie->dataLength;
    370     long count=0;
    371     if(trie->data16!=nullptr) {
    372         for(int32_t i=0; i<length; ++i) {
    373             if(trie->data16[i]==initialValue) { ++count; }
    374         }
    375     } else {
    376         for(int32_t i=0; i<length; ++i) {
    377             if(trie->data32[i]==initialValue) { ++count; }
    378         }
    379     }
    380     return count;
    381 }
    382 
    383 static void
    384 utrie_printLengths(const UTrie *trie) {
    385     long indexLength=trie->indexLength;
    386     long dataLength=(long)trie->dataLength;
    387     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
    388     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
    389            indexLength, dataLength, totalLength);
    390 }
    391 
    392 static void
    393 utrie2_printLengths(const UTrie2 *trie, const char *which) {
    394     long indexLength=trie->indexLength;
    395     long dataLength=(long)trie->dataLength;
    396     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
    397     printf("**UTrie2Lengths(%s %s)** index:%6ld  data:%6ld  countInitial:%6ld  serialized:%6ld\n",
    398            which, trie->name, indexLength, dataLength, countInitial(trie), totalLength);
    399 }
    400 #endif
    401 
    402 U_CAPI UTrie2 * U_EXPORT2
    403 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
    404     NewTrieAndStatus context;
    405     UChar lead;
    406 
    407     if(U_FAILURE(*pErrorCode)) {
    408         return NULL;
    409     }
    410     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
    411         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    412         return NULL;
    413     }
    414     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
    415         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
    416     }
    417 
    418     /* Clone the frozen trie by enumerating it and building a new one. */
    419     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
    420     if(U_FAILURE(*pErrorCode)) {
    421         return NULL;
    422     }
    423     context.exclusiveLimit=FALSE;
    424     context.errorCode=*pErrorCode;
    425     utrie2_enum(other, NULL, copyEnumRange, &context);
    426     *pErrorCode=context.errorCode;
    427     for(lead=0xd800; lead<0xdc00; ++lead) {
    428         uint32_t value;
    429         if(other->data32==NULL) {
    430             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
    431         } else {
    432             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
    433         }
    434         if(value!=other->initialValue) {
    435             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
    436         }
    437     }
    438     if(U_FAILURE(*pErrorCode)) {
    439         utrie2_close(context.trie);
    440         context.trie=NULL;
    441     }
    442     return context.trie;
    443 }
    444 
    445 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
    446 U_CAPI UTrie2 * U_EXPORT2
    447 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
    448     NewTrieAndStatus context;
    449     UChar lead;
    450 
    451     if(U_FAILURE(*pErrorCode)) {
    452         return NULL;
    453     }
    454     if(trie1==NULL) {
    455         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    456         return NULL;
    457     }
    458     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
    459     if(U_FAILURE(*pErrorCode)) {
    460         return NULL;
    461     }
    462     context.exclusiveLimit=TRUE;
    463     context.errorCode=*pErrorCode;
    464     utrie_enum(trie1, NULL, copyEnumRange, &context);
    465     *pErrorCode=context.errorCode;
    466     for(lead=0xd800; lead<0xdc00; ++lead) {
    467         uint32_t value;
    468         if(trie1->data32==NULL) {
    469             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
    470         } else {
    471             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
    472         }
    473         if(value!=trie1->initialValue) {
    474             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
    475         }
    476     }
    477     if(U_SUCCESS(*pErrorCode)) {
    478         utrie2_freeze(context.trie,
    479                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
    480                       pErrorCode);
    481     }
    482 #ifdef UTRIE2_DEBUG
    483     if(U_SUCCESS(*pErrorCode)) {
    484         utrie_printLengths(trie1);
    485         utrie2_printLengths(context.trie, "fromUTrie");
    486     }
    487 #endif
    488     if(U_FAILURE(*pErrorCode)) {
    489         utrie2_close(context.trie);
    490         context.trie=NULL;
    491     }
    492     return context.trie;
    493 }
    494 
    495 static inline UBool
    496 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    497     int32_t i2, block;
    498 
    499     if(U_IS_LEAD(c) && forLSCP) {
    500         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
    501             (c>>UTRIE2_SHIFT_2);
    502     } else {
    503         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
    504             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
    505     }
    506     block=trie->index2[i2];
    507     return (UBool)(block==trie->dataNullOffset);
    508 }
    509 
    510 static int32_t
    511 allocIndex2Block(UNewTrie2 *trie) {
    512     int32_t newBlock, newTop;
    513 
    514     newBlock=trie->index2Length;
    515     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
    516     if(newTop>UPRV_LENGTHOF(trie->index2)) {
    517         /*
    518          * Should never occur.
    519          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
    520          * or the code writes more values than should be possible.
    521          */
    522         return -1;
    523     }
    524     trie->index2Length=newTop;
    525     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
    526     return newBlock;
    527 }
    528 
    529 static int32_t
    530 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    531     int32_t i1, i2;
    532 
    533     if(U_IS_LEAD(c) && forLSCP) {
    534         return UTRIE2_LSCP_INDEX_2_OFFSET;
    535     }
    536 
    537     i1=c>>UTRIE2_SHIFT_1;
    538     i2=trie->index1[i1];
    539     if(i2==trie->index2NullOffset) {
    540         i2=allocIndex2Block(trie);
    541         if(i2<0) {
    542             return -1;  /* program error */
    543         }
    544         trie->index1[i1]=i2;
    545     }
    546     return i2;
    547 }
    548 
    549 static int32_t
    550 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
    551     int32_t newBlock, newTop;
    552 
    553     if(trie->firstFreeBlock!=0) {
    554         /* get the first free block */
    555         newBlock=trie->firstFreeBlock;
    556         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
    557     } else {
    558         /* get a new block from the high end */
    559         newBlock=trie->dataLength;
    560         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
    561         if(newTop>trie->dataCapacity) {
    562             /* out of memory in the data array */
    563             int32_t capacity;
    564             uint32_t *data;
    565 
    566             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
    567                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
    568             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
    569                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
    570             } else {
    571                 /*
    572                  * Should never occur.
    573                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
    574                  * or the code writes more values than should be possible.
    575                  */
    576                 return -1;
    577             }
    578             data=(uint32_t *)uprv_malloc(capacity*4);
    579             if(data==NULL) {
    580                 return -1;
    581             }
    582             uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
    583             uprv_free(trie->data);
    584             trie->data=data;
    585             trie->dataCapacity=capacity;
    586         }
    587         trie->dataLength=newTop;
    588     }
    589     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
    590     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
    591     return newBlock;
    592 }
    593 
    594 /* call when the block's reference counter reaches 0 */
    595 static void
    596 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
    597     /* put this block at the front of the free-block chain */
    598     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
    599     trie->firstFreeBlock=block;
    600 }
    601 
    602 static inline UBool
    603 isWritableBlock(UNewTrie2 *trie, int32_t block) {
    604     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
    605 }
    606 
    607 static inline void
    608 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
    609     int32_t oldBlock;
    610     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
    611     oldBlock=trie->index2[i2];
    612     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
    613         releaseDataBlock(trie, oldBlock);
    614     }
    615     trie->index2[i2]=block;
    616 }
    617 
    618 /**
    619  * No error checking for illegal arguments.
    620  *
    621  * @return -1 if no new data block available (out of memory in data array)
    622  * @internal
    623  */
    624 static int32_t
    625 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
    626     int32_t i2, oldBlock, newBlock;
    627 
    628     i2=getIndex2Block(trie, c, forLSCP);
    629     if(i2<0) {
    630         return -1;  /* program error */
    631     }
    632 
    633     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
    634     oldBlock=trie->index2[i2];
    635     if(isWritableBlock(trie, oldBlock)) {
    636         return oldBlock;
    637     }
    638 
    639     /* allocate a new data block */
    640     newBlock=allocDataBlock(trie, oldBlock);
    641     if(newBlock<0) {
    642         /* out of memory in the data array */
    643         return -1;
    644     }
    645     setIndex2Entry(trie, i2, newBlock);
    646     return newBlock;
    647 }
    648 
    649 /**
    650  * @return TRUE if the value was successfully set
    651  */
    652 static void
    653 set32(UNewTrie2 *trie,
    654       UChar32 c, UBool forLSCP, uint32_t value,
    655       UErrorCode *pErrorCode) {
    656     int32_t block;
    657 
    658     if(trie==NULL || trie->isCompacted) {
    659         *pErrorCode=U_NO_WRITE_PERMISSION;
    660         return;
    661     }
    662 #ifdef UCPTRIE_DEBUG
    663     umutablecptrie_set(trie->t3, c, value, pErrorCode);
    664 #endif
    665 
    666     block=getDataBlock(trie, c, forLSCP);
    667     if(block<0) {
    668         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    669         return;
    670     }
    671 
    672     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
    673 }
    674 
    675 U_CAPI void U_EXPORT2
    676 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
    677     if(U_FAILURE(*pErrorCode)) {
    678         return;
    679     }
    680     if((uint32_t)c>0x10ffff) {
    681         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    682         return;
    683     }
    684     set32(trie->newTrie, c, TRUE, value, pErrorCode);
    685 }
    686 
    687 U_CAPI void U_EXPORT2
    688 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
    689                                      UChar32 c, uint32_t value,
    690                                      UErrorCode *pErrorCode) {
    691     if(U_FAILURE(*pErrorCode)) {
    692         return;
    693     }
    694     if(!U_IS_LEAD(c)) {
    695         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    696         return;
    697     }
    698     set32(trie->newTrie, c, FALSE, value, pErrorCode);
    699 }
    700 
    701 static void
    702 writeBlock(uint32_t *block, uint32_t value) {
    703     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
    704     while(block<limit) {
    705         *block++=value;
    706     }
    707 }
    708 
    709 /**
    710  * initialValue is ignored if overwrite=TRUE
    711  * @internal
    712  */
    713 static void
    714 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
    715           uint32_t value, uint32_t initialValue, UBool overwrite) {
    716     uint32_t *pLimit;
    717 
    718     pLimit=block+limit;
    719     block+=start;
    720     if(overwrite) {
    721         while(block<pLimit) {
    722             *block++=value;
    723         }
    724     } else {
    725         while(block<pLimit) {
    726             if(*block==initialValue) {
    727                 *block=value;
    728             }
    729             ++block;
    730         }
    731     }
    732 }
    733 
    734 U_CAPI void U_EXPORT2
    735 utrie2_setRange32(UTrie2 *trie,
    736                   UChar32 start, UChar32 end,
    737                   uint32_t value, UBool overwrite,
    738                   UErrorCode *pErrorCode) {
    739     /*
    740      * repeat value in [start..end]
    741      * mark index values for repeat-data blocks by setting bit 31 of the index values
    742      * fill around existing values if any, if(overwrite)
    743      */
    744     UNewTrie2 *newTrie;
    745     int32_t block, rest, repeatBlock;
    746     UChar32 limit;
    747 
    748     if(U_FAILURE(*pErrorCode)) {
    749         return;
    750     }
    751     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
    752         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    753         return;
    754     }
    755     newTrie=trie->newTrie;
    756     if(newTrie==NULL || newTrie->isCompacted) {
    757         *pErrorCode=U_NO_WRITE_PERMISSION;
    758         return;
    759     }
    760 #ifdef UCPTRIE_DEBUG
    761     umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode);
    762 #endif
    763     if(!overwrite && value==newTrie->initialValue) {
    764         return; /* nothing to do */
    765     }
    766 
    767     limit=end+1;
    768     if(start&UTRIE2_DATA_MASK) {
    769         UChar32 nextStart;
    770 
    771         /* set partial block at [start..following block boundary[ */
    772         block=getDataBlock(newTrie, start, TRUE);
    773         if(block<0) {
    774             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    775             return;
    776         }
    777 
    778         nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK;
    779         if(nextStart<=limit) {
    780             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
    781                       value, newTrie->initialValue, overwrite);
    782             start=nextStart;
    783         } else {
    784             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
    785                       value, newTrie->initialValue, overwrite);
    786             return;
    787         }
    788     }
    789 
    790     /* number of positions in the last, partial block */
    791     rest=limit&UTRIE2_DATA_MASK;
    792 
    793     /* round down limit to a block boundary */
    794     limit&=~UTRIE2_DATA_MASK;
    795 
    796     /* iterate over all-value blocks */
    797     if(value==newTrie->initialValue) {
    798         repeatBlock=newTrie->dataNullOffset;
    799     } else {
    800         repeatBlock=-1;
    801     }
    802 
    803     while(start<limit) {
    804         int32_t i2;
    805         UBool setRepeatBlock=FALSE;
    806 
    807         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
    808             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
    809             continue;
    810         }
    811 
    812         /* get index value */
    813         i2=getIndex2Block(newTrie, start, TRUE);
    814         if(i2<0) {
    815             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    816             return;
    817         }
    818         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
    819         block=newTrie->index2[i2];
    820         if(isWritableBlock(newTrie, block)) {
    821             /* already allocated */
    822             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
    823                 /*
    824                  * We overwrite all values, and it's not a
    825                  * protected (ASCII-linear or 2-byte UTF-8) block:
    826                  * replace with the repeatBlock.
    827                  */
    828                 setRepeatBlock=TRUE;
    829             } else {
    830                 /* !overwrite, or protected block: just write the values into this block */
    831                 fillBlock(newTrie->data+block,
    832                           0, UTRIE2_DATA_BLOCK_LENGTH,
    833                           value, newTrie->initialValue, overwrite);
    834             }
    835         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
    836             /*
    837              * Set the repeatBlock instead of the null block or previous repeat block:
    838              *
    839              * If !isWritableBlock() then all entries in the block have the same value
    840              * because it's the null block or a range block (the repeatBlock from a previous
    841              * call to utrie2_setRange32()).
    842              * No other blocks are used multiple times before compacting.
    843              *
    844              * The null block is the only non-writable block with the initialValue because
    845              * of the repeatBlock initialization above. (If value==initialValue, then
    846              * the repeatBlock will be the null data block.)
    847              *
    848              * We set our repeatBlock if the desired value differs from the block's value,
    849              * and if we overwrite any data or if the data is all initial values
    850              * (which is the same as the block being the null block, see above).
    851              */
    852             setRepeatBlock=TRUE;
    853         }
    854         if(setRepeatBlock) {
    855             if(repeatBlock>=0) {
    856                 setIndex2Entry(newTrie, i2, repeatBlock);
    857             } else {
    858                 /* create and set and fill the repeatBlock */
    859                 repeatBlock=getDataBlock(newTrie, start, TRUE);
    860                 if(repeatBlock<0) {
    861                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    862                     return;
    863                 }
    864                 writeBlock(newTrie->data+repeatBlock, value);
    865             }
    866         }
    867 
    868         start+=UTRIE2_DATA_BLOCK_LENGTH;
    869     }
    870 
    871     if(rest>0) {
    872         /* set partial block at [last block boundary..limit[ */
    873         block=getDataBlock(newTrie, start, TRUE);
    874         if(block<0) {
    875             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    876             return;
    877         }
    878 
    879         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
    880     }
    881 
    882     return;
    883 }
    884 
    885 /* compaction --------------------------------------------------------------- */
    886 
    887 static inline UBool
    888 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
    889     while(length>0 && *s==*t) {
    890         ++s;
    891         ++t;
    892         --length;
    893     }
    894     return (UBool)(length==0);
    895 }
    896 
    897 static inline UBool
    898 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
    899     while(length>0 && *s==*t) {
    900         ++s;
    901         ++t;
    902         --length;
    903     }
    904     return (UBool)(length==0);
    905 }
    906 
    907 static int32_t
    908 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
    909     int32_t block;
    910 
    911     /* ensure that we do not even partially get past index2Length */
    912     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
    913 
    914     for(block=0; block<=index2Length; ++block) {
    915         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
    916             return block;
    917         }
    918     }
    919     return -1;
    920 }
    921 
    922 static int32_t
    923 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
    924     int32_t block;
    925 
    926     /* ensure that we do not even partially get past dataLength */
    927     dataLength-=blockLength;
    928 
    929     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
    930         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
    931             return block;
    932         }
    933     }
    934     return -1;
    935 }
    936 
    937 /*
    938  * Find the start of the last range in the trie by enumerating backward.
    939  * Indexes for supplementary code points higher than this will be omitted.
    940  */
    941 static UChar32
    942 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
    943     const uint32_t *data32;
    944 
    945     uint32_t value, initialValue;
    946     UChar32 c, prev;
    947     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
    948 
    949     data32=trie->data;
    950     initialValue=trie->initialValue;
    951 
    952     index2NullOffset=trie->index2NullOffset;
    953     nullBlock=trie->dataNullOffset;
    954 
    955     /* set variables for previous range */
    956     if(highValue==initialValue) {
    957         prevI2Block=index2NullOffset;
    958         prevBlock=nullBlock;
    959     } else {
    960         prevI2Block=-1;
    961         prevBlock=-1;
    962     }
    963     prev=0x110000;
    964 
    965     /* enumerate index-2 blocks */
    966     i1=UNEWTRIE2_INDEX_1_LENGTH;
    967     c=prev;
    968     while(c>0) {
    969         i2Block=trie->index1[--i1];
    970         if(i2Block==prevI2Block) {
    971             /* the index-2 block is the same as the previous one, and filled with highValue */
    972             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
    973             continue;
    974         }
    975         prevI2Block=i2Block;
    976         if(i2Block==index2NullOffset) {
    977             /* this is the null index-2 block */
    978             if(highValue!=initialValue) {
    979                 return c;
    980             }
    981             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
    982         } else {
    983             /* enumerate data blocks for one index-2 block */
    984             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
    985                 block=trie->index2[i2Block+ --i2];
    986                 if(block==prevBlock) {
    987                     /* the block is the same as the previous one, and filled with highValue */
    988                     c-=UTRIE2_DATA_BLOCK_LENGTH;
    989                     continue;
    990                 }
    991                 prevBlock=block;
    992                 if(block==nullBlock) {
    993                     /* this is the null data block */
    994                     if(highValue!=initialValue) {
    995                         return c;
    996                     }
    997                     c-=UTRIE2_DATA_BLOCK_LENGTH;
    998                 } else {
    999                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
   1000                         value=data32[block+ --j];
   1001                         if(value!=highValue) {
   1002                             return c;
   1003                         }
   1004                         --c;
   1005                     }
   1006                 }
   1007             }
   1008         }
   1009     }
   1010 
   1011     /* deliver last range */
   1012     return 0;
   1013 }
   1014 
   1015 /*
   1016  * Compact a build-time trie.
   1017  *
   1018  * The compaction
   1019  * - removes blocks that are identical with earlier ones
   1020  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
   1021  * - moves blocks in steps of the data granularity
   1022  * - moves and overlaps blocks that overlap with multiple values in the overlap region
   1023  *
   1024  * It does not
   1025  * - try to move and overlap blocks that are not already adjacent
   1026  */
   1027 static void
   1028 compactData(UNewTrie2 *trie) {
   1029 #ifdef UTRIE2_DEBUG
   1030     int32_t countSame=0, sumOverlaps=0;
   1031 #endif
   1032 
   1033     int32_t start, newStart, movedStart;
   1034     int32_t blockLength, overlap;
   1035     int32_t i, mapIndex, blockCount;
   1036 
   1037     /* do not compact linear-ASCII data */
   1038     newStart=UTRIE2_DATA_START_OFFSET;
   1039     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
   1040         trie->map[i]=start;
   1041     }
   1042 
   1043     /*
   1044      * Start with a block length of 64 for 2-byte UTF-8,
   1045      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
   1046      */
   1047     blockLength=64;
   1048     blockCount=blockLength>>UTRIE2_SHIFT_2;
   1049     for(start=newStart; start<trie->dataLength;) {
   1050         /*
   1051          * start: index of first entry of current block
   1052          * newStart: index where the current block is to be moved
   1053          *           (right after current end of already-compacted data)
   1054          */
   1055         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
   1056             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
   1057             blockCount=1;
   1058         }
   1059 
   1060         /* skip blocks that are not used */
   1061         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
   1062             /* advance start to the next block */
   1063             start+=blockLength;
   1064 
   1065             /* leave newStart with the previous block! */
   1066             continue;
   1067         }
   1068 
   1069         /* search for an identical block */
   1070         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
   1071              >=0
   1072         ) {
   1073 #ifdef UTRIE2_DEBUG
   1074             ++countSame;
   1075 #endif
   1076             /* found an identical block, set the other block's index value for the current block */
   1077             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1078                 trie->map[mapIndex++]=movedStart;
   1079                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
   1080             }
   1081 
   1082             /* advance start to the next block */
   1083             start+=blockLength;
   1084 
   1085             /* leave newStart with the previous block! */
   1086             continue;
   1087         }
   1088 
   1089         /* see if the beginning of this block can be overlapped with the end of the previous block */
   1090         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
   1091         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
   1092             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
   1093             overlap-=UTRIE2_DATA_GRANULARITY) {}
   1094 
   1095 #ifdef UTRIE2_DEBUG
   1096             sumOverlaps+=overlap;
   1097 #endif
   1098         if(overlap>0 || newStart<start) {
   1099             /* some overlap, or just move the whole block */
   1100             movedStart=newStart-overlap;
   1101             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1102                 trie->map[mapIndex++]=movedStart;
   1103                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
   1104             }
   1105 
   1106             /* move the non-overlapping indexes to their new positions */
   1107             start+=overlap;
   1108             for(i=blockLength-overlap; i>0; --i) {
   1109                 trie->data[newStart++]=trie->data[start++];
   1110             }
   1111         } else /* no overlap && newStart==start */ {
   1112             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
   1113                 trie->map[mapIndex++]=start;
   1114                 start+=UTRIE2_DATA_BLOCK_LENGTH;
   1115             }
   1116             newStart=start;
   1117         }
   1118     }
   1119 
   1120     /* now adjust the index-2 table */
   1121     for(i=0; i<trie->index2Length; ++i) {
   1122         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
   1123             /* Gap indexes are invalid (-1). Skip over the gap. */
   1124             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
   1125         }
   1126         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
   1127     }
   1128     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
   1129 
   1130     /* ensure dataLength alignment */
   1131     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
   1132         trie->data[newStart++]=trie->initialValue;
   1133     }
   1134 
   1135 #ifdef UTRIE2_DEBUG
   1136     /* we saved some space */
   1137     printf("compacting UTrie2: count of 32-bit data words %lu->%lu  countSame=%ld  sumOverlaps=%ld\n",
   1138             (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps);
   1139 #endif
   1140 
   1141     trie->dataLength=newStart;
   1142 }
   1143 
   1144 static void
   1145 compactIndex2(UNewTrie2 *trie) {
   1146     int32_t i, start, newStart, movedStart, overlap;
   1147 
   1148     /* do not compact linear-BMP index-2 blocks */
   1149     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
   1150     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
   1151         trie->map[i]=start;
   1152     }
   1153 
   1154     /* Reduce the index table gap to what will be needed at runtime. */
   1155     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
   1156 
   1157     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
   1158         /*
   1159          * start: index of first entry of current block
   1160          * newStart: index where the current block is to be moved
   1161          *           (right after current end of already-compacted data)
   1162          */
   1163 
   1164         /* search for an identical block */
   1165         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
   1166              >=0
   1167         ) {
   1168             /* found an identical block, set the other block's index value for the current block */
   1169             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
   1170 
   1171             /* advance start to the next block */
   1172             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
   1173 
   1174             /* leave newStart with the previous block! */
   1175             continue;
   1176         }
   1177 
   1178         /* see if the beginning of this block can be overlapped with the end of the previous block */
   1179         /* look for maximum overlap with the previous, adjacent block */
   1180         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
   1181             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
   1182             --overlap) {}
   1183 
   1184         if(overlap>0 || newStart<start) {
   1185             /* some overlap, or just move the whole block */
   1186             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
   1187 
   1188             /* move the non-overlapping indexes to their new positions */
   1189             start+=overlap;
   1190             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
   1191                 trie->index2[newStart++]=trie->index2[start++];
   1192             }
   1193         } else /* no overlap && newStart==start */ {
   1194             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
   1195             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
   1196             newStart=start;
   1197         }
   1198     }
   1199 
   1200     /* now adjust the index-1 table */
   1201     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
   1202         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
   1203     }
   1204     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
   1205 
   1206     /*
   1207      * Ensure data table alignment:
   1208      * Needs to be granularity-aligned for 16-bit trie
   1209      * (so that dataMove will be down-shiftable),
   1210      * and 2-aligned for uint32_t data.
   1211      */
   1212     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
   1213         /* Arbitrary value: 0x3fffc not possible for real data. */
   1214         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
   1215     }
   1216 
   1217 #ifdef UTRIE2_DEBUG
   1218     /* we saved some space */
   1219     printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n",
   1220             (long)trie->index2Length, (long)newStart);
   1221 #endif
   1222 
   1223     trie->index2Length=newStart;
   1224 }
   1225 
   1226 static void
   1227 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
   1228     UNewTrie2 *newTrie;
   1229     UChar32 highStart, suppHighStart;
   1230     uint32_t highValue;
   1231 
   1232     newTrie=trie->newTrie;
   1233 
   1234     /* find highStart and round it up */
   1235     highValue=utrie2_get32(trie, 0x10ffff);
   1236     highStart=findHighStart(newTrie, highValue);
   1237     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
   1238     if(highStart==0x110000) {
   1239         highValue=trie->errorValue;
   1240     }
   1241 
   1242     /*
   1243      * Set trie->highStart only after utrie2_get32(trie, highStart).
   1244      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
   1245      */
   1246     trie->highStart=newTrie->highStart=highStart;
   1247 
   1248 #ifdef UTRIE2_DEBUG
   1249     printf("UTrie2: highStart U+%06lx  highValue 0x%lx  initialValue 0x%lx\n",
   1250             (long)highStart, (long)highValue, (long)trie->initialValue);
   1251 #endif
   1252 
   1253     if(highStart<0x110000) {
   1254         /* Blank out [highStart..10ffff] to release associated data blocks. */
   1255         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
   1256         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
   1257         if(U_FAILURE(*pErrorCode)) {
   1258             return;
   1259         }
   1260     }
   1261 
   1262     compactData(newTrie);
   1263     if(highStart>0x10000) {
   1264         compactIndex2(newTrie);
   1265 #ifdef UTRIE2_DEBUG
   1266     } else {
   1267         printf("UTrie2: highStart U+%04lx  count of 16-bit index words %lu->%lu\n",
   1268                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
   1269 #endif
   1270     }
   1271 
   1272     /*
   1273      * Store the highValue in the data array and round up the dataLength.
   1274      * Must be done after compactData() because that assumes that dataLength
   1275      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
   1276      */
   1277     newTrie->data[newTrie->dataLength++]=highValue;
   1278     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
   1279         newTrie->data[newTrie->dataLength++]=trie->initialValue;
   1280     }
   1281 
   1282     newTrie->isCompacted=TRUE;
   1283 }
   1284 
   1285 /* serialization ------------------------------------------------------------ */
   1286 
   1287 /**
   1288  * Maximum length of the runtime index array.
   1289  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
   1290  * (The actual maximum length is lower,
   1291  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
   1292  */
   1293 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
   1294 
   1295 /**
   1296  * Maximum length of the runtime data array.
   1297  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
   1298  * and by uint16_t UTrie2Header.shiftedDataLength.
   1299  */
   1300 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
   1301 
   1302 /* Compact and internally serialize the trie. */
   1303 U_CAPI void U_EXPORT2
   1304 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
   1305     UNewTrie2 *newTrie;
   1306     UTrie2Header *header;
   1307     uint32_t *p;
   1308     uint16_t *dest16;
   1309     int32_t i, length;
   1310     int32_t allIndexesLength;
   1311     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
   1312     UChar32 highStart;
   1313 
   1314     /* argument check */
   1315     if(U_FAILURE(*pErrorCode)) {
   1316         return;
   1317     }
   1318     if( trie==NULL ||
   1319         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
   1320     ) {
   1321         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1322         return;
   1323     }
   1324     newTrie=trie->newTrie;
   1325     if(newTrie==NULL) {
   1326         /* already frozen */
   1327         UTrie2ValueBits frozenValueBits=
   1328             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
   1329         if(valueBits!=frozenValueBits) {
   1330             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1331         }
   1332         return;
   1333     }
   1334 
   1335     /* compact if necessary */
   1336     if(!newTrie->isCompacted) {
   1337         compactTrie(trie, pErrorCode);
   1338         if(U_FAILURE(*pErrorCode)) {
   1339             return;
   1340         }
   1341     }
   1342     highStart=trie->highStart;
   1343 
   1344     if(highStart<=0x10000) {
   1345         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
   1346     } else {
   1347         allIndexesLength=newTrie->index2Length;
   1348     }
   1349     if(valueBits==UTRIE2_16_VALUE_BITS) {
   1350         dataMove=allIndexesLength;
   1351     } else {
   1352         dataMove=0;
   1353     }
   1354 
   1355     /* are indexLength and dataLength within limits? */
   1356     if( /* for unshifted indexLength */
   1357         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
   1358         /* for unshifted dataNullOffset */
   1359         (dataMove+newTrie->dataNullOffset)>0xffff ||
   1360         /* for unshifted 2-byte UTF-8 index-2 values */
   1361         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
   1362         /* for shiftedDataLength */
   1363         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
   1364     ) {
   1365         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1366         return;
   1367     }
   1368 
   1369     /* calculate the total serialized length */
   1370     length=sizeof(UTrie2Header)+allIndexesLength*2;
   1371     if(valueBits==UTRIE2_16_VALUE_BITS) {
   1372         length+=newTrie->dataLength*2;
   1373     } else {
   1374         length+=newTrie->dataLength*4;
   1375     }
   1376 
   1377     trie->memory=uprv_malloc(length);
   1378     if(trie->memory==NULL) {
   1379         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1380         return;
   1381     }
   1382     trie->length=length;
   1383     trie->isMemoryOwned=TRUE;
   1384 
   1385     trie->indexLength=allIndexesLength;
   1386     trie->dataLength=newTrie->dataLength;
   1387     if(highStart<=0x10000) {
   1388         trie->index2NullOffset=0xffff;
   1389     } else {
   1390         trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset);
   1391     }
   1392     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
   1393     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
   1394 
   1395     /* set the header fields */
   1396     header=(UTrie2Header *)trie->memory;
   1397 
   1398     header->signature=UTRIE2_SIG; /* "Tri2" */
   1399     header->options=(uint16_t)valueBits;
   1400 
   1401     header->indexLength=(uint16_t)trie->indexLength;
   1402     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
   1403     header->index2NullOffset=trie->index2NullOffset;
   1404     header->dataNullOffset=trie->dataNullOffset;
   1405     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
   1406 
   1407     /* fill the index and data arrays */
   1408     dest16=(uint16_t *)(header+1);
   1409     trie->index=dest16;
   1410 
   1411     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
   1412     p=(uint32_t *)newTrie->index2;
   1413     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
   1414         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
   1415     }
   1416 
   1417     /* write UTF-8 2-byte index-2 values, not right-shifted */
   1418     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
   1419         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
   1420     }
   1421     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
   1422         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
   1423     }
   1424 
   1425     if(highStart>0x10000) {
   1426         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
   1427         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
   1428 
   1429         /* write 16-bit index-1 values for supplementary code points */
   1430         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
   1431         for(i=index1Length; i>0; --i) {
   1432             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
   1433         }
   1434 
   1435         /*
   1436          * write the index-2 array values for supplementary code points,
   1437          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
   1438          */
   1439         p=(uint32_t *)newTrie->index2+index2Offset;
   1440         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
   1441             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
   1442         }
   1443     }
   1444 
   1445     /* write the 16/32-bit data array */
   1446     switch(valueBits) {
   1447     case UTRIE2_16_VALUE_BITS:
   1448         /* write 16-bit data values */
   1449         trie->data16=dest16;
   1450         trie->data32=NULL;
   1451         p=newTrie->data;
   1452         for(i=newTrie->dataLength; i>0; --i) {
   1453             *dest16++=(uint16_t)*p++;
   1454         }
   1455         break;
   1456     case UTRIE2_32_VALUE_BITS:
   1457         /* write 32-bit data values */
   1458         trie->data16=NULL;
   1459         trie->data32=(uint32_t *)dest16;
   1460         uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
   1461         break;
   1462     default:
   1463         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1464         return;
   1465     }
   1466 
   1467 #ifdef UTRIE2_DEBUG
   1468     utrie2_printLengths(trie, "");
   1469 #endif
   1470 
   1471 #ifdef UCPTRIE_DEBUG
   1472     umutablecptrie_setName(newTrie->t3, trie->name);
   1473     ucptrie_close(
   1474         umutablecptrie_buildImmutable(
   1475             newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode));
   1476 #endif
   1477     /* Delete the UNewTrie2. */
   1478     uprv_free(newTrie->data);
   1479     uprv_free(newTrie);
   1480     trie->newTrie=NULL;
   1481 }
   1482