Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 ******************************************************************************
      5 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
      6 * others. All Rights Reserved.                                               *
      7 ******************************************************************************
      8 */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.io.DataOutputStream;
     13 import java.io.IOException;
     14 import java.io.OutputStream;
     15 import java.util.Arrays;
     16 
     17 import com.ibm.icu.lang.UCharacter;
     18 import com.ibm.icu.text.UTF16;
     19 
     20 /**
     21  * Builder class to manipulate and generate a trie.
     22  * This is useful for ICU data in primitive types.
     23  * Provides a compact way to store information that is indexed by Unicode
     24  * values, such as character properties, types, keyboard values, etc. This is
     25  * very useful when you have a block of Unicode data that contains significant
     26  * values while the rest of the Unicode data is unused in the application or
     27  * when you have a lot of redundance, such as where all 21,000 Han ideographs
     28  * have the same value.  However, lookup is much faster than a hash table.
     29  * A trie of any primitive data type serves two purposes:
     30  * <UL type = round>
     31  *     <LI>Fast access of the indexed values.
     32  *     <LI>Smaller memory footprint.
     33  * </UL>
     34  * This is a direct port from the ICU4C version
     35  * @author             Syn Wee Quek
     36  */
     37 public class IntTrieBuilder extends TrieBuilder
     38 {
     39     // public constructor ----------------------------------------------
     40 
     41     /**
     42      * Copy constructor
     43      */
     44     public IntTrieBuilder(IntTrieBuilder table)
     45     {
     46         super(table);
     47         m_data_ = new int[m_dataCapacity_];
     48         System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
     49         m_initialValue_ = table.m_initialValue_;
     50         m_leadUnitValue_ = table.m_leadUnitValue_;
     51     }
     52 
     53     /**
     54      * Constructs a build table
     55      * @param aliasdata data to be filled into table
     56      * @param maxdatalength maximum data length allowed in table
     57      * @param initialvalue inital data value
     58      * @param latin1linear is latin 1 to be linear
     59      */
     60     public IntTrieBuilder(int aliasdata[], int maxdatalength,
     61                           int initialvalue, int leadunitvalue,
     62                           boolean latin1linear)
     63     {
     64         super();
     65         if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear
     66                                                   && maxdatalength < 1024)) {
     67             throw new IllegalArgumentException(
     68                                                "Argument maxdatalength is too small");
     69         }
     70 
     71         if (aliasdata != null) {
     72             m_data_ = aliasdata;
     73         }
     74         else {
     75             m_data_ = new int[maxdatalength];
     76         }
     77 
     78         // preallocate and reset the first data block (block index 0)
     79         int j = DATA_BLOCK_LENGTH;
     80 
     81         if (latin1linear) {
     82             // preallocate and reset the first block (number 0) and Latin-1
     83             // (U+0000..U+00ff) after that made sure above that
     84             // maxDataLength >= 1024
     85             // set indexes to point to consecutive data blocks
     86             int i = 0;
     87             do {
     88                 // do this at least for trie->index[0] even if that block is
     89                 // only partly used for Latin-1
     90                 m_index_[i ++] = j;
     91                 j += DATA_BLOCK_LENGTH;
     92             } while (i < (256 >> SHIFT_));
     93         }
     94 
     95         m_dataLength_ = j;
     96         // reset the initially allocated blocks to the initial value
     97         Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
     98         m_initialValue_ = initialvalue;
     99         m_leadUnitValue_ = leadunitvalue;
    100         m_dataCapacity_ = maxdatalength;
    101         m_isLatin1Linear_ = latin1linear;
    102         m_isCompacted_ = false;
    103     }
    104 
    105     // public methods -------------------------------------------------------
    106 
    107     /*public final void print()
    108       {
    109       int i = 0;
    110       int oldvalue = m_index_[i];
    111       int count = 0;
    112       System.out.println("index length " + m_indexLength_
    113       + " --------------------------");
    114       while (i < m_indexLength_) {
    115       if (m_index_[i] != oldvalue) {
    116       System.out.println("index has " + count + " counts of "
    117       + Integer.toHexString(oldvalue));
    118       count = 0;
    119       oldvalue = m_index_[i];
    120       }
    121       count ++;
    122       i ++;
    123       }
    124       System.out.println("index has " + count + " counts of "
    125       + Integer.toHexString(oldvalue));
    126       i = 0;
    127       oldvalue = m_data_[i];
    128       count = 0;
    129       System.out.println("data length " + m_dataLength_
    130       + " --------------------------");
    131       while (i < m_dataLength_) {
    132       if (m_data_[i] != oldvalue) {
    133       if ((oldvalue & 0xf1000000) == 0xf1000000) {
    134       int temp = oldvalue & 0xffffff;
    135       temp += 0x320;
    136       oldvalue = 0xf1000000 | temp;
    137       }
    138       if ((oldvalue & 0xf2000000) == 0xf2000000) {
    139       int temp = oldvalue & 0xffffff;
    140       temp += 0x14a;
    141       oldvalue = 0xf2000000 | temp;
    142       }
    143       System.out.println("data has " + count + " counts of "
    144       + Integer.toHexString(oldvalue));
    145       count = 0;
    146       oldvalue = m_data_[i];
    147       }
    148       count ++;
    149       i ++;
    150       }
    151       if ((oldvalue & 0xf1000000) == 0xf1000000) {
    152       int temp = oldvalue & 0xffffff;
    153       temp += 0x320;
    154       oldvalue = 0xf1000000 | temp;
    155       }
    156       if ((oldvalue & 0xf2000000) == 0xf2000000) {
    157       int temp = oldvalue & 0xffffff;
    158       temp += 0x14a;
    159       oldvalue = 0xf2000000 | temp;
    160       }
    161       System.out.println("data has " + count + " counts of "
    162       + Integer.toHexString(oldvalue));
    163       }
    164     */
    165     /**
    166      * Gets a 32 bit data from the table data
    167      * @param ch codepoint which data is to be retrieved
    168      * @return the 32 bit data
    169      */
    170     public int getValue(int ch)
    171     {
    172         // valid, uncompacted trie and valid c?
    173         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
    174             return 0;
    175         }
    176 
    177         int block = m_index_[ch >> SHIFT_];
    178         return m_data_[Math.abs(block) + (ch & MASK_)];
    179     }
    180 
    181     /**
    182      * Get a 32 bit data from the table data
    183      * @param ch  code point for which data is to be retrieved.
    184      * @param inBlockZero  Output parameter, inBlockZero[0] returns true if the
    185      *                      char maps into block zero, otherwise false.
    186      * @return the 32 bit data value.
    187      */
    188     public int getValue(int ch, boolean [] inBlockZero)
    189     {
    190         // valid, uncompacted trie and valid c?
    191         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
    192             if (inBlockZero != null) {
    193                 inBlockZero[0] = true;
    194             }
    195             return 0;
    196         }
    197 
    198         int block = m_index_[ch >> SHIFT_];
    199         if (inBlockZero != null) {
    200             inBlockZero[0] = (block == 0);
    201         }
    202         return m_data_[Math.abs(block) + (ch & MASK_)];
    203     }
    204 
    205 
    206     /**
    207      * Sets a 32 bit data in the table data
    208      * @param ch codepoint which data is to be set
    209      * @param value to set
    210      * @return true if the set is successful, otherwise
    211      *              if the table has been compacted return false
    212      */
    213     public boolean setValue(int ch, int value)
    214     {
    215         // valid, uncompacted trie and valid c?
    216         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
    217             return false;
    218         }
    219 
    220         int block = getDataBlock(ch);
    221         if (block < 0) {
    222             return false;
    223         }
    224 
    225         m_data_[block + (ch & MASK_)] = value;
    226         return true;
    227     }
    228 
    229     /**
    230      * Serializes the build table with 32 bit data
    231      * @param datamanipulate builder raw fold method implementation
    232      * @param triedatamanipulate result trie fold method
    233      * @return a new trie
    234      */
    235     public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate,
    236                              Trie.DataManipulate triedatamanipulate)
    237     {
    238         if (datamanipulate == null) {
    239             throw new IllegalArgumentException("Parameters can not be null");
    240         }
    241         // fold and compact if necessary, also checks that indexLength is
    242         // within limits
    243         if (!m_isCompacted_) {
    244             // compact once without overlap to improve folding
    245             compact(false);
    246             // fold the supplementary part of the index array
    247             fold(datamanipulate);
    248             // compact again with overlap for minimum data array length
    249             compact(true);
    250             m_isCompacted_ = true;
    251         }
    252         // is dataLength within limits?
    253         if (m_dataLength_ >= MAX_DATA_LENGTH_) {
    254             throw new ArrayIndexOutOfBoundsException("Data length too small");
    255         }
    256 
    257         char index[] = new char[m_indexLength_];
    258         int data[] = new int[m_dataLength_];
    259         // write the index (stage 1) array and the 32-bit data (stage 2) array
    260         // write 16-bit index values shifted right by INDEX_SHIFT_
    261         for (int i = 0; i < m_indexLength_; i ++) {
    262             index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
    263         }
    264         // write 32-bit data values
    265         System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
    266 
    267         int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
    268         options |= OPTIONS_DATA_IS_32_BIT_;
    269         if (m_isLatin1Linear_) {
    270             options |= OPTIONS_LATIN1_IS_LINEAR_;
    271         }
    272         return new IntTrie(index, data, m_initialValue_, options,
    273                            triedatamanipulate);
    274     }
    275 
    276 
    277     /**
    278      * Serializes the build table to an output stream.
    279      *
    280      * Compacts the build-time trie after all values are set, and then
    281      * writes the serialized form onto an output stream.
    282      *
    283      * After this, this build-time Trie can only be serialized again and/or closed;
    284      * no further values can be added.
    285      *
    286      * This function is the rough equivalent of utrie_seriaize() in ICU4C.
    287      *
    288      * @param os the output stream to which the seriaized trie will be written.
    289      *         If nul, the function still returns the size of the serialized Trie.
    290      * @param reduceTo16Bits If true, reduce the data size to 16 bits.  The resulting
    291      *         serialized form can then be used to create a CharTrie.
    292      * @param datamanipulate builder raw fold method implementation
    293      * @return the number of bytes written to the output stream.
    294      */
    295      public int serialize(OutputStream os, boolean reduceTo16Bits,
    296             TrieBuilder.DataManipulate datamanipulate)  throws IOException {
    297          if (datamanipulate == null) {
    298              throw new IllegalArgumentException("Parameters can not be null");
    299          }
    300 
    301          // fold and compact if necessary, also checks that indexLength is
    302          // within limits
    303          if (!m_isCompacted_) {
    304              // compact once without overlap to improve folding
    305              compact(false);
    306              // fold the supplementary part of the index array
    307              fold(datamanipulate);
    308              // compact again with overlap for minimum data array length
    309              compact(true);
    310              m_isCompacted_ = true;
    311          }
    312 
    313          // is dataLength within limits?
    314          int length;
    315          if (reduceTo16Bits) {
    316              length = m_dataLength_ + m_indexLength_;
    317          } else {
    318              length = m_dataLength_;
    319          }
    320          if (length >= MAX_DATA_LENGTH_) {
    321              throw new ArrayIndexOutOfBoundsException("Data length too small");
    322          }
    323 
    324          //  struct UTrieHeader {
    325          //      int32_t   signature;
    326          //      int32_t   options  (a bit field)
    327          //      int32_t   indexLength
    328          //      int32_t   dataLength
    329          length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;
    330          if(reduceTo16Bits) {
    331              length+=2*m_dataLength_;
    332          } else {
    333              length+=4*m_dataLength_;
    334          }
    335 
    336          if (os == null) {
    337              // No output stream.  Just return the length of the serialized Trie, in bytes.
    338              return length;
    339          }
    340 
    341          DataOutputStream dos = new DataOutputStream(os);
    342          dos.writeInt(Trie.HEADER_SIGNATURE_);
    343 
    344          int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<<Trie.HEADER_OPTIONS_INDEX_SHIFT_);
    345          if(!reduceTo16Bits) {
    346              options |= Trie.HEADER_OPTIONS_DATA_IS_32_BIT_;
    347          }
    348          if(m_isLatin1Linear_) {
    349              options |= Trie.HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_;
    350          }
    351          dos.writeInt(options);
    352 
    353          dos.writeInt(m_indexLength_);
    354          dos.writeInt(m_dataLength_);
    355 
    356          /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
    357          if(reduceTo16Bits) {
    358              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
    359              for (int i=0; i<m_indexLength_; i++) {
    360                  int v = (m_index_[i] + m_indexLength_) >>> Trie.INDEX_STAGE_2_SHIFT_;
    361                  dos.writeChar(v);
    362              }
    363 
    364              /* write 16-bit data values */
    365              for(int i=0; i<m_dataLength_; i++) {
    366                  int v = m_data_[i] & 0x0000ffff;
    367                  dos.writeChar(v);
    368              }
    369          } else {
    370              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
    371              for (int i=0; i<m_indexLength_; i++) {
    372                  int v = (m_index_[i]) >>> Trie.INDEX_STAGE_2_SHIFT_;
    373                  dos.writeChar(v);
    374              }
    375 
    376              /* write 32-bit data values */
    377              for(int i=0; i<m_dataLength_; i++) {
    378                  dos.writeInt(m_data_[i]);
    379              }
    380         }
    381 
    382          return length;
    383 
    384     }
    385 
    386 
    387     /**
    388      * Set a value in a range of code points [start..limit].
    389      * All code points c with start &lt;= c &lt; limit will get the value if
    390      * overwrite is true or if the old value is 0.
    391      * @param start the first code point to get the value
    392      * @param limit one past the last code point to get the value
    393      * @param value the value
    394      * @param overwrite flag for whether old non-initial values are to be
    395      *        overwritten
    396      * @return false if a failure occurred (illegal argument or data array
    397      *               overrun)
    398      */
    399     public boolean setRange(int start, int limit, int value,
    400                             boolean overwrite)
    401     {
    402         // repeat value in [start..limit[
    403         // mark index values for repeat-data blocks by setting bit 31 of the
    404         // index values fill around existing values if any, if(overwrite)
    405 
    406         // valid, uncompacted trie and valid indexes?
    407         if (m_isCompacted_ || start < UCharacter.MIN_VALUE
    408             || start > UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
    409             || limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
    410             return false;
    411         }
    412 
    413         if (start == limit) {
    414             return true; // nothing to do
    415         }
    416 
    417         if ((start & MASK_) != 0) {
    418             // set partial block at [start..following block boundary[
    419             int block = getDataBlock(start);
    420             if (block < 0) {
    421                 return false;
    422             }
    423 
    424             int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
    425             if (nextStart <= limit) {
    426                 fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
    427                           value, overwrite);
    428                 start = nextStart;
    429             }
    430             else {
    431                 fillBlock(block, start & MASK_, limit & MASK_,
    432                           value, overwrite);
    433                 return true;
    434             }
    435         }
    436 
    437         // number of positions in the last, partial block
    438         int rest = limit & MASK_;
    439 
    440         // round down limit to a block boundary
    441         limit &= ~MASK_;
    442 
    443         // iterate over all-value blocks
    444         int repeatBlock = 0;
    445         if (value == m_initialValue_) {
    446             // repeatBlock = 0; assigned above
    447         }
    448         else {
    449             repeatBlock = -1;
    450         }
    451         while (start < limit) {
    452             // get index value
    453             int block = m_index_[start >> SHIFT_];
    454             if (block > 0) {
    455                 // already allocated, fill in value
    456                 fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
    457             }
    458             else if (m_data_[-block] != value && (block == 0 || overwrite)) {
    459                 // set the repeatBlock instead of the current block 0 or range
    460                 // block
    461                 if (repeatBlock >= 0) {
    462                     m_index_[start >> SHIFT_] = -repeatBlock;
    463                 }
    464                 else {
    465                     // create and set and fill the repeatBlock
    466                     repeatBlock = getDataBlock(start);
    467                     if (repeatBlock < 0) {
    468                         return false;
    469                     }
    470 
    471                     // set the negative block number to indicate that it is a
    472                     // repeat block
    473                     m_index_[start >> SHIFT_] = -repeatBlock;
    474                     fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
    475                 }
    476             }
    477 
    478             start += DATA_BLOCK_LENGTH;
    479         }
    480 
    481         if (rest > 0) {
    482             // set partial block at [last block boundary..limit[
    483             int block = getDataBlock(start);
    484             if (block < 0) {
    485                 return false;
    486             }
    487             fillBlock(block, 0, rest, value, overwrite);
    488         }
    489 
    490         return true;
    491     }
    492 
    493     // protected data member ------------------------------------------------
    494 
    495     protected int m_data_[];
    496     protected int m_initialValue_;
    497 
    498     //  private data member ------------------------------------------------
    499 
    500     private int m_leadUnitValue_;
    501 
    502     // private methods ------------------------------------------------------
    503 
    504     private int allocDataBlock()
    505     {
    506         int newBlock = m_dataLength_;
    507         int newTop = newBlock + DATA_BLOCK_LENGTH;
    508         if (newTop > m_dataCapacity_) {
    509             // out of memory in the data array
    510             return -1;
    511         }
    512         m_dataLength_ = newTop;
    513         return newBlock;
    514     }
    515 
    516     /**
    517      * No error checking for illegal arguments.
    518      * @param ch codepoint to look for
    519      * @return -1 if no new data block available (out of memory in data array)
    520      */
    521     private int getDataBlock(int ch)
    522     {
    523         ch >>= SHIFT_;
    524         int indexValue = m_index_[ch];
    525         if (indexValue > 0) {
    526             return indexValue;
    527         }
    528 
    529         // allocate a new data block
    530         int newBlock = allocDataBlock();
    531         if (newBlock < 0) {
    532             // out of memory in the data array
    533             return -1;
    534         }
    535         m_index_[ch] = newBlock;
    536 
    537         // copy-on-write for a block from a setRange()
    538         System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,
    539                          DATA_BLOCK_LENGTH << 2);
    540         return newBlock;
    541     }
    542 
    543     /**
    544      * Compact a folded build-time trie.
    545      * The compaction
    546      * - removes blocks that are identical with earlier ones
    547      * - overlaps adjacent blocks as much as possible (if overlap == true)
    548      * - moves blocks in steps of the data granularity
    549      * - moves and overlaps blocks that overlap with multiple values in the overlap region
    550      *
    551      * It does not
    552      * - try to move and overlap blocks that are not already adjacent
    553      * @param overlap flag
    554      */
    555     private void compact(boolean overlap)
    556     {
    557         if (m_isCompacted_) {
    558             return; // nothing left to do
    559         }
    560 
    561         // compaction
    562         // initialize the index map with "block is used/unused" flags
    563         findUnusedBlocks();
    564 
    565         // if Latin-1 is preallocated and linear, then do not compact Latin-1
    566         // data
    567         int overlapStart = DATA_BLOCK_LENGTH;
    568         if (m_isLatin1Linear_ && SHIFT_ <= 8) {
    569             overlapStart += 256;
    570         }
    571 
    572         int newStart = DATA_BLOCK_LENGTH;
    573         int i;
    574         for (int start = newStart; start < m_dataLength_;) {
    575             // start: index of first entry of current block
    576             // newStart: index where the current block is to be moved
    577             //           (right after current end of already-compacted data)
    578             // skip blocks that are not used
    579             if (m_map_[start >>> SHIFT_] < 0) {
    580                 // advance start to the next block
    581                 start += DATA_BLOCK_LENGTH;
    582                 // leave newStart with the previous block!
    583                 continue;
    584             }
    585             // search for an identical block
    586             if (start >= overlapStart) {
    587                 i = findSameDataBlock(m_data_, newStart, start,
    588                                           overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
    589                 if (i >= 0) {
    590                     // found an identical block, set the other block's index
    591                     // value for the current block
    592                     m_map_[start >>> SHIFT_] = i;
    593                     // advance start to the next block
    594                     start += DATA_BLOCK_LENGTH;
    595                     // leave newStart with the previous block!
    596                     continue;
    597                 }
    598             }
    599             // see if the beginning of this block can be overlapped with the
    600             // end of the previous block
    601             if(overlap && start>=overlapStart) {
    602                 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
    603                 for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;
    604                     i>0 && !equal_int(m_data_, newStart-i, start, i);
    605                     i-=DATA_GRANULARITY_) {}
    606             } else {
    607                 i=0;
    608             }
    609             if (i > 0) {
    610                 // some overlap
    611                 m_map_[start >>> SHIFT_] = newStart - i;
    612                 // move the non-overlapping indexes to their new positions
    613                 start += i;
    614                 for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
    615                     m_data_[newStart ++] = m_data_[start ++];
    616                 }
    617             }
    618             else if (newStart < start) {
    619                 // no overlap, just move the indexes to their new positions
    620                 m_map_[start >>> SHIFT_] = newStart;
    621                 for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
    622                     m_data_[newStart ++] = m_data_[start ++];
    623                 }
    624             }
    625             else { // no overlap && newStart==start
    626                 m_map_[start >>> SHIFT_] = start;
    627                 newStart += DATA_BLOCK_LENGTH;
    628                 start = newStart;
    629             }
    630         }
    631         // now adjust the index (stage 1) table
    632         for (i = 0; i < m_indexLength_; ++ i) {
    633             m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
    634         }
    635         m_dataLength_ = newStart;
    636     }
    637 
    638     /**
    639      * Find the same data block
    640      * @param data array
    641      * @param dataLength
    642      * @param otherBlock
    643      * @param step
    644      */
    645     private static final int findSameDataBlock(int data[], int dataLength,
    646                                                int otherBlock, int step)
    647     {
    648         // ensure that we do not even partially get past dataLength
    649         dataLength -= DATA_BLOCK_LENGTH;
    650 
    651         for (int block = 0; block <= dataLength; block += step) {
    652             if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {
    653                 return block;
    654             }
    655         }
    656         return -1;
    657     }
    658 
    659     /**
    660      * Fold the normalization data for supplementary code points into
    661      * a compact area on top of the BMP-part of the trie index,
    662      * with the lead surrogates indexing this compact area.
    663      *
    664      * Duplicate the index values for lead surrogates:
    665      * From inside the BMP area, where some may be overridden with folded values,
    666      * to just after the BMP area, where they can be retrieved for
    667      * code point lookups.
    668      * @param manipulate fold implementation
    669      */
    670     private final void fold(DataManipulate manipulate)
    671     {
    672         int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
    673         int index[] = m_index_;
    674         // copy the lead surrogate indexes into a temporary array
    675         System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
    676                          SURROGATE_BLOCK_COUNT_);
    677 
    678         // set all values for lead surrogate code *units* to leadUnitValue
    679         // so that by default runtime lookups will find no data for associated
    680         // supplementary code points, unless there is data for such code points
    681         // which will result in a non-zero folding value below that is set for
    682         // the respective lead units
    683         // the above saved the indexes for surrogate code *points*
    684         // fill the indexes with simplified code from utrie_setRange32()
    685         int block = 0;
    686         if (m_leadUnitValue_ == m_initialValue_) {
    687             // leadUnitValue == initialValue, use all-initial-value block
    688             // block = 0; if block here left empty
    689         }
    690         else {
    691             // create and fill the repeatBlock
    692             block = allocDataBlock();
    693             if (block < 0) {
    694                 // data table overflow
    695                 throw new IllegalStateException("Internal error: Out of memory space");
    696             }
    697             fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
    698             // negative block number to indicate that it is a repeat block
    699             block = -block;
    700         }
    701         for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
    702             m_index_[c] = block;
    703         }
    704 
    705         // Fold significant index values into the area just after the BMP
    706         // indexes.
    707         // In case the first lead surrogate has significant data,
    708         // its index block must be used first (in which case the folding is a
    709         // no-op).
    710         // Later all folded index blocks are moved up one to insert the copied
    711         // lead surrogate indexes.
    712         int indexLength = BMP_INDEX_LENGTH_;
    713         // search for any index (stage 1) entries for supplementary code points
    714         for (int c = 0x10000; c < 0x110000;) {
    715             if (index[c >> SHIFT_] != 0) {
    716                 // there is data, treat the full block for a lead surrogate
    717                 c &= ~0x3ff;
    718                 // is there an identical index block?
    719                 block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
    720 
    721                 // get a folded value for [c..c+0x400[ and,
    722                 // if different from the value for the lead surrogate code
    723                 // point, set it for the lead surrogate code unit
    724 
    725                 int value = manipulate.getFoldedValue(c,
    726                                                       block + SURROGATE_BLOCK_COUNT_);
    727                 if (value != getValue(UTF16.getLeadSurrogate(c))) {
    728                     if (!setValue(UTF16.getLeadSurrogate(c), value)) {
    729                         // data table overflow
    730                         throw new ArrayIndexOutOfBoundsException(
    731                                                                  "Data table overflow");
    732                     }
    733                     // if we did not find an identical index block...
    734                     if (block == indexLength) {
    735                         // move the actual index (stage 1) entries from the
    736                         // supplementary position to the new one
    737                         System.arraycopy(index, c >> SHIFT_, index, indexLength,
    738                                          SURROGATE_BLOCK_COUNT_);
    739                         indexLength += SURROGATE_BLOCK_COUNT_;
    740                     }
    741                 }
    742                 c += 0x400;
    743             }
    744             else {
    745                 c += DATA_BLOCK_LENGTH;
    746             }
    747         }
    748 
    749         // index array overflow?
    750         // This is to guarantee that a folding offset is of the form
    751         // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
    752         // If the index is too large, then n>=1024 and more than 10 bits are
    753         // necessary.
    754         // In fact, it can only ever become n==1024 with completely unfoldable
    755         // data and the additional block of duplicated values for lead
    756         // surrogates.
    757         if (indexLength >= MAX_INDEX_LENGTH_) {
    758             throw new ArrayIndexOutOfBoundsException("Index table overflow");
    759         }
    760         // make space for the lead surrogate index block and insert it between
    761         // the BMP indexes and the folded ones
    762         System.arraycopy(index, BMP_INDEX_LENGTH_, index,
    763                          BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
    764                          indexLength - BMP_INDEX_LENGTH_);
    765         System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
    766                          SURROGATE_BLOCK_COUNT_);
    767         indexLength += SURROGATE_BLOCK_COUNT_;
    768         m_indexLength_ = indexLength;
    769     }
    770 
    771     /**
    772      * @internal
    773      */
    774     private void fillBlock(int block, int start, int limit, int value,
    775                            boolean overwrite)
    776     {
    777         limit += block;
    778         block += start;
    779         if (overwrite) {
    780             while (block < limit) {
    781                 m_data_[block ++] = value;
    782             }
    783         }
    784         else {
    785             while (block < limit) {
    786                 if (m_data_[block] == m_initialValue_) {
    787                     m_data_[block] = value;
    788                 }
    789                 ++ block;
    790             }
    791         }
    792     }
    793 }
    794 
    795