Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 ******************************************************************************
      6 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
      7 * others. All Rights Reserved.                                               *
      8 ******************************************************************************
      9 */
     10 
     11 package android.icu.impl;
     12 
     13 import java.util.Arrays;
     14 
     15 import android.icu.lang.UCharacter;
     16 
     17 /**
     18  * Builder class to manipulate and generate a trie.
     19  * This is useful for ICU data in primitive types.
     20  * Provides a compact way to store information that is indexed by Unicode
     21  * values, such as character properties, types, keyboard values, etc. This is
     22  * very useful when you have a block of Unicode data that contains significant
     23  * values while the rest of the Unicode data is unused in the application or
     24  * when you have a lot of redundance, such as where all 21,000 Han ideographs
     25  * have the same value.  However, lookup is much faster than a hash table.
     26  * A trie of any primitive data type serves two purposes:
     27  * <UL type = round>
     28  *     <LI>Fast access of the indexed values.
     29  *     <LI>Smaller memory footprint.
     30  * </UL>
     31  * This is a direct port from the ICU4C version
     32  * @author             Syn Wee Quek
     33  * @hide Only a subset of ICU is exposed in Android
     34  */
     35 public class TrieBuilder
     36 {
     37     // public data member ----------------------------------------------
     38 
     39     /**
     40      * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
     41      * 0x200
     42      */
     43     public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
     44 
     45     // public class declaration ----------------------------------------
     46 
     47     /**
     48      * Character data in com.ibm.impl.Trie have different user-specified format
     49      * for different purposes.
     50      * This interface specifies methods to be implemented in order for
     51      * com.ibm.impl.Trie, to surrogate offset information encapsulated within
     52      * the data.
     53      */
     54     public static interface DataManipulate
     55     {
     56         /**
     57          * Build-time trie callback function, used with serialize().
     58          * This function calculates a lead surrogate's value including a
     59          * folding offset from the 1024 supplementary code points
     60          * [start..start+1024[ .
     61          * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
     62          * The folding offset is provided by the caller.
     63          * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
     64          * with n=0..1023.
     65          * Instead of the offset itself, n can be stored in 10 bits - or fewer
     66          * if it can be assumed that few lead surrogates have associated data.
     67          * The returned value must be
     68          *  - not zero if and only if there is relevant data for the
     69          *                        corresponding 1024 supplementary code points
     70          *  - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
     71          *                                                    offset))==offset
     72          * @return a folded value, or 0 if there is no relevant data for the
     73          *         lead surrogate.
     74          */
     75         public int getFoldedValue(int start, int offset);
     76     }
     77 
     78     // public methods ----------------------------------------------------
     79 
     80     /**
     81      * Checks if the character belongs to a zero block in the trie
     82      * @param ch codepoint which data is to be retrieved
     83      * @return true if ch is in the zero block
     84      */
     85     public boolean isInZeroBlock(int ch)
     86     {
     87         // valid, uncompacted trie and valid c?
     88         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
     89             || ch < UCharacter.MIN_VALUE) {
     90             return true;
     91         }
     92 
     93         return m_index_[ch >> SHIFT_] == 0;
     94     }
     95 
     96     // package private method -----------------------------------------------
     97 
     98     // protected data member -----------------------------------------------
     99 
    100     /**
    101      * Index values at build-time are 32 bits wide for easier processing.
    102      * Bit 31 is set if the data block is used by multiple index values
    103      * (from setRange()).
    104      */
    105     protected int m_index_[];
    106     protected int m_indexLength_;
    107     protected int m_dataCapacity_;
    108     protected int m_dataLength_;
    109     protected boolean m_isLatin1Linear_;
    110     protected boolean m_isCompacted_;
    111     /**
    112      * Map of adjusted indexes, used in utrie_compact().
    113      * Maps from original indexes to new ones.
    114      */
    115     protected int m_map_[];
    116 
    117     /**
    118      * Shift size for shifting right the input index. 1..9
    119      */
    120     protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
    121     /**
    122      * Length of the index (stage 1) array before folding.
    123      * Maximum number of Unicode code points (0x110000) shifted right by
    124      * SHIFT.
    125      */
    126     protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
    127     /**
    128      * Length of the BMP portion of the index (stage 1) array.
    129      */
    130     protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
    131     /**
    132      * Number of index (stage 1) entries per lead surrogate.
    133      * Same as number of indexe entries for 1024 trail surrogates,
    134      * ==0x400>>UTRIE_SHIFT
    135      * 10 - SHIFT == Number of bits of a trail surrogate that are used in
    136      *               index table lookups.
    137      */
    138     protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
    139     /**
    140      * Mask for getting the lower bits from the input index.
    141      * DATA_BLOCK_LENGTH - 1.
    142      */
    143     protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
    144     /**
    145      * Shift size for shifting left the index array values.
    146      * Increases possible data size with 16-bit index values at the cost
    147      * of compactability.
    148      * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
    149      * 0..UTRIE_SHIFT
    150      */
    151     protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
    152     /**
    153      * Maximum length of the runtime data (stage 2) array.
    154      * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
    155      */
    156     protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
    157     /**
    158      * Shifting to position the index value in options
    159      */
    160     protected static final int OPTIONS_INDEX_SHIFT_ = 4;
    161     /**
    162      * If set, then the data (stage 2) array is 32 bits wide.
    163      */
    164     protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
    165     /**
    166      * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
    167      * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
    168      */
    169     protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
    170     /**
    171      * The alignment size of a stage 2 data block. Also the granularity for
    172      * compaction.
    173      */
    174     protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
    175 
    176     // protected constructor ----------------------------------------------
    177 
    178     protected TrieBuilder()
    179     {
    180         m_index_ = new int[MAX_INDEX_LENGTH_];
    181         m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
    182         m_isLatin1Linear_ = false;
    183         m_isCompacted_ = false;
    184         m_indexLength_ = MAX_INDEX_LENGTH_;
    185     }
    186 
    187     protected TrieBuilder(TrieBuilder table)
    188     {
    189         m_index_ = new int[MAX_INDEX_LENGTH_];
    190         m_indexLength_ = table.m_indexLength_;
    191         System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
    192         m_dataCapacity_ = table.m_dataCapacity_;
    193         m_dataLength_ = table.m_dataLength_;
    194         m_map_ = new int[table.m_map_.length];
    195         System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
    196         m_isLatin1Linear_ = table.m_isLatin1Linear_;
    197         m_isCompacted_ = table.m_isCompacted_;
    198     }
    199 
    200     // protected functions ------------------------------------------------
    201 
    202     /**
    203      * Compare two sections of an array for equality.
    204      */
    205     protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
    206         while(length>0 && array[start1]==array[start2]) {
    207             ++start1;
    208             ++start2;
    209             --length;
    210         }
    211         return length==0;
    212     }
    213 
    214     /**
    215      * Set a value in the trie index map to indicate which data block
    216      * is referenced and which one is not.
    217      * utrie_compact() will remove data blocks that are not used at all.
    218      * Set
    219      * - 0 if it is used
    220      * - -1 if it is not used
    221      */
    222     protected void findUnusedBlocks()
    223     {
    224         // fill the entire map with "not used"
    225         Arrays.fill(m_map_, 0xff);
    226 
    227         // mark each block that _is_ used with 0
    228         for (int i = 0; i < m_indexLength_; ++ i) {
    229             m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
    230         }
    231 
    232         // never move the all-initial-value block 0
    233         m_map_[0] = 0;
    234     }
    235 
    236     /**
    237      * Finds the same index block as the otherBlock
    238      * @param index array
    239      * @param indexLength size of index
    240      * @param otherBlock
    241      * @return same index block
    242      */
    243     protected static final int findSameIndexBlock(int index[], int indexLength,
    244                                                   int otherBlock)
    245     {
    246         for (int block = BMP_INDEX_LENGTH_; block < indexLength;
    247              block += SURROGATE_BLOCK_COUNT_) {
    248             if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
    249                 return block;
    250             }
    251         }
    252         return indexLength;
    253     }
    254 
    255     // private data member ------------------------------------------------
    256 
    257     /**
    258      * Maximum length of the build-time data (stage 2) array.
    259      * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
    260      * (Number of Unicode code points + one all-initial-value block +
    261      *  possible duplicate entries for 1024 lead surrogates.)
    262      */
    263     private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
    264         0x110000 + DATA_BLOCK_LENGTH + 0x400;
    265 }
    266