Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ******************************************************************************
      5  * Copyright (C) 1996-2015, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.io.IOException;
     13 import java.nio.ByteBuffer;
     14 import java.util.Arrays;
     15 
     16 import com.ibm.icu.text.UTF16;
     17 
     18 /**
     19  * Trie implementation which stores data in int, 32 bits.
     20  * 2015-sep-03: Used only in CharsetSelector which could be switched to {@link Trie2_32}
     21  * as long as that does not load ICU4C selector data.
     22  *
     23  * @author synwee
     24  * @see com.ibm.icu.impl.Trie
     25  * @since release 2.1, Jan 01 2002
     26  */
     27 public class IntTrie extends Trie
     28 {
     29     // public constructors ---------------------------------------------
     30 
     31     /**
     32     * <p>Creates a new Trie with the settings for the trie data.</p>
     33     * <p>Unserialize the 32-bit-aligned input stream and use the data for the
     34     * trie.</p>
     35     * @param bytes file buffer to a ICU data file, containing the trie
     36     * @param dataManipulate object which provides methods to parse the char
     37     *                        data
     38     * @throws IOException thrown when data reading fails
     39     */
     40     public IntTrie(ByteBuffer bytes, DataManipulate dataManipulate)
     41                                                     throws IOException
     42     {
     43         super(bytes, dataManipulate);
     44         if (!isIntTrie()) {
     45             throw new IllegalArgumentException(
     46                                "Data given does not belong to a int trie.");
     47         }
     48     }
     49 
     50     /**
     51      * Make a dummy IntTrie.
     52      * A dummy trie is an empty runtime trie, used when a real data trie cannot
     53      * be loaded.
     54      *
     55      * The trie always returns the initialValue,
     56      * or the leadUnitValue for lead surrogate code points.
     57      * The Latin-1 part is always set up to be linear.
     58      *
     59      * @param initialValue the initial value that is set for all code points
     60      * @param leadUnitValue the value for lead surrogate code _units_ that do not
     61      *                      have associated supplementary data
     62      * @param dataManipulate object which provides methods to parse the char data
     63      */
     64     @SuppressWarnings("all") // No way to ignore dead code warning specifically - see eclipse bug#282770
     65     public IntTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
     66         super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
     67 
     68         int dataLength, latin1Length, i, limit;
     69         char block;
     70 
     71         /* calculate the actual size of the dummy trie data */
     72 
     73         /* max(Latin-1, block 0) */
     74         dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
     75         if(leadUnitValue!=initialValue) {
     76             dataLength+=DATA_BLOCK_LENGTH;
     77         }
     78         m_data_=new int[dataLength];
     79         m_dataLength_=dataLength;
     80 
     81         m_initialValue_=initialValue;
     82 
     83         /* fill the index and data arrays */
     84 
     85         /* indexes are preset to 0 (block 0) */
     86 
     87         /* Latin-1 data */
     88         for(i=0; i<latin1Length; ++i) {
     89             m_data_[i]=initialValue;
     90         }
     91 
     92         if(leadUnitValue!=initialValue) {
     93             /* indexes for lead surrogate code units to the block after Latin-1 */
     94             block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
     95             i=0xd800>>INDEX_STAGE_1_SHIFT_;
     96             limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
     97             for(; i<limit; ++i) {
     98                 m_index_[i]=block;
     99             }
    100 
    101             /* data for lead surrogate code units */
    102             limit=latin1Length+DATA_BLOCK_LENGTH;
    103             for(i=latin1Length; i<limit; ++i) {
    104                 m_data_[i]=leadUnitValue;
    105             }
    106         }
    107     }
    108 
    109     // public methods --------------------------------------------------
    110 
    111     /**
    112     * Gets the value associated with the codepoint.
    113     * If no value is associated with the codepoint, a default value will be
    114     * returned.
    115     * @param ch codepoint
    116     * @return offset to data
    117     */
    118     public final int getCodePointValue(int ch)
    119     {
    120         int offset;
    121 
    122         // fastpath for U+0000..U+D7FF
    123         if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
    124             // copy of getRawOffset()
    125             offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
    126                     + (ch & INDEX_STAGE_3_MASK_);
    127             return m_data_[offset];
    128         }
    129 
    130         // handle U+D800..U+10FFFF
    131         offset = getCodePointOffset(ch);
    132         return (offset >= 0) ? m_data_[offset] : m_initialValue_;
    133     }
    134 
    135     /**
    136     * Gets the value to the data which this lead surrogate character points
    137     * to.
    138     * Returned data may contain folding offset information for the next
    139     * trailing surrogate character.
    140     * This method does not guarantee correct results for trail surrogates.
    141     * @param ch lead surrogate character
    142     * @return data value
    143     */
    144     public final int getLeadValue(char ch)
    145     {
    146         return m_data_[getLeadOffset(ch)];
    147     }
    148 
    149     /**
    150     * Get the value associated with the BMP code point.
    151     * Lead surrogate code points are treated as normal code points, with
    152     * unfolded values that may differ from getLeadValue() results.
    153     * @param ch the input BMP code point
    154     * @return trie data value associated with the BMP codepoint
    155     */
    156     public final int getBMPValue(char ch)
    157     {
    158         return m_data_[getBMPOffset(ch)];
    159     }
    160 
    161     /**
    162     * Get the value associated with a pair of surrogates.
    163     * @param lead a lead surrogate
    164     * @param trail a trail surrogate
    165     */
    166     public final int getSurrogateValue(char lead, char trail)
    167     {
    168         if (!UTF16.isLeadSurrogate(lead) || !UTF16.isTrailSurrogate(trail)) {
    169             throw new IllegalArgumentException(
    170                 "Argument characters do not form a supplementary character");
    171         }
    172         // get fold position for the next trail surrogate
    173         int offset = getSurrogateOffset(lead, trail);
    174 
    175         // get the real data from the folded lead/trail units
    176         if (offset > 0) {
    177             return m_data_[offset];
    178         }
    179 
    180         // return m_initialValue_ if there is an error
    181         return m_initialValue_;
    182     }
    183 
    184     /**
    185     * Get a value from a folding offset (from the value of a lead surrogate)
    186     * and a trail surrogate.
    187     * @param leadvalue the value of a lead surrogate that contains the
    188     *        folding offset
    189     * @param trail surrogate
    190     * @return trie data value associated with the trail character
    191     */
    192     public final int getTrailValue(int leadvalue, char trail)
    193     {
    194         if (m_dataManipulate_ == null) {
    195             throw new NullPointerException(
    196                              "The field DataManipulate in this Trie is null");
    197         }
    198         int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
    199         if (offset > 0) {
    200             return m_data_[getRawOffset(offset,
    201                                          (char)(trail & SURROGATE_MASK_))];
    202         }
    203         return m_initialValue_;
    204     }
    205 
    206     /**
    207      * <p>Gets the latin 1 fast path value.</p>
    208      * <p>Note this only works if latin 1 characters have their own linear
    209      * array.</p>
    210      * @param ch latin 1 characters
    211      * @return value associated with latin character
    212      */
    213     public final int getLatin1LinearValue(char ch)
    214     {
    215         return m_data_[INDEX_STAGE_3_MASK_ + 1 + ch];
    216     }
    217 
    218     /**
    219      * Checks if the argument Trie has the same data as this Trie
    220      * @param other Trie to check
    221      * @return true if the argument Trie has the same data as this Trie, false
    222      *         otherwise
    223      */
    224     ///CLOVER:OFF
    225     @Override
    226     public boolean equals(Object other)
    227     {
    228         boolean result = super.equals(other);
    229         if (result && other instanceof IntTrie) {
    230             IntTrie othertrie = (IntTrie)other;
    231             if (m_initialValue_ != othertrie.m_initialValue_
    232                 || !Arrays.equals(m_data_, othertrie.m_data_)) {
    233                 return false;
    234             }
    235             return true;
    236         }
    237         return false;
    238     }
    239 
    240     @Override
    241     public int hashCode() {
    242         assert false : "hashCode not designed";
    243         return 42;
    244     }
    245     ///CLOVER:ON
    246 
    247     // protected methods -----------------------------------------------
    248 
    249     /**
    250     * <p>Parses the input stream and stores its trie content into a index and
    251     * data array</p>
    252     * @param bytes data buffer containing trie data
    253     */
    254     @Override
    255     protected final void unserialize(ByteBuffer bytes)
    256     {
    257         super.unserialize(bytes);
    258         // one used for initial value
    259         m_data_ = ICUBinary.getInts(bytes, m_dataLength_, 0);
    260         m_initialValue_ = m_data_[0];
    261     }
    262 
    263     /**
    264     * Gets the offset to the data which the surrogate pair points to.
    265     * @param lead lead surrogate
    266     * @param trail trailing surrogate
    267     * @return offset to data
    268     */
    269     @Override
    270     protected final int getSurrogateOffset(char lead, char trail)
    271     {
    272         if (m_dataManipulate_ == null) {
    273             throw new NullPointerException(
    274                              "The field DataManipulate in this Trie is null");
    275         }
    276         // get fold position for the next trail surrogate
    277         int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
    278 
    279         // get the real data from the folded lead/trail units
    280         if (offset > 0) {
    281             return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
    282         }
    283 
    284         // return -1 if there is an error, in this case we return the default
    285         // value: m_initialValue_
    286         return -1;
    287     }
    288 
    289     /**
    290     * Gets the value at the argument index.
    291     * For use internally in TrieIterator
    292     * @param index value at index will be retrieved
    293     * @return 32 bit value
    294     * @see com.ibm.icu.impl.TrieIterator
    295     */
    296     @Override
    297     protected final int getValue(int index)
    298     {
    299       return m_data_[index];
    300     }
    301 
    302     /**
    303     * Gets the default initial value
    304     * @return 32 bit value
    305     */
    306     @Override
    307     protected final int getInitialValue()
    308     {
    309         return m_initialValue_;
    310     }
    311 
    312     // package private methods -----------------------------------------
    313 
    314     /**
    315      * Internal constructor for builder use
    316      * @param index the index array to be slotted into this trie
    317      * @param data the data array to be slotted into this trie
    318      * @param initialvalue the initial value for this trie
    319      * @param options trie options to use
    320      * @param datamanipulate folding implementation
    321      */
    322     IntTrie(char index[], int data[], int initialvalue, int options,
    323             DataManipulate datamanipulate)
    324     {
    325         super(index, options, datamanipulate);
    326         m_data_ = data;
    327         m_dataLength_ = m_data_.length;
    328         m_initialValue_ = initialvalue;
    329     }
    330 
    331     // private data members --------------------------------------------
    332 
    333     /**
    334     * Default value
    335     */
    336     private int m_initialValue_;
    337     /**
    338     * Array of char data
    339     */
    340     private int m_data_[];
    341 }
    342