Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ******************************************************************************
      5  * Copyright (C) 1996-2015, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.nio.ByteBuffer;
     13 import java.util.Arrays;
     14 
     15 import com.ibm.icu.lang.UCharacter;
     16 import com.ibm.icu.text.UTF16;
     17 
     18 /**
     19  * <p>A trie is a kind of compressed, serializable table of values
     20  * associated with Unicode code points (0..0x10ffff).</p>
     21  * <p>This class defines the basic structure of a trie and provides methods
     22  * to <b>retrieve the offsets to the actual data</b>.</p>
     23  * <p>Data will be the form of an array of basic types, char or int.</p>
     24  * <p>The actual data format will have to be specified by the user in the
     25  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
     26  * <p>This trie implementation is optimized for getting offset while walking
     27  * forward through a UTF-16 string.
     28  * Therefore, the simplest and fastest access macros are the
     29  * fromLead() and fromOffsetTrail() methods.
     30  * The fromBMP() method are a little more complicated; they get offsets even
     31  * for lead surrogate codepoints, while the fromLead() method get special
     32  * "folded" offsets for lead surrogate code units if there is relevant data
     33  * associated with them.
     34  * From such a folded offsets, an offset needs to be extracted to supply
     35  * to the fromOffsetTrail() methods.
     36  * To handle such supplementary codepoints, some offset information are kept
     37  * in the data.</p>
     38  * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
     39  * that offset from the folded value for the lead surrogate unit.</p>
     40  * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
     41  * com.ibm.icu.impl.IntTrie.</p>
     42  * @author synwee
     43  * @see com.ibm.icu.impl.CharTrie
     44  * @see com.ibm.icu.impl.IntTrie
     45  * @since release 2.1, Jan 01 2002
     46  */
     47 public abstract class Trie
     48 {
     49     // public class declaration ----------------------------------------
     50 
     51     /**
     52     * Character data in com.ibm.impl.Trie have different user-specified format
     53     * for different purposes.
     54     * This interface specifies methods to be implemented in order for
     55     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
     56     * the data.
     57     */
     58     public static interface DataManipulate
     59     {
     60         /**
     61         * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
     62         * data
     63         * the index array offset of the indexes for that lead surrogate.
     64         * @param value data value for a surrogate from the trie, including the
     65         *        folding offset
     66         * @return data offset or 0 if there is no data for the lead surrogate
     67         */
     68         public int getFoldingOffset(int value);
     69     }
     70 
     71     // default implementation
     72     private static class DefaultGetFoldingOffset implements DataManipulate {
     73         @Override
     74         public int getFoldingOffset(int value) {
     75             return value;
     76         }
     77     }
     78 
     79     // public methods --------------------------------------------------
     80 
     81     /**
     82      * Determines if this trie has a linear latin 1 array
     83      * @return true if this trie has a linear latin 1 array, false otherwise
     84      */
     85     public final boolean isLatin1Linear()
     86     {
     87         return m_isLatin1Linear_;
     88     }
     89 
     90     /**
     91      * Checks if the argument Trie has the same data as this Trie.
     92      * Attributes are checked but not the index data.
     93      * @param other Trie to check
     94      * @return true if the argument Trie has the same data as this Trie, false
     95      *         otherwise
     96      */
     97     ///CLOVER:OFF
     98     @Override
     99     public boolean equals(Object other)
    100     {
    101         if (other == this) {
    102             return true;
    103         }
    104         if (!(other instanceof Trie)) {
    105             return false;
    106         }
    107         Trie othertrie = (Trie)other;
    108         return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
    109                && m_options_ == othertrie.m_options_
    110                && m_dataLength_ == othertrie.m_dataLength_
    111                && Arrays.equals(m_index_, othertrie.m_index_);
    112     }
    113 
    114     @Override
    115     public int hashCode() {
    116         assert false : "hashCode not designed";
    117         return 42;
    118     }
    119     ///CLOVER:ON
    120 
    121     /**
    122      * Gets the serialized data file size of the Trie. This is used during
    123      * trie data reading for size checking purposes.
    124      * @return size size of serialized trie data file in terms of the number
    125      *              of bytes
    126      */
    127     public int getSerializedDataSize()
    128     {
    129         // includes signature, option, dataoffset and datalength output
    130         int result = (4 << 2);
    131         result += (m_dataOffset_ << 1);
    132         if (isCharTrie()) {
    133             result += (m_dataLength_ << 1);
    134         }
    135         else if (isIntTrie()) {
    136             result += (m_dataLength_ << 2);
    137         }
    138         return result;
    139     }
    140 
    141     // protected constructor -------------------------------------------
    142 
    143     /**
    144      * Trie constructor for CharTrie use.
    145      * @param bytes data of an ICU data file, containing the trie
    146      * @param dataManipulate object containing the information to parse the
    147      *                       trie data
    148      */
    149     protected Trie(ByteBuffer bytes, DataManipulate  dataManipulate)
    150     {
    151         // Magic number to authenticate the data.
    152         int signature = bytes.getInt();
    153         m_options_    = bytes.getInt();
    154 
    155         if (!checkHeader(signature)) {
    156             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
    157         }
    158 
    159         if(dataManipulate != null) {
    160             m_dataManipulate_ = dataManipulate;
    161         } else {
    162             m_dataManipulate_ = new DefaultGetFoldingOffset();
    163         }
    164         m_isLatin1Linear_ = (m_options_ &
    165                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
    166         m_dataOffset_     = bytes.getInt();
    167         m_dataLength_     = bytes.getInt();
    168         unserialize(bytes);
    169     }
    170 
    171     /**
    172     * Trie constructor
    173     * @param index array to be used for index
    174     * @param options used by the trie
    175     * @param dataManipulate object containing the information to parse the
    176     *                       trie data
    177     */
    178     protected Trie(char index[], int options, DataManipulate dataManipulate)
    179     {
    180         m_options_ = options;
    181         if(dataManipulate != null) {
    182             m_dataManipulate_ = dataManipulate;
    183         } else {
    184             m_dataManipulate_ = new DefaultGetFoldingOffset();
    185         }
    186         m_isLatin1Linear_ = (m_options_ &
    187                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
    188         m_index_ = index;
    189         m_dataOffset_ = m_index_.length;
    190     }
    191 
    192 
    193     // protected data members ------------------------------------------
    194 
    195     /**
    196     * Lead surrogate code points' index displacement in the index array.
    197     * 0x10000-0xd800=0x2800
    198     * 0x2800 >> INDEX_STAGE_1_SHIFT_
    199     */
    200     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
    201     /**
    202     * Shift size for shifting right the input index. 1..9
    203     */
    204     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
    205     /**
    206     * Shift size for shifting left the index array values.
    207     * Increases possible data size with 16-bit index values at the cost
    208     * of compactability.
    209     * This requires blocks of stage 2 data to be aligned by
    210     * DATA_GRANULARITY.
    211     * 0..INDEX_STAGE_1_SHIFT
    212     */
    213     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
    214     /**
    215      * Number of data values in a stage 2 (data array) block.
    216      */
    217     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
    218     /**
    219     * Mask for getting the lower bits from the input index.
    220     * DATA_BLOCK_LENGTH - 1.
    221     */
    222     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
    223     /** Number of bits of a trail surrogate that are used in index table lookups. */
    224     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
    225     /**
    226      * Number of index (stage 1) entries per lead surrogate.
    227      * Same as number of index entries for 1024 trail surrogates,
    228      * ==0x400>>INDEX_STAGE_1_SHIFT_
    229      */
    230     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
    231     /** Length of the BMP portion of the index (stage 1) array. */
    232     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
    233     /**
    234     * Surrogate mask to use when shifting offset to retrieve supplementary
    235     * values
    236     */
    237     protected static final int SURROGATE_MASK_ = 0x3FF;
    238     /**
    239     * Index or UTF16 characters
    240     */
    241     protected char m_index_[];
    242     /**
    243     * Internal TrieValue which handles the parsing of the data value.
    244     * This class is to be implemented by the user
    245     */
    246     protected DataManipulate m_dataManipulate_;
    247     /**
    248     * Start index of the data portion of the trie. CharTrie combines
    249     * index and data into a char array, so this is used to indicate the
    250     * initial offset to the data portion.
    251     * Note this index always points to the initial value.
    252     */
    253     protected int m_dataOffset_;
    254     /**
    255     * Length of the data array
    256     */
    257     protected int m_dataLength_;
    258 
    259     // protected methods -----------------------------------------------
    260 
    261     /**
    262     * Gets the offset to the data which the surrogate pair points to.
    263     * @param lead lead surrogate
    264     * @param trail trailing surrogate
    265     * @return offset to data
    266     */
    267     protected abstract int getSurrogateOffset(char lead, char trail);
    268 
    269     /**
    270     * Gets the value at the argument index
    271     * @param index value at index will be retrieved
    272     * @return 32 bit value
    273     */
    274     protected abstract int getValue(int index);
    275 
    276     /**
    277     * Gets the default initial value
    278     * @return 32 bit value
    279     */
    280     protected abstract int getInitialValue();
    281 
    282     /**
    283     * Gets the offset to the data which the index ch after variable offset
    284     * points to.
    285     * Note for locating a non-supplementary character data offset, calling
    286     * <p>
    287     * getRawOffset(0, ch);
    288     * </p>
    289     * will do. Otherwise if it is a supplementary character formed by
    290     * surrogates lead and trail. Then we would have to call getRawOffset()
    291     * with getFoldingIndexOffset(). See getSurrogateOffset().
    292     * @param offset index offset which ch is to start from
    293     * @param ch index to be used after offset
    294     * @return offset to the data
    295     */
    296     protected final int getRawOffset(int offset, char ch)
    297     {
    298         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
    299                 << INDEX_STAGE_2_SHIFT_)
    300                 + (ch & INDEX_STAGE_3_MASK_);
    301     }
    302 
    303     /**
    304     * Gets the offset to data which the BMP character points to
    305     * Treats a lead surrogate as a normal code point.
    306     * @param ch BMP character
    307     * @return offset to data
    308     */
    309     protected final int getBMPOffset(char ch)
    310     {
    311         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
    312                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
    313                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
    314                 : getRawOffset(0, ch);
    315                 // using a getRawOffset(ch) makes no diff
    316     }
    317 
    318     /**
    319     * Gets the offset to the data which this lead surrogate character points
    320     * to.
    321     * Data at the returned offset may contain folding offset information for
    322     * the next trailing surrogate character.
    323     * @param ch lead surrogate character
    324     * @return offset to data
    325     */
    326     protected final int getLeadOffset(char ch)
    327     {
    328        return getRawOffset(0, ch);
    329     }
    330 
    331     /**
    332     * Internal trie getter from a code point.
    333     * Could be faster(?) but longer with
    334     *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
    335     * Gets the offset to data which the codepoint points to
    336     * @param ch codepoint
    337     * @return offset to data
    338     */
    339     protected final int getCodePointOffset(int ch)
    340     {
    341         // if ((ch >> 16) == 0) slower
    342         if (ch < 0) {
    343             return -1;
    344         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
    345             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
    346             return getRawOffset(0, (char)ch);
    347         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
    348             // BMP codepoint
    349             return getBMPOffset((char)ch);
    350         } else if (ch <= UCharacter.MAX_VALUE) {
    351             // look at the construction of supplementary characters
    352             // trail forms the ends of it.
    353             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
    354                                       (char)(ch & SURROGATE_MASK_));
    355         } else {
    356             // return -1 if there is an error, in this case we return
    357             return -1;
    358         }
    359     }
    360 
    361     /**
    362      * <p>Parses the byte buffer and creates the trie index with it.</p>
    363      * <p>The position of the input ByteBuffer must be right after the trie header.</p>
    364      * <p>This is overwritten by the child classes.
    365      * @param bytes buffer containing trie data
    366      */
    367     protected void unserialize(ByteBuffer bytes)
    368     {
    369         m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0);
    370     }
    371 
    372     /**
    373     * Determines if this is a 32 bit trie
    374     * @return true if options specifies this is a 32 bit trie
    375     */
    376     protected final boolean isIntTrie()
    377     {
    378         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
    379     }
    380 
    381     /**
    382     * Determines if this is a 16 bit trie
    383     * @return true if this is a 16 bit trie
    384     */
    385     protected final boolean isCharTrie()
    386     {
    387         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
    388     }
    389 
    390     // private data members --------------------------------------------
    391 
    392     //  struct UTrieHeader {
    393     //      int32_t   signature;
    394     //      int32_t   options  (a bit field)
    395     //      int32_t   indexLength
    396     //      int32_t   dataLength
    397 
    398     /**
    399     * Size of Trie header in bytes
    400     */
    401     protected static final int HEADER_LENGTH_ = 4 * 4;
    402     /**
    403     * Latin 1 option mask
    404     */
    405     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
    406     /**
    407     * Constant number to authenticate the byte block
    408     */
    409     protected static final int HEADER_SIGNATURE_ = 0x54726965;
    410     /**
    411     * Header option formatting
    412     */
    413     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
    414     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
    415     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
    416 
    417     /**
    418     * Flag indicator for Latin quick access data block
    419     */
    420     private boolean m_isLatin1Linear_;
    421 
    422     /**
    423     * <p>Trie options field.</p>
    424     * <p>options bit field:<br>
    425     * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
    426     * 8  0 = 16-bit data, 1=32-bit data<br>
    427     * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
    428     * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
    429     */
    430     private int m_options_;
    431 
    432     // private methods ---------------------------------------------------
    433 
    434     /**
    435     * Authenticates raw data header.
    436     * Checking the header information, signature and options.
    437     * @param signature This contains the options and type of a Trie
    438     * @return true if the header is authenticated valid
    439     */
    440     private final boolean checkHeader(int signature)
    441     {
    442         // check the signature
    443         // Trie in big-endian US-ASCII (0x54726965).
    444         // Magic number to authenticate the data.
    445         if (signature != HEADER_SIGNATURE_) {
    446             return false;
    447         }
    448 
    449         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
    450                                                     INDEX_STAGE_1_SHIFT_ ||
    451             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
    452                                                 HEADER_OPTIONS_SHIFT_MASK_)
    453                                                  != INDEX_STAGE_2_SHIFT_) {
    454             return false;
    455         }
    456         return true;
    457     }
    458 }
    459