Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2009-2014, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.io.DataOutputStream;
     13 import java.io.IOException;
     14 import java.io.OutputStream;
     15 import java.nio.ByteBuffer;
     16 
     17 
     18 /**
     19  * @author aheninger
     20  *
     21  * A read-only Trie2, holding 16 bit data values.
     22  *
     23  * A Trie2 is a highly optimized data structure for mapping from Unicode
     24  * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
     25  *
     26  * See class Trie2 for descriptions of the API for accessing the contents of a trie.
     27  *
     28  * The fundamental data access methods are declared final in this class, with
     29  * the intent that applications might gain a little extra performance, when compared
     30  * with calling the same methods via the abstract UTrie2 base class.
     31  */
     32 public final class Trie2_16 extends Trie2 {
     33 
     34 
     35     /**
     36      *  Internal constructor, not for general use.
     37      */
     38     Trie2_16() {
     39     }
     40 
     41 
     42     /**
     43      * Create a Trie2 from its serialized form.  Inverse of utrie2_serialize().
     44      * The serialized format is identical between ICU4C and ICU4J, so this function
     45      * will work with serialized Trie2s from either.
     46      *
     47      * The serialized Trie2 in the bytes may be in either little or big endian byte order.
     48      * This allows using serialized Tries from ICU4C without needing to consider the
     49      * byte order of the system that created them.
     50      *
     51      * @param bytes a byte buffer to the serialized form of a UTrie2.
     52      * @return An unserialized Trie2_16, ready for use.
     53      * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
     54      * @throws IOException if a read error occurs in the buffer.
     55      * @throws ClassCastException if the bytes contain a serialized Trie2_32
     56      */
     57     public static Trie2_16  createFromSerialized(ByteBuffer bytes) throws IOException {
     58         return (Trie2_16) Trie2.createFromSerialized(bytes);
     59     }
     60 
     61     /**
     62      * Get the value for a code point as stored in the Trie2.
     63      *
     64      * @param codePoint the code point
     65      * @return the value
     66      */
     67     @Override
     68     public final int get(int codePoint) {
     69         int value;
     70         int ix;
     71 
     72         if (codePoint >= 0) {
     73             if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
     74                 // Ordinary BMP code point, excluding leading surrogates.
     75                 // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
     76                 // 16 bit data is stored in the index array itself.
     77                 ix = index[codePoint >> UTRIE2_SHIFT_2];
     78                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
     79                 value = index[ix];
     80                 return value;
     81             }
     82             if (codePoint <= 0xffff) {
     83                 // Lead Surrogate Code Point.  A Separate index section is stored for
     84                 // lead surrogate code units and code points.
     85                 //   The main index has the code unit data.
     86                 //   For this function, we need the code point data.
     87                 // Note: this expression could be refactored for slightly improved efficiency, but
     88                 //       surrogate code points will be so rare in practice that it's not worth it.
     89                 ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
     90                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
     91                 value = index[ix];
     92                 return value;
     93             }
     94             if (codePoint < highStart) {
     95                 // Supplemental code point, use two-level lookup.
     96                 ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
     97                 ix = index[ix];
     98                 ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
     99                 ix = index[ix];
    100                 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
    101                 value = index[ix];
    102                 return value;
    103             }
    104             if (codePoint <= 0x10ffff) {
    105                 value = index[highValueIndex];
    106                 return value;
    107             }
    108         }
    109 
    110         // Fall through.  The code point is outside of the legal range of 0..0x10ffff.
    111         return errorValue;
    112     }
    113 
    114 
    115     /**
    116      * Get a Trie2 value for a UTF-16 code unit.
    117      *
    118      * This function returns the same value as get() if the input
    119      * character is outside of the lead surrogate range
    120      *
    121      * There are two values stored in a Trie2 for inputs in the lead
    122      * surrogate range.  This function returns the alternate value,
    123      * while Trie2.get() returns the main value.
    124      *
    125      * @param codeUnit a 16 bit code unit or lead surrogate value.
    126      * @return the value
    127      */
    128     @Override
    129     public int getFromU16SingleLead(char codeUnit) {
    130         int value;
    131         int ix;
    132 
    133         // Because the input is a 16 bit char, we can skip the tests for it being in
    134         // the BMP range.  It is.
    135         ix = index[codeUnit >> UTRIE2_SHIFT_2];
    136         ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
    137         value = index[ix];
    138         return value;
    139     }
    140 
    141 
    142     /**
    143      * Serialize a Trie2_16 onto an OutputStream.
    144      *
    145      * A Trie2 can be serialized multiple times.
    146      * The serialized data is compatible with ICU4C UTrie2 serialization.
    147      * Trie2 serialization is unrelated to Java object serialization.
    148      *
    149      * @param os the stream to which the serialized Trie2 data will be written.
    150      * @return the number of bytes written.
    151      * @throw IOException on an error writing to the OutputStream.
    152      */
    153     public int serialize(OutputStream os) throws IOException {
    154         DataOutputStream dos = new DataOutputStream(os);
    155         int  bytesWritten = 0;
    156 
    157         bytesWritten += serializeHeader(dos);
    158         for (int i=0; i<dataLength; i++) {
    159             dos.writeChar(index[data16+i]);
    160         }
    161         bytesWritten += dataLength*2;
    162         return bytesWritten;
    163     }
    164 
    165     /**
    166      * @return the number of bytes of the serialized trie
    167      */
    168     public int getSerializedLength() {
    169         return 16+(header.indexLength+dataLength)*2;
    170     }
    171 
    172     /**
    173      * Given a starting code point, find the last in a range of code points,
    174      * all with the same value.
    175      *
    176      * This function is part of the implementation of iterating over the
    177      * Trie2's contents.
    178      * @param startingCP The code point at which to begin looking.
    179      * @return The last code point with the same value as the starting code point.
    180      */
    181     @Override
    182     int rangeEnd(int startingCP, int limit, int value) {
    183         int   cp = startingCP;
    184         int   block = 0;
    185         int   index2Block = 0;
    186 
    187         // Loop runs once for each of
    188         //   - a partial data block
    189         //   - a reference to the null (default) data block.
    190         //   - a reference to the index2 null block
    191 
    192       outerLoop:
    193         for (;;) {
    194             if (cp >= limit) {
    195                 break;
    196             }
    197             if (cp < 0x0d800 || (cp > 0x0dbff && cp <= 0x0ffff)) {
    198                 // Ordinary BMP code point, excluding leading surrogates.
    199                 // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
    200                 // 16 bit data is stored in the index array itself.
    201                 index2Block = 0;
    202                 block       = index[cp >> UTRIE2_SHIFT_2] << UTRIE2_INDEX_SHIFT;
    203             } else if (cp < 0xffff) {
    204                 // Lead Surrogate Code Point, 0xd800 <= cp < 0xdc00
    205                 index2Block = UTRIE2_LSCP_INDEX_2_OFFSET;
    206                 block       = index[index2Block + ((cp - 0xd800) >> UTRIE2_SHIFT_2)] << UTRIE2_INDEX_SHIFT;
    207             } else if (cp < highStart) {
    208                 // Supplemental code point, use two-level lookup.
    209                 int ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (cp >> UTRIE2_SHIFT_1);
    210                 index2Block = index[ix];
    211                 block = index[index2Block + ((cp >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK)] << UTRIE2_INDEX_SHIFT;
    212             } else  {
    213                 // Code point above highStart.
    214                 if (value == index[highValueIndex]) {
    215                     cp = limit;
    216                 }
    217                 break;
    218             }
    219 
    220             if (index2Block == index2NullOffset) {
    221                 if (value != initialValue) {
    222                     break;
    223                 }
    224                 cp += UTRIE2_CP_PER_INDEX_1_ENTRY;
    225             } else if (block == dataNullOffset) {
    226                 // The block at dataNullOffset has all values == initialValue.
    227                 // Because Trie2 iteration always proceeds in ascending order, we will always
    228                 //   encounter a null block at its beginning, and can skip over
    229                 //   a number of code points equal to the length of the block.
    230                 if (value != initialValue) {
    231                     break;
    232                 }
    233                 cp += UTRIE2_DATA_BLOCK_LENGTH;
    234             } else {
    235                 // Current position refers to an ordinary data block.
    236                 // Walk over the data entries, checking the values.
    237                 int startIx = block + (cp & UTRIE2_DATA_MASK);
    238                 int limitIx = block + UTRIE2_DATA_BLOCK_LENGTH;
    239                 for (int ix = startIx; ix<limitIx; ix++) {
    240                     if (index[ix] != value) {
    241                         // We came to an entry with a different value.
    242                         //   We are done.
    243                         cp += (ix - startIx);
    244                         break outerLoop;
    245                     }
    246                 }
    247                 // The ordinary data block contained our value until its end.
    248                 //  Advance the current code point, and continue the outerloop.
    249                 cp += limitIx - startIx;
    250             }
    251         }
    252         if (cp > limit) {
    253             cp = limit;
    254         }
    255 
    256         return cp - 1;
    257     }
    258 }
    259