1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.DataOutputStream; 13 import java.io.IOException; 14 import java.io.OutputStream; 15 import java.nio.ByteBuffer; 16 17 18 /** 19 * @author aheninger 20 * 21 * A read-only Trie2, holding 16 bit data values. 22 * 23 * A Trie2 is a highly optimized data structure for mapping from Unicode 24 * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value. 25 * 26 * See class Trie2 for descriptions of the API for accessing the contents of a trie. 27 * 28 * The fundamental data access methods are declared final in this class, with 29 * the intent that applications might gain a little extra performance, when compared 30 * with calling the same methods via the abstract UTrie2 base class. 31 */ 32 public final class Trie2_16 extends Trie2 { 33 34 35 /** 36 * Internal constructor, not for general use. 37 */ 38 Trie2_16() { 39 } 40 41 42 /** 43 * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). 44 * The serialized format is identical between ICU4C and ICU4J, so this function 45 * will work with serialized Trie2s from either. 46 * 47 * The serialized Trie2 in the bytes may be in either little or big endian byte order. 48 * This allows using serialized Tries from ICU4C without needing to consider the 49 * byte order of the system that created them. 50 * 51 * @param bytes a byte buffer to the serialized form of a UTrie2. 52 * @return An unserialized Trie2_16, ready for use. 53 * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2. 54 * @throws IOException if a read error occurs in the buffer. 55 * @throws ClassCastException if the bytes contain a serialized Trie2_32 56 */ 57 public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException { 58 return (Trie2_16) Trie2.createFromSerialized(bytes); 59 } 60 61 /** 62 * Get the value for a code point as stored in the Trie2. 63 * 64 * @param codePoint the code point 65 * @return the value 66 */ 67 @Override 68 public final int get(int codePoint) { 69 int value; 70 int ix; 71 72 if (codePoint >= 0) { 73 if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) { 74 // Ordinary BMP code point, excluding leading surrogates. 75 // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. 76 // 16 bit data is stored in the index array itself. 77 ix = index[codePoint >> UTRIE2_SHIFT_2]; 78 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); 79 value = index[ix]; 80 return value; 81 } 82 if (codePoint <= 0xffff) { 83 // Lead Surrogate Code Point. A Separate index section is stored for 84 // lead surrogate code units and code points. 85 // The main index has the code unit data. 86 // For this function, we need the code point data. 87 // Note: this expression could be refactored for slightly improved efficiency, but 88 // surrogate code points will be so rare in practice that it's not worth it. 89 ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)]; 90 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); 91 value = index[ix]; 92 return value; 93 } 94 if (codePoint < highStart) { 95 // Supplemental code point, use two-level lookup. 96 ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1); 97 ix = index[ix]; 98 ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; 99 ix = index[ix]; 100 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); 101 value = index[ix]; 102 return value; 103 } 104 if (codePoint <= 0x10ffff) { 105 value = index[highValueIndex]; 106 return value; 107 } 108 } 109 110 // Fall through. The code point is outside of the legal range of 0..0x10ffff. 111 return errorValue; 112 } 113 114 115 /** 116 * Get a Trie2 value for a UTF-16 code unit. 117 * 118 * This function returns the same value as get() if the input 119 * character is outside of the lead surrogate range 120 * 121 * There are two values stored in a Trie2 for inputs in the lead 122 * surrogate range. This function returns the alternate value, 123 * while Trie2.get() returns the main value. 124 * 125 * @param codeUnit a 16 bit code unit or lead surrogate value. 126 * @return the value 127 */ 128 @Override 129 public int getFromU16SingleLead(char codeUnit) { 130 int value; 131 int ix; 132 133 // Because the input is a 16 bit char, we can skip the tests for it being in 134 // the BMP range. It is. 135 ix = index[codeUnit >> UTRIE2_SHIFT_2]; 136 ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK); 137 value = index[ix]; 138 return value; 139 } 140 141 142 /** 143 * Serialize a Trie2_16 onto an OutputStream. 144 * 145 * A Trie2 can be serialized multiple times. 146 * The serialized data is compatible with ICU4C UTrie2 serialization. 147 * Trie2 serialization is unrelated to Java object serialization. 148 * 149 * @param os the stream to which the serialized Trie2 data will be written. 150 * @return the number of bytes written. 151 * @throw IOException on an error writing to the OutputStream. 152 */ 153 public int serialize(OutputStream os) throws IOException { 154 DataOutputStream dos = new DataOutputStream(os); 155 int bytesWritten = 0; 156 157 bytesWritten += serializeHeader(dos); 158 for (int i=0; i<dataLength; i++) { 159 dos.writeChar(index[data16+i]); 160 } 161 bytesWritten += dataLength*2; 162 return bytesWritten; 163 } 164 165 /** 166 * @return the number of bytes of the serialized trie 167 */ 168 public int getSerializedLength() { 169 return 16+(header.indexLength+dataLength)*2; 170 } 171 172 /** 173 * Given a starting code point, find the last in a range of code points, 174 * all with the same value. 175 * 176 * This function is part of the implementation of iterating over the 177 * Trie2's contents. 178 * @param startingCP The code point at which to begin looking. 179 * @return The last code point with the same value as the starting code point. 180 */ 181 @Override 182 int rangeEnd(int startingCP, int limit, int value) { 183 int cp = startingCP; 184 int block = 0; 185 int index2Block = 0; 186 187 // Loop runs once for each of 188 // - a partial data block 189 // - a reference to the null (default) data block. 190 // - a reference to the index2 null block 191 192 outerLoop: 193 for (;;) { 194 if (cp >= limit) { 195 break; 196 } 197 if (cp < 0x0d800 || (cp > 0x0dbff && cp <= 0x0ffff)) { 198 // Ordinary BMP code point, excluding leading surrogates. 199 // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. 200 // 16 bit data is stored in the index array itself. 201 index2Block = 0; 202 block = index[cp >> UTRIE2_SHIFT_2] << UTRIE2_INDEX_SHIFT; 203 } else if (cp < 0xffff) { 204 // Lead Surrogate Code Point, 0xd800 <= cp < 0xdc00 205 index2Block = UTRIE2_LSCP_INDEX_2_OFFSET; 206 block = index[index2Block + ((cp - 0xd800) >> UTRIE2_SHIFT_2)] << UTRIE2_INDEX_SHIFT; 207 } else if (cp < highStart) { 208 // Supplemental code point, use two-level lookup. 209 int ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (cp >> UTRIE2_SHIFT_1); 210 index2Block = index[ix]; 211 block = index[index2Block + ((cp >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK)] << UTRIE2_INDEX_SHIFT; 212 } else { 213 // Code point above highStart. 214 if (value == index[highValueIndex]) { 215 cp = limit; 216 } 217 break; 218 } 219 220 if (index2Block == index2NullOffset) { 221 if (value != initialValue) { 222 break; 223 } 224 cp += UTRIE2_CP_PER_INDEX_1_ENTRY; 225 } else if (block == dataNullOffset) { 226 // The block at dataNullOffset has all values == initialValue. 227 // Because Trie2 iteration always proceeds in ascending order, we will always 228 // encounter a null block at its beginning, and can skip over 229 // a number of code points equal to the length of the block. 230 if (value != initialValue) { 231 break; 232 } 233 cp += UTRIE2_DATA_BLOCK_LENGTH; 234 } else { 235 // Current position refers to an ordinary data block. 236 // Walk over the data entries, checking the values. 237 int startIx = block + (cp & UTRIE2_DATA_MASK); 238 int limitIx = block + UTRIE2_DATA_BLOCK_LENGTH; 239 for (int ix = startIx; ix<limitIx; ix++) { 240 if (index[ix] != value) { 241 // We came to an entry with a different value. 242 // We are done. 243 cp += (ix - startIx); 244 break outerLoop; 245 } 246 } 247 // The ordinary data block contained our value until its end. 248 // Advance the current code point, and continue the outerloop. 249 cp += limitIx - startIx; 250 } 251 } 252 if (cp > limit) { 253 cp = limit; 254 } 255 256 return cp - 1; 257 } 258 } 259