1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationDataReader.java, ported from collationdatareader.h/.cpp 10 * 11 * C++ version created on: 2013feb07 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import java.io.IOException; 18 import java.nio.ByteBuffer; 19 import java.nio.CharBuffer; 20 import java.util.Arrays; 21 22 import android.icu.impl.ICUBinary; 23 import android.icu.impl.Trie2_32; 24 import android.icu.impl.USerializedSet; 25 import android.icu.text.Collator; 26 import android.icu.text.UnicodeSet; 27 import android.icu.util.ICUException; 28 29 /** 30 * Collation binary data reader. 31 */ 32 final class CollationDataReader /* all static */ { 33 // The following constants are also copied into source/common/ucol_swp.cpp. 34 // Keep them in sync! 35 /** 36 * Number of int indexes. 37 * 38 * Can be 2 if there are only options. 39 * Can be 7 or 8 if there are only options and a script reordering. 40 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 41 */ 42 static final int IX_INDEXES_LENGTH = 0; 43 /** 44 * Bits 31..24: numericPrimary, for numeric collation 45 * 23..16: fast Latin format version (0 = no fast Latin table) 46 * 15.. 0: options bit set 47 */ 48 static final int IX_OPTIONS = 1; 49 static final int IX_RESERVED2 = 2; 50 static final int IX_RESERVED3 = 3; 51 52 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 53 static final int IX_JAMO_CE32S_START = 4; 54 55 // Byte offsets from the start of the data, after the generic header. 56 // The indexes[] are at byte offset 0, other data follows. 57 // Each data item is aligned properly. 58 // The data items should be in descending order of unit size, 59 // to minimize the need for padding. 60 // Each item's byte length is given by the difference between its offset and 61 // the next index/offset value. 62 /** Byte offset to int reorderCodes[]. */ 63 static final int IX_REORDER_CODES_OFFSET = 5; 64 /** 65 * Byte offset to uint8_t reorderTable[]. 66 * Empty table if <256 bytes (padding only). 67 * Otherwise 256 bytes or more (with padding). 68 */ 69 static final int IX_REORDER_TABLE_OFFSET = 6; 70 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 71 static final int IX_TRIE_OFFSET = 7; 72 73 static final int IX_RESERVED8_OFFSET = 8; 74 /** Byte offset to long ces[]. */ 75 static final int IX_CES_OFFSET = 9; 76 static final int IX_RESERVED10_OFFSET = 10; 77 /** Byte offset to int ce32s[]. */ 78 static final int IX_CE32S_OFFSET = 11; 79 80 /** Byte offset to uint32_t rootElements[]. */ 81 static final int IX_ROOT_ELEMENTS_OFFSET = 12; 82 /** Byte offset to UChar *contexts[]. */ 83 static final int IX_CONTEXTS_OFFSET = 13; 84 /** Byte offset to char [] with serialized unsafeBackwardSet. */ 85 static final int IX_UNSAFE_BWD_OFFSET = 14; 86 /** Byte offset to char fastLatinTable[]. */ 87 static final int IX_FAST_LATIN_TABLE_OFFSET = 15; 88 89 /** Byte offset to char scripts[]. */ 90 static final int IX_SCRIPTS_OFFSET = 16; 91 /** 92 * Byte offset to boolean compressibleBytes[]. 93 * Empty table if <256 bytes (padding only). 94 * Otherwise 256 bytes or more (with padding). 95 */ 96 static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17; 97 static final int IX_RESERVED18_OFFSET = 18; 98 static final int IX_TOTAL_SIZE = 19; 99 100 static void read(CollationTailoring base, ByteBuffer inBytes, 101 CollationTailoring tailoring) throws IOException { 102 tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); 103 if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) { 104 throw new ICUException("Tailoring UCA version differs from base data UCA version"); 105 } 106 107 int inLength = inBytes.remaining(); 108 if(inLength < 8) { 109 throw new ICUException("not enough bytes"); 110 } 111 int indexesLength = inBytes.getInt(); // inIndexes[IX_INDEXES_LENGTH] 112 if(indexesLength < 2 || inLength < indexesLength * 4) { 113 throw new ICUException("not enough indexes"); 114 } 115 int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; 116 inIndexes[0] = indexesLength; 117 for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) { 118 inIndexes[i] = inBytes.getInt(); 119 } 120 for(int i = indexesLength; i < inIndexes.length; ++i) { 121 inIndexes[i] = -1; 122 } 123 if(indexesLength > inIndexes.length) { 124 ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4); 125 } 126 127 // Assume that the tailoring data is in initial state, 128 // with null pointers and 0 lengths. 129 130 // Set pointers to non-empty data parts. 131 // Do this in order of their byte offsets. (Should help porting to Java.) 132 133 int index; // one of the indexes[] slots 134 int offset; // byte offset for the index part 135 int length; // number of bytes in the index part 136 137 if(indexesLength > IX_TOTAL_SIZE) { 138 length = inIndexes[IX_TOTAL_SIZE]; 139 } else if(indexesLength > IX_REORDER_CODES_OFFSET) { 140 length = inIndexes[indexesLength - 1]; 141 } else { 142 length = 0; // only indexes, and inLength was already checked for them 143 } 144 if(inLength < length) { 145 throw new ICUException("not enough bytes"); 146 } 147 148 CollationData baseData = base == null ? null : base.data; 149 int[] reorderCodes; 150 int reorderCodesLength; 151 index = IX_REORDER_CODES_OFFSET; 152 offset = inIndexes[index]; 153 length = inIndexes[index + 1] - offset; 154 if(length >= 4) { 155 if(baseData == null) { 156 // We assume for collation settings that 157 // the base data does not have a reordering. 158 throw new ICUException("Collation base data must not reorder scripts"); 159 } 160 reorderCodesLength = length / 4; 161 reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3); 162 163 // The reorderRanges (if any) are the trailing reorderCodes entries. 164 // Split the array at the boundary. 165 // Script or reorder codes do not exceed 16-bit values. 166 // Range limits are stored in the upper 16 bits, and are never 0. 167 int reorderRangesLength = 0; 168 while(reorderRangesLength < reorderCodesLength && 169 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { 170 ++reorderRangesLength; 171 } 172 assert(reorderRangesLength < reorderCodesLength); 173 reorderCodesLength -= reorderRangesLength; 174 } else { 175 reorderCodes = new int[0]; 176 reorderCodesLength = 0; 177 ICUBinary.skipBytes(inBytes, length); 178 } 179 180 // There should be a reorder table only if there are reorder codes. 181 // However, when there are reorder codes the reorder table may be omitted to reduce 182 // the data size. 183 byte[] reorderTable = null; 184 index = IX_REORDER_TABLE_OFFSET; 185 offset = inIndexes[index]; 186 length = inIndexes[index + 1] - offset; 187 if(length >= 256) { 188 if(reorderCodesLength == 0) { 189 throw new ICUException("Reordering table without reordering codes"); 190 } 191 reorderTable = new byte[256]; 192 inBytes.get(reorderTable); 193 length -= 256; 194 } else { 195 // If we have reorder codes, then build the reorderTable at the end, 196 // when the CollationData is otherwise complete. 197 } 198 ICUBinary.skipBytes(inBytes, length); 199 200 if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { 201 throw new ICUException("Tailoring numeric primary weight differs from base data"); 202 } 203 CollationData data = null; // Remains null if there are no mappings. 204 205 index = IX_TRIE_OFFSET; 206 offset = inIndexes[index]; 207 length = inIndexes[index + 1] - offset; 208 if(length >= 8) { 209 tailoring.ensureOwnedData(); 210 data = tailoring.ownedData; 211 data.base = baseData; 212 data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; 213 data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes); 214 int trieLength = data.trie.getSerializedLength(); 215 if(trieLength > length) { 216 throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. 217 } 218 length -= trieLength; 219 } else if(baseData != null) { 220 // Use the base data. Only the settings are tailored. 221 tailoring.data = baseData; 222 } else { 223 throw new ICUException("Missing collation data mappings"); // No mappings. 224 } 225 ICUBinary.skipBytes(inBytes, length); 226 227 index = IX_RESERVED8_OFFSET; 228 offset = inIndexes[index]; 229 length = inIndexes[index + 1] - offset; 230 ICUBinary.skipBytes(inBytes, length); 231 232 index = IX_CES_OFFSET; 233 offset = inIndexes[index]; 234 length = inIndexes[index + 1] - offset; 235 if(length >= 8) { 236 if(data == null) { 237 throw new ICUException("Tailored ces without tailored trie"); 238 } 239 data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7); 240 } else { 241 ICUBinary.skipBytes(inBytes, length); 242 } 243 244 index = IX_RESERVED10_OFFSET; 245 offset = inIndexes[index]; 246 length = inIndexes[index + 1] - offset; 247 ICUBinary.skipBytes(inBytes, length); 248 249 index = IX_CE32S_OFFSET; 250 offset = inIndexes[index]; 251 length = inIndexes[index + 1] - offset; 252 if(length >= 4) { 253 if(data == null) { 254 throw new ICUException("Tailored ce32s without tailored trie"); 255 } 256 data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3); 257 } else { 258 ICUBinary.skipBytes(inBytes, length); 259 } 260 261 int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; 262 if(jamoCE32sStart >= 0) { 263 if(data == null || data.ce32s == null) { 264 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); 265 } 266 data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 267 System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); 268 } else if(data == null) { 269 // Nothing to do. 270 } else if(baseData != null) { 271 data.jamoCE32s = baseData.jamoCE32s; 272 } else { 273 throw new ICUException("Missing Jamo CE32s for Hangul processing"); 274 } 275 276 index = IX_ROOT_ELEMENTS_OFFSET; 277 offset = inIndexes[index]; 278 length = inIndexes[index + 1] - offset; 279 if(length >= 4) { 280 int rootElementsLength = length / 4; 281 if(data == null) { 282 throw new ICUException("Root elements but no mappings"); 283 } 284 if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { 285 throw new ICUException("Root elements array too short"); 286 } 287 data.rootElements = new long[rootElementsLength]; 288 for(int i = 0; i < rootElementsLength; ++i) { 289 data.rootElements[i] = inBytes.getInt() & 0xffffffffL; // unsigned int -> long 290 } 291 long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; 292 if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { 293 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); 294 } 295 long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; 296 if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) { 297 // [fixed last secondary common byte] is too low, 298 // and secondary weights would collide with compressed common secondaries. 299 throw new ICUException("[fixed last secondary common byte] is too low"); 300 } 301 length &= 3; 302 } 303 ICUBinary.skipBytes(inBytes, length); 304 305 index = IX_CONTEXTS_OFFSET; 306 offset = inIndexes[index]; 307 length = inIndexes[index + 1] - offset; 308 if(length >= 2) { 309 if(data == null) { 310 throw new ICUException("Tailored contexts without tailored trie"); 311 } 312 data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1); 313 } else { 314 ICUBinary.skipBytes(inBytes, length); 315 } 316 317 index = IX_UNSAFE_BWD_OFFSET; 318 offset = inIndexes[index]; 319 length = inIndexes[index + 1] - offset; 320 if(length >= 2) { 321 if(data == null) { 322 throw new ICUException("Unsafe-backward-set but no mappings"); 323 } 324 if(baseData == null) { 325 // Create the unsafe-backward set for the root collator. 326 // Include all non-zero combining marks and trail surrogates. 327 // We do this at load time, rather than at build time, 328 // to simplify Unicode version bootstrapping: 329 // The root data builder only needs the new FractionalUCA.txt data, 330 // but it need not be built with a version of ICU already updated to 331 // the corresponding new Unicode Character Database. 332 // 333 // The following is an optimized version of 334 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). 335 // It is faster and requires fewer code dependencies. 336 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates 337 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet); 338 } else { 339 // Clone the root collator's set contents. 340 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed(); 341 } 342 // Add the ranges from the data file to the unsafe-backward set. 343 USerializedSet sset = new USerializedSet(); 344 char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1); 345 length = 0; 346 sset.getSet(unsafeData, 0); 347 int count = sset.countRanges(); 348 int[] range = new int[2]; 349 for(int i = 0; i < count; ++i) { 350 sset.getRange(i, range); 351 tailoring.unsafeBackwardSet.add(range[0], range[1]); 352 } 353 // Mark each lead surrogate as "unsafe" 354 // if any of its 1024 associated supplementary code points is "unsafe". 355 int c = 0x10000; 356 for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 357 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) { 358 tailoring.unsafeBackwardSet.add(lead); 359 } 360 } 361 tailoring.unsafeBackwardSet.freeze(); 362 data.unsafeBackwardSet = tailoring.unsafeBackwardSet; 363 } else if(data == null) { 364 // Nothing to do. 365 } else if(baseData != null) { 366 // No tailoring-specific data: Alias the root collator's set. 367 data.unsafeBackwardSet = baseData.unsafeBackwardSet; 368 } else { 369 throw new ICUException("Missing unsafe-backward-set"); 370 } 371 ICUBinary.skipBytes(inBytes, length); 372 373 // If the fast Latin format version is different, 374 // or the version is set to 0 for "no fast Latin table", 375 // then just always use the normal string comparison path. 376 index = IX_FAST_LATIN_TABLE_OFFSET; 377 offset = inIndexes[index]; 378 length = inIndexes[index + 1] - offset; 379 if(data != null) { 380 data.fastLatinTable = null; 381 data.fastLatinTableHeader = null; 382 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { 383 if(length >= 2) { 384 char header0 = inBytes.getChar(); 385 int headerLength = header0 & 0xff; 386 data.fastLatinTableHeader = new char[headerLength]; 387 data.fastLatinTableHeader[0] = header0; 388 for(int i = 1; i < headerLength; ++i) { 389 data.fastLatinTableHeader[i] = inBytes.getChar(); 390 } 391 int tableLength = length / 2 - headerLength; 392 data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1); 393 length = 0; 394 if((header0 >> 8) != CollationFastLatin.VERSION) { 395 throw new ICUException("Fast-Latin table version differs from version in data header"); 396 } 397 } else if(baseData != null) { 398 data.fastLatinTable = baseData.fastLatinTable; 399 data.fastLatinTableHeader = baseData.fastLatinTableHeader; 400 } 401 } 402 } 403 ICUBinary.skipBytes(inBytes, length); 404 405 index = IX_SCRIPTS_OFFSET; 406 offset = inIndexes[index]; 407 length = inIndexes[index + 1] - offset; 408 if(length >= 2) { 409 if(data == null) { 410 throw new ICUException("Script order data but no mappings"); 411 } 412 int scriptsLength = length / 2; 413 CharBuffer inChars = inBytes.asCharBuffer(); 414 data.numScripts = inChars.get(); 415 // There must be enough entries for both arrays, including more than two range starts. 416 int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); 417 if(scriptStartsLength <= 2) { 418 throw new ICUException("Script order data too short"); 419 } 420 inChars.get(data.scriptsIndex = new char[data.numScripts + 16]); 421 inChars.get(data.scriptStarts = new char[scriptStartsLength]); 422 if(!(data.scriptStarts[0] == 0 && 423 data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && 424 data.scriptStarts[scriptStartsLength - 1] == 425 (Collation.TRAIL_WEIGHT_BYTE << 8))) { 426 throw new ICUException("Script order data not valid"); 427 } 428 } else if(data == null) { 429 // Nothing to do. 430 } else if(baseData != null) { 431 data.numScripts = baseData.numScripts; 432 data.scriptsIndex = baseData.scriptsIndex; 433 data.scriptStarts = baseData.scriptStarts; 434 } 435 ICUBinary.skipBytes(inBytes, length); 436 437 index = IX_COMPRESSIBLE_BYTES_OFFSET; 438 offset = inIndexes[index]; 439 length = inIndexes[index + 1] - offset; 440 if(length >= 256) { 441 if(data == null) { 442 throw new ICUException("Data for compressible primary lead bytes but no mappings"); 443 } 444 data.compressibleBytes = new boolean[256]; 445 for(int i = 0; i < 256; ++i) { 446 data.compressibleBytes[i] = inBytes.get() != 0; 447 } 448 length -= 256; 449 } else if(data == null) { 450 // Nothing to do. 451 } else if(baseData != null) { 452 data.compressibleBytes = baseData.compressibleBytes; 453 } else { 454 throw new ICUException("Missing data for compressible primary lead bytes"); 455 } 456 ICUBinary.skipBytes(inBytes, length); 457 458 index = IX_RESERVED18_OFFSET; 459 offset = inIndexes[index]; 460 length = inIndexes[index + 1] - offset; 461 ICUBinary.skipBytes(inBytes, length); 462 463 CollationSettings ts = tailoring.settings.readOnly(); 464 int options = inIndexes[IX_OPTIONS] & 0xffff; 465 char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; 466 int fastLatinOptions = CollationFastLatin.getOptions( 467 tailoring.data, ts, fastLatinPrimaries); 468 if(options == ts.options && ts.variableTop != 0 && 469 Arrays.equals(reorderCodes, ts.reorderCodes) && 470 fastLatinOptions == ts.fastLatinOptions && 471 (fastLatinOptions < 0 || 472 Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) { 473 return; 474 } 475 476 CollationSettings settings = tailoring.settings.copyOnWrite(); 477 settings.options = options; 478 // Set variableTop from options and scripts data. 479 settings.variableTop = tailoring.data.getLastPrimaryForGroup( 480 Collator.ReorderCodes.FIRST + settings.getMaxVariable()); 481 if(settings.variableTop == 0) { 482 throw new ICUException("The maxVariable could not be mapped to a variableTop"); 483 } 484 485 if(reorderCodesLength != 0) { 486 settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); 487 } 488 489 settings.fastLatinOptions = CollationFastLatin.getOptions( 490 tailoring.data, settings, 491 settings.fastLatinPrimaries); 492 } 493 494 private static final class IsAcceptable implements ICUBinary.Authenticate { 495 @Override 496 public boolean isDataVersionAcceptable(byte version[]) { 497 return version[0] == 5; 498 } 499 } 500 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 501 private static final int DATA_FORMAT = 0x55436f6c; // "UCol" 502 503 private CollationDataReader() {} // no constructor 504 } 505 506 /* 507 * Format of collation data (ucadata.icu, binary data in coll/ *.res files): 508 * See ICU4C source/common/collationdatareader.h. 509 */ 510