1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationData.java, ported from collationdata.h/.cpp 10 * 11 * C++ version created on: 2010oct27 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import android.icu.impl.Normalizer2Impl; 18 import android.icu.impl.Trie2_32; 19 import android.icu.lang.UScript; 20 import android.icu.text.Collator; 21 import android.icu.text.UnicodeSet; 22 import android.icu.util.ICUException; 23 24 /** 25 * Collation data container. 26 * Immutable data created by a CollationDataBuilder, or loaded from a file, 27 * or deserialized from API-provided binary data. 28 * 29 * Includes data for the collation base (root/default), aliased if this is not the base. 30 * @hide Only a subset of ICU is exposed in Android 31 */ 32 public final class CollationData { 33 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array 34 // parallel with the ranges, and resetting ranges that are indexed. 35 // The reordering builder code could clone the resulting template array. 36 static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14; 37 static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15; 38 39 static final int MAX_NUM_SPECIAL_REORDER_CODES = 8; 40 41 CollationData(Normalizer2Impl nfc) { 42 nfcImpl = nfc; 43 } 44 45 public int getCE32(int c) { 46 return trie.get(c); 47 } 48 49 int getCE32FromSupplementary(int c) { 50 return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java? 51 } 52 53 boolean isDigit(int c) { 54 return c < 0x660 ? c <= 0x39 && 0x30 <= c : 55 Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG); 56 } 57 58 public boolean isUnsafeBackward(int c, boolean numeric) { 59 return unsafeBackwardSet.contains(c) || (numeric && isDigit(c)); 60 } 61 62 public boolean isCompressibleLeadByte(int b) { 63 return compressibleBytes[b]; 64 } 65 66 public boolean isCompressiblePrimary(long p) { 67 return isCompressibleLeadByte((int)p >>> 24); 68 } 69 70 /** 71 * Returns the CE32 from two contexts words. 72 * Access to the defaultCE32 for contraction and prefix matching. 73 */ 74 int getCE32FromContexts(int index) { 75 return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1); 76 } 77 78 /** 79 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG). 80 * Requires that ce32 is special. 81 */ 82 int getIndirectCE32(int ce32) { 83 assert(Collation.isSpecialCE32(ce32)); 84 int tag = Collation.tagFromCE32(ce32); 85 if(tag == Collation.DIGIT_TAG) { 86 // Fetch the non-numeric-collation CE32. 87 ce32 = ce32s[Collation.indexFromCE32(ce32)]; 88 } else if(tag == Collation.LEAD_SURROGATE_TAG) { 89 ce32 = Collation.UNASSIGNED_CE32; 90 } else if(tag == Collation.U0000_TAG) { 91 // Fetch the normal ce32 for U+0000. 92 ce32 = ce32s[0]; 93 } 94 return ce32; 95 } 96 97 /** 98 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG), 99 * if ce32 is special. 100 */ 101 int getFinalCE32(int ce32) { 102 if(Collation.isSpecialCE32(ce32)) { 103 ce32 = getIndirectCE32(ce32); 104 } 105 return ce32; 106 } 107 108 /** 109 * Computes a CE from c's ce32 which has the OFFSET_TAG. 110 */ 111 long getCEFromOffsetCE32(int c, int ce32) { 112 long dataCE = ces[Collation.indexFromCE32(ce32)]; 113 return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE)); 114 } 115 116 /** 117 * Returns the single CE that c maps to. 118 * Throws UnsupportedOperationException if c does not map to a single CE. 119 */ 120 long getSingleCE(int c) { 121 CollationData d; 122 int ce32 = getCE32(c); 123 if(ce32 == Collation.FALLBACK_CE32) { 124 d = base; 125 ce32 = base.getCE32(c); 126 } else { 127 d = this; 128 } 129 while(Collation.isSpecialCE32(ce32)) { 130 switch(Collation.tagFromCE32(ce32)) { 131 case Collation.LATIN_EXPANSION_TAG: 132 case Collation.BUILDER_DATA_TAG: 133 case Collation.PREFIX_TAG: 134 case Collation.CONTRACTION_TAG: 135 case Collation.HANGUL_TAG: 136 case Collation.LEAD_SURROGATE_TAG: 137 throw new UnsupportedOperationException(String.format( 138 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 139 c, ce32)); 140 case Collation.FALLBACK_TAG: 141 case Collation.RESERVED_TAG_3: 142 throw new AssertionError(String.format( 143 "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32)); 144 case Collation.LONG_PRIMARY_TAG: 145 return Collation.ceFromLongPrimaryCE32(ce32); 146 case Collation.LONG_SECONDARY_TAG: 147 return Collation.ceFromLongSecondaryCE32(ce32); 148 case Collation.EXPANSION32_TAG: 149 if(Collation.lengthFromCE32(ce32) == 1) { 150 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 151 break; 152 } else { 153 throw new UnsupportedOperationException(String.format( 154 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 155 c, ce32)); 156 } 157 case Collation.EXPANSION_TAG: { 158 if(Collation.lengthFromCE32(ce32) == 1) { 159 return d.ces[Collation.indexFromCE32(ce32)]; 160 } else { 161 throw new UnsupportedOperationException(String.format( 162 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 163 c, ce32)); 164 } 165 } 166 case Collation.DIGIT_TAG: 167 // Fetch the non-numeric-collation CE32 and continue. 168 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 169 break; 170 case Collation.U0000_TAG: 171 assert(c == 0); 172 // Fetch the normal ce32 for U+0000 and continue. 173 ce32 = d.ce32s[0]; 174 break; 175 case Collation.OFFSET_TAG: 176 return d.getCEFromOffsetCE32(c, ce32); 177 case Collation.IMPLICIT_TAG: 178 return Collation.unassignedCEFromCodePoint(c); 179 } 180 } 181 return Collation.ceFromSimpleCE32(ce32); 182 } 183 184 /** 185 * Returns the FCD16 value for code point c. c must be >= 0. 186 */ 187 int getFCD16(int c) { 188 return nfcImpl.getFCD16(c); 189 } 190 191 /** 192 * Returns the first primary for the script's reordering group. 193 * @return the primary with only the first primary lead byte of the group 194 * (not necessarily an actual root collator primary weight), 195 * or 0 if the script is unknown 196 */ 197 long getFirstPrimaryForGroup(int script) { 198 int index = getScriptIndex(script); 199 return index == 0 ? 0 : (long)scriptStarts[index] << 16; 200 } 201 202 /** 203 * Returns the last primary for the script's reordering group. 204 * @return the last primary of the group 205 * (not an actual root collator primary weight), 206 * or 0 if the script is unknown 207 */ 208 public long getLastPrimaryForGroup(int script) { 209 int index = getScriptIndex(script); 210 if(index == 0) { 211 return 0; 212 } 213 long limit = scriptStarts[index + 1]; 214 return (limit << 16) - 1; 215 } 216 217 /** 218 * Finds the reordering group which contains the primary weight. 219 * @return the first script of the group, or -1 if the weight is beyond the last group 220 */ 221 public int getGroupForPrimary(long p) { 222 p >>= 16; 223 if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) { 224 return -1; 225 } 226 int index = 1; 227 while(p >= scriptStarts[index + 1]) { ++index; } 228 for(int i = 0; i < numScripts; ++i) { 229 if(scriptsIndex[i] == index) { 230 return i; 231 } 232 } 233 for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 234 if(scriptsIndex[numScripts + i] == index) { 235 return Collator.ReorderCodes.FIRST + i; 236 } 237 } 238 return -1; 239 } 240 241 private int getScriptIndex(int script) { 242 if(script < 0) { 243 return 0; 244 } else if(script < numScripts) { 245 return scriptsIndex[script]; 246 } else if(script < Collator.ReorderCodes.FIRST) { 247 return 0; 248 } else { 249 script -= Collator.ReorderCodes.FIRST; 250 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { 251 return scriptsIndex[numScripts + script]; 252 } else { 253 return 0; 254 } 255 } 256 } 257 258 public int[] getEquivalentScripts(int script) { 259 int index = getScriptIndex(script); 260 if(index == 0) { return EMPTY_INT_ARRAY; } 261 if(script >= Collator.ReorderCodes.FIRST) { 262 // Special groups have no aliases. 263 return new int[] { script }; 264 } 265 266 int length = 0; 267 for(int i = 0; i < numScripts; ++i) { 268 if(scriptsIndex[i] == index) { 269 ++length; 270 } 271 } 272 int[] dest = new int[length]; 273 if(length == 1) { 274 dest[0] = script; 275 return dest; 276 } 277 length = 0; 278 for(int i = 0; i < numScripts; ++i) { 279 if(scriptsIndex[i] == index) { 280 dest[length++] = i; 281 } 282 } 283 return dest; 284 } 285 286 /** 287 * Writes the permutation of primary-weight ranges 288 * for the given reordering of scripts and groups. 289 * The caller checks for illegal arguments and 290 * takes care of [DEFAULT] and memory allocation. 291 * 292 * <p>Each list element will be a (limit, offset) pair as described 293 * for the CollationSettings.reorderRanges. 294 * The list will be empty if no ranges are reordered. 295 */ 296 void makeReorderRanges(int[] reorder, UVector32 ranges) { 297 makeReorderRanges(reorder, false, ranges); 298 } 299 300 private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) { 301 ranges.removeAllElements(); 302 int length = reorder.length; 303 if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) { 304 return; 305 } 306 307 // Maps each script-or-group range to a new lead byte. 308 short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[] 309 310 { 311 // Set "don't care" values for reserved ranges. 312 int index = scriptsIndex[ 313 numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST]; 314 if(index != 0) { 315 table[index] = 0xff; 316 } 317 index = scriptsIndex[ 318 numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST]; 319 if(index != 0) { 320 table[index] = 0xff; 321 } 322 } 323 324 // Never reorder special low and high primary lead bytes. 325 assert(scriptStarts.length >= 2); 326 assert(scriptStarts[0] == 0); 327 int lowStart = scriptStarts[1]; 328 assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8)); 329 int highLimit = scriptStarts[scriptStarts.length - 1]; 330 assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8)); 331 332 // Get the set of special reorder codes in the input list. 333 // This supports a fixed number of special reorder codes; 334 // it works for data with codes beyond Collator.ReorderCodes.LIMIT. 335 int specials = 0; 336 for(int i = 0; i < length; ++i) { 337 int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST; 338 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { 339 specials |= 1 << reorderCode; 340 } 341 } 342 343 // Start the reordering with the special low reorder codes that do not occur in the input. 344 for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 345 int index = scriptsIndex[numScripts + i]; 346 if(index != 0 && (specials & (1 << i)) == 0) { 347 lowStart = addLowScriptRange(table, index, lowStart); 348 } 349 } 350 351 // Skip the reserved range before Latin if Latin is the first script, 352 // so that we do not move it unnecessarily. 353 int skippedReserved = 0; 354 if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) { 355 int index = scriptsIndex[UScript.LATIN]; 356 assert(index != 0); 357 int start = scriptStarts[index]; 358 assert(lowStart <= start); 359 skippedReserved = start - lowStart; 360 lowStart = start; 361 } 362 363 // Reorder according to the input scripts, continuing from the bottom of the primary range. 364 boolean hasReorderToEnd = false; 365 for(int i = 0; i < length;) { 366 int script = reorder[i++]; 367 if(script == UScript.UNKNOWN) { 368 // Put the remaining scripts at the top. 369 hasReorderToEnd = true; 370 while(i < length) { 371 script = reorder[--length]; 372 if(script == UScript.UNKNOWN) { // Must occur at most once. 373 throw new IllegalArgumentException( 374 "setReorderCodes(): duplicate UScript.UNKNOWN"); 375 } 376 if(script == Collator.ReorderCodes.DEFAULT) { 377 throw new IllegalArgumentException( 378 "setReorderCodes(): UScript.DEFAULT together with other scripts"); 379 } 380 int index = getScriptIndex(script); 381 if(index == 0) { continue; } 382 if(table[index] != 0) { // Duplicate or equivalent script. 383 throw new IllegalArgumentException( 384 "setReorderCodes(): duplicate or equivalent script " + 385 scriptCodeString(script)); 386 } 387 highLimit = addHighScriptRange(table, index, highLimit); 388 } 389 break; 390 } 391 if(script == Collator.ReorderCodes.DEFAULT) { 392 // The default code must be the only one in the list, and that is handled by the caller. 393 // Otherwise it must not be used. 394 throw new IllegalArgumentException( 395 "setReorderCodes(): UScript.DEFAULT together with other scripts"); 396 } 397 int index = getScriptIndex(script); 398 if(index == 0) { continue; } 399 if(table[index] != 0) { // Duplicate or equivalent script. 400 throw new IllegalArgumentException( 401 "setReorderCodes(): duplicate or equivalent script " + 402 scriptCodeString(script)); 403 } 404 lowStart = addLowScriptRange(table, index, lowStart); 405 } 406 407 // Put all remaining scripts into the middle. 408 for(int i = 1; i < scriptStarts.length - 1; ++i) { 409 int leadByte = table[i]; 410 if(leadByte != 0) { continue; } 411 int start = scriptStarts[i]; 412 if(!hasReorderToEnd && start > lowStart) { 413 // No need to move this script. 414 lowStart = start; 415 } 416 lowStart = addLowScriptRange(table, i, lowStart); 417 } 418 if(lowStart > highLimit) { 419 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { 420 // Try not skipping the before-Latin reserved range. 421 makeReorderRanges(reorder, true, ranges); 422 return; 423 } 424 // We need more primary lead bytes than available, despite the reserved ranges. 425 throw new ICUException( 426 "setReorderCodes(): reordering too many partial-primary-lead-byte scripts"); 427 } 428 429 // Turn lead bytes into a list of (limit, offset) pairs. 430 // Encode each pair in one list element: 431 // Upper 16 bits = limit, lower 16 = signed lead byte offset. 432 int offset = 0; 433 for(int i = 1;; ++i) { 434 int nextOffset = offset; 435 while(i < scriptStarts.length - 1) { 436 int newLeadByte = table[i]; 437 if(newLeadByte == 0xff) { 438 // "Don't care" lead byte for reserved range, continue with current offset. 439 } else { 440 nextOffset = newLeadByte - (scriptStarts[i] >> 8); 441 if(nextOffset != offset) { break; } 442 } 443 ++i; 444 } 445 if(offset != 0 || i < scriptStarts.length - 1) { 446 ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff)); 447 } 448 if(i == scriptStarts.length - 1) { break; } 449 offset = nextOffset; 450 } 451 } 452 453 private int addLowScriptRange(short[] table, int index, int lowStart) { 454 int start = scriptStarts[index]; 455 if((start & 0xff) < (lowStart & 0xff)) { 456 lowStart += 0x100; 457 } 458 table[index] = (short)(lowStart >> 8); 459 int limit = scriptStarts[index + 1]; 460 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); 461 return lowStart; 462 } 463 464 private int addHighScriptRange(short[] table, int index, int highLimit) { 465 int limit = scriptStarts[index + 1]; 466 if((limit & 0xff) > (highLimit & 0xff)) { 467 highLimit -= 0x100; 468 } 469 int start = scriptStarts[index]; 470 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); 471 table[index] = (short)(highLimit >> 8); 472 return highLimit; 473 } 474 475 private static String scriptCodeString(int script) { 476 // Do not use the script name here: We do not want to depend on that data. 477 return (script < Collator.ReorderCodes.FIRST) ? 478 Integer.toString(script) : "0x" + Integer.toHexString(script); 479 } 480 481 private static final int[] EMPTY_INT_ARRAY = new int[0]; 482 483 /** @see jamoCE32s */ 484 static final int JAMO_CE32S_LENGTH = 19 + 21 + 27; 485 486 /** Main lookup trie. */ 487 Trie2_32 trie; 488 /** 489 * Array of CE32 values. 490 * At index 0 there must be CE32(U+0000) 491 * to support U+0000's special-tag for NUL-termination handling. 492 */ 493 int[] ce32s; 494 /** Array of CE values for expansions and OFFSET_TAG. */ 495 long[] ces; 496 /** Array of prefix and contraction-suffix matching data. */ 497 String contexts; 498 /** Base collation data, or null if this data itself is a base. */ 499 public CollationData base; 500 /** 501 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T. 502 * They are normally simple CE32s, rarely expansions. 503 * For fast handling of HANGUL_TAG. 504 */ 505 int[] jamoCE32s = new int[JAMO_CE32S_LENGTH]; 506 public Normalizer2Impl nfcImpl; 507 /** The single-byte primary weight (xx000000) for numeric collation. */ 508 long numericPrimary = 0x12000000; 509 510 /** 256 flags for which primary-weight lead bytes are compressible. */ 511 public boolean[] compressibleBytes; 512 /** 513 * Set of code points that are unsafe for starting string comparison after an identical prefix, 514 * or in backwards CE iteration. 515 */ 516 UnicodeSet unsafeBackwardSet; 517 518 /** 519 * Fast Latin table for common-Latin-text string comparisons. 520 * Data structure see class CollationFastLatin. 521 */ 522 public char[] fastLatinTable; 523 /** 524 * Header portion of the fastLatinTable. 525 * In C++, these are one array, and the header is skipped for mapping characters. 526 * In Java, two arrays work better. 527 */ 528 char[] fastLatinTableHeader; 529 530 /** 531 * Data for scripts and reordering groups. 532 * Uses include building a reordering permutation table and 533 * providing script boundaries to AlphabeticIndex. 534 */ 535 int numScripts; 536 /** 537 * The length of scriptsIndex is numScripts+16. 538 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts. 539 * 16 special reorder codes (not all used) are mapped starting at numScripts. 540 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit. 541 * There are special codes at the end for reorder-reserved primary ranges. 542 * 543 * <p>Multiple scripts may share a range and index, for example Hira & Kana. 544 */ 545 char[] scriptsIndex; 546 /** 547 * Start primary weight (top 16 bits only) for a group/script/reserved range 548 * indexed by scriptsIndex. 549 * The first range (separators & terminators) and the last range (trailing weights) 550 * are not reorderable, and no scriptsIndex entry points to them. 551 */ 552 char[] scriptStarts; 553 554 /** 555 * Collation elements in the root collator. 556 * Used by the CollationRootElements class. The data structure is described there. 557 * null in a tailoring. 558 */ 559 public long[] rootElements; 560 } 561