1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationData.java, ported from collationdata.h/.cpp 9 * 10 * C++ version created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.impl.Normalizer2Impl; 17 import com.ibm.icu.impl.Trie2_32; 18 import com.ibm.icu.lang.UScript; 19 import com.ibm.icu.text.Collator; 20 import com.ibm.icu.text.UnicodeSet; 21 import com.ibm.icu.util.ICUException; 22 23 /** 24 * Collation data container. 25 * Immutable data created by a CollationDataBuilder, or loaded from a file, 26 * or deserialized from API-provided binary data. 27 * 28 * Includes data for the collation base (root/default), aliased if this is not the base. 29 */ 30 public final class CollationData { 31 // Note: The ucadata.icu loader could discover the reserved ranges by setting an array 32 // parallel with the ranges, and resetting ranges that are indexed. 33 // The reordering builder code could clone the resulting template array. 34 static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14; 35 static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15; 36 37 static final int MAX_NUM_SPECIAL_REORDER_CODES = 8; 38 39 CollationData(Normalizer2Impl nfc) { 40 nfcImpl = nfc; 41 } 42 43 public int getCE32(int c) { 44 return trie.get(c); 45 } 46 47 int getCE32FromSupplementary(int c) { 48 return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java? 49 } 50 51 boolean isDigit(int c) { 52 return c < 0x660 ? c <= 0x39 && 0x30 <= c : 53 Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG); 54 } 55 56 public boolean isUnsafeBackward(int c, boolean numeric) { 57 return unsafeBackwardSet.contains(c) || (numeric && isDigit(c)); 58 } 59 60 public boolean isCompressibleLeadByte(int b) { 61 return compressibleBytes[b]; 62 } 63 64 public boolean isCompressiblePrimary(long p) { 65 return isCompressibleLeadByte((int)p >>> 24); 66 } 67 68 /** 69 * Returns the CE32 from two contexts words. 70 * Access to the defaultCE32 for contraction and prefix matching. 71 */ 72 int getCE32FromContexts(int index) { 73 return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1); 74 } 75 76 /** 77 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG). 78 * Requires that ce32 is special. 79 */ 80 int getIndirectCE32(int ce32) { 81 assert(Collation.isSpecialCE32(ce32)); 82 int tag = Collation.tagFromCE32(ce32); 83 if(tag == Collation.DIGIT_TAG) { 84 // Fetch the non-numeric-collation CE32. 85 ce32 = ce32s[Collation.indexFromCE32(ce32)]; 86 } else if(tag == Collation.LEAD_SURROGATE_TAG) { 87 ce32 = Collation.UNASSIGNED_CE32; 88 } else if(tag == Collation.U0000_TAG) { 89 // Fetch the normal ce32 for U+0000. 90 ce32 = ce32s[0]; 91 } 92 return ce32; 93 } 94 95 /** 96 * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG), 97 * if ce32 is special. 98 */ 99 int getFinalCE32(int ce32) { 100 if(Collation.isSpecialCE32(ce32)) { 101 ce32 = getIndirectCE32(ce32); 102 } 103 return ce32; 104 } 105 106 /** 107 * Computes a CE from c's ce32 which has the OFFSET_TAG. 108 */ 109 long getCEFromOffsetCE32(int c, int ce32) { 110 long dataCE = ces[Collation.indexFromCE32(ce32)]; 111 return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE)); 112 } 113 114 /** 115 * Returns the single CE that c maps to. 116 * Throws UnsupportedOperationException if c does not map to a single CE. 117 */ 118 long getSingleCE(int c) { 119 CollationData d; 120 int ce32 = getCE32(c); 121 if(ce32 == Collation.FALLBACK_CE32) { 122 d = base; 123 ce32 = base.getCE32(c); 124 } else { 125 d = this; 126 } 127 while(Collation.isSpecialCE32(ce32)) { 128 switch(Collation.tagFromCE32(ce32)) { 129 case Collation.LATIN_EXPANSION_TAG: 130 case Collation.BUILDER_DATA_TAG: 131 case Collation.PREFIX_TAG: 132 case Collation.CONTRACTION_TAG: 133 case Collation.HANGUL_TAG: 134 case Collation.LEAD_SURROGATE_TAG: 135 throw new UnsupportedOperationException(String.format( 136 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 137 c, ce32)); 138 case Collation.FALLBACK_TAG: 139 case Collation.RESERVED_TAG_3: 140 throw new AssertionError(String.format( 141 "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32)); 142 case Collation.LONG_PRIMARY_TAG: 143 return Collation.ceFromLongPrimaryCE32(ce32); 144 case Collation.LONG_SECONDARY_TAG: 145 return Collation.ceFromLongSecondaryCE32(ce32); 146 case Collation.EXPANSION32_TAG: 147 if(Collation.lengthFromCE32(ce32) == 1) { 148 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 149 break; 150 } else { 151 throw new UnsupportedOperationException(String.format( 152 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 153 c, ce32)); 154 } 155 case Collation.EXPANSION_TAG: { 156 if(Collation.lengthFromCE32(ce32) == 1) { 157 return d.ces[Collation.indexFromCE32(ce32)]; 158 } else { 159 throw new UnsupportedOperationException(String.format( 160 "there is not exactly one collation element for U+%04X (CE32 0x%08x)", 161 c, ce32)); 162 } 163 } 164 case Collation.DIGIT_TAG: 165 // Fetch the non-numeric-collation CE32 and continue. 166 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 167 break; 168 case Collation.U0000_TAG: 169 assert(c == 0); 170 // Fetch the normal ce32 for U+0000 and continue. 171 ce32 = d.ce32s[0]; 172 break; 173 case Collation.OFFSET_TAG: 174 return d.getCEFromOffsetCE32(c, ce32); 175 case Collation.IMPLICIT_TAG: 176 return Collation.unassignedCEFromCodePoint(c); 177 } 178 } 179 return Collation.ceFromSimpleCE32(ce32); 180 } 181 182 /** 183 * Returns the FCD16 value for code point c. c must be >= 0. 184 */ 185 int getFCD16(int c) { 186 return nfcImpl.getFCD16(c); 187 } 188 189 /** 190 * Returns the first primary for the script's reordering group. 191 * @return the primary with only the first primary lead byte of the group 192 * (not necessarily an actual root collator primary weight), 193 * or 0 if the script is unknown 194 */ 195 long getFirstPrimaryForGroup(int script) { 196 int index = getScriptIndex(script); 197 return index == 0 ? 0 : (long)scriptStarts[index] << 16; 198 } 199 200 /** 201 * Returns the last primary for the script's reordering group. 202 * @return the last primary of the group 203 * (not an actual root collator primary weight), 204 * or 0 if the script is unknown 205 */ 206 public long getLastPrimaryForGroup(int script) { 207 int index = getScriptIndex(script); 208 if(index == 0) { 209 return 0; 210 } 211 long limit = scriptStarts[index + 1]; 212 return (limit << 16) - 1; 213 } 214 215 /** 216 * Finds the reordering group which contains the primary weight. 217 * @return the first script of the group, or -1 if the weight is beyond the last group 218 */ 219 public int getGroupForPrimary(long p) { 220 p >>= 16; 221 if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) { 222 return -1; 223 } 224 int index = 1; 225 while(p >= scriptStarts[index + 1]) { ++index; } 226 for(int i = 0; i < numScripts; ++i) { 227 if(scriptsIndex[i] == index) { 228 return i; 229 } 230 } 231 for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 232 if(scriptsIndex[numScripts + i] == index) { 233 return Collator.ReorderCodes.FIRST + i; 234 } 235 } 236 return -1; 237 } 238 239 private int getScriptIndex(int script) { 240 if(script < 0) { 241 return 0; 242 } else if(script < numScripts) { 243 return scriptsIndex[script]; 244 } else if(script < Collator.ReorderCodes.FIRST) { 245 return 0; 246 } else { 247 script -= Collator.ReorderCodes.FIRST; 248 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { 249 return scriptsIndex[numScripts + script]; 250 } else { 251 return 0; 252 } 253 } 254 } 255 256 public int[] getEquivalentScripts(int script) { 257 int index = getScriptIndex(script); 258 if(index == 0) { return EMPTY_INT_ARRAY; } 259 if(script >= Collator.ReorderCodes.FIRST) { 260 // Special groups have no aliases. 261 return new int[] { script }; 262 } 263 264 int length = 0; 265 for(int i = 0; i < numScripts; ++i) { 266 if(scriptsIndex[i] == index) { 267 ++length; 268 } 269 } 270 int[] dest = new int[length]; 271 if(length == 1) { 272 dest[0] = script; 273 return dest; 274 } 275 length = 0; 276 for(int i = 0; i < numScripts; ++i) { 277 if(scriptsIndex[i] == index) { 278 dest[length++] = i; 279 } 280 } 281 return dest; 282 } 283 284 /** 285 * Writes the permutation of primary-weight ranges 286 * for the given reordering of scripts and groups. 287 * The caller checks for illegal arguments and 288 * takes care of [DEFAULT] and memory allocation. 289 * 290 * <p>Each list element will be a (limit, offset) pair as described 291 * for the CollationSettings.reorderRanges. 292 * The list will be empty if no ranges are reordered. 293 */ 294 void makeReorderRanges(int[] reorder, UVector32 ranges) { 295 makeReorderRanges(reorder, false, ranges); 296 } 297 298 private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) { 299 ranges.removeAllElements(); 300 int length = reorder.length; 301 if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) { 302 return; 303 } 304 305 // Maps each script-or-group range to a new lead byte. 306 short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[] 307 308 { 309 // Set "don't care" values for reserved ranges. 310 int index = scriptsIndex[ 311 numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST]; 312 if(index != 0) { 313 table[index] = 0xff; 314 } 315 index = scriptsIndex[ 316 numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST]; 317 if(index != 0) { 318 table[index] = 0xff; 319 } 320 } 321 322 // Never reorder special low and high primary lead bytes. 323 assert(scriptStarts.length >= 2); 324 assert(scriptStarts[0] == 0); 325 int lowStart = scriptStarts[1]; 326 assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8)); 327 int highLimit = scriptStarts[scriptStarts.length - 1]; 328 assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8)); 329 330 // Get the set of special reorder codes in the input list. 331 // This supports a fixed number of special reorder codes; 332 // it works for data with codes beyond Collator.ReorderCodes.LIMIT. 333 int specials = 0; 334 for(int i = 0; i < length; ++i) { 335 int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST; 336 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { 337 specials |= 1 << reorderCode; 338 } 339 } 340 341 // Start the reordering with the special low reorder codes that do not occur in the input. 342 for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 343 int index = scriptsIndex[numScripts + i]; 344 if(index != 0 && (specials & (1 << i)) == 0) { 345 lowStart = addLowScriptRange(table, index, lowStart); 346 } 347 } 348 349 // Skip the reserved range before Latin if Latin is the first script, 350 // so that we do not move it unnecessarily. 351 int skippedReserved = 0; 352 if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) { 353 int index = scriptsIndex[UScript.LATIN]; 354 assert(index != 0); 355 int start = scriptStarts[index]; 356 assert(lowStart <= start); 357 skippedReserved = start - lowStart; 358 lowStart = start; 359 } 360 361 // Reorder according to the input scripts, continuing from the bottom of the primary range. 362 boolean hasReorderToEnd = false; 363 for(int i = 0; i < length;) { 364 int script = reorder[i++]; 365 if(script == UScript.UNKNOWN) { 366 // Put the remaining scripts at the top. 367 hasReorderToEnd = true; 368 while(i < length) { 369 script = reorder[--length]; 370 if(script == UScript.UNKNOWN) { // Must occur at most once. 371 throw new IllegalArgumentException( 372 "setReorderCodes(): duplicate UScript.UNKNOWN"); 373 } 374 if(script == Collator.ReorderCodes.DEFAULT) { 375 throw new IllegalArgumentException( 376 "setReorderCodes(): UScript.DEFAULT together with other scripts"); 377 } 378 int index = getScriptIndex(script); 379 if(index == 0) { continue; } 380 if(table[index] != 0) { // Duplicate or equivalent script. 381 throw new IllegalArgumentException( 382 "setReorderCodes(): duplicate or equivalent script " + 383 scriptCodeString(script)); 384 } 385 highLimit = addHighScriptRange(table, index, highLimit); 386 } 387 break; 388 } 389 if(script == Collator.ReorderCodes.DEFAULT) { 390 // The default code must be the only one in the list, and that is handled by the caller. 391 // Otherwise it must not be used. 392 throw new IllegalArgumentException( 393 "setReorderCodes(): UScript.DEFAULT together with other scripts"); 394 } 395 int index = getScriptIndex(script); 396 if(index == 0) { continue; } 397 if(table[index] != 0) { // Duplicate or equivalent script. 398 throw new IllegalArgumentException( 399 "setReorderCodes(): duplicate or equivalent script " + 400 scriptCodeString(script)); 401 } 402 lowStart = addLowScriptRange(table, index, lowStart); 403 } 404 405 // Put all remaining scripts into the middle. 406 for(int i = 1; i < scriptStarts.length - 1; ++i) { 407 int leadByte = table[i]; 408 if(leadByte != 0) { continue; } 409 int start = scriptStarts[i]; 410 if(!hasReorderToEnd && start > lowStart) { 411 // No need to move this script. 412 lowStart = start; 413 } 414 lowStart = addLowScriptRange(table, i, lowStart); 415 } 416 if(lowStart > highLimit) { 417 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { 418 // Try not skipping the before-Latin reserved range. 419 makeReorderRanges(reorder, true, ranges); 420 return; 421 } 422 // We need more primary lead bytes than available, despite the reserved ranges. 423 throw new ICUException( 424 "setReorderCodes(): reordering too many partial-primary-lead-byte scripts"); 425 } 426 427 // Turn lead bytes into a list of (limit, offset) pairs. 428 // Encode each pair in one list element: 429 // Upper 16 bits = limit, lower 16 = signed lead byte offset. 430 int offset = 0; 431 for(int i = 1;; ++i) { 432 int nextOffset = offset; 433 while(i < scriptStarts.length - 1) { 434 int newLeadByte = table[i]; 435 if(newLeadByte == 0xff) { 436 // "Don't care" lead byte for reserved range, continue with current offset. 437 } else { 438 nextOffset = newLeadByte - (scriptStarts[i] >> 8); 439 if(nextOffset != offset) { break; } 440 } 441 ++i; 442 } 443 if(offset != 0 || i < scriptStarts.length - 1) { 444 ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff)); 445 } 446 if(i == scriptStarts.length - 1) { break; } 447 offset = nextOffset; 448 } 449 } 450 451 private int addLowScriptRange(short[] table, int index, int lowStart) { 452 int start = scriptStarts[index]; 453 if((start & 0xff) < (lowStart & 0xff)) { 454 lowStart += 0x100; 455 } 456 table[index] = (short)(lowStart >> 8); 457 int limit = scriptStarts[index + 1]; 458 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); 459 return lowStart; 460 } 461 462 private int addHighScriptRange(short[] table, int index, int highLimit) { 463 int limit = scriptStarts[index + 1]; 464 if((limit & 0xff) > (highLimit & 0xff)) { 465 highLimit -= 0x100; 466 } 467 int start = scriptStarts[index]; 468 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); 469 table[index] = (short)(highLimit >> 8); 470 return highLimit; 471 } 472 473 private static String scriptCodeString(int script) { 474 // Do not use the script name here: We do not want to depend on that data. 475 return (script < Collator.ReorderCodes.FIRST) ? 476 Integer.toString(script) : "0x" + Integer.toHexString(script); 477 } 478 479 private static final int[] EMPTY_INT_ARRAY = new int[0]; 480 481 /** @see jamoCE32s */ 482 static final int JAMO_CE32S_LENGTH = 19 + 21 + 27; 483 484 /** Main lookup trie. */ 485 Trie2_32 trie; 486 /** 487 * Array of CE32 values. 488 * At index 0 there must be CE32(U+0000) 489 * to support U+0000's special-tag for NUL-termination handling. 490 */ 491 int[] ce32s; 492 /** Array of CE values for expansions and OFFSET_TAG. */ 493 long[] ces; 494 /** Array of prefix and contraction-suffix matching data. */ 495 String contexts; 496 /** Base collation data, or null if this data itself is a base. */ 497 public CollationData base; 498 /** 499 * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T. 500 * They are normally simple CE32s, rarely expansions. 501 * For fast handling of HANGUL_TAG. 502 */ 503 int[] jamoCE32s = new int[JAMO_CE32S_LENGTH]; 504 public Normalizer2Impl nfcImpl; 505 /** The single-byte primary weight (xx000000) for numeric collation. */ 506 long numericPrimary = 0x12000000; 507 508 /** 256 flags for which primary-weight lead bytes are compressible. */ 509 public boolean[] compressibleBytes; 510 /** 511 * Set of code points that are unsafe for starting string comparison after an identical prefix, 512 * or in backwards CE iteration. 513 */ 514 UnicodeSet unsafeBackwardSet; 515 516 /** 517 * Fast Latin table for common-Latin-text string comparisons. 518 * Data structure see class CollationFastLatin. 519 */ 520 public char[] fastLatinTable; 521 /** 522 * Header portion of the fastLatinTable. 523 * In C++, these are one array, and the header is skipped for mapping characters. 524 * In Java, two arrays work better. 525 */ 526 char[] fastLatinTableHeader; 527 528 /** 529 * Data for scripts and reordering groups. 530 * Uses include building a reordering permutation table and 531 * providing script boundaries to AlphabeticIndex. 532 */ 533 int numScripts; 534 /** 535 * The length of scriptsIndex is numScripts+16. 536 * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts. 537 * 16 special reorder codes (not all used) are mapped starting at numScripts. 538 * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit. 539 * There are special codes at the end for reorder-reserved primary ranges. 540 * 541 * <p>Multiple scripts may share a range and index, for example Hira & Kana. 542 */ 543 char[] scriptsIndex; 544 /** 545 * Start primary weight (top 16 bits only) for a group/script/reserved range 546 * indexed by scriptsIndex. 547 * The first range (separators & terminators) and the last range (trailing weights) 548 * are not reorderable, and no scriptsIndex entry points to them. 549 */ 550 char[] scriptStarts; 551 552 /** 553 * Collation elements in the root collator. 554 * Used by the CollationRootElements class. The data structure is described there. 555 * null in a tailoring. 556 */ 557 public long[] rootElements; 558 } 559