1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationFastLatin.java, ported from collationfastlatin.h/.cpp 10 * 11 * C++ version created on: 2013aug09 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import android.icu.lang.UScript; 18 import android.icu.text.Collator; 19 20 /** 21 * @hide Only a subset of ICU is exposed in Android 22 */ 23 public final class CollationFastLatin /* all static */ { 24 /** 25 * Fast Latin format version (one byte 1..FF). 26 * Must be incremented for any runtime-incompatible changes, 27 * in particular, for changes to any of the following constants. 28 * 29 * When the major version number of the main data format changes, 30 * we can reset this fast Latin version to 1. 31 */ 32 public static final int VERSION = 2; 33 34 public static final int LATIN_MAX = 0x17f; 35 public static final int LATIN_LIMIT = LATIN_MAX + 1; 36 37 static final int LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX 38 39 static final int PUNCT_START = 0x2000; 40 static final int PUNCT_LIMIT = 0x2040; 41 42 // excludes U+FFFE & U+FFFF 43 static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START); 44 45 // Note on the supported weight ranges: 46 // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that 47 // the CEs for characters in the above ranges, excluding expansions with length >2, 48 // excluding contractions of >2 characters, and other restrictions 49 // (see the builder's getCEsFromCE32()), 50 // use at most about 150 primary weights, 51 // where about 94 primary weights are possibly-variable (space/punct/symbol/currency), 52 // at most 4 secondary before-common weights, 53 // at most 4 secondary after-common weights, 54 // at most 16 secondary high weights (in secondary CEs), and 55 // at most 4 tertiary after-common weights. 56 // The following ranges are designed to support slightly more weights than that. 57 // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.) 58 59 // Digits may use long primaries (preserving more short ones) 60 // or short primaries (faster) without changing this data structure. 61 // (If we supported numeric collation, then digits would have to have long primaries 62 // so that special handling does not affect the fast path.) 63 64 static final int SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10 65 static final int INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions 66 static final int SECONDARY_MASK = 0x3e0; // bits 9..5 67 static final int CASE_MASK = 0x18; // bits 4..3 68 static final int LONG_PRIMARY_MASK = 0xfff8; // bits 15..3 69 static final int TERTIARY_MASK = 7; // bits 2..0 70 static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK; 71 72 static final int TWO_SHORT_PRIMARIES_MASK = 73 (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00 74 static final int TWO_LONG_PRIMARIES_MASK = 75 (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8 76 static final int TWO_SECONDARIES_MASK = 77 (SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0 78 static final int TWO_CASES_MASK = 79 (CASE_MASK << 16) | CASE_MASK; // 0x180018 80 static final int TWO_TERTIARIES_MASK = 81 (TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007 82 83 /** 84 * Contraction with one fast Latin character. 85 * Use INDEX_MASK to find the start of the contraction list after the fixed table. 86 * The first entry contains the default mapping. 87 * Otherwise use CONTR_CHAR_MASK for the contraction character index 88 * (in ascending order). 89 * Use CONTR_LENGTH_SHIFT for the length of the entry 90 * (1=BAIL_OUT, 2=one CE, 3=two CEs). 91 * 92 * Also, U+0000 maps to a contraction entry, so that the fast path need not 93 * check for NUL termination. 94 * It usually maps to a contraction list with only the completely ignorable default value. 95 */ 96 static final int CONTRACTION = 0x400; 97 /** 98 * An expansion encodes two CEs. 99 * Use INDEX_MASK to find the pair of CEs after the fixed table. 100 * 101 * The higher a mini CE value, the easier it is to process. 102 * For expansions and higher, no context needs to be considered. 103 */ 104 static final int EXPANSION = 0x800; 105 /** 106 * Encodes one CE with a long/low mini primary (there are 128). 107 * All potentially-variable primaries must be in this range, 108 * to make the short-primary path as fast as possible. 109 */ 110 static final int MIN_LONG = 0xc00; 111 static final int LONG_INC = 8; 112 static final int MAX_LONG = 0xff8; 113 /** 114 * Encodes one CE with a short/high primary (there are 60), 115 * plus a secondary CE if the secondary weight is high. 116 * Fast handling: At least all letter primaries should be in this range. 117 */ 118 static final int MIN_SHORT = 0x1000; 119 static final int SHORT_INC = 0x400; 120 /** The highest primary weight is reserved for U+FFFF. */ 121 static final int MAX_SHORT = SHORT_PRIMARY_MASK; 122 123 static final int MIN_SEC_BEFORE = 0; // must add SEC_OFFSET 124 static final int SEC_INC = 0x20; 125 static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common 126 static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC; 127 static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC; 128 static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common 129 static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries 130 static final int MAX_SEC_HIGH = SECONDARY_MASK; 131 132 /** 133 * Lookup: Add this offset to secondary weights, except for completely ignorable CEs. 134 * Must be greater than any special value, e.g., MERGE_WEIGHT. 135 * The exact value is not relevant for the format version. 136 */ 137 static final int SEC_OFFSET = SEC_INC; 138 static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET; 139 140 static final int TWO_SEC_OFFSETS = 141 (SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020 142 static final int TWO_COMMON_SEC_PLUS_OFFSET = 143 (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET; 144 145 static final int LOWER_CASE = 8; // case bits include this offset 146 static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008 147 148 static final int COMMON_TER = 0; // must add TER_OFFSET 149 static final int MAX_TER_AFTER = 7; // 7 after common 150 151 /** 152 * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs. 153 * Must be greater than any special value, e.g., MERGE_WEIGHT. 154 * Must be greater than case bits as well, so that with combined case+tertiary weights 155 * plus the offset the tertiary bits does not spill over into the case bits. 156 * The exact value is not relevant for the format version. 157 */ 158 static final int TER_OFFSET = SEC_OFFSET; 159 static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET; 160 161 static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET; 162 static final int TWO_COMMON_TER_PLUS_OFFSET = 163 (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET; 164 165 static final int MERGE_WEIGHT = 3; 166 static final int EOS = 2; // end of string 167 static final int BAIL_OUT = 1; 168 169 /** 170 * Contraction result first word bits 8..0 contain the 171 * second contraction character, as a char index 0..NUM_FAST_CHARS-1. 172 * Each contraction list is terminated with a word containing CONTR_CHAR_MASK. 173 */ 174 static final int CONTR_CHAR_MASK = 0x1ff; 175 /** 176 * Contraction result first word bits 10..9 contain the result length: 177 * 1=bail out, 2=one mini CE, 3=two mini CEs 178 */ 179 static final int CONTR_LENGTH_SHIFT = 9; 180 181 /** 182 * Comparison return value when the regular comparison must be used. 183 * The exact value is not relevant for the format version. 184 */ 185 public static final int BAIL_OUT_RESULT = -2; 186 187 static int getCharIndex(char c) { 188 if(c <= LATIN_MAX) { 189 return c; 190 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 191 return c - (PUNCT_START - LATIN_LIMIT); 192 } else { 193 // Not a fast Latin character. 194 // Note: U+FFFE & U+FFFF are forbidden in tailorings 195 // and thus do not occur in any contractions. 196 return -1; 197 } 198 } 199 200 /** 201 * Computes the options value for the compare functions 202 * and writes the precomputed primary weights. 203 * Returns -1 if the Latin fastpath is not supported for the data and settings. 204 * The capacity must be LATIN_LIMIT. 205 */ 206 public static int getOptions(CollationData data, CollationSettings settings, 207 char[] primaries) { 208 char[] header = data.fastLatinTableHeader; 209 if(header == null) { return -1; } 210 assert((header[0] >> 8) == VERSION); 211 if(primaries.length != LATIN_LIMIT) { 212 assert false; 213 return -1; 214 } 215 216 int miniVarTop; 217 if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) { 218 // No mini primaries are variable, set a variableTop just below the 219 // lowest long mini primary. 220 miniVarTop = MIN_LONG - 1; 221 } else { 222 int headerLength = header[0] & 0xff; 223 int i = 1 + settings.getMaxVariable(); 224 if(i >= headerLength) { 225 return -1; // variableTop >= digits, should not occur 226 } 227 miniVarTop = header[i]; 228 } 229 230 boolean digitsAreReordered = false; 231 if(settings.hasReordering()) { 232 long prevStart = 0; 233 long beforeDigitStart = 0; 234 long digitStart = 0; 235 long afterDigitStart = 0; 236 for(int group = Collator.ReorderCodes.FIRST; 237 group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES; 238 ++group) { 239 long start = data.getFirstPrimaryForGroup(group); 240 start = settings.reorder(start); 241 if(group == Collator.ReorderCodes.DIGIT) { 242 beforeDigitStart = prevStart; 243 digitStart = start; 244 } else if(start != 0) { 245 if(start < prevStart) { 246 // The permutation affects the groups up to Latin. 247 return -1; 248 } 249 // In the future, there might be a special group between digits & Latin. 250 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 251 afterDigitStart = start; 252 } 253 prevStart = start; 254 } 255 } 256 long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN); 257 latinStart = settings.reorder(latinStart); 258 if(latinStart < prevStart) { 259 return -1; 260 } 261 if(afterDigitStart == 0) { 262 afterDigitStart = latinStart; 263 } 264 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 265 digitsAreReordered = true; 266 } 267 } 268 269 char[] table = data.fastLatinTable; // skip the header 270 for(int c = 0; c < LATIN_LIMIT; ++c) { 271 int p = table[c]; 272 if(p >= MIN_SHORT) { 273 p &= SHORT_PRIMARY_MASK; 274 } else if(p > miniVarTop) { 275 p &= LONG_PRIMARY_MASK; 276 } else { 277 p = 0; 278 } 279 primaries[c] = (char)p; 280 } 281 if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) { 282 // Bail out for digits. 283 for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 284 } 285 286 // Shift the miniVarTop above other options. 287 return (miniVarTop << 16) | settings.options; 288 } 289 290 public static int compareUTF16(char[] table, char[] primaries, int options, 291 CharSequence left, CharSequence right, int startIndex) { 292 // This is a modified copy of CollationCompare.compareUpToQuaternary(), 293 // optimized for common Latin text. 294 // Keep them in sync! 295 296 int variableTop = options >> 16; // see getOptions() 297 options &= 0xffff; // needed for CollationSettings.getStrength() to work 298 299 // Check for supported characters, fetch mini CEs, and compare primaries. 300 int leftIndex = startIndex, rightIndex = startIndex; 301 /** 302 * Single mini CE or a pair. 303 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 304 * If there is only one, then it is in the lower bits, and the upper bits are 0. 305 */ 306 int leftPair = 0, rightPair = 0; 307 for(;;) { 308 // We fetch CEs until we get a non-ignorable primary or reach the end. 309 while(leftPair == 0) { 310 if(leftIndex == left.length()) { 311 leftPair = EOS; 312 break; 313 } 314 int c = left.charAt(leftIndex++); 315 if(c <= LATIN_MAX) { 316 leftPair = primaries[c]; 317 if(leftPair != 0) { break; } 318 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 319 return BAIL_OUT_RESULT; 320 } 321 leftPair = table[c]; 322 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 323 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 324 } else { 325 leftPair = lookup(table, c); 326 } 327 if(leftPair >= MIN_SHORT) { 328 leftPair &= SHORT_PRIMARY_MASK; 329 break; 330 } else if(leftPair > variableTop) { 331 leftPair &= LONG_PRIMARY_MASK; 332 break; 333 } else { 334 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 335 if(pairAndInc < 0) { 336 ++leftIndex; 337 pairAndInc = ~pairAndInc; 338 } 339 leftPair = (int)pairAndInc; 340 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 341 leftPair = getPrimaries(variableTop, leftPair); 342 } 343 } 344 345 while(rightPair == 0) { 346 if(rightIndex == right.length()) { 347 rightPair = EOS; 348 break; 349 } 350 int c = right.charAt(rightIndex++); 351 if(c <= LATIN_MAX) { 352 rightPair = primaries[c]; 353 if(rightPair != 0) { break; } 354 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 355 return BAIL_OUT_RESULT; 356 } 357 rightPair = table[c]; 358 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 359 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 360 } else { 361 rightPair = lookup(table, c); 362 } 363 if(rightPair >= MIN_SHORT) { 364 rightPair &= SHORT_PRIMARY_MASK; 365 break; 366 } else if(rightPair > variableTop) { 367 rightPair &= LONG_PRIMARY_MASK; 368 break; 369 } else { 370 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 371 if(pairAndInc < 0) { 372 ++rightIndex; 373 pairAndInc = ~pairAndInc; 374 } 375 rightPair = (int)pairAndInc; 376 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 377 rightPair = getPrimaries(variableTop, rightPair); 378 } 379 } 380 381 if(leftPair == rightPair) { 382 if(leftPair == EOS) { break; } 383 leftPair = rightPair = 0; 384 continue; 385 } 386 int leftPrimary = leftPair & 0xffff; 387 int rightPrimary = rightPair & 0xffff; 388 if(leftPrimary != rightPrimary) { 389 // Return the primary difference. 390 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER; 391 } 392 if(leftPair == EOS) { break; } 393 leftPair >>>= 16; 394 rightPair >>>= 16; 395 } 396 // In the following, we need to re-fetch each character because we did not buffer the CEs, 397 // but we know that the string is well-formed and 398 // only contains supported characters and mappings. 399 400 // We might skip the secondary level but continue with the case level 401 // which is turned on separately. 402 if(CollationSettings.getStrength(options) >= Collator.SECONDARY) { 403 leftIndex = rightIndex = startIndex; 404 leftPair = rightPair = 0; 405 for(;;) { 406 while(leftPair == 0) { 407 if(leftIndex == left.length()) { 408 leftPair = EOS; 409 break; 410 } 411 int c = left.charAt(leftIndex++); 412 if(c <= LATIN_MAX) { 413 leftPair = table[c]; 414 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 415 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 416 } else { 417 leftPair = lookup(table, c); 418 } 419 if(leftPair >= MIN_SHORT) { 420 leftPair = getSecondariesFromOneShortCE(leftPair); 421 break; 422 } else if(leftPair > variableTop) { 423 leftPair = COMMON_SEC_PLUS_OFFSET; 424 break; 425 } else { 426 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 427 if(pairAndInc < 0) { 428 ++leftIndex; 429 pairAndInc = ~pairAndInc; 430 } 431 leftPair = getSecondaries(variableTop, (int)pairAndInc); 432 } 433 } 434 435 while(rightPair == 0) { 436 if(rightIndex == right.length()) { 437 rightPair = EOS; 438 break; 439 } 440 int c = right.charAt(rightIndex++); 441 if(c <= LATIN_MAX) { 442 rightPair = table[c]; 443 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 444 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 445 } else { 446 rightPair = lookup(table, c); 447 } 448 if(rightPair >= MIN_SHORT) { 449 rightPair = getSecondariesFromOneShortCE(rightPair); 450 break; 451 } else if(rightPair > variableTop) { 452 rightPair = COMMON_SEC_PLUS_OFFSET; 453 break; 454 } else { 455 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 456 if(pairAndInc < 0) { 457 ++rightIndex; 458 pairAndInc = ~pairAndInc; 459 } 460 rightPair = getSecondaries(variableTop, (int)pairAndInc); 461 } 462 } 463 464 if(leftPair == rightPair) { 465 if(leftPair == EOS) { break; } 466 leftPair = rightPair = 0; 467 continue; 468 } 469 int leftSecondary = leftPair & 0xffff; 470 int rightSecondary = rightPair & 0xffff; 471 if(leftSecondary != rightSecondary) { 472 if((options & CollationSettings.BACKWARD_SECONDARY) != 0) { 473 // Full support for backwards secondary requires backwards contraction matching 474 // and moving backwards between merge separators. 475 return BAIL_OUT_RESULT; 476 } 477 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 478 } 479 if(leftPair == EOS) { break; } 480 leftPair >>>= 16; 481 rightPair >>>= 16; 482 } 483 } 484 485 if((options & CollationSettings.CASE_LEVEL) != 0) { 486 boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY; 487 leftIndex = rightIndex = startIndex; 488 leftPair = rightPair = 0; 489 for(;;) { 490 while(leftPair == 0) { 491 if(leftIndex == left.length()) { 492 leftPair = EOS; 493 break; 494 } 495 int c = left.charAt(leftIndex++); 496 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 497 if(leftPair < MIN_LONG) { 498 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 499 if(pairAndInc < 0) { 500 ++leftIndex; 501 pairAndInc = ~pairAndInc; 502 } 503 leftPair = (int)pairAndInc; 504 } 505 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 506 } 507 508 while(rightPair == 0) { 509 if(rightIndex == right.length()) { 510 rightPair = EOS; 511 break; 512 } 513 int c = right.charAt(rightIndex++); 514 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 515 if(rightPair < MIN_LONG) { 516 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 517 if(pairAndInc < 0) { 518 ++rightIndex; 519 pairAndInc = ~pairAndInc; 520 } 521 rightPair = (int)pairAndInc; 522 } 523 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 524 } 525 526 if(leftPair == rightPair) { 527 if(leftPair == EOS) { break; } 528 leftPair = rightPair = 0; 529 continue; 530 } 531 int leftCase = leftPair & 0xffff; 532 int rightCase = rightPair & 0xffff; 533 if(leftCase != rightCase) { 534 if((options & CollationSettings.UPPER_FIRST) == 0) { 535 return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER; 536 } else { 537 return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS; 538 } 539 } 540 if(leftPair == EOS) { break; } 541 leftPair >>>= 16; 542 rightPair >>>= 16; 543 } 544 } 545 if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; } 546 547 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 548 boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options); 549 550 leftIndex = rightIndex = startIndex; 551 leftPair = rightPair = 0; 552 for(;;) { 553 while(leftPair == 0) { 554 if(leftIndex == left.length()) { 555 leftPair = EOS; 556 break; 557 } 558 int c = left.charAt(leftIndex++); 559 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 560 if(leftPair < MIN_LONG) { 561 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 562 if(pairAndInc < 0) { 563 ++leftIndex; 564 pairAndInc = ~pairAndInc; 565 } 566 leftPair = (int)pairAndInc; 567 } 568 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 569 } 570 571 while(rightPair == 0) { 572 if(rightIndex == right.length()) { 573 rightPair = EOS; 574 break; 575 } 576 int c = right.charAt(rightIndex++); 577 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 578 if(rightPair < MIN_LONG) { 579 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 580 if(pairAndInc < 0) { 581 ++rightIndex; 582 pairAndInc = ~pairAndInc; 583 } 584 rightPair = (int)pairAndInc; 585 } 586 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 587 } 588 589 if(leftPair == rightPair) { 590 if(leftPair == EOS) { break; } 591 leftPair = rightPair = 0; 592 continue; 593 } 594 int leftTertiary = leftPair & 0xffff; 595 int rightTertiary = rightPair & 0xffff; 596 if(leftTertiary != rightTertiary) { 597 if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) { 598 // Pass through EOS and MERGE_WEIGHT 599 // and keep real tertiary weights larger than the MERGE_WEIGHT. 600 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 601 if(leftTertiary > MERGE_WEIGHT) { 602 leftTertiary ^= CASE_MASK; 603 } 604 if(rightTertiary > MERGE_WEIGHT) { 605 rightTertiary ^= CASE_MASK; 606 } 607 } 608 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER; 609 } 610 if(leftPair == EOS) { break; } 611 leftPair >>>= 16; 612 rightPair >>>= 16; 613 } 614 if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; } 615 616 leftIndex = rightIndex = startIndex; 617 leftPair = rightPair = 0; 618 for(;;) { 619 while(leftPair == 0) { 620 if(leftIndex == left.length()) { 621 leftPair = EOS; 622 break; 623 } 624 int c = left.charAt(leftIndex++); 625 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 626 if(leftPair < MIN_LONG) { 627 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 628 if(pairAndInc < 0) { 629 ++leftIndex; 630 pairAndInc = ~pairAndInc; 631 } 632 leftPair = (int)pairAndInc; 633 } 634 leftPair = getQuaternaries(variableTop, leftPair); 635 } 636 637 while(rightPair == 0) { 638 if(rightIndex == right.length()) { 639 rightPair = EOS; 640 break; 641 } 642 int c = right.charAt(rightIndex++); 643 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 644 if(rightPair < MIN_LONG) { 645 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 646 if(pairAndInc < 0) { 647 ++rightIndex; 648 pairAndInc = ~pairAndInc; 649 } 650 rightPair = (int)pairAndInc; 651 } 652 rightPair = getQuaternaries(variableTop, rightPair); 653 } 654 655 if(leftPair == rightPair) { 656 if(leftPair == EOS) { break; } 657 leftPair = rightPair = 0; 658 continue; 659 } 660 int leftQuaternary = leftPair & 0xffff; 661 int rightQuaternary = rightPair & 0xffff; 662 if(leftQuaternary != rightQuaternary) { 663 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER; 664 } 665 if(leftPair == EOS) { break; } 666 leftPair >>>= 16; 667 rightPair >>>= 16; 668 } 669 return Collation.EQUAL; 670 } 671 672 private static int lookup(char[] table, int c) { 673 assert(c > LATIN_MAX); 674 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 675 return table[c - PUNCT_START + LATIN_LIMIT]; 676 } else if(c == 0xfffe) { 677 return MERGE_WEIGHT; 678 } else if(c == 0xffff) { 679 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 680 } else { 681 return BAIL_OUT; 682 } 683 } 684 685 /** 686 * Java returns a negative result (use the '~' operator) if sIndex is to be incremented. 687 * C++ modifies sIndex. 688 */ 689 private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) { 690 if(ce >= MIN_LONG || ce < CONTRACTION) { 691 return ce; // simple or special mini CE 692 } else if(ce >= EXPANSION) { 693 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 694 return ((long)table[index + 1] << 16) | table[index]; 695 } else /* ce >= CONTRACTION */ { 696 // Contraction list: Default mapping followed by 697 // 0 or more single-character contraction suffix mappings. 698 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 699 boolean inc = false; // true if the next char is consumed. 700 if(sIndex != s16.length()) { 701 // Read the next character. 702 int c2; 703 int nextIndex = sIndex; 704 c2 = s16.charAt(nextIndex++); 705 if(c2 > LATIN_MAX) { 706 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 707 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 708 } else if(c2 == 0xfffe || c2 == 0xffff) { 709 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 710 } else { 711 return BAIL_OUT; 712 } 713 } 714 // Look for the next character in the contraction suffix list, 715 // which is in ascending order of single suffix characters. 716 int i = index; 717 int head = table[i]; // first skip the default mapping 718 int x; 719 do { 720 i += head >> CONTR_LENGTH_SHIFT; 721 head = table[i]; 722 x = head & CONTR_CHAR_MASK; 723 } while(x < c2); 724 if(x == c2) { 725 index = i; 726 inc = true; 727 } 728 } 729 // Return the CE or CEs for the default or contraction mapping. 730 int length = table[index] >> CONTR_LENGTH_SHIFT; 731 if(length == 1) { 732 return BAIL_OUT; 733 } 734 ce = table[index + 1]; 735 long result; 736 if(length == 2) { 737 result = ce; 738 } else { 739 result = ((long)table[index + 2] << 16) | ce; 740 } 741 return inc ? ~result : result; 742 } 743 } 744 745 private static int getPrimaries(int variableTop, int pair) { 746 int ce = pair & 0xffff; 747 if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; } 748 if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; } 749 if(ce >= MIN_LONG) { return 0; } // variable 750 return pair; // special mini CE 751 } 752 753 private static int getSecondariesFromOneShortCE(int ce) { 754 ce &= SECONDARY_MASK; 755 if(ce < MIN_SEC_HIGH) { 756 return ce + SEC_OFFSET; 757 } else { 758 return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET; 759 } 760 } 761 762 private static int getSecondaries(int variableTop, int pair) { 763 if(pair <= 0xffff) { 764 // one mini CE 765 if(pair >= MIN_SHORT) { 766 pair = getSecondariesFromOneShortCE(pair); 767 } else if(pair > variableTop) { 768 pair = COMMON_SEC_PLUS_OFFSET; 769 } else if(pair >= MIN_LONG) { 770 pair = 0; // variable 771 } 772 // else special mini CE 773 } else { 774 int ce = pair & 0xffff; 775 if(ce >= MIN_SHORT) { 776 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 777 } else if(ce > variableTop) { 778 pair = TWO_COMMON_SEC_PLUS_OFFSET; 779 } else { 780 assert(ce >= MIN_LONG); 781 pair = 0; // variable 782 } 783 } 784 return pair; 785 } 786 787 private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) { 788 // Primary+caseLevel: Ignore case level weights of primary ignorables. 789 // Otherwise: Ignore case level weights of secondary ignorables. 790 // For details see the comments in the CollationCompare class. 791 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 792 if(pair <= 0xffff) { 793 // one mini CE 794 if(pair >= MIN_SHORT) { 795 // A high secondary weight means we really have two CEs, 796 // a primary CE and a secondary CE. 797 int ce = pair; 798 pair &= CASE_MASK; // explicit weight of primary CE 799 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 800 pair |= LOWER_CASE << 16; // implied weight of secondary CE 801 } 802 } else if(pair > variableTop) { 803 pair = LOWER_CASE; 804 } else if(pair >= MIN_LONG) { 805 pair = 0; // variable 806 } 807 // else special mini CE 808 } else { 809 // two mini CEs, same primary groups, neither expands like above 810 int ce = pair & 0xffff; 811 if(ce >= MIN_SHORT) { 812 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 813 pair &= CASE_MASK; 814 } else { 815 pair &= TWO_CASES_MASK; 816 } 817 } else if(ce > variableTop) { 818 pair = TWO_LOWER_CASES; 819 } else { 820 assert(ce >= MIN_LONG); 821 pair = 0; // variable 822 } 823 } 824 return pair; 825 } 826 827 private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) { 828 if(pair <= 0xffff) { 829 // one mini CE 830 if(pair >= MIN_SHORT) { 831 // A high secondary weight means we really have two CEs, 832 // a primary CE and a secondary CE. 833 int ce = pair; 834 if(withCaseBits) { 835 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 836 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 837 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 838 } 839 } else { 840 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 841 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 842 pair |= COMMON_TER_PLUS_OFFSET << 16; 843 } 844 } 845 } else if(pair > variableTop) { 846 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 847 if(withCaseBits) { 848 pair |= LOWER_CASE; 849 } 850 } else if(pair >= MIN_LONG) { 851 pair = 0; // variable 852 } 853 // else special mini CE 854 } else { 855 // two mini CEs, same primary groups, neither expands like above 856 int ce = pair & 0xffff; 857 if(ce >= MIN_SHORT) { 858 if(withCaseBits) { 859 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 860 } else { 861 pair &= TWO_TERTIARIES_MASK; 862 } 863 pair += TWO_TER_OFFSETS; 864 } else if(ce > variableTop) { 865 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 866 if(withCaseBits) { 867 pair |= TWO_LOWER_CASES; 868 } 869 } else { 870 assert(ce >= MIN_LONG); 871 pair = 0; // variable 872 } 873 } 874 return pair; 875 } 876 877 private static int getQuaternaries(int variableTop, int pair) { 878 // Return the primary weight of a variable CE, 879 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 880 if(pair <= 0xffff) { 881 // one mini CE 882 if(pair >= MIN_SHORT) { 883 // A high secondary weight means we really have two CEs, 884 // a primary CE and a secondary CE. 885 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 886 pair = TWO_SHORT_PRIMARIES_MASK; 887 } else { 888 pair = SHORT_PRIMARY_MASK; 889 } 890 } else if(pair > variableTop) { 891 pair = SHORT_PRIMARY_MASK; 892 } else if(pair >= MIN_LONG) { 893 pair &= LONG_PRIMARY_MASK; // variable 894 } 895 // else special mini CE 896 } else { 897 // two mini CEs, same primary groups, neither expands like above 898 int ce = pair & 0xffff; 899 if(ce > variableTop) { 900 pair = TWO_SHORT_PRIMARIES_MASK; 901 } else { 902 assert(ce >= MIN_LONG); 903 pair &= TWO_LONG_PRIMARIES_MASK; // variable 904 } 905 } 906 return pair; 907 } 908 909 private CollationFastLatin() {} // no constructor 910 } 911