1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationfastlatin.cpp 7 * 8 * created on: 2013aug18 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "collationdata.h" 18 #include "collationfastlatin.h" 19 #include "collationsettings.h" 20 #include "uassert.h" 21 22 U_NAMESPACE_BEGIN 23 24 int32_t 25 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, 26 uint16_t *primaries, int32_t capacity) { 27 const uint16_t *table = data->fastLatinTable; 28 if(table == NULL) { return -1; } 29 U_ASSERT(capacity == LATIN_LIMIT); 30 if(capacity != LATIN_LIMIT) { return -1; } 31 32 uint32_t miniVarTop; 33 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { 34 // No mini primaries are variable, set a variableTop just below the 35 // lowest long mini primary. 36 miniVarTop = MIN_LONG - 1; 37 } else { 38 int32_t headerLength = *table & 0xff; 39 int32_t i = 1 + settings.getMaxVariable(); 40 if(i >= headerLength) { 41 return -1; // variableTop >= digits, should not occur 42 } 43 miniVarTop = table[i]; 44 } 45 46 UBool digitsAreReordered = FALSE; 47 if(settings.hasReordering()) { 48 uint32_t prevStart = 0; 49 uint32_t beforeDigitStart = 0; 50 uint32_t digitStart = 0; 51 uint32_t afterDigitStart = 0; 52 for(int32_t group = UCOL_REORDER_CODE_FIRST; 53 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; 54 ++group) { 55 uint32_t start = data->getFirstPrimaryForGroup(group); 56 start = settings.reorder(start); 57 if(group == UCOL_REORDER_CODE_DIGIT) { 58 beforeDigitStart = prevStart; 59 digitStart = start; 60 } else if(start != 0) { 61 if(start < prevStart) { 62 // The permutation affects the groups up to Latin. 63 return -1; 64 } 65 // In the future, there might be a special group between digits & Latin. 66 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 67 afterDigitStart = start; 68 } 69 prevStart = start; 70 } 71 } 72 uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); 73 latinStart = settings.reorder(latinStart); 74 if(latinStart < prevStart) { 75 return -1; 76 } 77 if(afterDigitStart == 0) { 78 afterDigitStart = latinStart; 79 } 80 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 81 digitsAreReordered = TRUE; 82 } 83 } 84 85 table += (table[0] & 0xff); // skip the header 86 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { 87 uint32_t p = table[c]; 88 if(p >= MIN_SHORT) { 89 p &= SHORT_PRIMARY_MASK; 90 } else if(p > miniVarTop) { 91 p &= LONG_PRIMARY_MASK; 92 } else { 93 p = 0; 94 } 95 primaries[c] = (uint16_t)p; 96 } 97 if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { 98 // Bail out for digits. 99 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 100 } 101 102 // Shift the miniVarTop above other options. 103 return ((int32_t)miniVarTop << 16) | settings.options; 104 } 105 106 int32_t 107 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, 108 const UChar *left, int32_t leftLength, 109 const UChar *right, int32_t rightLength) { 110 // This is a modified copy of CollationCompare::compareUpToQuaternary(), 111 // optimized for common Latin text. 112 // Keep them in sync! 113 // Keep compareUTF16() and compareUTF8() in sync very closely! 114 115 U_ASSERT((table[0] >> 8) == VERSION); 116 table += (table[0] & 0xff); // skip the header 117 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() 118 options &= 0xffff; // needed for CollationSettings::getStrength() to work 119 120 // Check for supported characters, fetch mini CEs, and compare primaries. 121 int32_t leftIndex = 0, rightIndex = 0; 122 /** 123 * Single mini CE or a pair. 124 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 125 * If there is only one, then it is in the lower bits, and the upper bits are 0. 126 */ 127 uint32_t leftPair = 0, rightPair = 0; 128 for(;;) { 129 // We fetch CEs until we get a non-ignorable primary or reach the end. 130 while(leftPair == 0) { 131 if(leftIndex == leftLength) { 132 leftPair = EOS; 133 break; 134 } 135 UChar32 c = left[leftIndex++]; 136 if(c <= LATIN_MAX) { 137 leftPair = primaries[c]; 138 if(leftPair != 0) { break; } 139 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 140 return BAIL_OUT_RESULT; 141 } 142 leftPair = table[c]; 143 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 144 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 145 } else { 146 leftPair = lookup(table, c); 147 } 148 if(leftPair >= MIN_SHORT) { 149 leftPair &= SHORT_PRIMARY_MASK; 150 break; 151 } else if(leftPair > variableTop) { 152 leftPair &= LONG_PRIMARY_MASK; 153 break; 154 } else { 155 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 156 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 157 leftPair = getPrimaries(variableTop, leftPair); 158 } 159 } 160 161 while(rightPair == 0) { 162 if(rightIndex == rightLength) { 163 rightPair = EOS; 164 break; 165 } 166 UChar32 c = right[rightIndex++]; 167 if(c <= LATIN_MAX) { 168 rightPair = primaries[c]; 169 if(rightPair != 0) { break; } 170 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 171 return BAIL_OUT_RESULT; 172 } 173 rightPair = table[c]; 174 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 175 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 176 } else { 177 rightPair = lookup(table, c); 178 } 179 if(rightPair >= MIN_SHORT) { 180 rightPair &= SHORT_PRIMARY_MASK; 181 break; 182 } else if(rightPair > variableTop) { 183 rightPair &= LONG_PRIMARY_MASK; 184 break; 185 } else { 186 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 187 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 188 rightPair = getPrimaries(variableTop, rightPair); 189 } 190 } 191 192 if(leftPair == rightPair) { 193 if(leftPair == EOS) { break; } 194 leftPair = rightPair = 0; 195 continue; 196 } 197 uint32_t leftPrimary = leftPair & 0xffff; 198 uint32_t rightPrimary = rightPair & 0xffff; 199 if(leftPrimary != rightPrimary) { 200 // Return the primary difference. 201 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 202 } 203 if(leftPair == EOS) { break; } 204 leftPair >>= 16; 205 rightPair >>= 16; 206 } 207 // In the following, we need to re-fetch each character because we did not buffer the CEs, 208 // but we know that the string is well-formed and 209 // only contains supported characters and mappings. 210 211 // We might skip the secondary level but continue with the case level 212 // which is turned on separately. 213 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 214 leftIndex = rightIndex = 0; 215 leftPair = rightPair = 0; 216 for(;;) { 217 while(leftPair == 0) { 218 if(leftIndex == leftLength) { 219 leftPair = EOS; 220 break; 221 } 222 UChar32 c = left[leftIndex++]; 223 if(c <= LATIN_MAX) { 224 leftPair = table[c]; 225 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 226 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 227 } else { 228 leftPair = lookup(table, c); 229 } 230 if(leftPair >= MIN_SHORT) { 231 leftPair = getSecondariesFromOneShortCE(leftPair); 232 break; 233 } else if(leftPair > variableTop) { 234 leftPair = COMMON_SEC_PLUS_OFFSET; 235 break; 236 } else { 237 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 238 leftPair = getSecondaries(variableTop, leftPair); 239 } 240 } 241 242 while(rightPair == 0) { 243 if(rightIndex == rightLength) { 244 rightPair = EOS; 245 break; 246 } 247 UChar32 c = right[rightIndex++]; 248 if(c <= LATIN_MAX) { 249 rightPair = table[c]; 250 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 251 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 252 } else { 253 rightPair = lookup(table, c); 254 } 255 if(rightPair >= MIN_SHORT) { 256 rightPair = getSecondariesFromOneShortCE(rightPair); 257 break; 258 } else if(rightPair > variableTop) { 259 rightPair = COMMON_SEC_PLUS_OFFSET; 260 break; 261 } else { 262 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 263 rightPair = getSecondaries(variableTop, rightPair); 264 } 265 } 266 267 if(leftPair == rightPair) { 268 if(leftPair == EOS) { break; } 269 leftPair = rightPair = 0; 270 continue; 271 } 272 uint32_t leftSecondary = leftPair & 0xffff; 273 uint32_t rightSecondary = rightPair & 0xffff; 274 if(leftSecondary != rightSecondary) { 275 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 276 // Full support for backwards secondary requires backwards contraction matching 277 // and moving backwards between merge separators. 278 return BAIL_OUT_RESULT; 279 } 280 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 281 } 282 if(leftPair == EOS) { break; } 283 leftPair >>= 16; 284 rightPair >>= 16; 285 } 286 } 287 288 if((options & CollationSettings::CASE_LEVEL) != 0) { 289 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 290 leftIndex = rightIndex = 0; 291 leftPair = rightPair = 0; 292 for(;;) { 293 while(leftPair == 0) { 294 if(leftIndex == leftLength) { 295 leftPair = EOS; 296 break; 297 } 298 UChar32 c = left[leftIndex++]; 299 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 300 if(leftPair < MIN_LONG) { 301 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 302 } 303 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 304 } 305 306 while(rightPair == 0) { 307 if(rightIndex == rightLength) { 308 rightPair = EOS; 309 break; 310 } 311 UChar32 c = right[rightIndex++]; 312 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 313 if(rightPair < MIN_LONG) { 314 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 315 } 316 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 317 } 318 319 if(leftPair == rightPair) { 320 if(leftPair == EOS) { break; } 321 leftPair = rightPair = 0; 322 continue; 323 } 324 uint32_t leftCase = leftPair & 0xffff; 325 uint32_t rightCase = rightPair & 0xffff; 326 if(leftCase != rightCase) { 327 if((options & CollationSettings::UPPER_FIRST) == 0) { 328 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 329 } else { 330 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 331 } 332 } 333 if(leftPair == EOS) { break; } 334 leftPair >>= 16; 335 rightPair >>= 16; 336 } 337 } 338 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 339 340 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 341 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 342 343 leftIndex = rightIndex = 0; 344 leftPair = rightPair = 0; 345 for(;;) { 346 while(leftPair == 0) { 347 if(leftIndex == leftLength) { 348 leftPair = EOS; 349 break; 350 } 351 UChar32 c = left[leftIndex++]; 352 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 353 if(leftPair < MIN_LONG) { 354 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 355 } 356 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 357 } 358 359 while(rightPair == 0) { 360 if(rightIndex == rightLength) { 361 rightPair = EOS; 362 break; 363 } 364 UChar32 c = right[rightIndex++]; 365 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 366 if(rightPair < MIN_LONG) { 367 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 368 } 369 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 370 } 371 372 if(leftPair == rightPair) { 373 if(leftPair == EOS) { break; } 374 leftPair = rightPair = 0; 375 continue; 376 } 377 uint32_t leftTertiary = leftPair & 0xffff; 378 uint32_t rightTertiary = rightPair & 0xffff; 379 if(leftTertiary != rightTertiary) { 380 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 381 // Pass through EOS and MERGE_WEIGHT 382 // and keep real tertiary weights larger than the MERGE_WEIGHT. 383 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 384 if(leftTertiary > MERGE_WEIGHT) { 385 leftTertiary ^= CASE_MASK; 386 } 387 if(rightTertiary > MERGE_WEIGHT) { 388 rightTertiary ^= CASE_MASK; 389 } 390 } 391 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 392 } 393 if(leftPair == EOS) { break; } 394 leftPair >>= 16; 395 rightPair >>= 16; 396 } 397 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 398 399 leftIndex = rightIndex = 0; 400 leftPair = rightPair = 0; 401 for(;;) { 402 while(leftPair == 0) { 403 if(leftIndex == leftLength) { 404 leftPair = EOS; 405 break; 406 } 407 UChar32 c = left[leftIndex++]; 408 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 409 if(leftPair < MIN_LONG) { 410 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 411 } 412 leftPair = getQuaternaries(variableTop, leftPair); 413 } 414 415 while(rightPair == 0) { 416 if(rightIndex == rightLength) { 417 rightPair = EOS; 418 break; 419 } 420 UChar32 c = right[rightIndex++]; 421 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 422 if(rightPair < MIN_LONG) { 423 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 424 } 425 rightPair = getQuaternaries(variableTop, rightPair); 426 } 427 428 if(leftPair == rightPair) { 429 if(leftPair == EOS) { break; } 430 leftPair = rightPair = 0; 431 continue; 432 } 433 uint32_t leftQuaternary = leftPair & 0xffff; 434 uint32_t rightQuaternary = rightPair & 0xffff; 435 if(leftQuaternary != rightQuaternary) { 436 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 437 } 438 if(leftPair == EOS) { break; } 439 leftPair >>= 16; 440 rightPair >>= 16; 441 } 442 return UCOL_EQUAL; 443 } 444 445 int32_t 446 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, 447 const uint8_t *left, int32_t leftLength, 448 const uint8_t *right, int32_t rightLength) { 449 // Keep compareUTF16() and compareUTF8() in sync very closely! 450 451 U_ASSERT((table[0] >> 8) == VERSION); 452 table += (table[0] & 0xff); // skip the header 453 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() 454 options &= 0xffff; // needed for CollationSettings::getStrength() to work 455 456 // Check for supported characters, fetch mini CEs, and compare primaries. 457 int32_t leftIndex = 0, rightIndex = 0; 458 /** 459 * Single mini CE or a pair. 460 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 461 * If there is only one, then it is in the lower bits, and the upper bits are 0. 462 */ 463 uint32_t leftPair = 0, rightPair = 0; 464 // Note: There is no need to assemble the code point. 465 // We only need to look up the table entry for the character, 466 // and nextPair() looks for whether c==0. 467 for(;;) { 468 // We fetch CEs until we get a non-ignorable primary or reach the end. 469 while(leftPair == 0) { 470 if(leftIndex == leftLength) { 471 leftPair = EOS; 472 break; 473 } 474 UChar32 c = left[leftIndex++]; 475 uint8_t t; 476 if(c <= 0x7f) { 477 leftPair = primaries[c]; 478 if(leftPair != 0) { break; } 479 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 480 return BAIL_OUT_RESULT; 481 } 482 leftPair = table[c]; 483 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && 484 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { 485 ++leftIndex; 486 c = ((c - 0xc2) << 6) + t; 487 leftPair = primaries[c]; 488 if(leftPair != 0) { break; } 489 leftPair = table[c]; 490 } else { 491 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); 492 } 493 if(leftPair >= MIN_SHORT) { 494 leftPair &= SHORT_PRIMARY_MASK; 495 break; 496 } else if(leftPair > variableTop) { 497 leftPair &= LONG_PRIMARY_MASK; 498 break; 499 } else { 500 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 501 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 502 leftPair = getPrimaries(variableTop, leftPair); 503 } 504 } 505 506 while(rightPair == 0) { 507 if(rightIndex == rightLength) { 508 rightPair = EOS; 509 break; 510 } 511 UChar32 c = right[rightIndex++]; 512 uint8_t t; 513 if(c <= 0x7f) { 514 rightPair = primaries[c]; 515 if(rightPair != 0) { break; } 516 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 517 return BAIL_OUT_RESULT; 518 } 519 rightPair = table[c]; 520 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && 521 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { 522 ++rightIndex; 523 c = ((c - 0xc2) << 6) + t; 524 rightPair = primaries[c]; 525 if(rightPair != 0) { break; } 526 rightPair = table[c]; 527 } else { 528 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); 529 } 530 if(rightPair >= MIN_SHORT) { 531 rightPair &= SHORT_PRIMARY_MASK; 532 break; 533 } else if(rightPair > variableTop) { 534 rightPair &= LONG_PRIMARY_MASK; 535 break; 536 } else { 537 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 538 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 539 rightPair = getPrimaries(variableTop, rightPair); 540 } 541 } 542 543 if(leftPair == rightPair) { 544 if(leftPair == EOS) { break; } 545 leftPair = rightPair = 0; 546 continue; 547 } 548 uint32_t leftPrimary = leftPair & 0xffff; 549 uint32_t rightPrimary = rightPair & 0xffff; 550 if(leftPrimary != rightPrimary) { 551 // Return the primary difference. 552 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 553 } 554 if(leftPair == EOS) { break; } 555 leftPair >>= 16; 556 rightPair >>= 16; 557 } 558 // In the following, we need to re-fetch each character because we did not buffer the CEs, 559 // but we know that the string is well-formed and 560 // only contains supported characters and mappings. 561 562 // We might skip the secondary level but continue with the case level 563 // which is turned on separately. 564 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 565 leftIndex = rightIndex = 0; 566 leftPair = rightPair = 0; 567 for(;;) { 568 while(leftPair == 0) { 569 if(leftIndex == leftLength) { 570 leftPair = EOS; 571 break; 572 } 573 UChar32 c = left[leftIndex++]; 574 if(c <= 0x7f) { 575 leftPair = table[c]; 576 } else if(c <= LATIN_MAX_UTF8_LEAD) { 577 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; 578 } else { 579 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); 580 } 581 if(leftPair >= MIN_SHORT) { 582 leftPair = getSecondariesFromOneShortCE(leftPair); 583 break; 584 } else if(leftPair > variableTop) { 585 leftPair = COMMON_SEC_PLUS_OFFSET; 586 break; 587 } else { 588 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 589 leftPair = getSecondaries(variableTop, leftPair); 590 } 591 } 592 593 while(rightPair == 0) { 594 if(rightIndex == rightLength) { 595 rightPair = EOS; 596 break; 597 } 598 UChar32 c = right[rightIndex++]; 599 if(c <= 0x7f) { 600 rightPair = table[c]; 601 } else if(c <= LATIN_MAX_UTF8_LEAD) { 602 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; 603 } else { 604 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); 605 } 606 if(rightPair >= MIN_SHORT) { 607 rightPair = getSecondariesFromOneShortCE(rightPair); 608 break; 609 } else if(rightPair > variableTop) { 610 rightPair = COMMON_SEC_PLUS_OFFSET; 611 break; 612 } else { 613 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 614 rightPair = getSecondaries(variableTop, rightPair); 615 } 616 } 617 618 if(leftPair == rightPair) { 619 if(leftPair == EOS) { break; } 620 leftPair = rightPair = 0; 621 continue; 622 } 623 uint32_t leftSecondary = leftPair & 0xffff; 624 uint32_t rightSecondary = rightPair & 0xffff; 625 if(leftSecondary != rightSecondary) { 626 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 627 // Full support for backwards secondary requires backwards contraction matching 628 // and moving backwards between merge separators. 629 return BAIL_OUT_RESULT; 630 } 631 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 632 } 633 if(leftPair == EOS) { break; } 634 leftPair >>= 16; 635 rightPair >>= 16; 636 } 637 } 638 639 if((options & CollationSettings::CASE_LEVEL) != 0) { 640 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 641 leftIndex = rightIndex = 0; 642 leftPair = rightPair = 0; 643 for(;;) { 644 while(leftPair == 0) { 645 if(leftIndex == leftLength) { 646 leftPair = EOS; 647 break; 648 } 649 UChar32 c = left[leftIndex++]; 650 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 651 if(leftPair < MIN_LONG) { 652 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 653 } 654 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 655 } 656 657 while(rightPair == 0) { 658 if(rightIndex == rightLength) { 659 rightPair = EOS; 660 break; 661 } 662 UChar32 c = right[rightIndex++]; 663 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 664 if(rightPair < MIN_LONG) { 665 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 666 } 667 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 668 } 669 670 if(leftPair == rightPair) { 671 if(leftPair == EOS) { break; } 672 leftPair = rightPair = 0; 673 continue; 674 } 675 uint32_t leftCase = leftPair & 0xffff; 676 uint32_t rightCase = rightPair & 0xffff; 677 if(leftCase != rightCase) { 678 if((options & CollationSettings::UPPER_FIRST) == 0) { 679 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 680 } else { 681 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 682 } 683 } 684 if(leftPair == EOS) { break; } 685 leftPair >>= 16; 686 rightPair >>= 16; 687 } 688 } 689 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 690 691 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 692 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 693 694 leftIndex = rightIndex = 0; 695 leftPair = rightPair = 0; 696 for(;;) { 697 while(leftPair == 0) { 698 if(leftIndex == leftLength) { 699 leftPair = EOS; 700 break; 701 } 702 UChar32 c = left[leftIndex++]; 703 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 704 if(leftPair < MIN_LONG) { 705 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 706 } 707 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 708 } 709 710 while(rightPair == 0) { 711 if(rightIndex == rightLength) { 712 rightPair = EOS; 713 break; 714 } 715 UChar32 c = right[rightIndex++]; 716 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 717 if(rightPair < MIN_LONG) { 718 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 719 } 720 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 721 } 722 723 if(leftPair == rightPair) { 724 if(leftPair == EOS) { break; } 725 leftPair = rightPair = 0; 726 continue; 727 } 728 uint32_t leftTertiary = leftPair & 0xffff; 729 uint32_t rightTertiary = rightPair & 0xffff; 730 if(leftTertiary != rightTertiary) { 731 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 732 // Pass through EOS and MERGE_WEIGHT 733 // and keep real tertiary weights larger than the MERGE_WEIGHT. 734 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 735 if(leftTertiary > MERGE_WEIGHT) { 736 leftTertiary ^= CASE_MASK; 737 } 738 if(rightTertiary > MERGE_WEIGHT) { 739 rightTertiary ^= CASE_MASK; 740 } 741 } 742 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 743 } 744 if(leftPair == EOS) { break; } 745 leftPair >>= 16; 746 rightPair >>= 16; 747 } 748 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 749 750 leftIndex = rightIndex = 0; 751 leftPair = rightPair = 0; 752 for(;;) { 753 while(leftPair == 0) { 754 if(leftIndex == leftLength) { 755 leftPair = EOS; 756 break; 757 } 758 UChar32 c = left[leftIndex++]; 759 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 760 if(leftPair < MIN_LONG) { 761 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 762 } 763 leftPair = getQuaternaries(variableTop, leftPair); 764 } 765 766 while(rightPair == 0) { 767 if(rightIndex == rightLength) { 768 rightPair = EOS; 769 break; 770 } 771 UChar32 c = right[rightIndex++]; 772 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 773 if(rightPair < MIN_LONG) { 774 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 775 } 776 rightPair = getQuaternaries(variableTop, rightPair); 777 } 778 779 if(leftPair == rightPair) { 780 if(leftPair == EOS) { break; } 781 leftPair = rightPair = 0; 782 continue; 783 } 784 uint32_t leftQuaternary = leftPair & 0xffff; 785 uint32_t rightQuaternary = rightPair & 0xffff; 786 if(leftQuaternary != rightQuaternary) { 787 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 788 } 789 if(leftPair == EOS) { break; } 790 leftPair >>= 16; 791 rightPair >>= 16; 792 } 793 return UCOL_EQUAL; 794 } 795 796 uint32_t 797 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { 798 U_ASSERT(c > LATIN_MAX); 799 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 800 return table[c - PUNCT_START + LATIN_LIMIT]; 801 } else if(c == 0xfffe) { 802 return MERGE_WEIGHT; 803 } else if(c == 0xffff) { 804 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 805 } else { 806 return BAIL_OUT; 807 } 808 } 809 810 uint32_t 811 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, 812 const uint8_t *s8, int32_t &sIndex, int32_t sLength) { 813 // The caller handled ASCII and valid/supported Latin. 814 U_ASSERT(c > 0x7f); 815 int32_t i2 = sIndex + 1; 816 if(i2 < sLength || sLength < 0) { 817 uint8_t t1 = s8[sIndex]; 818 uint8_t t2 = s8[i2]; 819 sIndex += 2; 820 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { 821 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 822 } else if(c == 0xef && t1 == 0xbf) { 823 if(t2 == 0xbe) { 824 return MERGE_WEIGHT; // U+FFFE 825 } else if(t2 == 0xbf) { 826 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 827 } 828 } 829 } 830 return BAIL_OUT; 831 } 832 833 uint32_t 834 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, 835 const uint8_t *s8, int32_t &sIndex) { 836 // The caller handled ASCII. 837 // The string is well-formed and contains only supported characters. 838 U_ASSERT(c > 0x7f); 839 if(c <= LATIN_MAX_UTF8_LEAD) { 840 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F 841 } 842 uint8_t t2 = s8[sIndex + 1]; 843 sIndex += 2; 844 if(c == 0xe2) { 845 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 846 } else if(t2 == 0xbe) { 847 return MERGE_WEIGHT; // U+FFFE 848 } else { 849 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 850 } 851 } 852 853 uint32_t 854 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, 855 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { 856 if(ce >= MIN_LONG || ce < CONTRACTION) { 857 return ce; // simple or special mini CE 858 } else if(ce >= EXPANSION) { 859 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 860 return ((uint32_t)table[index + 1] << 16) | table[index]; 861 } else /* ce >= CONTRACTION */ { 862 if(c == 0 && sLength < 0) { 863 sLength = sIndex - 1; 864 return EOS; 865 } 866 // Contraction list: Default mapping followed by 867 // 0 or more single-character contraction suffix mappings. 868 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 869 if(sIndex != sLength) { 870 // Read the next character. 871 int32_t c2; 872 int32_t nextIndex = sIndex; 873 if(s16 != NULL) { 874 c2 = s16[nextIndex++]; 875 if(c2 > LATIN_MAX) { 876 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 877 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 878 } else if(c2 == 0xfffe || c2 == 0xffff) { 879 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 880 } else { 881 return BAIL_OUT; 882 } 883 } 884 } else { 885 c2 = s8[nextIndex++]; 886 if(c2 > 0x7f) { 887 uint8_t t; 888 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && 889 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { 890 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F 891 ++nextIndex; 892 } else { 893 int32_t i2 = nextIndex + 1; 894 if(i2 < sLength || sLength < 0) { 895 if(c2 == 0xe2 && s8[nextIndex] == 0x80 && 896 0x80 <= (t = s8[i2]) && t <= 0xbf) { 897 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF 898 } else if(c2 == 0xef && s8[nextIndex] == 0xbf && 899 ((t = s8[i2]) == 0xbe || t == 0xbf)) { 900 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 901 } else { 902 return BAIL_OUT; 903 } 904 } else { 905 return BAIL_OUT; 906 } 907 nextIndex += 2; 908 } 909 } 910 } 911 if(c2 == 0 && sLength < 0) { 912 sLength = sIndex; 913 c2 = -1; 914 } 915 // Look for the next character in the contraction suffix list, 916 // which is in ascending order of single suffix characters. 917 int32_t i = index; 918 int32_t head = table[i]; // first skip the default mapping 919 int32_t x; 920 do { 921 i += head >> CONTR_LENGTH_SHIFT; 922 head = table[i]; 923 x = head & CONTR_CHAR_MASK; 924 } while(x < c2); 925 if(x == c2) { 926 index = i; 927 sIndex = nextIndex; 928 } 929 } 930 // Return the CE or CEs for the default or contraction mapping. 931 int32_t length = table[index] >> CONTR_LENGTH_SHIFT; 932 if(length == 1) { 933 return BAIL_OUT; 934 } 935 ce = table[index + 1]; 936 if(length == 2) { 937 return ce; 938 } else { 939 return ((uint32_t)table[index + 2] << 16) | ce; 940 } 941 } 942 } 943 944 uint32_t 945 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { 946 if(pair <= 0xffff) { 947 // one mini CE 948 if(pair >= MIN_SHORT) { 949 pair = getSecondariesFromOneShortCE(pair); 950 } else if(pair > variableTop) { 951 pair = COMMON_SEC_PLUS_OFFSET; 952 } else if(pair >= MIN_LONG) { 953 pair = 0; // variable 954 } 955 // else special mini CE 956 } else { 957 uint32_t ce = pair & 0xffff; 958 if(ce >= MIN_SHORT) { 959 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 960 } else if(ce > variableTop) { 961 pair = TWO_COMMON_SEC_PLUS_OFFSET; 962 } else { 963 U_ASSERT(ce >= MIN_LONG); 964 pair = 0; // variable 965 } 966 } 967 return pair; 968 } 969 970 uint32_t 971 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { 972 // Primary+caseLevel: Ignore case level weights of primary ignorables. 973 // Otherwise: Ignore case level weights of secondary ignorables. 974 // For details see the comments in the CollationCompare class. 975 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 976 if(pair <= 0xffff) { 977 // one mini CE 978 if(pair >= MIN_SHORT) { 979 // A high secondary weight means we really have two CEs, 980 // a primary CE and a secondary CE. 981 uint32_t ce = pair; 982 pair &= CASE_MASK; // explicit weight of primary CE 983 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 984 pair |= LOWER_CASE << 16; // implied weight of secondary CE 985 } 986 } else if(pair > variableTop) { 987 pair = LOWER_CASE; 988 } else if(pair >= MIN_LONG) { 989 pair = 0; // variable 990 } 991 // else special mini CE 992 } else { 993 // two mini CEs, same primary groups, neither expands like above 994 uint32_t ce = pair & 0xffff; 995 if(ce >= MIN_SHORT) { 996 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 997 pair &= CASE_MASK; 998 } else { 999 pair &= TWO_CASES_MASK; 1000 } 1001 } else if(ce > variableTop) { 1002 pair = TWO_LOWER_CASES; 1003 } else { 1004 U_ASSERT(ce >= MIN_LONG); 1005 pair = 0; // variable 1006 } 1007 } 1008 return pair; 1009 } 1010 1011 uint32_t 1012 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { 1013 if(pair <= 0xffff) { 1014 // one mini CE 1015 if(pair >= MIN_SHORT) { 1016 // A high secondary weight means we really have two CEs, 1017 // a primary CE and a secondary CE. 1018 uint32_t ce = pair; 1019 if(withCaseBits) { 1020 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 1021 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1022 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 1023 } 1024 } else { 1025 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1026 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1027 pair |= COMMON_TER_PLUS_OFFSET << 16; 1028 } 1029 } 1030 } else if(pair > variableTop) { 1031 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1032 if(withCaseBits) { 1033 pair |= LOWER_CASE; 1034 } 1035 } else if(pair >= MIN_LONG) { 1036 pair = 0; // variable 1037 } 1038 // else special mini CE 1039 } else { 1040 // two mini CEs, same primary groups, neither expands like above 1041 uint32_t ce = pair & 0xffff; 1042 if(ce >= MIN_SHORT) { 1043 if(withCaseBits) { 1044 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 1045 } else { 1046 pair &= TWO_TERTIARIES_MASK; 1047 } 1048 pair += TWO_TER_OFFSETS; 1049 } else if(ce > variableTop) { 1050 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 1051 if(withCaseBits) { 1052 pair |= TWO_LOWER_CASES; 1053 } 1054 } else { 1055 U_ASSERT(ce >= MIN_LONG); 1056 pair = 0; // variable 1057 } 1058 } 1059 return pair; 1060 } 1061 1062 uint32_t 1063 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { 1064 // Return the primary weight of a variable CE, 1065 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 1066 if(pair <= 0xffff) { 1067 // one mini CE 1068 if(pair >= MIN_SHORT) { 1069 // A high secondary weight means we really have two CEs, 1070 // a primary CE and a secondary CE. 1071 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1072 pair = TWO_SHORT_PRIMARIES_MASK; 1073 } else { 1074 pair = SHORT_PRIMARY_MASK; 1075 } 1076 } else if(pair > variableTop) { 1077 pair = SHORT_PRIMARY_MASK; 1078 } else if(pair >= MIN_LONG) { 1079 pair &= LONG_PRIMARY_MASK; // variable 1080 } 1081 // else special mini CE 1082 } else { 1083 // two mini CEs, same primary groups, neither expands like above 1084 uint32_t ce = pair & 0xffff; 1085 if(ce > variableTop) { 1086 pair = TWO_SHORT_PRIMARIES_MASK; 1087 } else { 1088 U_ASSERT(ce >= MIN_LONG); 1089 pair &= TWO_LONG_PRIMARIES_MASK; // variable 1090 } 1091 } 1092 return pair; 1093 } 1094 1095 U_NAMESPACE_END 1096 1097 #endif // !UCONFIG_NO_COLLATION 1098