1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationfastlatin.cpp 7 * 8 * created on: 2013aug18 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "collationdata.h" 18 #include "collationfastlatin.h" 19 #include "collationsettings.h" 20 #include "putilimp.h" // U_ALIGN_CODE 21 #include "uassert.h" 22 23 U_NAMESPACE_BEGIN 24 25 int32_t 26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, 27 uint16_t *primaries, int32_t capacity) { 28 const uint16_t *table = data->fastLatinTable; 29 if(table == NULL) { return -1; } 30 U_ASSERT(capacity == LATIN_LIMIT); 31 if(capacity != LATIN_LIMIT) { return -1; } 32 33 uint32_t miniVarTop; 34 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { 35 // No mini primaries are variable, set a variableTop just below the 36 // lowest long mini primary. 37 miniVarTop = MIN_LONG - 1; 38 } else { 39 int32_t headerLength = *table & 0xff; 40 int32_t i = 1 + settings.getMaxVariable(); 41 if(i >= headerLength) { 42 return -1; // variableTop >= digits, should not occur 43 } 44 miniVarTop = table[i]; 45 } 46 47 UBool digitsAreReordered = FALSE; 48 if(settings.hasReordering()) { 49 uint32_t prevStart = 0; 50 uint32_t beforeDigitStart = 0; 51 uint32_t digitStart = 0; 52 uint32_t afterDigitStart = 0; 53 for(int32_t group = UCOL_REORDER_CODE_FIRST; 54 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; 55 ++group) { 56 uint32_t start = data->getFirstPrimaryForGroup(group); 57 start = settings.reorder(start); 58 if(group == UCOL_REORDER_CODE_DIGIT) { 59 beforeDigitStart = prevStart; 60 digitStart = start; 61 } else if(start != 0) { 62 if(start < prevStart) { 63 // The permutation affects the groups up to Latin. 64 return -1; 65 } 66 // In the future, there might be a special group between digits & Latin. 67 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 68 afterDigitStart = start; 69 } 70 prevStart = start; 71 } 72 } 73 uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); 74 latinStart = settings.reorder(latinStart); 75 if(latinStart < prevStart) { 76 return -1; 77 } 78 if(afterDigitStart == 0) { 79 afterDigitStart = latinStart; 80 } 81 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 82 digitsAreReordered = TRUE; 83 } 84 } 85 86 table += (table[0] & 0xff); // skip the header 87 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { 88 uint32_t p = table[c]; 89 if(p >= MIN_SHORT) { 90 p &= SHORT_PRIMARY_MASK; 91 } else if(p > miniVarTop) { 92 p &= LONG_PRIMARY_MASK; 93 } else { 94 p = 0; 95 } 96 primaries[c] = (uint16_t)p; 97 } 98 if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { 99 // Bail out for digits. 100 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 101 } 102 103 // Shift the miniVarTop above other options. 104 return ((int32_t)miniVarTop << 16) | settings.options; 105 } 106 107 int32_t 108 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, 109 const UChar *left, int32_t leftLength, 110 const UChar *right, int32_t rightLength) { 111 // This is a modified copy of CollationCompare::compareUpToQuaternary(), 112 // optimized for common Latin text. 113 // Keep them in sync! 114 // Keep compareUTF16() and compareUTF8() in sync very closely! 115 116 U_ASSERT((table[0] >> 8) == VERSION); 117 table += (table[0] & 0xff); // skip the header 118 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() 119 options &= 0xffff; // needed for CollationSettings::getStrength() to work 120 121 // Check for supported characters, fetch mini CEs, and compare primaries. 122 U_ALIGN_CODE(16); 123 int32_t leftIndex = 0, rightIndex = 0; 124 /** 125 * Single mini CE or a pair. 126 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 127 * If there is only one, then it is in the lower bits, and the upper bits are 0. 128 */ 129 uint32_t leftPair = 0, rightPair = 0; 130 for(;;) { 131 // We fetch CEs until we get a non-ignorable primary or reach the end. 132 while(leftPair == 0) { 133 if(leftIndex == leftLength) { 134 leftPair = EOS; 135 break; 136 } 137 UChar32 c = left[leftIndex++]; 138 if(c <= LATIN_MAX) { 139 leftPair = primaries[c]; 140 if(leftPair != 0) { break; } 141 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 142 return BAIL_OUT_RESULT; 143 } 144 leftPair = table[c]; 145 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 146 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 147 } else { 148 leftPair = lookup(table, c); 149 } 150 if(leftPair >= MIN_SHORT) { 151 leftPair &= SHORT_PRIMARY_MASK; 152 break; 153 } else if(leftPair > variableTop) { 154 leftPair &= LONG_PRIMARY_MASK; 155 break; 156 } else { 157 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 158 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 159 leftPair = getPrimaries(variableTop, leftPair); 160 } 161 } 162 163 while(rightPair == 0) { 164 if(rightIndex == rightLength) { 165 rightPair = EOS; 166 break; 167 } 168 UChar32 c = right[rightIndex++]; 169 if(c <= LATIN_MAX) { 170 rightPair = primaries[c]; 171 if(rightPair != 0) { break; } 172 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 173 return BAIL_OUT_RESULT; 174 } 175 rightPair = table[c]; 176 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 177 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 178 } else { 179 rightPair = lookup(table, c); 180 } 181 if(rightPair >= MIN_SHORT) { 182 rightPair &= SHORT_PRIMARY_MASK; 183 break; 184 } else if(rightPair > variableTop) { 185 rightPair &= LONG_PRIMARY_MASK; 186 break; 187 } else { 188 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 189 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 190 rightPair = getPrimaries(variableTop, rightPair); 191 } 192 } 193 194 if(leftPair == rightPair) { 195 if(leftPair == EOS) { break; } 196 leftPair = rightPair = 0; 197 continue; 198 } 199 uint32_t leftPrimary = leftPair & 0xffff; 200 uint32_t rightPrimary = rightPair & 0xffff; 201 if(leftPrimary != rightPrimary) { 202 // Return the primary difference. 203 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 204 } 205 if(leftPair == EOS) { break; } 206 leftPair >>= 16; 207 rightPair >>= 16; 208 } 209 // In the following, we need to re-fetch each character because we did not buffer the CEs, 210 // but we know that the string is well-formed and 211 // only contains supported characters and mappings. 212 213 // We might skip the secondary level but continue with the case level 214 // which is turned on separately. 215 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 216 leftIndex = rightIndex = 0; 217 leftPair = rightPair = 0; 218 for(;;) { 219 while(leftPair == 0) { 220 if(leftIndex == leftLength) { 221 leftPair = EOS; 222 break; 223 } 224 UChar32 c = left[leftIndex++]; 225 if(c <= LATIN_MAX) { 226 leftPair = table[c]; 227 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 228 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 229 } else { 230 leftPair = lookup(table, c); 231 } 232 if(leftPair >= MIN_SHORT) { 233 leftPair = getSecondariesFromOneShortCE(leftPair); 234 break; 235 } else if(leftPair > variableTop) { 236 leftPair = COMMON_SEC_PLUS_OFFSET; 237 break; 238 } else { 239 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 240 leftPair = getSecondaries(variableTop, leftPair); 241 } 242 } 243 244 while(rightPair == 0) { 245 if(rightIndex == rightLength) { 246 rightPair = EOS; 247 break; 248 } 249 UChar32 c = right[rightIndex++]; 250 if(c <= LATIN_MAX) { 251 rightPair = table[c]; 252 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 253 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 254 } else { 255 rightPair = lookup(table, c); 256 } 257 if(rightPair >= MIN_SHORT) { 258 rightPair = getSecondariesFromOneShortCE(rightPair); 259 break; 260 } else if(rightPair > variableTop) { 261 rightPair = COMMON_SEC_PLUS_OFFSET; 262 break; 263 } else { 264 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 265 rightPair = getSecondaries(variableTop, rightPair); 266 } 267 } 268 269 if(leftPair == rightPair) { 270 if(leftPair == EOS) { break; } 271 leftPair = rightPair = 0; 272 continue; 273 } 274 uint32_t leftSecondary = leftPair & 0xffff; 275 uint32_t rightSecondary = rightPair & 0xffff; 276 if(leftSecondary != rightSecondary) { 277 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 278 // Full support for backwards secondary requires backwards contraction matching 279 // and moving backwards between merge separators. 280 return BAIL_OUT_RESULT; 281 } 282 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 283 } 284 if(leftPair == EOS) { break; } 285 leftPair >>= 16; 286 rightPair >>= 16; 287 } 288 } 289 290 if((options & CollationSettings::CASE_LEVEL) != 0) { 291 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 292 leftIndex = rightIndex = 0; 293 leftPair = rightPair = 0; 294 for(;;) { 295 while(leftPair == 0) { 296 if(leftIndex == leftLength) { 297 leftPair = EOS; 298 break; 299 } 300 UChar32 c = left[leftIndex++]; 301 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 302 if(leftPair < MIN_LONG) { 303 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 304 } 305 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 306 } 307 308 while(rightPair == 0) { 309 if(rightIndex == rightLength) { 310 rightPair = EOS; 311 break; 312 } 313 UChar32 c = right[rightIndex++]; 314 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 315 if(rightPair < MIN_LONG) { 316 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 317 } 318 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 319 } 320 321 if(leftPair == rightPair) { 322 if(leftPair == EOS) { break; } 323 leftPair = rightPair = 0; 324 continue; 325 } 326 uint32_t leftCase = leftPair & 0xffff; 327 uint32_t rightCase = rightPair & 0xffff; 328 if(leftCase != rightCase) { 329 if((options & CollationSettings::UPPER_FIRST) == 0) { 330 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 331 } else { 332 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 333 } 334 } 335 if(leftPair == EOS) { break; } 336 leftPair >>= 16; 337 rightPair >>= 16; 338 } 339 } 340 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 341 342 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 343 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 344 345 leftIndex = rightIndex = 0; 346 leftPair = rightPair = 0; 347 for(;;) { 348 while(leftPair == 0) { 349 if(leftIndex == leftLength) { 350 leftPair = EOS; 351 break; 352 } 353 UChar32 c = left[leftIndex++]; 354 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 355 if(leftPair < MIN_LONG) { 356 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 357 } 358 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 359 } 360 361 while(rightPair == 0) { 362 if(rightIndex == rightLength) { 363 rightPair = EOS; 364 break; 365 } 366 UChar32 c = right[rightIndex++]; 367 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 368 if(rightPair < MIN_LONG) { 369 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 370 } 371 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 372 } 373 374 if(leftPair == rightPair) { 375 if(leftPair == EOS) { break; } 376 leftPair = rightPair = 0; 377 continue; 378 } 379 uint32_t leftTertiary = leftPair & 0xffff; 380 uint32_t rightTertiary = rightPair & 0xffff; 381 if(leftTertiary != rightTertiary) { 382 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 383 // Pass through EOS and MERGE_WEIGHT 384 // and keep real tertiary weights larger than the MERGE_WEIGHT. 385 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 386 if(leftTertiary > MERGE_WEIGHT) { 387 leftTertiary ^= CASE_MASK; 388 } 389 if(rightTertiary > MERGE_WEIGHT) { 390 rightTertiary ^= CASE_MASK; 391 } 392 } 393 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 394 } 395 if(leftPair == EOS) { break; } 396 leftPair >>= 16; 397 rightPair >>= 16; 398 } 399 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 400 401 leftIndex = rightIndex = 0; 402 leftPair = rightPair = 0; 403 for(;;) { 404 while(leftPair == 0) { 405 if(leftIndex == leftLength) { 406 leftPair = EOS; 407 break; 408 } 409 UChar32 c = left[leftIndex++]; 410 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 411 if(leftPair < MIN_LONG) { 412 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 413 } 414 leftPair = getQuaternaries(variableTop, leftPair); 415 } 416 417 while(rightPair == 0) { 418 if(rightIndex == rightLength) { 419 rightPair = EOS; 420 break; 421 } 422 UChar32 c = right[rightIndex++]; 423 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 424 if(rightPair < MIN_LONG) { 425 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 426 } 427 rightPair = getQuaternaries(variableTop, rightPair); 428 } 429 430 if(leftPair == rightPair) { 431 if(leftPair == EOS) { break; } 432 leftPair = rightPair = 0; 433 continue; 434 } 435 uint32_t leftQuaternary = leftPair & 0xffff; 436 uint32_t rightQuaternary = rightPair & 0xffff; 437 if(leftQuaternary != rightQuaternary) { 438 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 439 } 440 if(leftPair == EOS) { break; } 441 leftPair >>= 16; 442 rightPair >>= 16; 443 } 444 return UCOL_EQUAL; 445 } 446 447 int32_t 448 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, 449 const uint8_t *left, int32_t leftLength, 450 const uint8_t *right, int32_t rightLength) { 451 // Keep compareUTF16() and compareUTF8() in sync very closely! 452 453 U_ASSERT((table[0] >> 8) == VERSION); 454 table += (table[0] & 0xff); // skip the header 455 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() 456 options &= 0xffff; // needed for CollationSettings::getStrength() to work 457 458 // Check for supported characters, fetch mini CEs, and compare primaries. 459 U_ALIGN_CODE(16); 460 int32_t leftIndex = 0, rightIndex = 0; 461 /** 462 * Single mini CE or a pair. 463 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 464 * If there is only one, then it is in the lower bits, and the upper bits are 0. 465 */ 466 uint32_t leftPair = 0, rightPair = 0; 467 // Note: There is no need to assemble the code point. 468 // We only need to look up the table entry for the character, 469 // and nextPair() looks for whether c==0. 470 for(;;) { 471 // We fetch CEs until we get a non-ignorable primary or reach the end. 472 while(leftPair == 0) { 473 if(leftIndex == leftLength) { 474 leftPair = EOS; 475 break; 476 } 477 UChar32 c = left[leftIndex++]; 478 uint8_t t; 479 if(c <= 0x7f) { 480 leftPair = primaries[c]; 481 if(leftPair != 0) { break; } 482 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 483 return BAIL_OUT_RESULT; 484 } 485 leftPair = table[c]; 486 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && 487 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { 488 ++leftIndex; 489 c = ((c - 0xc2) << 6) + t; 490 leftPair = primaries[c]; 491 if(leftPair != 0) { break; } 492 leftPair = table[c]; 493 } else { 494 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); 495 } 496 if(leftPair >= MIN_SHORT) { 497 leftPair &= SHORT_PRIMARY_MASK; 498 break; 499 } else if(leftPair > variableTop) { 500 leftPair &= LONG_PRIMARY_MASK; 501 break; 502 } else { 503 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 504 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 505 leftPair = getPrimaries(variableTop, leftPair); 506 } 507 } 508 509 while(rightPair == 0) { 510 if(rightIndex == rightLength) { 511 rightPair = EOS; 512 break; 513 } 514 UChar32 c = right[rightIndex++]; 515 uint8_t t; 516 if(c <= 0x7f) { 517 rightPair = primaries[c]; 518 if(rightPair != 0) { break; } 519 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 520 return BAIL_OUT_RESULT; 521 } 522 rightPair = table[c]; 523 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && 524 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { 525 ++rightIndex; 526 c = ((c - 0xc2) << 6) + t; 527 rightPair = primaries[c]; 528 if(rightPair != 0) { break; } 529 rightPair = table[c]; 530 } else { 531 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); 532 } 533 if(rightPair >= MIN_SHORT) { 534 rightPair &= SHORT_PRIMARY_MASK; 535 break; 536 } else if(rightPair > variableTop) { 537 rightPair &= LONG_PRIMARY_MASK; 538 break; 539 } else { 540 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 541 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 542 rightPair = getPrimaries(variableTop, rightPair); 543 } 544 } 545 546 if(leftPair == rightPair) { 547 if(leftPair == EOS) { break; } 548 leftPair = rightPair = 0; 549 continue; 550 } 551 uint32_t leftPrimary = leftPair & 0xffff; 552 uint32_t rightPrimary = rightPair & 0xffff; 553 if(leftPrimary != rightPrimary) { 554 // Return the primary difference. 555 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 556 } 557 if(leftPair == EOS) { break; } 558 leftPair >>= 16; 559 rightPair >>= 16; 560 } 561 // In the following, we need to re-fetch each character because we did not buffer the CEs, 562 // but we know that the string is well-formed and 563 // only contains supported characters and mappings. 564 565 // We might skip the secondary level but continue with the case level 566 // which is turned on separately. 567 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 568 leftIndex = rightIndex = 0; 569 leftPair = rightPair = 0; 570 for(;;) { 571 while(leftPair == 0) { 572 if(leftIndex == leftLength) { 573 leftPair = EOS; 574 break; 575 } 576 UChar32 c = left[leftIndex++]; 577 if(c <= 0x7f) { 578 leftPair = table[c]; 579 } else if(c <= LATIN_MAX_UTF8_LEAD) { 580 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; 581 } else { 582 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); 583 } 584 if(leftPair >= MIN_SHORT) { 585 leftPair = getSecondariesFromOneShortCE(leftPair); 586 break; 587 } else if(leftPair > variableTop) { 588 leftPair = COMMON_SEC_PLUS_OFFSET; 589 break; 590 } else { 591 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 592 leftPair = getSecondaries(variableTop, leftPair); 593 } 594 } 595 596 while(rightPair == 0) { 597 if(rightIndex == rightLength) { 598 rightPair = EOS; 599 break; 600 } 601 UChar32 c = right[rightIndex++]; 602 if(c <= 0x7f) { 603 rightPair = table[c]; 604 } else if(c <= LATIN_MAX_UTF8_LEAD) { 605 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; 606 } else { 607 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); 608 } 609 if(rightPair >= MIN_SHORT) { 610 rightPair = getSecondariesFromOneShortCE(rightPair); 611 break; 612 } else if(rightPair > variableTop) { 613 rightPair = COMMON_SEC_PLUS_OFFSET; 614 break; 615 } else { 616 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 617 rightPair = getSecondaries(variableTop, rightPair); 618 } 619 } 620 621 if(leftPair == rightPair) { 622 if(leftPair == EOS) { break; } 623 leftPair = rightPair = 0; 624 continue; 625 } 626 uint32_t leftSecondary = leftPair & 0xffff; 627 uint32_t rightSecondary = rightPair & 0xffff; 628 if(leftSecondary != rightSecondary) { 629 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 630 // Full support for backwards secondary requires backwards contraction matching 631 // and moving backwards between merge separators. 632 return BAIL_OUT_RESULT; 633 } 634 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 635 } 636 if(leftPair == EOS) { break; } 637 leftPair >>= 16; 638 rightPair >>= 16; 639 } 640 } 641 642 if((options & CollationSettings::CASE_LEVEL) != 0) { 643 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 644 leftIndex = rightIndex = 0; 645 leftPair = rightPair = 0; 646 for(;;) { 647 while(leftPair == 0) { 648 if(leftIndex == leftLength) { 649 leftPair = EOS; 650 break; 651 } 652 UChar32 c = left[leftIndex++]; 653 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 654 if(leftPair < MIN_LONG) { 655 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 656 } 657 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 658 } 659 660 while(rightPair == 0) { 661 if(rightIndex == rightLength) { 662 rightPair = EOS; 663 break; 664 } 665 UChar32 c = right[rightIndex++]; 666 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 667 if(rightPair < MIN_LONG) { 668 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 669 } 670 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 671 } 672 673 if(leftPair == rightPair) { 674 if(leftPair == EOS) { break; } 675 leftPair = rightPair = 0; 676 continue; 677 } 678 uint32_t leftCase = leftPair & 0xffff; 679 uint32_t rightCase = rightPair & 0xffff; 680 if(leftCase != rightCase) { 681 if((options & CollationSettings::UPPER_FIRST) == 0) { 682 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 683 } else { 684 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 685 } 686 } 687 if(leftPair == EOS) { break; } 688 leftPair >>= 16; 689 rightPair >>= 16; 690 } 691 } 692 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 693 694 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 695 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 696 697 leftIndex = rightIndex = 0; 698 leftPair = rightPair = 0; 699 for(;;) { 700 while(leftPair == 0) { 701 if(leftIndex == leftLength) { 702 leftPair = EOS; 703 break; 704 } 705 UChar32 c = left[leftIndex++]; 706 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 707 if(leftPair < MIN_LONG) { 708 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 709 } 710 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 711 } 712 713 while(rightPair == 0) { 714 if(rightIndex == rightLength) { 715 rightPair = EOS; 716 break; 717 } 718 UChar32 c = right[rightIndex++]; 719 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 720 if(rightPair < MIN_LONG) { 721 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 722 } 723 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 724 } 725 726 if(leftPair == rightPair) { 727 if(leftPair == EOS) { break; } 728 leftPair = rightPair = 0; 729 continue; 730 } 731 uint32_t leftTertiary = leftPair & 0xffff; 732 uint32_t rightTertiary = rightPair & 0xffff; 733 if(leftTertiary != rightTertiary) { 734 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 735 // Pass through EOS and MERGE_WEIGHT 736 // and keep real tertiary weights larger than the MERGE_WEIGHT. 737 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 738 if(leftTertiary > MERGE_WEIGHT) { 739 leftTertiary ^= CASE_MASK; 740 } 741 if(rightTertiary > MERGE_WEIGHT) { 742 rightTertiary ^= CASE_MASK; 743 } 744 } 745 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 746 } 747 if(leftPair == EOS) { break; } 748 leftPair >>= 16; 749 rightPair >>= 16; 750 } 751 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 752 753 leftIndex = rightIndex = 0; 754 leftPair = rightPair = 0; 755 for(;;) { 756 while(leftPair == 0) { 757 if(leftIndex == leftLength) { 758 leftPair = EOS; 759 break; 760 } 761 UChar32 c = left[leftIndex++]; 762 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 763 if(leftPair < MIN_LONG) { 764 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 765 } 766 leftPair = getQuaternaries(variableTop, leftPair); 767 } 768 769 while(rightPair == 0) { 770 if(rightIndex == rightLength) { 771 rightPair = EOS; 772 break; 773 } 774 UChar32 c = right[rightIndex++]; 775 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 776 if(rightPair < MIN_LONG) { 777 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 778 } 779 rightPair = getQuaternaries(variableTop, rightPair); 780 } 781 782 if(leftPair == rightPair) { 783 if(leftPair == EOS) { break; } 784 leftPair = rightPair = 0; 785 continue; 786 } 787 uint32_t leftQuaternary = leftPair & 0xffff; 788 uint32_t rightQuaternary = rightPair & 0xffff; 789 if(leftQuaternary != rightQuaternary) { 790 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 791 } 792 if(leftPair == EOS) { break; } 793 leftPair >>= 16; 794 rightPair >>= 16; 795 } 796 return UCOL_EQUAL; 797 } 798 799 uint32_t 800 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { 801 U_ASSERT(c > LATIN_MAX); 802 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 803 return table[c - PUNCT_START + LATIN_LIMIT]; 804 } else if(c == 0xfffe) { 805 return MERGE_WEIGHT; 806 } else if(c == 0xffff) { 807 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 808 } else { 809 return BAIL_OUT; 810 } 811 } 812 813 uint32_t 814 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, 815 const uint8_t *s8, int32_t &sIndex, int32_t sLength) { 816 // The caller handled ASCII and valid/supported Latin. 817 U_ASSERT(c > 0x7f); 818 int32_t i2 = sIndex + 1; 819 if(i2 < sLength || sLength < 0) { 820 uint8_t t1 = s8[sIndex]; 821 uint8_t t2 = s8[i2]; 822 sIndex += 2; 823 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { 824 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 825 } else if(c == 0xef && t1 == 0xbf) { 826 if(t2 == 0xbe) { 827 return MERGE_WEIGHT; // U+FFFE 828 } else if(t2 == 0xbf) { 829 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 830 } 831 } 832 } 833 return BAIL_OUT; 834 } 835 836 uint32_t 837 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, 838 const uint8_t *s8, int32_t &sIndex) { 839 // The caller handled ASCII. 840 // The string is well-formed and contains only supported characters. 841 U_ASSERT(c > 0x7f); 842 if(c <= LATIN_MAX_UTF8_LEAD) { 843 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F 844 } 845 uint8_t t2 = s8[sIndex + 1]; 846 sIndex += 2; 847 if(c == 0xe2) { 848 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 849 } else if(t2 == 0xbe) { 850 return MERGE_WEIGHT; // U+FFFE 851 } else { 852 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 853 } 854 } 855 856 uint32_t 857 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, 858 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { 859 if(ce >= MIN_LONG || ce < CONTRACTION) { 860 return ce; // simple or special mini CE 861 } else if(ce >= EXPANSION) { 862 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 863 return ((uint32_t)table[index + 1] << 16) | table[index]; 864 } else /* ce >= CONTRACTION */ { 865 if(c == 0 && sLength < 0) { 866 sLength = sIndex - 1; 867 return EOS; 868 } 869 // Contraction list: Default mapping followed by 870 // 0 or more single-character contraction suffix mappings. 871 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 872 if(sIndex != sLength) { 873 // Read the next character. 874 int32_t c2; 875 int32_t nextIndex = sIndex; 876 if(s16 != NULL) { 877 c2 = s16[nextIndex++]; 878 if(c2 > LATIN_MAX) { 879 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 880 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 881 } else if(c2 == 0xfffe || c2 == 0xffff) { 882 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 883 } else { 884 return BAIL_OUT; 885 } 886 } 887 } else { 888 c2 = s8[nextIndex++]; 889 if(c2 > 0x7f) { 890 uint8_t t; 891 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && 892 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { 893 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F 894 ++nextIndex; 895 } else { 896 int32_t i2 = nextIndex + 1; 897 if(i2 < sLength || sLength < 0) { 898 if(c2 == 0xe2 && s8[nextIndex] == 0x80 && 899 0x80 <= (t = s8[i2]) && t <= 0xbf) { 900 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF 901 } else if(c2 == 0xef && s8[nextIndex] == 0xbf && 902 ((t = s8[i2]) == 0xbe || t == 0xbf)) { 903 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 904 } else { 905 return BAIL_OUT; 906 } 907 } else { 908 return BAIL_OUT; 909 } 910 nextIndex += 2; 911 } 912 } 913 } 914 if(c2 == 0 && sLength < 0) { 915 sLength = sIndex; 916 c2 = -1; 917 } 918 // Look for the next character in the contraction suffix list, 919 // which is in ascending order of single suffix characters. 920 int32_t i = index; 921 int32_t head = table[i]; // first skip the default mapping 922 int32_t x; 923 do { 924 i += head >> CONTR_LENGTH_SHIFT; 925 head = table[i]; 926 x = head & CONTR_CHAR_MASK; 927 } while(x < c2); 928 if(x == c2) { 929 index = i; 930 sIndex = nextIndex; 931 } 932 } 933 // Return the CE or CEs for the default or contraction mapping. 934 int32_t length = table[index] >> CONTR_LENGTH_SHIFT; 935 if(length == 1) { 936 return BAIL_OUT; 937 } 938 ce = table[index + 1]; 939 if(length == 2) { 940 return ce; 941 } else { 942 return ((uint32_t)table[index + 2] << 16) | ce; 943 } 944 } 945 } 946 947 uint32_t 948 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { 949 if(pair <= 0xffff) { 950 // one mini CE 951 if(pair >= MIN_SHORT) { 952 pair = getSecondariesFromOneShortCE(pair); 953 } else if(pair > variableTop) { 954 pair = COMMON_SEC_PLUS_OFFSET; 955 } else if(pair >= MIN_LONG) { 956 pair = 0; // variable 957 } 958 // else special mini CE 959 } else { 960 uint32_t ce = pair & 0xffff; 961 if(ce >= MIN_SHORT) { 962 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 963 } else if(ce > variableTop) { 964 pair = TWO_COMMON_SEC_PLUS_OFFSET; 965 } else { 966 U_ASSERT(ce >= MIN_LONG); 967 pair = 0; // variable 968 } 969 } 970 return pair; 971 } 972 973 uint32_t 974 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { 975 // Primary+caseLevel: Ignore case level weights of primary ignorables. 976 // Otherwise: Ignore case level weights of secondary ignorables. 977 // For details see the comments in the CollationCompare class. 978 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 979 if(pair <= 0xffff) { 980 // one mini CE 981 if(pair >= MIN_SHORT) { 982 // A high secondary weight means we really have two CEs, 983 // a primary CE and a secondary CE. 984 uint32_t ce = pair; 985 pair &= CASE_MASK; // explicit weight of primary CE 986 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 987 pair |= LOWER_CASE << 16; // implied weight of secondary CE 988 } 989 } else if(pair > variableTop) { 990 pair = LOWER_CASE; 991 } else if(pair >= MIN_LONG) { 992 pair = 0; // variable 993 } 994 // else special mini CE 995 } else { 996 // two mini CEs, same primary groups, neither expands like above 997 uint32_t ce = pair & 0xffff; 998 if(ce >= MIN_SHORT) { 999 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 1000 pair &= CASE_MASK; 1001 } else { 1002 pair &= TWO_CASES_MASK; 1003 } 1004 } else if(ce > variableTop) { 1005 pair = TWO_LOWER_CASES; 1006 } else { 1007 U_ASSERT(ce >= MIN_LONG); 1008 pair = 0; // variable 1009 } 1010 } 1011 return pair; 1012 } 1013 1014 uint32_t 1015 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { 1016 if(pair <= 0xffff) { 1017 // one mini CE 1018 if(pair >= MIN_SHORT) { 1019 // A high secondary weight means we really have two CEs, 1020 // a primary CE and a secondary CE. 1021 uint32_t ce = pair; 1022 if(withCaseBits) { 1023 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 1024 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1025 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 1026 } 1027 } else { 1028 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1029 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1030 pair |= COMMON_TER_PLUS_OFFSET << 16; 1031 } 1032 } 1033 } else if(pair > variableTop) { 1034 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1035 if(withCaseBits) { 1036 pair |= LOWER_CASE; 1037 } 1038 } else if(pair >= MIN_LONG) { 1039 pair = 0; // variable 1040 } 1041 // else special mini CE 1042 } else { 1043 // two mini CEs, same primary groups, neither expands like above 1044 uint32_t ce = pair & 0xffff; 1045 if(ce >= MIN_SHORT) { 1046 if(withCaseBits) { 1047 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 1048 } else { 1049 pair &= TWO_TERTIARIES_MASK; 1050 } 1051 pair += TWO_TER_OFFSETS; 1052 } else if(ce > variableTop) { 1053 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 1054 if(withCaseBits) { 1055 pair |= TWO_LOWER_CASES; 1056 } 1057 } else { 1058 U_ASSERT(ce >= MIN_LONG); 1059 pair = 0; // variable 1060 } 1061 } 1062 return pair; 1063 } 1064 1065 uint32_t 1066 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { 1067 // Return the primary weight of a variable CE, 1068 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 1069 if(pair <= 0xffff) { 1070 // one mini CE 1071 if(pair >= MIN_SHORT) { 1072 // A high secondary weight means we really have two CEs, 1073 // a primary CE and a secondary CE. 1074 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1075 pair = TWO_SHORT_PRIMARIES_MASK; 1076 } else { 1077 pair = SHORT_PRIMARY_MASK; 1078 } 1079 } else if(pair > variableTop) { 1080 pair = SHORT_PRIMARY_MASK; 1081 } else if(pair >= MIN_LONG) { 1082 pair &= LONG_PRIMARY_MASK; // variable 1083 } 1084 // else special mini CE 1085 } else { 1086 // two mini CEs, same primary groups, neither expands like above 1087 uint32_t ce = pair & 0xffff; 1088 if(ce > variableTop) { 1089 pair = TWO_SHORT_PRIMARIES_MASK; 1090 } else { 1091 U_ASSERT(ce >= MIN_LONG); 1092 pair &= TWO_LONG_PRIMARIES_MASK; // variable 1093 } 1094 } 1095 return pair; 1096 } 1097 1098 U_NAMESPACE_END 1099 1100 #endif // !UCONFIG_NO_COLLATION 1101