1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationfastlatin.cpp 7 * 8 * created on: 2013aug18 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "collationdata.h" 18 #include "collationfastlatin.h" 19 #include "collationsettings.h" 20 #include "putilimp.h" // U_ALIGN_CODE 21 #include "uassert.h" 22 23 U_NAMESPACE_BEGIN 24 25 int32_t 26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, 27 uint16_t *primaries, int32_t capacity) { 28 const uint16_t *table = data->fastLatinTable; 29 if(table == NULL) { return -1; } 30 U_ASSERT(capacity == LATIN_LIMIT); 31 if(capacity != LATIN_LIMIT) { return -1; } 32 33 uint32_t miniVarTop; 34 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { 35 // No mini primaries are variable, set a variableTop just below the 36 // lowest long mini primary. 37 miniVarTop = MIN_LONG - 1; 38 } else { 39 uint32_t v1 = settings.variableTop >> 24; 40 int32_t headerLength = *table & 0xff; 41 int32_t i = headerLength - 1; 42 if(i <= 0 || v1 > (table[i] & 0x7f)) { 43 return -1; // variableTop >= digits, should not occur 44 } 45 while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; } 46 // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied. 47 // Shift right to make it comparable with long mini primaries in bits 15..3. 48 miniVarTop = (table[i] & 0xff80) >> 4; 49 } 50 51 const uint8_t *reorderTable = settings.reorderTable; 52 if(reorderTable != NULL) { 53 const uint16_t *scripts = data->scripts; 54 int32_t length = data->scriptsLength; 55 uint32_t prevLastByte = 0; 56 for(int32_t i = 0; i < length;) { 57 // reordered last byte of the group 58 uint32_t lastByte = reorderTable[scripts[i] & 0xff]; 59 if(lastByte < prevLastByte) { 60 // The permutation affects the groups up to Latin. 61 return -1; 62 } 63 if(scripts[i + 2] == USCRIPT_LATIN) { break; } 64 i = i + 2 + scripts[i + 1]; 65 prevLastByte = lastByte; 66 } 67 } 68 69 table += (table[0] & 0xff); // skip the header 70 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { 71 uint32_t p = table[c]; 72 if(p >= MIN_SHORT) { 73 p &= SHORT_PRIMARY_MASK; 74 } else if(p > miniVarTop) { 75 p &= LONG_PRIMARY_MASK; 76 } else { 77 p = 0; 78 } 79 primaries[c] = (uint16_t)p; 80 } 81 if((settings.options & CollationSettings::NUMERIC) != 0) { 82 // Bail out for digits. 83 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 84 } 85 86 // Shift the miniVarTop above other options. 87 return ((int32_t)miniVarTop << 16) | settings.options; 88 } 89 90 int32_t 91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, 92 const UChar *left, int32_t leftLength, 93 const UChar *right, int32_t rightLength) { 94 // This is a modified copy of CollationCompare::compareUpToQuaternary(), 95 // optimized for common Latin text. 96 // Keep them in sync! 97 // Keep compareUTF16() and compareUTF8() in sync very closely! 98 99 U_ASSERT((table[0] >> 8) == VERSION); 100 table += (table[0] & 0xff); // skip the header 101 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() 102 options &= 0xffff; // needed for CollationSettings::getStrength() to work 103 104 // Check for supported characters, fetch mini CEs, and compare primaries. 105 U_ALIGN_CODE(16); 106 int32_t leftIndex = 0, rightIndex = 0; 107 /** 108 * Single mini CE or a pair. 109 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 110 * If there is only one, then it is in the lower bits, and the upper bits are 0. 111 */ 112 uint32_t leftPair = 0, rightPair = 0; 113 for(;;) { 114 // We fetch CEs until we get a non-ignorable primary or reach the end. 115 while(leftPair == 0) { 116 if(leftIndex == leftLength) { 117 leftPair = EOS; 118 break; 119 } 120 UChar32 c = left[leftIndex++]; 121 if(c <= LATIN_MAX) { 122 leftPair = primaries[c]; 123 if(leftPair != 0) { break; } 124 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 125 return BAIL_OUT_RESULT; 126 } 127 leftPair = table[c]; 128 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 129 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 130 } else { 131 leftPair = lookup(table, c); 132 } 133 if(leftPair >= MIN_SHORT) { 134 leftPair &= SHORT_PRIMARY_MASK; 135 break; 136 } else if(leftPair > variableTop) { 137 leftPair &= LONG_PRIMARY_MASK; 138 break; 139 } else { 140 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 141 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 142 leftPair = getPrimaries(variableTop, leftPair); 143 } 144 } 145 146 while(rightPair == 0) { 147 if(rightIndex == rightLength) { 148 rightPair = EOS; 149 break; 150 } 151 UChar32 c = right[rightIndex++]; 152 if(c <= LATIN_MAX) { 153 rightPair = primaries[c]; 154 if(rightPair != 0) { break; } 155 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 156 return BAIL_OUT_RESULT; 157 } 158 rightPair = table[c]; 159 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 160 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 161 } else { 162 rightPair = lookup(table, c); 163 } 164 if(rightPair >= MIN_SHORT) { 165 rightPair &= SHORT_PRIMARY_MASK; 166 break; 167 } else if(rightPair > variableTop) { 168 rightPair &= LONG_PRIMARY_MASK; 169 break; 170 } else { 171 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 172 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 173 rightPair = getPrimaries(variableTop, rightPair); 174 } 175 } 176 177 if(leftPair == rightPair) { 178 if(leftPair == EOS) { break; } 179 leftPair = rightPair = 0; 180 continue; 181 } 182 uint32_t leftPrimary = leftPair & 0xffff; 183 uint32_t rightPrimary = rightPair & 0xffff; 184 if(leftPrimary != rightPrimary) { 185 // Return the primary difference. 186 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 187 } 188 if(leftPair == EOS) { break; } 189 leftPair >>= 16; 190 rightPair >>= 16; 191 } 192 // In the following, we need to re-fetch each character because we did not buffer the CEs, 193 // but we know that the string is well-formed and 194 // only contains supported characters and mappings. 195 196 // We might skip the secondary level but continue with the case level 197 // which is turned on separately. 198 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 199 leftIndex = rightIndex = 0; 200 leftPair = rightPair = 0; 201 for(;;) { 202 while(leftPair == 0) { 203 if(leftIndex == leftLength) { 204 leftPair = EOS; 205 break; 206 } 207 UChar32 c = left[leftIndex++]; 208 if(c <= LATIN_MAX) { 209 leftPair = table[c]; 210 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 211 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 212 } else { 213 leftPair = lookup(table, c); 214 } 215 if(leftPair >= MIN_SHORT) { 216 leftPair = getSecondariesFromOneShortCE(leftPair); 217 break; 218 } else if(leftPair > variableTop) { 219 leftPair = COMMON_SEC_PLUS_OFFSET; 220 break; 221 } else { 222 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 223 leftPair = getSecondaries(variableTop, leftPair); 224 } 225 } 226 227 while(rightPair == 0) { 228 if(rightIndex == rightLength) { 229 rightPair = EOS; 230 break; 231 } 232 UChar32 c = right[rightIndex++]; 233 if(c <= LATIN_MAX) { 234 rightPair = table[c]; 235 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 236 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 237 } else { 238 rightPair = lookup(table, c); 239 } 240 if(rightPair >= MIN_SHORT) { 241 rightPair = getSecondariesFromOneShortCE(rightPair); 242 break; 243 } else if(rightPair > variableTop) { 244 rightPair = COMMON_SEC_PLUS_OFFSET; 245 break; 246 } else { 247 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 248 rightPair = getSecondaries(variableTop, rightPair); 249 } 250 } 251 252 if(leftPair == rightPair) { 253 if(leftPair == EOS) { break; } 254 leftPair = rightPair = 0; 255 continue; 256 } 257 uint32_t leftSecondary = leftPair & 0xffff; 258 uint32_t rightSecondary = rightPair & 0xffff; 259 if(leftSecondary != rightSecondary) { 260 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 261 // Full support for backwards secondary requires backwards contraction matching 262 // and moving backwards between merge separators. 263 return BAIL_OUT_RESULT; 264 } 265 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 266 } 267 if(leftPair == EOS) { break; } 268 leftPair >>= 16; 269 rightPair >>= 16; 270 } 271 } 272 273 if((options & CollationSettings::CASE_LEVEL) != 0) { 274 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 275 leftIndex = rightIndex = 0; 276 leftPair = rightPair = 0; 277 for(;;) { 278 while(leftPair == 0) { 279 if(leftIndex == leftLength) { 280 leftPair = EOS; 281 break; 282 } 283 UChar32 c = left[leftIndex++]; 284 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 285 if(leftPair < MIN_LONG) { 286 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 287 } 288 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 289 } 290 291 while(rightPair == 0) { 292 if(rightIndex == rightLength) { 293 rightPair = EOS; 294 break; 295 } 296 UChar32 c = right[rightIndex++]; 297 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 298 if(rightPair < MIN_LONG) { 299 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 300 } 301 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 302 } 303 304 if(leftPair == rightPair) { 305 if(leftPair == EOS) { break; } 306 leftPair = rightPair = 0; 307 continue; 308 } 309 uint32_t leftCase = leftPair & 0xffff; 310 uint32_t rightCase = rightPair & 0xffff; 311 if(leftCase != rightCase) { 312 if((options & CollationSettings::UPPER_FIRST) == 0) { 313 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 314 } else { 315 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 316 } 317 } 318 if(leftPair == EOS) { break; } 319 leftPair >>= 16; 320 rightPair >>= 16; 321 } 322 } 323 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 324 325 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 326 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 327 328 leftIndex = rightIndex = 0; 329 leftPair = rightPair = 0; 330 for(;;) { 331 while(leftPair == 0) { 332 if(leftIndex == leftLength) { 333 leftPair = EOS; 334 break; 335 } 336 UChar32 c = left[leftIndex++]; 337 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 338 if(leftPair < MIN_LONG) { 339 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 340 } 341 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 342 } 343 344 while(rightPair == 0) { 345 if(rightIndex == rightLength) { 346 rightPair = EOS; 347 break; 348 } 349 UChar32 c = right[rightIndex++]; 350 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 351 if(rightPair < MIN_LONG) { 352 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 353 } 354 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 355 } 356 357 if(leftPair == rightPair) { 358 if(leftPair == EOS) { break; } 359 leftPair = rightPair = 0; 360 continue; 361 } 362 uint32_t leftTertiary = leftPair & 0xffff; 363 uint32_t rightTertiary = rightPair & 0xffff; 364 if(leftTertiary != rightTertiary) { 365 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 366 // Pass through EOS and MERGE_WEIGHT 367 // and keep real tertiary weights larger than the MERGE_WEIGHT. 368 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 369 if(leftTertiary > MERGE_WEIGHT) { 370 leftTertiary ^= CASE_MASK; 371 } 372 if(rightTertiary > MERGE_WEIGHT) { 373 rightTertiary ^= CASE_MASK; 374 } 375 } 376 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 377 } 378 if(leftPair == EOS) { break; } 379 leftPair >>= 16; 380 rightPair >>= 16; 381 } 382 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 383 384 leftIndex = rightIndex = 0; 385 leftPair = rightPair = 0; 386 for(;;) { 387 while(leftPair == 0) { 388 if(leftIndex == leftLength) { 389 leftPair = EOS; 390 break; 391 } 392 UChar32 c = left[leftIndex++]; 393 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 394 if(leftPair < MIN_LONG) { 395 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 396 } 397 leftPair = getQuaternaries(variableTop, leftPair); 398 } 399 400 while(rightPair == 0) { 401 if(rightIndex == rightLength) { 402 rightPair = EOS; 403 break; 404 } 405 UChar32 c = right[rightIndex++]; 406 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 407 if(rightPair < MIN_LONG) { 408 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 409 } 410 rightPair = getQuaternaries(variableTop, rightPair); 411 } 412 413 if(leftPair == rightPair) { 414 if(leftPair == EOS) { break; } 415 leftPair = rightPair = 0; 416 continue; 417 } 418 uint32_t leftQuaternary = leftPair & 0xffff; 419 uint32_t rightQuaternary = rightPair & 0xffff; 420 if(leftQuaternary != rightQuaternary) { 421 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 422 } 423 if(leftPair == EOS) { break; } 424 leftPair >>= 16; 425 rightPair >>= 16; 426 } 427 return UCOL_EQUAL; 428 } 429 430 int32_t 431 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, 432 const uint8_t *left, int32_t leftLength, 433 const uint8_t *right, int32_t rightLength) { 434 // Keep compareUTF16() and compareUTF8() in sync very closely! 435 436 U_ASSERT((table[0] >> 8) == VERSION); 437 table += (table[0] & 0xff); // skip the header 438 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() 439 options &= 0xffff; // needed for CollationSettings::getStrength() to work 440 441 // Check for supported characters, fetch mini CEs, and compare primaries. 442 U_ALIGN_CODE(16); 443 int32_t leftIndex = 0, rightIndex = 0; 444 /** 445 * Single mini CE or a pair. 446 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 447 * If there is only one, then it is in the lower bits, and the upper bits are 0. 448 */ 449 uint32_t leftPair = 0, rightPair = 0; 450 // Note: There is no need to assemble the code point. 451 // We only need to look up the table entry for the character, 452 // and nextPair() looks for whether c==0. 453 for(;;) { 454 // We fetch CEs until we get a non-ignorable primary or reach the end. 455 while(leftPair == 0) { 456 if(leftIndex == leftLength) { 457 leftPair = EOS; 458 break; 459 } 460 UChar32 c = left[leftIndex++]; 461 uint8_t t; 462 if(c <= 0x7f) { 463 leftPair = primaries[c]; 464 if(leftPair != 0) { break; } 465 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 466 return BAIL_OUT_RESULT; 467 } 468 leftPair = table[c]; 469 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && 470 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { 471 ++leftIndex; 472 c = ((c - 0xc2) << 6) + t; 473 leftPair = primaries[c]; 474 if(leftPair != 0) { break; } 475 leftPair = table[c]; 476 } else { 477 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); 478 } 479 if(leftPair >= MIN_SHORT) { 480 leftPair &= SHORT_PRIMARY_MASK; 481 break; 482 } else if(leftPair > variableTop) { 483 leftPair &= LONG_PRIMARY_MASK; 484 break; 485 } else { 486 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 487 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 488 leftPair = getPrimaries(variableTop, leftPair); 489 } 490 } 491 492 while(rightPair == 0) { 493 if(rightIndex == rightLength) { 494 rightPair = EOS; 495 break; 496 } 497 UChar32 c = right[rightIndex++]; 498 uint8_t t; 499 if(c <= 0x7f) { 500 rightPair = primaries[c]; 501 if(rightPair != 0) { break; } 502 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 503 return BAIL_OUT_RESULT; 504 } 505 rightPair = table[c]; 506 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && 507 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { 508 ++rightIndex; 509 c = ((c - 0xc2) << 6) + t; 510 rightPair = primaries[c]; 511 if(rightPair != 0) { break; } 512 rightPair = table[c]; 513 } else { 514 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); 515 } 516 if(rightPair >= MIN_SHORT) { 517 rightPair &= SHORT_PRIMARY_MASK; 518 break; 519 } else if(rightPair > variableTop) { 520 rightPair &= LONG_PRIMARY_MASK; 521 break; 522 } else { 523 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 524 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 525 rightPair = getPrimaries(variableTop, rightPair); 526 } 527 } 528 529 if(leftPair == rightPair) { 530 if(leftPair == EOS) { break; } 531 leftPair = rightPair = 0; 532 continue; 533 } 534 uint32_t leftPrimary = leftPair & 0xffff; 535 uint32_t rightPrimary = rightPair & 0xffff; 536 if(leftPrimary != rightPrimary) { 537 // Return the primary difference. 538 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 539 } 540 if(leftPair == EOS) { break; } 541 leftPair >>= 16; 542 rightPair >>= 16; 543 } 544 // In the following, we need to re-fetch each character because we did not buffer the CEs, 545 // but we know that the string is well-formed and 546 // only contains supported characters and mappings. 547 548 // We might skip the secondary level but continue with the case level 549 // which is turned on separately. 550 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 551 leftIndex = rightIndex = 0; 552 leftPair = rightPair = 0; 553 for(;;) { 554 while(leftPair == 0) { 555 if(leftIndex == leftLength) { 556 leftPair = EOS; 557 break; 558 } 559 UChar32 c = left[leftIndex++]; 560 if(c <= 0x7f) { 561 leftPair = table[c]; 562 } else if(c <= LATIN_MAX_UTF8_LEAD) { 563 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; 564 } else { 565 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); 566 } 567 if(leftPair >= MIN_SHORT) { 568 leftPair = getSecondariesFromOneShortCE(leftPair); 569 break; 570 } else if(leftPair > variableTop) { 571 leftPair = COMMON_SEC_PLUS_OFFSET; 572 break; 573 } else { 574 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 575 leftPair = getSecondaries(variableTop, leftPair); 576 } 577 } 578 579 while(rightPair == 0) { 580 if(rightIndex == rightLength) { 581 rightPair = EOS; 582 break; 583 } 584 UChar32 c = right[rightIndex++]; 585 if(c <= 0x7f) { 586 rightPair = table[c]; 587 } else if(c <= LATIN_MAX_UTF8_LEAD) { 588 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; 589 } else { 590 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); 591 } 592 if(rightPair >= MIN_SHORT) { 593 rightPair = getSecondariesFromOneShortCE(rightPair); 594 break; 595 } else if(rightPair > variableTop) { 596 rightPair = COMMON_SEC_PLUS_OFFSET; 597 break; 598 } else { 599 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 600 rightPair = getSecondaries(variableTop, rightPair); 601 } 602 } 603 604 if(leftPair == rightPair) { 605 if(leftPair == EOS) { break; } 606 leftPair = rightPair = 0; 607 continue; 608 } 609 uint32_t leftSecondary = leftPair & 0xffff; 610 uint32_t rightSecondary = rightPair & 0xffff; 611 if(leftSecondary != rightSecondary) { 612 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 613 // Full support for backwards secondary requires backwards contraction matching 614 // and moving backwards between merge separators. 615 return BAIL_OUT_RESULT; 616 } 617 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 618 } 619 if(leftPair == EOS) { break; } 620 leftPair >>= 16; 621 rightPair >>= 16; 622 } 623 } 624 625 if((options & CollationSettings::CASE_LEVEL) != 0) { 626 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 627 leftIndex = rightIndex = 0; 628 leftPair = rightPair = 0; 629 for(;;) { 630 while(leftPair == 0) { 631 if(leftIndex == leftLength) { 632 leftPair = EOS; 633 break; 634 } 635 UChar32 c = left[leftIndex++]; 636 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 637 if(leftPair < MIN_LONG) { 638 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 639 } 640 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 641 } 642 643 while(rightPair == 0) { 644 if(rightIndex == rightLength) { 645 rightPair = EOS; 646 break; 647 } 648 UChar32 c = right[rightIndex++]; 649 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 650 if(rightPair < MIN_LONG) { 651 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 652 } 653 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 654 } 655 656 if(leftPair == rightPair) { 657 if(leftPair == EOS) { break; } 658 leftPair = rightPair = 0; 659 continue; 660 } 661 uint32_t leftCase = leftPair & 0xffff; 662 uint32_t rightCase = rightPair & 0xffff; 663 if(leftCase != rightCase) { 664 if((options & CollationSettings::UPPER_FIRST) == 0) { 665 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 666 } else { 667 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 668 } 669 } 670 if(leftPair == EOS) { break; } 671 leftPair >>= 16; 672 rightPair >>= 16; 673 } 674 } 675 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 676 677 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 678 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 679 680 leftIndex = rightIndex = 0; 681 leftPair = rightPair = 0; 682 for(;;) { 683 while(leftPair == 0) { 684 if(leftIndex == leftLength) { 685 leftPair = EOS; 686 break; 687 } 688 UChar32 c = left[leftIndex++]; 689 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 690 if(leftPair < MIN_LONG) { 691 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 692 } 693 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 694 } 695 696 while(rightPair == 0) { 697 if(rightIndex == rightLength) { 698 rightPair = EOS; 699 break; 700 } 701 UChar32 c = right[rightIndex++]; 702 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 703 if(rightPair < MIN_LONG) { 704 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 705 } 706 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 707 } 708 709 if(leftPair == rightPair) { 710 if(leftPair == EOS) { break; } 711 leftPair = rightPair = 0; 712 continue; 713 } 714 uint32_t leftTertiary = leftPair & 0xffff; 715 uint32_t rightTertiary = rightPair & 0xffff; 716 if(leftTertiary != rightTertiary) { 717 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 718 // Pass through EOS and MERGE_WEIGHT 719 // and keep real tertiary weights larger than the MERGE_WEIGHT. 720 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 721 if(leftTertiary > MERGE_WEIGHT) { 722 leftTertiary ^= CASE_MASK; 723 } 724 if(rightTertiary > MERGE_WEIGHT) { 725 rightTertiary ^= CASE_MASK; 726 } 727 } 728 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 729 } 730 if(leftPair == EOS) { break; } 731 leftPair >>= 16; 732 rightPair >>= 16; 733 } 734 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 735 736 leftIndex = rightIndex = 0; 737 leftPair = rightPair = 0; 738 for(;;) { 739 while(leftPair == 0) { 740 if(leftIndex == leftLength) { 741 leftPair = EOS; 742 break; 743 } 744 UChar32 c = left[leftIndex++]; 745 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 746 if(leftPair < MIN_LONG) { 747 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 748 } 749 leftPair = getQuaternaries(variableTop, leftPair); 750 } 751 752 while(rightPair == 0) { 753 if(rightIndex == rightLength) { 754 rightPair = EOS; 755 break; 756 } 757 UChar32 c = right[rightIndex++]; 758 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 759 if(rightPair < MIN_LONG) { 760 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 761 } 762 rightPair = getQuaternaries(variableTop, rightPair); 763 } 764 765 if(leftPair == rightPair) { 766 if(leftPair == EOS) { break; } 767 leftPair = rightPair = 0; 768 continue; 769 } 770 uint32_t leftQuaternary = leftPair & 0xffff; 771 uint32_t rightQuaternary = rightPair & 0xffff; 772 if(leftQuaternary != rightQuaternary) { 773 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 774 } 775 if(leftPair == EOS) { break; } 776 leftPair >>= 16; 777 rightPair >>= 16; 778 } 779 return UCOL_EQUAL; 780 } 781 782 uint32_t 783 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { 784 U_ASSERT(c > LATIN_MAX); 785 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 786 return table[c - PUNCT_START + LATIN_LIMIT]; 787 } else if(c == 0xfffe) { 788 return MERGE_WEIGHT; 789 } else if(c == 0xffff) { 790 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 791 } else { 792 return BAIL_OUT; 793 } 794 } 795 796 uint32_t 797 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, 798 const uint8_t *s8, int32_t &sIndex, int32_t sLength) { 799 // The caller handled ASCII and valid/supported Latin. 800 U_ASSERT(c > 0x7f); 801 int32_t i2 = sIndex + 1; 802 if(i2 < sLength || sLength < 0) { 803 uint8_t t1 = s8[sIndex]; 804 uint8_t t2 = s8[i2]; 805 sIndex += 2; 806 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { 807 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 808 } else if(c == 0xef && t1 == 0xbf) { 809 if(t2 == 0xbe) { 810 return MERGE_WEIGHT; // U+FFFE 811 } else if(t2 == 0xbf) { 812 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 813 } 814 } 815 } 816 return BAIL_OUT; 817 } 818 819 uint32_t 820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, 821 const uint8_t *s8, int32_t &sIndex) { 822 // The caller handled ASCII. 823 // The string is well-formed and contains only supported characters. 824 U_ASSERT(c > 0x7f); 825 if(c <= LATIN_MAX_UTF8_LEAD) { 826 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F 827 } 828 uint8_t t2 = s8[sIndex + 1]; 829 sIndex += 2; 830 if(c == 0xe2) { 831 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 832 } else if(t2 == 0xbe) { 833 return MERGE_WEIGHT; // U+FFFE 834 } else { 835 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 836 } 837 } 838 839 uint32_t 840 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, 841 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { 842 if(ce >= MIN_LONG || ce < CONTRACTION) { 843 return ce; // simple or special mini CE 844 } else if(ce >= EXPANSION) { 845 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 846 return ((uint32_t)table[index + 1] << 16) | table[index]; 847 } else /* ce >= CONTRACTION */ { 848 if(c == 0 && sLength < 0) { 849 sLength = sIndex - 1; 850 return EOS; 851 } 852 // Contraction list: Default mapping followed by 853 // 0 or more single-character contraction suffix mappings. 854 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 855 if(sIndex != sLength) { 856 // Read the next character. 857 int32_t c2; 858 int32_t nextIndex = sIndex; 859 if(s16 != NULL) { 860 c2 = s16[nextIndex++]; 861 if(c2 > LATIN_MAX) { 862 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 863 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 864 } else if(c2 == 0xfffe || c2 == 0xffff) { 865 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 866 } else { 867 return BAIL_OUT; 868 } 869 } 870 } else { 871 c2 = s8[nextIndex++]; 872 if(c2 > 0x7f) { 873 uint8_t t; 874 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && 875 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { 876 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F 877 ++nextIndex; 878 } else { 879 int32_t i2 = nextIndex + 1; 880 if(i2 < sLength || sLength < 0) { 881 if(c2 == 0xe2 && s8[nextIndex] == 0x80 && 882 0x80 <= (t = s8[i2]) && t <= 0xbf) { 883 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF 884 } else if(c2 == 0xef && s8[nextIndex] == 0xbf && 885 ((t = s8[i2]) == 0xbe || t == 0xbf)) { 886 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 887 } else { 888 return BAIL_OUT; 889 } 890 } else { 891 return BAIL_OUT; 892 } 893 nextIndex += 2; 894 } 895 } 896 } 897 if(c2 == 0 && sLength < 0) { 898 sLength = sIndex; 899 c2 = -1; 900 } 901 // Look for the next character in the contraction suffix list, 902 // which is in ascending order of single suffix characters. 903 int32_t i = index; 904 int32_t head = table[i]; // first skip the default mapping 905 int32_t x; 906 do { 907 i += head >> CONTR_LENGTH_SHIFT; 908 head = table[i]; 909 x = head & CONTR_CHAR_MASK; 910 } while(x < c2); 911 if(x == c2) { 912 index = i; 913 sIndex = nextIndex; 914 } 915 } 916 // Return the CE or CEs for the default or contraction mapping. 917 int32_t length = table[index] >> CONTR_LENGTH_SHIFT; 918 if(length == 1) { 919 return BAIL_OUT; 920 } 921 ce = table[index + 1]; 922 if(length == 2) { 923 return ce; 924 } else { 925 return ((uint32_t)table[index + 2] << 16) | ce; 926 } 927 } 928 } 929 930 uint32_t 931 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { 932 if(pair <= 0xffff) { 933 // one mini CE 934 if(pair >= MIN_SHORT) { 935 pair = getSecondariesFromOneShortCE(pair); 936 } else if(pair > variableTop) { 937 pair = COMMON_SEC_PLUS_OFFSET; 938 } else if(pair >= MIN_LONG) { 939 pair = 0; // variable 940 } 941 // else special mini CE 942 } else { 943 uint32_t ce = pair & 0xffff; 944 if(ce >= MIN_SHORT) { 945 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 946 } else if(ce > variableTop) { 947 pair = TWO_COMMON_SEC_PLUS_OFFSET; 948 } else { 949 U_ASSERT(ce >= MIN_LONG); 950 pair = 0; // variable 951 } 952 } 953 return pair; 954 } 955 956 uint32_t 957 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { 958 // Primary+caseLevel: Ignore case level weights of primary ignorables. 959 // Otherwise: Ignore case level weights of secondary ignorables. 960 // For details see the comments in the CollationCompare class. 961 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 962 if(pair <= 0xffff) { 963 // one mini CE 964 if(pair >= MIN_SHORT) { 965 // A high secondary weight means we really have two CEs, 966 // a primary CE and a secondary CE. 967 uint32_t ce = pair; 968 pair &= CASE_MASK; // explicit weight of primary CE 969 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 970 pair |= LOWER_CASE << 16; // implied weight of secondary CE 971 } 972 } else if(pair > variableTop) { 973 pair = LOWER_CASE; 974 } else if(pair >= MIN_LONG) { 975 pair = 0; // variable 976 } 977 // else special mini CE 978 } else { 979 // two mini CEs, same primary groups, neither expands like above 980 uint32_t ce = pair & 0xffff; 981 if(ce >= MIN_SHORT) { 982 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 983 pair &= CASE_MASK; 984 } else { 985 pair &= TWO_CASES_MASK; 986 } 987 } else if(ce > variableTop) { 988 pair = TWO_LOWER_CASES; 989 } else { 990 U_ASSERT(ce >= MIN_LONG); 991 pair = 0; // variable 992 } 993 } 994 return pair; 995 } 996 997 uint32_t 998 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { 999 if(pair <= 0xffff) { 1000 // one mini CE 1001 if(pair >= MIN_SHORT) { 1002 // A high secondary weight means we really have two CEs, 1003 // a primary CE and a secondary CE. 1004 uint32_t ce = pair; 1005 if(withCaseBits) { 1006 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 1007 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1008 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 1009 } 1010 } else { 1011 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1012 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1013 pair |= COMMON_TER_PLUS_OFFSET << 16; 1014 } 1015 } 1016 } else if(pair > variableTop) { 1017 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1018 if(withCaseBits) { 1019 pair |= LOWER_CASE; 1020 } 1021 } else if(pair >= MIN_LONG) { 1022 pair = 0; // variable 1023 } 1024 // else special mini CE 1025 } else { 1026 // two mini CEs, same primary groups, neither expands like above 1027 uint32_t ce = pair & 0xffff; 1028 if(ce >= MIN_SHORT) { 1029 if(withCaseBits) { 1030 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 1031 } else { 1032 pair &= TWO_TERTIARIES_MASK; 1033 } 1034 pair += TWO_TER_OFFSETS; 1035 } else if(ce > variableTop) { 1036 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 1037 if(withCaseBits) { 1038 pair |= TWO_LOWER_CASES; 1039 } 1040 } else { 1041 U_ASSERT(ce >= MIN_LONG); 1042 pair = 0; // variable 1043 } 1044 } 1045 return pair; 1046 } 1047 1048 uint32_t 1049 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { 1050 // Return the primary weight of a variable CE, 1051 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 1052 if(pair <= 0xffff) { 1053 // one mini CE 1054 if(pair >= MIN_SHORT) { 1055 // A high secondary weight means we really have two CEs, 1056 // a primary CE and a secondary CE. 1057 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1058 pair = TWO_SHORT_PRIMARIES_MASK; 1059 } else { 1060 pair = SHORT_PRIMARY_MASK; 1061 } 1062 } else if(pair > variableTop) { 1063 pair = SHORT_PRIMARY_MASK; 1064 } else if(pair >= MIN_LONG) { 1065 pair &= LONG_PRIMARY_MASK; // variable 1066 } 1067 // else special mini CE 1068 } else { 1069 // two mini CEs, same primary groups, neither expands like above 1070 uint32_t ce = pair & 0xffff; 1071 if(ce > variableTop) { 1072 pair = TWO_SHORT_PRIMARIES_MASK; 1073 } else { 1074 U_ASSERT(ce >= MIN_LONG); 1075 pair &= TWO_LONG_PRIMARIES_MASK; // variable 1076 } 1077 } 1078 return pair; 1079 } 1080 1081 U_NAMESPACE_END 1082 1083 #endif // !UCONFIG_NO_COLLATION 1084