Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationfastlatin.cpp
      9 *
     10 * created on: 2013aug18
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "collationdata.h"
     20 #include "collationfastlatin.h"
     21 #include "collationsettings.h"
     22 #include "uassert.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 int32_t
     27 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
     28                                uint16_t *primaries, int32_t capacity) {
     29     const uint16_t *table = data->fastLatinTable;
     30     if(table == NULL) { return -1; }
     31     U_ASSERT(capacity == LATIN_LIMIT);
     32     if(capacity != LATIN_LIMIT) { return -1; }
     33 
     34     uint32_t miniVarTop;
     35     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
     36         // No mini primaries are variable, set a variableTop just below the
     37         // lowest long mini primary.
     38         miniVarTop = MIN_LONG - 1;
     39     } else {
     40         int32_t headerLength = *table & 0xff;
     41         int32_t i = 1 + settings.getMaxVariable();
     42         if(i >= headerLength) {
     43             return -1;  // variableTop >= digits, should not occur
     44         }
     45         miniVarTop = table[i];
     46     }
     47 
     48     UBool digitsAreReordered = FALSE;
     49     if(settings.hasReordering()) {
     50         uint32_t prevStart = 0;
     51         uint32_t beforeDigitStart = 0;
     52         uint32_t digitStart = 0;
     53         uint32_t afterDigitStart = 0;
     54         for(int32_t group = UCOL_REORDER_CODE_FIRST;
     55                 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
     56                 ++group) {
     57             uint32_t start = data->getFirstPrimaryForGroup(group);
     58             start = settings.reorder(start);
     59             if(group == UCOL_REORDER_CODE_DIGIT) {
     60                 beforeDigitStart = prevStart;
     61                 digitStart = start;
     62             } else if(start != 0) {
     63                 if(start < prevStart) {
     64                     // The permutation affects the groups up to Latin.
     65                     return -1;
     66                 }
     67                 // In the future, there might be a special group between digits & Latin.
     68                 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
     69                     afterDigitStart = start;
     70                 }
     71                 prevStart = start;
     72             }
     73         }
     74         uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
     75         latinStart = settings.reorder(latinStart);
     76         if(latinStart < prevStart) {
     77             return -1;
     78         }
     79         if(afterDigitStart == 0) {
     80             afterDigitStart = latinStart;
     81         }
     82         if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
     83             digitsAreReordered = TRUE;
     84         }
     85     }
     86 
     87     table += (table[0] & 0xff);  // skip the header
     88     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
     89         uint32_t p = table[c];
     90         if(p >= MIN_SHORT) {
     91             p &= SHORT_PRIMARY_MASK;
     92         } else if(p > miniVarTop) {
     93             p &= LONG_PRIMARY_MASK;
     94         } else {
     95             p = 0;
     96         }
     97         primaries[c] = (uint16_t)p;
     98     }
     99     if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
    100         // Bail out for digits.
    101         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
    102     }
    103 
    104     // Shift the miniVarTop above other options.
    105     return ((int32_t)miniVarTop << 16) | settings.options;
    106 }
    107 
    108 int32_t
    109 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
    110                                  const UChar *left, int32_t leftLength,
    111                                  const UChar *right, int32_t rightLength) {
    112     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
    113     // optimized for common Latin text.
    114     // Keep them in sync!
    115     // Keep compareUTF16() and compareUTF8() in sync very closely!
    116 
    117     U_ASSERT((table[0] >> 8) == VERSION);
    118     table += (table[0] & 0xff);  // skip the header
    119     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
    120     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    121 
    122     // Check for supported characters, fetch mini CEs, and compare primaries.
    123     int32_t leftIndex = 0, rightIndex = 0;
    124     /**
    125      * Single mini CE or a pair.
    126      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    127      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    128      */
    129     uint32_t leftPair = 0, rightPair = 0;
    130     for(;;) {
    131         // We fetch CEs until we get a non-ignorable primary or reach the end.
    132         while(leftPair == 0) {
    133             if(leftIndex == leftLength) {
    134                 leftPair = EOS;
    135                 break;
    136             }
    137             UChar32 c = left[leftIndex++];
    138             if(c <= LATIN_MAX) {
    139                 leftPair = primaries[c];
    140                 if(leftPair != 0) { break; }
    141                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    142                     return BAIL_OUT_RESULT;
    143                 }
    144                 leftPair = table[c];
    145             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    146                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    147             } else {
    148                 leftPair = lookup(table, c);
    149             }
    150             if(leftPair >= MIN_SHORT) {
    151                 leftPair &= SHORT_PRIMARY_MASK;
    152                 break;
    153             } else if(leftPair > variableTop) {
    154                 leftPair &= LONG_PRIMARY_MASK;
    155                 break;
    156             } else {
    157                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    158                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    159                 leftPair = getPrimaries(variableTop, leftPair);
    160             }
    161         }
    162 
    163         while(rightPair == 0) {
    164             if(rightIndex == rightLength) {
    165                 rightPair = EOS;
    166                 break;
    167             }
    168             UChar32 c = right[rightIndex++];
    169             if(c <= LATIN_MAX) {
    170                 rightPair = primaries[c];
    171                 if(rightPair != 0) { break; }
    172                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    173                     return BAIL_OUT_RESULT;
    174                 }
    175                 rightPair = table[c];
    176             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    177                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    178             } else {
    179                 rightPair = lookup(table, c);
    180             }
    181             if(rightPair >= MIN_SHORT) {
    182                 rightPair &= SHORT_PRIMARY_MASK;
    183                 break;
    184             } else if(rightPair > variableTop) {
    185                 rightPair &= LONG_PRIMARY_MASK;
    186                 break;
    187             } else {
    188                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    189                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    190                 rightPair = getPrimaries(variableTop, rightPair);
    191             }
    192         }
    193 
    194         if(leftPair == rightPair) {
    195             if(leftPair == EOS) { break; }
    196             leftPair = rightPair = 0;
    197             continue;
    198         }
    199         uint32_t leftPrimary = leftPair & 0xffff;
    200         uint32_t rightPrimary = rightPair & 0xffff;
    201         if(leftPrimary != rightPrimary) {
    202             // Return the primary difference.
    203             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    204         }
    205         if(leftPair == EOS) { break; }
    206         leftPair >>= 16;
    207         rightPair >>= 16;
    208     }
    209     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    210     // but we know that the string is well-formed and
    211     // only contains supported characters and mappings.
    212 
    213     // We might skip the secondary level but continue with the case level
    214     // which is turned on separately.
    215     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    216         leftIndex = rightIndex = 0;
    217         leftPair = rightPair = 0;
    218         for(;;) {
    219             while(leftPair == 0) {
    220                 if(leftIndex == leftLength) {
    221                     leftPair = EOS;
    222                     break;
    223                 }
    224                 UChar32 c = left[leftIndex++];
    225                 if(c <= LATIN_MAX) {
    226                     leftPair = table[c];
    227                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    228                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    229                 } else {
    230                     leftPair = lookup(table, c);
    231                 }
    232                 if(leftPair >= MIN_SHORT) {
    233                     leftPair = getSecondariesFromOneShortCE(leftPair);
    234                     break;
    235                 } else if(leftPair > variableTop) {
    236                     leftPair = COMMON_SEC_PLUS_OFFSET;
    237                     break;
    238                 } else {
    239                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    240                     leftPair = getSecondaries(variableTop, leftPair);
    241                 }
    242             }
    243 
    244             while(rightPair == 0) {
    245                 if(rightIndex == rightLength) {
    246                     rightPair = EOS;
    247                     break;
    248                 }
    249                 UChar32 c = right[rightIndex++];
    250                 if(c <= LATIN_MAX) {
    251                     rightPair = table[c];
    252                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    253                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    254                 } else {
    255                     rightPair = lookup(table, c);
    256                 }
    257                 if(rightPair >= MIN_SHORT) {
    258                     rightPair = getSecondariesFromOneShortCE(rightPair);
    259                     break;
    260                 } else if(rightPair > variableTop) {
    261                     rightPair = COMMON_SEC_PLUS_OFFSET;
    262                     break;
    263                 } else {
    264                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    265                     rightPair = getSecondaries(variableTop, rightPair);
    266                 }
    267             }
    268 
    269             if(leftPair == rightPair) {
    270                 if(leftPair == EOS) { break; }
    271                 leftPair = rightPair = 0;
    272                 continue;
    273             }
    274             uint32_t leftSecondary = leftPair & 0xffff;
    275             uint32_t rightSecondary = rightPair & 0xffff;
    276             if(leftSecondary != rightSecondary) {
    277                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    278                     // Full support for backwards secondary requires backwards contraction matching
    279                     // and moving backwards between merge separators.
    280                     return BAIL_OUT_RESULT;
    281                 }
    282                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    283             }
    284             if(leftPair == EOS) { break; }
    285             leftPair >>= 16;
    286             rightPair >>= 16;
    287         }
    288     }
    289 
    290     if((options & CollationSettings::CASE_LEVEL) != 0) {
    291         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    292         leftIndex = rightIndex = 0;
    293         leftPair = rightPair = 0;
    294         for(;;) {
    295             while(leftPair == 0) {
    296                 if(leftIndex == leftLength) {
    297                     leftPair = EOS;
    298                     break;
    299                 }
    300                 UChar32 c = left[leftIndex++];
    301                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    302                 if(leftPair < MIN_LONG) {
    303                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    304                 }
    305                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    306             }
    307 
    308             while(rightPair == 0) {
    309                 if(rightIndex == rightLength) {
    310                     rightPair = EOS;
    311                     break;
    312                 }
    313                 UChar32 c = right[rightIndex++];
    314                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    315                 if(rightPair < MIN_LONG) {
    316                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    317                 }
    318                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    319             }
    320 
    321             if(leftPair == rightPair) {
    322                 if(leftPair == EOS) { break; }
    323                 leftPair = rightPair = 0;
    324                 continue;
    325             }
    326             uint32_t leftCase = leftPair & 0xffff;
    327             uint32_t rightCase = rightPair & 0xffff;
    328             if(leftCase != rightCase) {
    329                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    330                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    331                 } else {
    332                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    333                 }
    334             }
    335             if(leftPair == EOS) { break; }
    336             leftPair >>= 16;
    337             rightPair >>= 16;
    338         }
    339     }
    340     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    341 
    342     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    343     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    344 
    345     leftIndex = rightIndex = 0;
    346     leftPair = rightPair = 0;
    347     for(;;) {
    348         while(leftPair == 0) {
    349             if(leftIndex == leftLength) {
    350                 leftPair = EOS;
    351                 break;
    352             }
    353             UChar32 c = left[leftIndex++];
    354             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    355             if(leftPair < MIN_LONG) {
    356                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    357             }
    358             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    359         }
    360 
    361         while(rightPair == 0) {
    362             if(rightIndex == rightLength) {
    363                 rightPair = EOS;
    364                 break;
    365             }
    366             UChar32 c = right[rightIndex++];
    367             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    368             if(rightPair < MIN_LONG) {
    369                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    370             }
    371             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    372         }
    373 
    374         if(leftPair == rightPair) {
    375             if(leftPair == EOS) { break; }
    376             leftPair = rightPair = 0;
    377             continue;
    378         }
    379         uint32_t leftTertiary = leftPair & 0xffff;
    380         uint32_t rightTertiary = rightPair & 0xffff;
    381         if(leftTertiary != rightTertiary) {
    382             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    383                 // Pass through EOS and MERGE_WEIGHT
    384                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    385                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    386                 if(leftTertiary > MERGE_WEIGHT) {
    387                     leftTertiary ^= CASE_MASK;
    388                 }
    389                 if(rightTertiary > MERGE_WEIGHT) {
    390                     rightTertiary ^= CASE_MASK;
    391                 }
    392             }
    393             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    394         }
    395         if(leftPair == EOS) { break; }
    396         leftPair >>= 16;
    397         rightPair >>= 16;
    398     }
    399     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    400 
    401     leftIndex = rightIndex = 0;
    402     leftPair = rightPair = 0;
    403     for(;;) {
    404         while(leftPair == 0) {
    405             if(leftIndex == leftLength) {
    406                 leftPair = EOS;
    407                 break;
    408             }
    409             UChar32 c = left[leftIndex++];
    410             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    411             if(leftPair < MIN_LONG) {
    412                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    413             }
    414             leftPair = getQuaternaries(variableTop, leftPair);
    415         }
    416 
    417         while(rightPair == 0) {
    418             if(rightIndex == rightLength) {
    419                 rightPair = EOS;
    420                 break;
    421             }
    422             UChar32 c = right[rightIndex++];
    423             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    424             if(rightPair < MIN_LONG) {
    425                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    426             }
    427             rightPair = getQuaternaries(variableTop, rightPair);
    428         }
    429 
    430         if(leftPair == rightPair) {
    431             if(leftPair == EOS) { break; }
    432             leftPair = rightPair = 0;
    433             continue;
    434         }
    435         uint32_t leftQuaternary = leftPair & 0xffff;
    436         uint32_t rightQuaternary = rightPair & 0xffff;
    437         if(leftQuaternary != rightQuaternary) {
    438             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    439         }
    440         if(leftPair == EOS) { break; }
    441         leftPair >>= 16;
    442         rightPair >>= 16;
    443     }
    444     return UCOL_EQUAL;
    445 }
    446 
    447 int32_t
    448 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
    449                                  const uint8_t *left, int32_t leftLength,
    450                                  const uint8_t *right, int32_t rightLength) {
    451     // Keep compareUTF16() and compareUTF8() in sync very closely!
    452 
    453     U_ASSERT((table[0] >> 8) == VERSION);
    454     table += (table[0] & 0xff);  // skip the header
    455     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
    456     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    457 
    458     // Check for supported characters, fetch mini CEs, and compare primaries.
    459     int32_t leftIndex = 0, rightIndex = 0;
    460     /**
    461      * Single mini CE or a pair.
    462      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    463      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    464      */
    465     uint32_t leftPair = 0, rightPair = 0;
    466     // Note: There is no need to assemble the code point.
    467     // We only need to look up the table entry for the character,
    468     // and nextPair() looks for whether c==0.
    469     for(;;) {
    470         // We fetch CEs until we get a non-ignorable primary or reach the end.
    471         while(leftPair == 0) {
    472             if(leftIndex == leftLength) {
    473                 leftPair = EOS;
    474                 break;
    475             }
    476             UChar32 c = left[leftIndex++];
    477             uint8_t t;
    478             if(c <= 0x7f) {
    479                 leftPair = primaries[c];
    480                 if(leftPair != 0) { break; }
    481                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    482                     return BAIL_OUT_RESULT;
    483                 }
    484                 leftPair = table[c];
    485             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
    486                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
    487                 ++leftIndex;
    488                 c = ((c - 0xc2) << 6) + t;
    489                 leftPair = primaries[c];
    490                 if(leftPair != 0) { break; }
    491                 leftPair = table[c];
    492             } else {
    493                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
    494             }
    495             if(leftPair >= MIN_SHORT) {
    496                 leftPair &= SHORT_PRIMARY_MASK;
    497                 break;
    498             } else if(leftPair > variableTop) {
    499                 leftPair &= LONG_PRIMARY_MASK;
    500                 break;
    501             } else {
    502                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    503                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    504                 leftPair = getPrimaries(variableTop, leftPair);
    505             }
    506         }
    507 
    508         while(rightPair == 0) {
    509             if(rightIndex == rightLength) {
    510                 rightPair = EOS;
    511                 break;
    512             }
    513             UChar32 c = right[rightIndex++];
    514             uint8_t t;
    515             if(c <= 0x7f) {
    516                 rightPair = primaries[c];
    517                 if(rightPair != 0) { break; }
    518                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    519                     return BAIL_OUT_RESULT;
    520                 }
    521                 rightPair = table[c];
    522             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
    523                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
    524                 ++rightIndex;
    525                 c = ((c - 0xc2) << 6) + t;
    526                 rightPair = primaries[c];
    527                 if(rightPair != 0) { break; }
    528                 rightPair = table[c];
    529             } else {
    530                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
    531             }
    532             if(rightPair >= MIN_SHORT) {
    533                 rightPair &= SHORT_PRIMARY_MASK;
    534                 break;
    535             } else if(rightPair > variableTop) {
    536                 rightPair &= LONG_PRIMARY_MASK;
    537                 break;
    538             } else {
    539                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    540                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    541                 rightPair = getPrimaries(variableTop, rightPair);
    542             }
    543         }
    544 
    545         if(leftPair == rightPair) {
    546             if(leftPair == EOS) { break; }
    547             leftPair = rightPair = 0;
    548             continue;
    549         }
    550         uint32_t leftPrimary = leftPair & 0xffff;
    551         uint32_t rightPrimary = rightPair & 0xffff;
    552         if(leftPrimary != rightPrimary) {
    553             // Return the primary difference.
    554             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    555         }
    556         if(leftPair == EOS) { break; }
    557         leftPair >>= 16;
    558         rightPair >>= 16;
    559     }
    560     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    561     // but we know that the string is well-formed and
    562     // only contains supported characters and mappings.
    563 
    564     // We might skip the secondary level but continue with the case level
    565     // which is turned on separately.
    566     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    567         leftIndex = rightIndex = 0;
    568         leftPair = rightPair = 0;
    569         for(;;) {
    570             while(leftPair == 0) {
    571                 if(leftIndex == leftLength) {
    572                     leftPair = EOS;
    573                     break;
    574                 }
    575                 UChar32 c = left[leftIndex++];
    576                 if(c <= 0x7f) {
    577                     leftPair = table[c];
    578                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    579                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
    580                 } else {
    581                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
    582                 }
    583                 if(leftPair >= MIN_SHORT) {
    584                     leftPair = getSecondariesFromOneShortCE(leftPair);
    585                     break;
    586                 } else if(leftPair > variableTop) {
    587                     leftPair = COMMON_SEC_PLUS_OFFSET;
    588                     break;
    589                 } else {
    590                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    591                     leftPair = getSecondaries(variableTop, leftPair);
    592                 }
    593             }
    594 
    595             while(rightPair == 0) {
    596                 if(rightIndex == rightLength) {
    597                     rightPair = EOS;
    598                     break;
    599                 }
    600                 UChar32 c = right[rightIndex++];
    601                 if(c <= 0x7f) {
    602                     rightPair = table[c];
    603                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    604                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
    605                 } else {
    606                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
    607                 }
    608                 if(rightPair >= MIN_SHORT) {
    609                     rightPair = getSecondariesFromOneShortCE(rightPair);
    610                     break;
    611                 } else if(rightPair > variableTop) {
    612                     rightPair = COMMON_SEC_PLUS_OFFSET;
    613                     break;
    614                 } else {
    615                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    616                     rightPair = getSecondaries(variableTop, rightPair);
    617                 }
    618             }
    619 
    620             if(leftPair == rightPair) {
    621                 if(leftPair == EOS) { break; }
    622                 leftPair = rightPair = 0;
    623                 continue;
    624             }
    625             uint32_t leftSecondary = leftPair & 0xffff;
    626             uint32_t rightSecondary = rightPair & 0xffff;
    627             if(leftSecondary != rightSecondary) {
    628                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    629                     // Full support for backwards secondary requires backwards contraction matching
    630                     // and moving backwards between merge separators.
    631                     return BAIL_OUT_RESULT;
    632                 }
    633                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    634             }
    635             if(leftPair == EOS) { break; }
    636             leftPair >>= 16;
    637             rightPair >>= 16;
    638         }
    639     }
    640 
    641     if((options & CollationSettings::CASE_LEVEL) != 0) {
    642         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    643         leftIndex = rightIndex = 0;
    644         leftPair = rightPair = 0;
    645         for(;;) {
    646             while(leftPair == 0) {
    647                 if(leftIndex == leftLength) {
    648                     leftPair = EOS;
    649                     break;
    650                 }
    651                 UChar32 c = left[leftIndex++];
    652                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    653                 if(leftPair < MIN_LONG) {
    654                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    655                 }
    656                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    657             }
    658 
    659             while(rightPair == 0) {
    660                 if(rightIndex == rightLength) {
    661                     rightPair = EOS;
    662                     break;
    663                 }
    664                 UChar32 c = right[rightIndex++];
    665                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    666                 if(rightPair < MIN_LONG) {
    667                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    668                 }
    669                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    670             }
    671 
    672             if(leftPair == rightPair) {
    673                 if(leftPair == EOS) { break; }
    674                 leftPair = rightPair = 0;
    675                 continue;
    676             }
    677             uint32_t leftCase = leftPair & 0xffff;
    678             uint32_t rightCase = rightPair & 0xffff;
    679             if(leftCase != rightCase) {
    680                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    681                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    682                 } else {
    683                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    684                 }
    685             }
    686             if(leftPair == EOS) { break; }
    687             leftPair >>= 16;
    688             rightPair >>= 16;
    689         }
    690     }
    691     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    692 
    693     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    694     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    695 
    696     leftIndex = rightIndex = 0;
    697     leftPair = rightPair = 0;
    698     for(;;) {
    699         while(leftPair == 0) {
    700             if(leftIndex == leftLength) {
    701                 leftPair = EOS;
    702                 break;
    703             }
    704             UChar32 c = left[leftIndex++];
    705             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    706             if(leftPair < MIN_LONG) {
    707                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    708             }
    709             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    710         }
    711 
    712         while(rightPair == 0) {
    713             if(rightIndex == rightLength) {
    714                 rightPair = EOS;
    715                 break;
    716             }
    717             UChar32 c = right[rightIndex++];
    718             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    719             if(rightPair < MIN_LONG) {
    720                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    721             }
    722             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    723         }
    724 
    725         if(leftPair == rightPair) {
    726             if(leftPair == EOS) { break; }
    727             leftPair = rightPair = 0;
    728             continue;
    729         }
    730         uint32_t leftTertiary = leftPair & 0xffff;
    731         uint32_t rightTertiary = rightPair & 0xffff;
    732         if(leftTertiary != rightTertiary) {
    733             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    734                 // Pass through EOS and MERGE_WEIGHT
    735                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    736                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    737                 if(leftTertiary > MERGE_WEIGHT) {
    738                     leftTertiary ^= CASE_MASK;
    739                 }
    740                 if(rightTertiary > MERGE_WEIGHT) {
    741                     rightTertiary ^= CASE_MASK;
    742                 }
    743             }
    744             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    745         }
    746         if(leftPair == EOS) { break; }
    747         leftPair >>= 16;
    748         rightPair >>= 16;
    749     }
    750     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    751 
    752     leftIndex = rightIndex = 0;
    753     leftPair = rightPair = 0;
    754     for(;;) {
    755         while(leftPair == 0) {
    756             if(leftIndex == leftLength) {
    757                 leftPair = EOS;
    758                 break;
    759             }
    760             UChar32 c = left[leftIndex++];
    761             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    762             if(leftPair < MIN_LONG) {
    763                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    764             }
    765             leftPair = getQuaternaries(variableTop, leftPair);
    766         }
    767 
    768         while(rightPair == 0) {
    769             if(rightIndex == rightLength) {
    770                 rightPair = EOS;
    771                 break;
    772             }
    773             UChar32 c = right[rightIndex++];
    774             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    775             if(rightPair < MIN_LONG) {
    776                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    777             }
    778             rightPair = getQuaternaries(variableTop, rightPair);
    779         }
    780 
    781         if(leftPair == rightPair) {
    782             if(leftPair == EOS) { break; }
    783             leftPair = rightPair = 0;
    784             continue;
    785         }
    786         uint32_t leftQuaternary = leftPair & 0xffff;
    787         uint32_t rightQuaternary = rightPair & 0xffff;
    788         if(leftQuaternary != rightQuaternary) {
    789             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    790         }
    791         if(leftPair == EOS) { break; }
    792         leftPair >>= 16;
    793         rightPair >>= 16;
    794     }
    795     return UCOL_EQUAL;
    796 }
    797 
    798 uint32_t
    799 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
    800     U_ASSERT(c > LATIN_MAX);
    801     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    802         return table[c - PUNCT_START + LATIN_LIMIT];
    803     } else if(c == 0xfffe) {
    804         return MERGE_WEIGHT;
    805     } else if(c == 0xffff) {
    806         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
    807     } else {
    808         return BAIL_OUT;
    809     }
    810 }
    811 
    812 uint32_t
    813 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
    814                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
    815     // The caller handled ASCII and valid/supported Latin.
    816     U_ASSERT(c > 0x7f);
    817     int32_t i2 = sIndex + 1;
    818     if(i2 < sLength || sLength < 0) {
    819         uint8_t t1 = s8[sIndex];
    820         uint8_t t2 = s8[i2];
    821         sIndex += 2;
    822         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
    823             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    824         } else if(c == 0xef && t1 == 0xbf) {
    825             if(t2 == 0xbe) {
    826                 return MERGE_WEIGHT;  // U+FFFE
    827             } else if(t2 == 0xbf) {
    828                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    829             }
    830         }
    831     }
    832     return BAIL_OUT;
    833 }
    834 
    835 uint32_t
    836 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
    837                                      const uint8_t *s8, int32_t &sIndex) {
    838     // The caller handled ASCII.
    839     // The string is well-formed and contains only supported characters.
    840     U_ASSERT(c > 0x7f);
    841     if(c <= LATIN_MAX_UTF8_LEAD) {
    842         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
    843     }
    844     uint8_t t2 = s8[sIndex + 1];
    845     sIndex += 2;
    846     if(c == 0xe2) {
    847         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    848     } else if(t2 == 0xbe) {
    849         return MERGE_WEIGHT;  // U+FFFE
    850     } else {
    851         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    852     }
    853 }
    854 
    855 uint32_t
    856 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
    857                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
    858     if(ce >= MIN_LONG || ce < CONTRACTION) {
    859         return ce;  // simple or special mini CE
    860     } else if(ce >= EXPANSION) {
    861         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    862         return ((uint32_t)table[index + 1] << 16) | table[index];
    863     } else /* ce >= CONTRACTION */ {
    864         if(c == 0 && sLength < 0) {
    865             sLength = sIndex - 1;
    866             return EOS;
    867         }
    868         // Contraction list: Default mapping followed by
    869         // 0 or more single-character contraction suffix mappings.
    870         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    871         if(sIndex != sLength) {
    872             // Read the next character.
    873             int32_t c2;
    874             int32_t nextIndex = sIndex;
    875             if(s16 != NULL) {
    876                 c2 = s16[nextIndex++];
    877                 if(c2 > LATIN_MAX) {
    878                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
    879                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
    880                     } else if(c2 == 0xfffe || c2 == 0xffff) {
    881                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    882                     } else {
    883                         return BAIL_OUT;
    884                     }
    885                 }
    886             } else {
    887                 c2 = s8[nextIndex++];
    888                 if(c2 > 0x7f) {
    889                     uint8_t t;
    890                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
    891                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
    892                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
    893                         ++nextIndex;
    894                     } else {
    895                         int32_t i2 = nextIndex + 1;
    896                         if(i2 < sLength || sLength < 0) {
    897                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
    898                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
    899                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
    900                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
    901                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
    902                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    903                             } else {
    904                                 return BAIL_OUT;
    905                             }
    906                         } else {
    907                             return BAIL_OUT;
    908                         }
    909                         nextIndex += 2;
    910                     }
    911                 }
    912             }
    913             if(c2 == 0 && sLength < 0) {
    914                 sLength = sIndex;
    915                 c2 = -1;
    916             }
    917             // Look for the next character in the contraction suffix list,
    918             // which is in ascending order of single suffix characters.
    919             int32_t i = index;
    920             int32_t head = table[i];  // first skip the default mapping
    921             int32_t x;
    922             do {
    923                 i += head >> CONTR_LENGTH_SHIFT;
    924                 head = table[i];
    925                 x = head & CONTR_CHAR_MASK;
    926             } while(x < c2);
    927             if(x == c2) {
    928                 index = i;
    929                 sIndex = nextIndex;
    930             }
    931         }
    932         // Return the CE or CEs for the default or contraction mapping.
    933         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
    934         if(length == 1) {
    935             return BAIL_OUT;
    936         }
    937         ce = table[index + 1];
    938         if(length == 2) {
    939             return ce;
    940         } else {
    941             return ((uint32_t)table[index + 2] << 16) | ce;
    942         }
    943     }
    944 }
    945 
    946 uint32_t
    947 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
    948     if(pair <= 0xffff) {
    949         // one mini CE
    950         if(pair >= MIN_SHORT) {
    951             pair = getSecondariesFromOneShortCE(pair);
    952         } else if(pair > variableTop) {
    953             pair = COMMON_SEC_PLUS_OFFSET;
    954         } else if(pair >= MIN_LONG) {
    955             pair = 0;  // variable
    956         }
    957         // else special mini CE
    958     } else {
    959         uint32_t ce = pair & 0xffff;
    960         if(ce >= MIN_SHORT) {
    961             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
    962         } else if(ce > variableTop) {
    963             pair = TWO_COMMON_SEC_PLUS_OFFSET;
    964         } else {
    965             U_ASSERT(ce >= MIN_LONG);
    966             pair = 0;  // variable
    967         }
    968     }
    969     return pair;
    970 }
    971 
    972 uint32_t
    973 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
    974     // Primary+caseLevel: Ignore case level weights of primary ignorables.
    975     // Otherwise: Ignore case level weights of secondary ignorables.
    976     // For details see the comments in the CollationCompare class.
    977     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    978     if(pair <= 0xffff) {
    979         // one mini CE
    980         if(pair >= MIN_SHORT) {
    981             // A high secondary weight means we really have two CEs,
    982             // a primary CE and a secondary CE.
    983             uint32_t ce = pair;
    984             pair &= CASE_MASK;  // explicit weight of primary CE
    985             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
    986                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
    987             }
    988         } else if(pair > variableTop) {
    989             pair = LOWER_CASE;
    990         } else if(pair >= MIN_LONG) {
    991             pair = 0;  // variable
    992         }
    993         // else special mini CE
    994     } else {
    995         // two mini CEs, same primary groups, neither expands like above
    996         uint32_t ce = pair & 0xffff;
    997         if(ce >= MIN_SHORT) {
    998             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
    999                 pair &= CASE_MASK;
   1000             } else {
   1001                 pair &= TWO_CASES_MASK;
   1002             }
   1003         } else if(ce > variableTop) {
   1004             pair = TWO_LOWER_CASES;
   1005         } else {
   1006             U_ASSERT(ce >= MIN_LONG);
   1007             pair = 0;  // variable
   1008         }
   1009     }
   1010     return pair;
   1011 }
   1012 
   1013 uint32_t
   1014 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
   1015     if(pair <= 0xffff) {
   1016         // one mini CE
   1017         if(pair >= MIN_SHORT) {
   1018             // A high secondary weight means we really have two CEs,
   1019             // a primary CE and a secondary CE.
   1020             uint32_t ce = pair;
   1021             if(withCaseBits) {
   1022                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
   1023                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1024                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
   1025                 }
   1026             } else {
   1027                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1028                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1029                     pair |= COMMON_TER_PLUS_OFFSET << 16;
   1030                 }
   1031             }
   1032         } else if(pair > variableTop) {
   1033             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1034             if(withCaseBits) {
   1035                 pair |= LOWER_CASE;
   1036             }
   1037         } else if(pair >= MIN_LONG) {
   1038             pair = 0;  // variable
   1039         }
   1040         // else special mini CE
   1041     } else {
   1042         // two mini CEs, same primary groups, neither expands like above
   1043         uint32_t ce = pair & 0xffff;
   1044         if(ce >= MIN_SHORT) {
   1045             if(withCaseBits) {
   1046                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
   1047             } else {
   1048                 pair &= TWO_TERTIARIES_MASK;
   1049             }
   1050             pair += TWO_TER_OFFSETS;
   1051         } else if(ce > variableTop) {
   1052             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
   1053             if(withCaseBits) {
   1054                 pair |= TWO_LOWER_CASES;
   1055             }
   1056         } else {
   1057             U_ASSERT(ce >= MIN_LONG);
   1058             pair = 0;  // variable
   1059         }
   1060     }
   1061     return pair;
   1062 }
   1063 
   1064 uint32_t
   1065 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
   1066     // Return the primary weight of a variable CE,
   1067     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
   1068     if(pair <= 0xffff) {
   1069         // one mini CE
   1070         if(pair >= MIN_SHORT) {
   1071             // A high secondary weight means we really have two CEs,
   1072             // a primary CE and a secondary CE.
   1073             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1074                 pair = TWO_SHORT_PRIMARIES_MASK;
   1075             } else {
   1076                 pair = SHORT_PRIMARY_MASK;
   1077             }
   1078         } else if(pair > variableTop) {
   1079             pair = SHORT_PRIMARY_MASK;
   1080         } else if(pair >= MIN_LONG) {
   1081             pair &= LONG_PRIMARY_MASK;  // variable
   1082         }
   1083         // else special mini CE
   1084     } else {
   1085         // two mini CEs, same primary groups, neither expands like above
   1086         uint32_t ce = pair & 0xffff;
   1087         if(ce > variableTop) {
   1088             pair = TWO_SHORT_PRIMARIES_MASK;
   1089         } else {
   1090             U_ASSERT(ce >= MIN_LONG);
   1091             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
   1092         }
   1093     }
   1094     return pair;
   1095 }
   1096 
   1097 U_NAMESPACE_END
   1098 
   1099 #endif  // !UCONFIG_NO_COLLATION
   1100