Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationfastlatin.cpp
      7 *
      8 * created on: 2013aug18
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "collationdata.h"
     18 #include "collationfastlatin.h"
     19 #include "collationsettings.h"
     20 #include "putilimp.h"  // U_ALIGN_CODE
     21 #include "uassert.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 int32_t
     26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
     27                                uint16_t *primaries, int32_t capacity) {
     28     const uint16_t *table = data->fastLatinTable;
     29     if(table == NULL) { return -1; }
     30     U_ASSERT(capacity == LATIN_LIMIT);
     31     if(capacity != LATIN_LIMIT) { return -1; }
     32 
     33     uint32_t miniVarTop;
     34     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
     35         // No mini primaries are variable, set a variableTop just below the
     36         // lowest long mini primary.
     37         miniVarTop = MIN_LONG - 1;
     38     } else {
     39         uint32_t v1 = settings.variableTop >> 24;
     40         int32_t headerLength = *table & 0xff;
     41         int32_t i = headerLength - 1;
     42         if(i <= 0 || v1 > (table[i] & 0x7f)) {
     43             return -1;  // variableTop >= digits, should not occur
     44         }
     45         while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
     46         // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
     47         // Shift right to make it comparable with long mini primaries in bits 15..3.
     48         miniVarTop = (table[i] & 0xff80) >> 4;
     49     }
     50 
     51     const uint8_t *reorderTable = settings.reorderTable;
     52     if(reorderTable != NULL) {
     53         const uint16_t *scripts = data->scripts;
     54         int32_t length = data->scriptsLength;
     55         uint32_t prevLastByte = 0;
     56         for(int32_t i = 0; i < length;) {
     57             // reordered last byte of the group
     58             uint32_t lastByte = reorderTable[scripts[i] & 0xff];
     59             if(lastByte < prevLastByte) {
     60                 // The permutation affects the groups up to Latin.
     61                 return -1;
     62             }
     63             if(scripts[i + 2] == USCRIPT_LATIN) { break; }
     64             i = i + 2 + scripts[i + 1];
     65             prevLastByte = lastByte;
     66         }
     67     }
     68 
     69     table += (table[0] & 0xff);  // skip the header
     70     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
     71         uint32_t p = table[c];
     72         if(p >= MIN_SHORT) {
     73             p &= SHORT_PRIMARY_MASK;
     74         } else if(p > miniVarTop) {
     75             p &= LONG_PRIMARY_MASK;
     76         } else {
     77             p = 0;
     78         }
     79         primaries[c] = (uint16_t)p;
     80     }
     81     if((settings.options & CollationSettings::NUMERIC) != 0) {
     82         // Bail out for digits.
     83         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
     84     }
     85 
     86     // Shift the miniVarTop above other options.
     87     return ((int32_t)miniVarTop << 16) | settings.options;
     88 }
     89 
     90 int32_t
     91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
     92                                  const UChar *left, int32_t leftLength,
     93                                  const UChar *right, int32_t rightLength) {
     94     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
     95     // optimized for common Latin text.
     96     // Keep them in sync!
     97     // Keep compareUTF16() and compareUTF8() in sync very closely!
     98 
     99     U_ASSERT((table[0] >> 8) == VERSION);
    100     table += (table[0] & 0xff);  // skip the header
    101     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
    102     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    103 
    104     // Check for supported characters, fetch mini CEs, and compare primaries.
    105     U_ALIGN_CODE(16);
    106     int32_t leftIndex = 0, rightIndex = 0;
    107     /**
    108      * Single mini CE or a pair.
    109      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    110      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    111      */
    112     uint32_t leftPair = 0, rightPair = 0;
    113     for(;;) {
    114         // We fetch CEs until we get a non-ignorable primary or reach the end.
    115         while(leftPair == 0) {
    116             if(leftIndex == leftLength) {
    117                 leftPair = EOS;
    118                 break;
    119             }
    120             UChar32 c = left[leftIndex++];
    121             if(c <= LATIN_MAX) {
    122                 leftPair = primaries[c];
    123                 if(leftPair != 0) { break; }
    124                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    125                     return BAIL_OUT_RESULT;
    126                 }
    127                 leftPair = table[c];
    128             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    129                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    130             } else {
    131                 leftPair = lookup(table, c);
    132             }
    133             if(leftPair >= MIN_SHORT) {
    134                 leftPair &= SHORT_PRIMARY_MASK;
    135                 break;
    136             } else if(leftPair > variableTop) {
    137                 leftPair &= LONG_PRIMARY_MASK;
    138                 break;
    139             } else {
    140                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    141                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    142                 leftPair = getPrimaries(variableTop, leftPair);
    143             }
    144         }
    145 
    146         while(rightPair == 0) {
    147             if(rightIndex == rightLength) {
    148                 rightPair = EOS;
    149                 break;
    150             }
    151             UChar32 c = right[rightIndex++];
    152             if(c <= LATIN_MAX) {
    153                 rightPair = primaries[c];
    154                 if(rightPair != 0) { break; }
    155                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    156                     return BAIL_OUT_RESULT;
    157                 }
    158                 rightPair = table[c];
    159             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    160                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    161             } else {
    162                 rightPair = lookup(table, c);
    163             }
    164             if(rightPair >= MIN_SHORT) {
    165                 rightPair &= SHORT_PRIMARY_MASK;
    166                 break;
    167             } else if(rightPair > variableTop) {
    168                 rightPair &= LONG_PRIMARY_MASK;
    169                 break;
    170             } else {
    171                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    172                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    173                 rightPair = getPrimaries(variableTop, rightPair);
    174             }
    175         }
    176 
    177         if(leftPair == rightPair) {
    178             if(leftPair == EOS) { break; }
    179             leftPair = rightPair = 0;
    180             continue;
    181         }
    182         uint32_t leftPrimary = leftPair & 0xffff;
    183         uint32_t rightPrimary = rightPair & 0xffff;
    184         if(leftPrimary != rightPrimary) {
    185             // Return the primary difference.
    186             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    187         }
    188         if(leftPair == EOS) { break; }
    189         leftPair >>= 16;
    190         rightPair >>= 16;
    191     }
    192     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    193     // but we know that the string is well-formed and
    194     // only contains supported characters and mappings.
    195 
    196     // We might skip the secondary level but continue with the case level
    197     // which is turned on separately.
    198     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    199         leftIndex = rightIndex = 0;
    200         leftPair = rightPair = 0;
    201         for(;;) {
    202             while(leftPair == 0) {
    203                 if(leftIndex == leftLength) {
    204                     leftPair = EOS;
    205                     break;
    206                 }
    207                 UChar32 c = left[leftIndex++];
    208                 if(c <= LATIN_MAX) {
    209                     leftPair = table[c];
    210                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    211                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    212                 } else {
    213                     leftPair = lookup(table, c);
    214                 }
    215                 if(leftPair >= MIN_SHORT) {
    216                     leftPair = getSecondariesFromOneShortCE(leftPair);
    217                     break;
    218                 } else if(leftPair > variableTop) {
    219                     leftPair = COMMON_SEC_PLUS_OFFSET;
    220                     break;
    221                 } else {
    222                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    223                     leftPair = getSecondaries(variableTop, leftPair);
    224                 }
    225             }
    226 
    227             while(rightPair == 0) {
    228                 if(rightIndex == rightLength) {
    229                     rightPair = EOS;
    230                     break;
    231                 }
    232                 UChar32 c = right[rightIndex++];
    233                 if(c <= LATIN_MAX) {
    234                     rightPair = table[c];
    235                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    236                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    237                 } else {
    238                     rightPair = lookup(table, c);
    239                 }
    240                 if(rightPair >= MIN_SHORT) {
    241                     rightPair = getSecondariesFromOneShortCE(rightPair);
    242                     break;
    243                 } else if(rightPair > variableTop) {
    244                     rightPair = COMMON_SEC_PLUS_OFFSET;
    245                     break;
    246                 } else {
    247                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    248                     rightPair = getSecondaries(variableTop, rightPair);
    249                 }
    250             }
    251 
    252             if(leftPair == rightPair) {
    253                 if(leftPair == EOS) { break; }
    254                 leftPair = rightPair = 0;
    255                 continue;
    256             }
    257             uint32_t leftSecondary = leftPair & 0xffff;
    258             uint32_t rightSecondary = rightPair & 0xffff;
    259             if(leftSecondary != rightSecondary) {
    260                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    261                     // Full support for backwards secondary requires backwards contraction matching
    262                     // and moving backwards between merge separators.
    263                     return BAIL_OUT_RESULT;
    264                 }
    265                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    266             }
    267             if(leftPair == EOS) { break; }
    268             leftPair >>= 16;
    269             rightPair >>= 16;
    270         }
    271     }
    272 
    273     if((options & CollationSettings::CASE_LEVEL) != 0) {
    274         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    275         leftIndex = rightIndex = 0;
    276         leftPair = rightPair = 0;
    277         for(;;) {
    278             while(leftPair == 0) {
    279                 if(leftIndex == leftLength) {
    280                     leftPair = EOS;
    281                     break;
    282                 }
    283                 UChar32 c = left[leftIndex++];
    284                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    285                 if(leftPair < MIN_LONG) {
    286                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    287                 }
    288                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    289             }
    290 
    291             while(rightPair == 0) {
    292                 if(rightIndex == rightLength) {
    293                     rightPair = EOS;
    294                     break;
    295                 }
    296                 UChar32 c = right[rightIndex++];
    297                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    298                 if(rightPair < MIN_LONG) {
    299                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    300                 }
    301                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    302             }
    303 
    304             if(leftPair == rightPair) {
    305                 if(leftPair == EOS) { break; }
    306                 leftPair = rightPair = 0;
    307                 continue;
    308             }
    309             uint32_t leftCase = leftPair & 0xffff;
    310             uint32_t rightCase = rightPair & 0xffff;
    311             if(leftCase != rightCase) {
    312                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    313                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    314                 } else {
    315                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    316                 }
    317             }
    318             if(leftPair == EOS) { break; }
    319             leftPair >>= 16;
    320             rightPair >>= 16;
    321         }
    322     }
    323     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    324 
    325     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    326     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    327 
    328     leftIndex = rightIndex = 0;
    329     leftPair = rightPair = 0;
    330     for(;;) {
    331         while(leftPair == 0) {
    332             if(leftIndex == leftLength) {
    333                 leftPair = EOS;
    334                 break;
    335             }
    336             UChar32 c = left[leftIndex++];
    337             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    338             if(leftPair < MIN_LONG) {
    339                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    340             }
    341             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    342         }
    343 
    344         while(rightPair == 0) {
    345             if(rightIndex == rightLength) {
    346                 rightPair = EOS;
    347                 break;
    348             }
    349             UChar32 c = right[rightIndex++];
    350             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    351             if(rightPair < MIN_LONG) {
    352                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    353             }
    354             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    355         }
    356 
    357         if(leftPair == rightPair) {
    358             if(leftPair == EOS) { break; }
    359             leftPair = rightPair = 0;
    360             continue;
    361         }
    362         uint32_t leftTertiary = leftPair & 0xffff;
    363         uint32_t rightTertiary = rightPair & 0xffff;
    364         if(leftTertiary != rightTertiary) {
    365             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    366                 // Pass through EOS and MERGE_WEIGHT
    367                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    368                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    369                 if(leftTertiary > MERGE_WEIGHT) {
    370                     leftTertiary ^= CASE_MASK;
    371                 }
    372                 if(rightTertiary > MERGE_WEIGHT) {
    373                     rightTertiary ^= CASE_MASK;
    374                 }
    375             }
    376             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    377         }
    378         if(leftPair == EOS) { break; }
    379         leftPair >>= 16;
    380         rightPair >>= 16;
    381     }
    382     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    383 
    384     leftIndex = rightIndex = 0;
    385     leftPair = rightPair = 0;
    386     for(;;) {
    387         while(leftPair == 0) {
    388             if(leftIndex == leftLength) {
    389                 leftPair = EOS;
    390                 break;
    391             }
    392             UChar32 c = left[leftIndex++];
    393             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    394             if(leftPair < MIN_LONG) {
    395                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    396             }
    397             leftPair = getQuaternaries(variableTop, leftPair);
    398         }
    399 
    400         while(rightPair == 0) {
    401             if(rightIndex == rightLength) {
    402                 rightPair = EOS;
    403                 break;
    404             }
    405             UChar32 c = right[rightIndex++];
    406             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    407             if(rightPair < MIN_LONG) {
    408                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    409             }
    410             rightPair = getQuaternaries(variableTop, rightPair);
    411         }
    412 
    413         if(leftPair == rightPair) {
    414             if(leftPair == EOS) { break; }
    415             leftPair = rightPair = 0;
    416             continue;
    417         }
    418         uint32_t leftQuaternary = leftPair & 0xffff;
    419         uint32_t rightQuaternary = rightPair & 0xffff;
    420         if(leftQuaternary != rightQuaternary) {
    421             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    422         }
    423         if(leftPair == EOS) { break; }
    424         leftPair >>= 16;
    425         rightPair >>= 16;
    426     }
    427     return UCOL_EQUAL;
    428 }
    429 
    430 int32_t
    431 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
    432                                  const uint8_t *left, int32_t leftLength,
    433                                  const uint8_t *right, int32_t rightLength) {
    434     // Keep compareUTF16() and compareUTF8() in sync very closely!
    435 
    436     U_ASSERT((table[0] >> 8) == VERSION);
    437     table += (table[0] & 0xff);  // skip the header
    438     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
    439     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    440 
    441     // Check for supported characters, fetch mini CEs, and compare primaries.
    442     U_ALIGN_CODE(16);
    443     int32_t leftIndex = 0, rightIndex = 0;
    444     /**
    445      * Single mini CE or a pair.
    446      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    447      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    448      */
    449     uint32_t leftPair = 0, rightPair = 0;
    450     // Note: There is no need to assemble the code point.
    451     // We only need to look up the table entry for the character,
    452     // and nextPair() looks for whether c==0.
    453     for(;;) {
    454         // We fetch CEs until we get a non-ignorable primary or reach the end.
    455         while(leftPair == 0) {
    456             if(leftIndex == leftLength) {
    457                 leftPair = EOS;
    458                 break;
    459             }
    460             UChar32 c = left[leftIndex++];
    461             uint8_t t;
    462             if(c <= 0x7f) {
    463                 leftPair = primaries[c];
    464                 if(leftPair != 0) { break; }
    465                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    466                     return BAIL_OUT_RESULT;
    467                 }
    468                 leftPair = table[c];
    469             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
    470                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
    471                 ++leftIndex;
    472                 c = ((c - 0xc2) << 6) + t;
    473                 leftPair = primaries[c];
    474                 if(leftPair != 0) { break; }
    475                 leftPair = table[c];
    476             } else {
    477                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
    478             }
    479             if(leftPair >= MIN_SHORT) {
    480                 leftPair &= SHORT_PRIMARY_MASK;
    481                 break;
    482             } else if(leftPair > variableTop) {
    483                 leftPair &= LONG_PRIMARY_MASK;
    484                 break;
    485             } else {
    486                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    487                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    488                 leftPair = getPrimaries(variableTop, leftPair);
    489             }
    490         }
    491 
    492         while(rightPair == 0) {
    493             if(rightIndex == rightLength) {
    494                 rightPair = EOS;
    495                 break;
    496             }
    497             UChar32 c = right[rightIndex++];
    498             uint8_t t;
    499             if(c <= 0x7f) {
    500                 rightPair = primaries[c];
    501                 if(rightPair != 0) { break; }
    502                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    503                     return BAIL_OUT_RESULT;
    504                 }
    505                 rightPair = table[c];
    506             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
    507                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
    508                 ++rightIndex;
    509                 c = ((c - 0xc2) << 6) + t;
    510                 rightPair = primaries[c];
    511                 if(rightPair != 0) { break; }
    512                 rightPair = table[c];
    513             } else {
    514                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
    515             }
    516             if(rightPair >= MIN_SHORT) {
    517                 rightPair &= SHORT_PRIMARY_MASK;
    518                 break;
    519             } else if(rightPair > variableTop) {
    520                 rightPair &= LONG_PRIMARY_MASK;
    521                 break;
    522             } else {
    523                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    524                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    525                 rightPair = getPrimaries(variableTop, rightPair);
    526             }
    527         }
    528 
    529         if(leftPair == rightPair) {
    530             if(leftPair == EOS) { break; }
    531             leftPair = rightPair = 0;
    532             continue;
    533         }
    534         uint32_t leftPrimary = leftPair & 0xffff;
    535         uint32_t rightPrimary = rightPair & 0xffff;
    536         if(leftPrimary != rightPrimary) {
    537             // Return the primary difference.
    538             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    539         }
    540         if(leftPair == EOS) { break; }
    541         leftPair >>= 16;
    542         rightPair >>= 16;
    543     }
    544     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    545     // but we know that the string is well-formed and
    546     // only contains supported characters and mappings.
    547 
    548     // We might skip the secondary level but continue with the case level
    549     // which is turned on separately.
    550     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    551         leftIndex = rightIndex = 0;
    552         leftPair = rightPair = 0;
    553         for(;;) {
    554             while(leftPair == 0) {
    555                 if(leftIndex == leftLength) {
    556                     leftPair = EOS;
    557                     break;
    558                 }
    559                 UChar32 c = left[leftIndex++];
    560                 if(c <= 0x7f) {
    561                     leftPair = table[c];
    562                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    563                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
    564                 } else {
    565                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
    566                 }
    567                 if(leftPair >= MIN_SHORT) {
    568                     leftPair = getSecondariesFromOneShortCE(leftPair);
    569                     break;
    570                 } else if(leftPair > variableTop) {
    571                     leftPair = COMMON_SEC_PLUS_OFFSET;
    572                     break;
    573                 } else {
    574                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    575                     leftPair = getSecondaries(variableTop, leftPair);
    576                 }
    577             }
    578 
    579             while(rightPair == 0) {
    580                 if(rightIndex == rightLength) {
    581                     rightPair = EOS;
    582                     break;
    583                 }
    584                 UChar32 c = right[rightIndex++];
    585                 if(c <= 0x7f) {
    586                     rightPair = table[c];
    587                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    588                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
    589                 } else {
    590                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
    591                 }
    592                 if(rightPair >= MIN_SHORT) {
    593                     rightPair = getSecondariesFromOneShortCE(rightPair);
    594                     break;
    595                 } else if(rightPair > variableTop) {
    596                     rightPair = COMMON_SEC_PLUS_OFFSET;
    597                     break;
    598                 } else {
    599                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    600                     rightPair = getSecondaries(variableTop, rightPair);
    601                 }
    602             }
    603 
    604             if(leftPair == rightPair) {
    605                 if(leftPair == EOS) { break; }
    606                 leftPair = rightPair = 0;
    607                 continue;
    608             }
    609             uint32_t leftSecondary = leftPair & 0xffff;
    610             uint32_t rightSecondary = rightPair & 0xffff;
    611             if(leftSecondary != rightSecondary) {
    612                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    613                     // Full support for backwards secondary requires backwards contraction matching
    614                     // and moving backwards between merge separators.
    615                     return BAIL_OUT_RESULT;
    616                 }
    617                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    618             }
    619             if(leftPair == EOS) { break; }
    620             leftPair >>= 16;
    621             rightPair >>= 16;
    622         }
    623     }
    624 
    625     if((options & CollationSettings::CASE_LEVEL) != 0) {
    626         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    627         leftIndex = rightIndex = 0;
    628         leftPair = rightPair = 0;
    629         for(;;) {
    630             while(leftPair == 0) {
    631                 if(leftIndex == leftLength) {
    632                     leftPair = EOS;
    633                     break;
    634                 }
    635                 UChar32 c = left[leftIndex++];
    636                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    637                 if(leftPair < MIN_LONG) {
    638                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    639                 }
    640                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    641             }
    642 
    643             while(rightPair == 0) {
    644                 if(rightIndex == rightLength) {
    645                     rightPair = EOS;
    646                     break;
    647                 }
    648                 UChar32 c = right[rightIndex++];
    649                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    650                 if(rightPair < MIN_LONG) {
    651                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    652                 }
    653                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    654             }
    655 
    656             if(leftPair == rightPair) {
    657                 if(leftPair == EOS) { break; }
    658                 leftPair = rightPair = 0;
    659                 continue;
    660             }
    661             uint32_t leftCase = leftPair & 0xffff;
    662             uint32_t rightCase = rightPair & 0xffff;
    663             if(leftCase != rightCase) {
    664                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    665                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    666                 } else {
    667                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    668                 }
    669             }
    670             if(leftPair == EOS) { break; }
    671             leftPair >>= 16;
    672             rightPair >>= 16;
    673         }
    674     }
    675     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    676 
    677     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    678     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    679 
    680     leftIndex = rightIndex = 0;
    681     leftPair = rightPair = 0;
    682     for(;;) {
    683         while(leftPair == 0) {
    684             if(leftIndex == leftLength) {
    685                 leftPair = EOS;
    686                 break;
    687             }
    688             UChar32 c = left[leftIndex++];
    689             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    690             if(leftPair < MIN_LONG) {
    691                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    692             }
    693             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    694         }
    695 
    696         while(rightPair == 0) {
    697             if(rightIndex == rightLength) {
    698                 rightPair = EOS;
    699                 break;
    700             }
    701             UChar32 c = right[rightIndex++];
    702             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    703             if(rightPair < MIN_LONG) {
    704                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    705             }
    706             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    707         }
    708 
    709         if(leftPair == rightPair) {
    710             if(leftPair == EOS) { break; }
    711             leftPair = rightPair = 0;
    712             continue;
    713         }
    714         uint32_t leftTertiary = leftPair & 0xffff;
    715         uint32_t rightTertiary = rightPair & 0xffff;
    716         if(leftTertiary != rightTertiary) {
    717             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    718                 // Pass through EOS and MERGE_WEIGHT
    719                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    720                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    721                 if(leftTertiary > MERGE_WEIGHT) {
    722                     leftTertiary ^= CASE_MASK;
    723                 }
    724                 if(rightTertiary > MERGE_WEIGHT) {
    725                     rightTertiary ^= CASE_MASK;
    726                 }
    727             }
    728             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    729         }
    730         if(leftPair == EOS) { break; }
    731         leftPair >>= 16;
    732         rightPair >>= 16;
    733     }
    734     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    735 
    736     leftIndex = rightIndex = 0;
    737     leftPair = rightPair = 0;
    738     for(;;) {
    739         while(leftPair == 0) {
    740             if(leftIndex == leftLength) {
    741                 leftPair = EOS;
    742                 break;
    743             }
    744             UChar32 c = left[leftIndex++];
    745             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    746             if(leftPair < MIN_LONG) {
    747                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    748             }
    749             leftPair = getQuaternaries(variableTop, leftPair);
    750         }
    751 
    752         while(rightPair == 0) {
    753             if(rightIndex == rightLength) {
    754                 rightPair = EOS;
    755                 break;
    756             }
    757             UChar32 c = right[rightIndex++];
    758             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    759             if(rightPair < MIN_LONG) {
    760                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    761             }
    762             rightPair = getQuaternaries(variableTop, rightPair);
    763         }
    764 
    765         if(leftPair == rightPair) {
    766             if(leftPair == EOS) { break; }
    767             leftPair = rightPair = 0;
    768             continue;
    769         }
    770         uint32_t leftQuaternary = leftPair & 0xffff;
    771         uint32_t rightQuaternary = rightPair & 0xffff;
    772         if(leftQuaternary != rightQuaternary) {
    773             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    774         }
    775         if(leftPair == EOS) { break; }
    776         leftPair >>= 16;
    777         rightPair >>= 16;
    778     }
    779     return UCOL_EQUAL;
    780 }
    781 
    782 uint32_t
    783 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
    784     U_ASSERT(c > LATIN_MAX);
    785     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    786         return table[c - PUNCT_START + LATIN_LIMIT];
    787     } else if(c == 0xfffe) {
    788         return MERGE_WEIGHT;
    789     } else if(c == 0xffff) {
    790         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
    791     } else {
    792         return BAIL_OUT;
    793     }
    794 }
    795 
    796 uint32_t
    797 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
    798                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
    799     // The caller handled ASCII and valid/supported Latin.
    800     U_ASSERT(c > 0x7f);
    801     int32_t i2 = sIndex + 1;
    802     if(i2 < sLength || sLength < 0) {
    803         uint8_t t1 = s8[sIndex];
    804         uint8_t t2 = s8[i2];
    805         sIndex += 2;
    806         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
    807             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    808         } else if(c == 0xef && t1 == 0xbf) {
    809             if(t2 == 0xbe) {
    810                 return MERGE_WEIGHT;  // U+FFFE
    811             } else if(t2 == 0xbf) {
    812                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    813             }
    814         }
    815     }
    816     return BAIL_OUT;
    817 }
    818 
    819 uint32_t
    820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
    821                                      const uint8_t *s8, int32_t &sIndex) {
    822     // The caller handled ASCII.
    823     // The string is well-formed and contains only supported characters.
    824     U_ASSERT(c > 0x7f);
    825     if(c <= LATIN_MAX_UTF8_LEAD) {
    826         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
    827     }
    828     uint8_t t2 = s8[sIndex + 1];
    829     sIndex += 2;
    830     if(c == 0xe2) {
    831         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    832     } else if(t2 == 0xbe) {
    833         return MERGE_WEIGHT;  // U+FFFE
    834     } else {
    835         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    836     }
    837 }
    838 
    839 uint32_t
    840 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
    841                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
    842     if(ce >= MIN_LONG || ce < CONTRACTION) {
    843         return ce;  // simple or special mini CE
    844     } else if(ce >= EXPANSION) {
    845         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    846         return ((uint32_t)table[index + 1] << 16) | table[index];
    847     } else /* ce >= CONTRACTION */ {
    848         if(c == 0 && sLength < 0) {
    849             sLength = sIndex - 1;
    850             return EOS;
    851         }
    852         // Contraction list: Default mapping followed by
    853         // 0 or more single-character contraction suffix mappings.
    854         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    855         if(sIndex != sLength) {
    856             // Read the next character.
    857             int32_t c2;
    858             int32_t nextIndex = sIndex;
    859             if(s16 != NULL) {
    860                 c2 = s16[nextIndex++];
    861                 if(c2 > LATIN_MAX) {
    862                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
    863                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
    864                     } else if(c2 == 0xfffe || c2 == 0xffff) {
    865                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    866                     } else {
    867                         return BAIL_OUT;
    868                     }
    869                 }
    870             } else {
    871                 c2 = s8[nextIndex++];
    872                 if(c2 > 0x7f) {
    873                     uint8_t t;
    874                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
    875                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
    876                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
    877                         ++nextIndex;
    878                     } else {
    879                         int32_t i2 = nextIndex + 1;
    880                         if(i2 < sLength || sLength < 0) {
    881                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
    882                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
    883                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
    884                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
    885                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
    886                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    887                             } else {
    888                                 return BAIL_OUT;
    889                             }
    890                         } else {
    891                             return BAIL_OUT;
    892                         }
    893                         nextIndex += 2;
    894                     }
    895                 }
    896             }
    897             if(c2 == 0 && sLength < 0) {
    898                 sLength = sIndex;
    899                 c2 = -1;
    900             }
    901             // Look for the next character in the contraction suffix list,
    902             // which is in ascending order of single suffix characters.
    903             int32_t i = index;
    904             int32_t head = table[i];  // first skip the default mapping
    905             int32_t x;
    906             do {
    907                 i += head >> CONTR_LENGTH_SHIFT;
    908                 head = table[i];
    909                 x = head & CONTR_CHAR_MASK;
    910             } while(x < c2);
    911             if(x == c2) {
    912                 index = i;
    913                 sIndex = nextIndex;
    914             }
    915         }
    916         // Return the CE or CEs for the default or contraction mapping.
    917         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
    918         if(length == 1) {
    919             return BAIL_OUT;
    920         }
    921         ce = table[index + 1];
    922         if(length == 2) {
    923             return ce;
    924         } else {
    925             return ((uint32_t)table[index + 2] << 16) | ce;
    926         }
    927     }
    928 }
    929 
    930 uint32_t
    931 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
    932     if(pair <= 0xffff) {
    933         // one mini CE
    934         if(pair >= MIN_SHORT) {
    935             pair = getSecondariesFromOneShortCE(pair);
    936         } else if(pair > variableTop) {
    937             pair = COMMON_SEC_PLUS_OFFSET;
    938         } else if(pair >= MIN_LONG) {
    939             pair = 0;  // variable
    940         }
    941         // else special mini CE
    942     } else {
    943         uint32_t ce = pair & 0xffff;
    944         if(ce >= MIN_SHORT) {
    945             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
    946         } else if(ce > variableTop) {
    947             pair = TWO_COMMON_SEC_PLUS_OFFSET;
    948         } else {
    949             U_ASSERT(ce >= MIN_LONG);
    950             pair = 0;  // variable
    951         }
    952     }
    953     return pair;
    954 }
    955 
    956 uint32_t
    957 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
    958     // Primary+caseLevel: Ignore case level weights of primary ignorables.
    959     // Otherwise: Ignore case level weights of secondary ignorables.
    960     // For details see the comments in the CollationCompare class.
    961     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    962     if(pair <= 0xffff) {
    963         // one mini CE
    964         if(pair >= MIN_SHORT) {
    965             // A high secondary weight means we really have two CEs,
    966             // a primary CE and a secondary CE.
    967             uint32_t ce = pair;
    968             pair &= CASE_MASK;  // explicit weight of primary CE
    969             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
    970                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
    971             }
    972         } else if(pair > variableTop) {
    973             pair = LOWER_CASE;
    974         } else if(pair >= MIN_LONG) {
    975             pair = 0;  // variable
    976         }
    977         // else special mini CE
    978     } else {
    979         // two mini CEs, same primary groups, neither expands like above
    980         uint32_t ce = pair & 0xffff;
    981         if(ce >= MIN_SHORT) {
    982             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
    983                 pair &= CASE_MASK;
    984             } else {
    985                 pair &= TWO_CASES_MASK;
    986             }
    987         } else if(ce > variableTop) {
    988             pair = TWO_LOWER_CASES;
    989         } else {
    990             U_ASSERT(ce >= MIN_LONG);
    991             pair = 0;  // variable
    992         }
    993     }
    994     return pair;
    995 }
    996 
    997 uint32_t
    998 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
    999     if(pair <= 0xffff) {
   1000         // one mini CE
   1001         if(pair >= MIN_SHORT) {
   1002             // A high secondary weight means we really have two CEs,
   1003             // a primary CE and a secondary CE.
   1004             uint32_t ce = pair;
   1005             if(withCaseBits) {
   1006                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
   1007                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1008                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
   1009                 }
   1010             } else {
   1011                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1012                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1013                     pair |= COMMON_TER_PLUS_OFFSET << 16;
   1014                 }
   1015             }
   1016         } else if(pair > variableTop) {
   1017             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1018             if(withCaseBits) {
   1019                 pair |= LOWER_CASE;
   1020             }
   1021         } else if(pair >= MIN_LONG) {
   1022             pair = 0;  // variable
   1023         }
   1024         // else special mini CE
   1025     } else {
   1026         // two mini CEs, same primary groups, neither expands like above
   1027         uint32_t ce = pair & 0xffff;
   1028         if(ce >= MIN_SHORT) {
   1029             if(withCaseBits) {
   1030                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
   1031             } else {
   1032                 pair &= TWO_TERTIARIES_MASK;
   1033             }
   1034             pair += TWO_TER_OFFSETS;
   1035         } else if(ce > variableTop) {
   1036             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
   1037             if(withCaseBits) {
   1038                 pair |= TWO_LOWER_CASES;
   1039             }
   1040         } else {
   1041             U_ASSERT(ce >= MIN_LONG);
   1042             pair = 0;  // variable
   1043         }
   1044     }
   1045     return pair;
   1046 }
   1047 
   1048 uint32_t
   1049 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
   1050     // Return the primary weight of a variable CE,
   1051     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
   1052     if(pair <= 0xffff) {
   1053         // one mini CE
   1054         if(pair >= MIN_SHORT) {
   1055             // A high secondary weight means we really have two CEs,
   1056             // a primary CE and a secondary CE.
   1057             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1058                 pair = TWO_SHORT_PRIMARIES_MASK;
   1059             } else {
   1060                 pair = SHORT_PRIMARY_MASK;
   1061             }
   1062         } else if(pair > variableTop) {
   1063             pair = SHORT_PRIMARY_MASK;
   1064         } else if(pair >= MIN_LONG) {
   1065             pair &= LONG_PRIMARY_MASK;  // variable
   1066         }
   1067         // else special mini CE
   1068     } else {
   1069         // two mini CEs, same primary groups, neither expands like above
   1070         uint32_t ce = pair & 0xffff;
   1071         if(ce > variableTop) {
   1072             pair = TWO_SHORT_PRIMARIES_MASK;
   1073         } else {
   1074             U_ASSERT(ce >= MIN_LONG);
   1075             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
   1076         }
   1077     }
   1078     return pair;
   1079 }
   1080 
   1081 U_NAMESPACE_END
   1082 
   1083 #endif  // !UCONFIG_NO_COLLATION
   1084