Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationfastlatin.cpp
      7 *
      8 * created on: 2013aug18
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "collationdata.h"
     18 #include "collationfastlatin.h"
     19 #include "collationsettings.h"
     20 #include "uassert.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 int32_t
     25 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
     26                                uint16_t *primaries, int32_t capacity) {
     27     const uint16_t *table = data->fastLatinTable;
     28     if(table == NULL) { return -1; }
     29     U_ASSERT(capacity == LATIN_LIMIT);
     30     if(capacity != LATIN_LIMIT) { return -1; }
     31 
     32     uint32_t miniVarTop;
     33     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
     34         // No mini primaries are variable, set a variableTop just below the
     35         // lowest long mini primary.
     36         miniVarTop = MIN_LONG - 1;
     37     } else {
     38         int32_t headerLength = *table & 0xff;
     39         int32_t i = 1 + settings.getMaxVariable();
     40         if(i >= headerLength) {
     41             return -1;  // variableTop >= digits, should not occur
     42         }
     43         miniVarTop = table[i];
     44     }
     45 
     46     UBool digitsAreReordered = FALSE;
     47     if(settings.hasReordering()) {
     48         uint32_t prevStart = 0;
     49         uint32_t beforeDigitStart = 0;
     50         uint32_t digitStart = 0;
     51         uint32_t afterDigitStart = 0;
     52         for(int32_t group = UCOL_REORDER_CODE_FIRST;
     53                 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
     54                 ++group) {
     55             uint32_t start = data->getFirstPrimaryForGroup(group);
     56             start = settings.reorder(start);
     57             if(group == UCOL_REORDER_CODE_DIGIT) {
     58                 beforeDigitStart = prevStart;
     59                 digitStart = start;
     60             } else if(start != 0) {
     61                 if(start < prevStart) {
     62                     // The permutation affects the groups up to Latin.
     63                     return -1;
     64                 }
     65                 // In the future, there might be a special group between digits & Latin.
     66                 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
     67                     afterDigitStart = start;
     68                 }
     69                 prevStart = start;
     70             }
     71         }
     72         uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
     73         latinStart = settings.reorder(latinStart);
     74         if(latinStart < prevStart) {
     75             return -1;
     76         }
     77         if(afterDigitStart == 0) {
     78             afterDigitStart = latinStart;
     79         }
     80         if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
     81             digitsAreReordered = TRUE;
     82         }
     83     }
     84 
     85     table += (table[0] & 0xff);  // skip the header
     86     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
     87         uint32_t p = table[c];
     88         if(p >= MIN_SHORT) {
     89             p &= SHORT_PRIMARY_MASK;
     90         } else if(p > miniVarTop) {
     91             p &= LONG_PRIMARY_MASK;
     92         } else {
     93             p = 0;
     94         }
     95         primaries[c] = (uint16_t)p;
     96     }
     97     if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
     98         // Bail out for digits.
     99         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
    100     }
    101 
    102     // Shift the miniVarTop above other options.
    103     return ((int32_t)miniVarTop << 16) | settings.options;
    104 }
    105 
    106 int32_t
    107 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
    108                                  const UChar *left, int32_t leftLength,
    109                                  const UChar *right, int32_t rightLength) {
    110     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
    111     // optimized for common Latin text.
    112     // Keep them in sync!
    113     // Keep compareUTF16() and compareUTF8() in sync very closely!
    114 
    115     U_ASSERT((table[0] >> 8) == VERSION);
    116     table += (table[0] & 0xff);  // skip the header
    117     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
    118     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    119 
    120     // Check for supported characters, fetch mini CEs, and compare primaries.
    121     int32_t leftIndex = 0, rightIndex = 0;
    122     /**
    123      * Single mini CE or a pair.
    124      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    125      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    126      */
    127     uint32_t leftPair = 0, rightPair = 0;
    128     for(;;) {
    129         // We fetch CEs until we get a non-ignorable primary or reach the end.
    130         while(leftPair == 0) {
    131             if(leftIndex == leftLength) {
    132                 leftPair = EOS;
    133                 break;
    134             }
    135             UChar32 c = left[leftIndex++];
    136             if(c <= LATIN_MAX) {
    137                 leftPair = primaries[c];
    138                 if(leftPair != 0) { break; }
    139                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    140                     return BAIL_OUT_RESULT;
    141                 }
    142                 leftPair = table[c];
    143             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    144                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    145             } else {
    146                 leftPair = lookup(table, c);
    147             }
    148             if(leftPair >= MIN_SHORT) {
    149                 leftPair &= SHORT_PRIMARY_MASK;
    150                 break;
    151             } else if(leftPair > variableTop) {
    152                 leftPair &= LONG_PRIMARY_MASK;
    153                 break;
    154             } else {
    155                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    156                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    157                 leftPair = getPrimaries(variableTop, leftPair);
    158             }
    159         }
    160 
    161         while(rightPair == 0) {
    162             if(rightIndex == rightLength) {
    163                 rightPair = EOS;
    164                 break;
    165             }
    166             UChar32 c = right[rightIndex++];
    167             if(c <= LATIN_MAX) {
    168                 rightPair = primaries[c];
    169                 if(rightPair != 0) { break; }
    170                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    171                     return BAIL_OUT_RESULT;
    172                 }
    173                 rightPair = table[c];
    174             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    175                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    176             } else {
    177                 rightPair = lookup(table, c);
    178             }
    179             if(rightPair >= MIN_SHORT) {
    180                 rightPair &= SHORT_PRIMARY_MASK;
    181                 break;
    182             } else if(rightPair > variableTop) {
    183                 rightPair &= LONG_PRIMARY_MASK;
    184                 break;
    185             } else {
    186                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    187                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    188                 rightPair = getPrimaries(variableTop, rightPair);
    189             }
    190         }
    191 
    192         if(leftPair == rightPair) {
    193             if(leftPair == EOS) { break; }
    194             leftPair = rightPair = 0;
    195             continue;
    196         }
    197         uint32_t leftPrimary = leftPair & 0xffff;
    198         uint32_t rightPrimary = rightPair & 0xffff;
    199         if(leftPrimary != rightPrimary) {
    200             // Return the primary difference.
    201             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    202         }
    203         if(leftPair == EOS) { break; }
    204         leftPair >>= 16;
    205         rightPair >>= 16;
    206     }
    207     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    208     // but we know that the string is well-formed and
    209     // only contains supported characters and mappings.
    210 
    211     // We might skip the secondary level but continue with the case level
    212     // which is turned on separately.
    213     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    214         leftIndex = rightIndex = 0;
    215         leftPair = rightPair = 0;
    216         for(;;) {
    217             while(leftPair == 0) {
    218                 if(leftIndex == leftLength) {
    219                     leftPair = EOS;
    220                     break;
    221                 }
    222                 UChar32 c = left[leftIndex++];
    223                 if(c <= LATIN_MAX) {
    224                     leftPair = table[c];
    225                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    226                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    227                 } else {
    228                     leftPair = lookup(table, c);
    229                 }
    230                 if(leftPair >= MIN_SHORT) {
    231                     leftPair = getSecondariesFromOneShortCE(leftPair);
    232                     break;
    233                 } else if(leftPair > variableTop) {
    234                     leftPair = COMMON_SEC_PLUS_OFFSET;
    235                     break;
    236                 } else {
    237                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    238                     leftPair = getSecondaries(variableTop, leftPair);
    239                 }
    240             }
    241 
    242             while(rightPair == 0) {
    243                 if(rightIndex == rightLength) {
    244                     rightPair = EOS;
    245                     break;
    246                 }
    247                 UChar32 c = right[rightIndex++];
    248                 if(c <= LATIN_MAX) {
    249                     rightPair = table[c];
    250                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    251                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    252                 } else {
    253                     rightPair = lookup(table, c);
    254                 }
    255                 if(rightPair >= MIN_SHORT) {
    256                     rightPair = getSecondariesFromOneShortCE(rightPair);
    257                     break;
    258                 } else if(rightPair > variableTop) {
    259                     rightPair = COMMON_SEC_PLUS_OFFSET;
    260                     break;
    261                 } else {
    262                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    263                     rightPair = getSecondaries(variableTop, rightPair);
    264                 }
    265             }
    266 
    267             if(leftPair == rightPair) {
    268                 if(leftPair == EOS) { break; }
    269                 leftPair = rightPair = 0;
    270                 continue;
    271             }
    272             uint32_t leftSecondary = leftPair & 0xffff;
    273             uint32_t rightSecondary = rightPair & 0xffff;
    274             if(leftSecondary != rightSecondary) {
    275                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    276                     // Full support for backwards secondary requires backwards contraction matching
    277                     // and moving backwards between merge separators.
    278                     return BAIL_OUT_RESULT;
    279                 }
    280                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    281             }
    282             if(leftPair == EOS) { break; }
    283             leftPair >>= 16;
    284             rightPair >>= 16;
    285         }
    286     }
    287 
    288     if((options & CollationSettings::CASE_LEVEL) != 0) {
    289         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    290         leftIndex = rightIndex = 0;
    291         leftPair = rightPair = 0;
    292         for(;;) {
    293             while(leftPair == 0) {
    294                 if(leftIndex == leftLength) {
    295                     leftPair = EOS;
    296                     break;
    297                 }
    298                 UChar32 c = left[leftIndex++];
    299                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    300                 if(leftPair < MIN_LONG) {
    301                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    302                 }
    303                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    304             }
    305 
    306             while(rightPair == 0) {
    307                 if(rightIndex == rightLength) {
    308                     rightPair = EOS;
    309                     break;
    310                 }
    311                 UChar32 c = right[rightIndex++];
    312                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    313                 if(rightPair < MIN_LONG) {
    314                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    315                 }
    316                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    317             }
    318 
    319             if(leftPair == rightPair) {
    320                 if(leftPair == EOS) { break; }
    321                 leftPair = rightPair = 0;
    322                 continue;
    323             }
    324             uint32_t leftCase = leftPair & 0xffff;
    325             uint32_t rightCase = rightPair & 0xffff;
    326             if(leftCase != rightCase) {
    327                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    328                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    329                 } else {
    330                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    331                 }
    332             }
    333             if(leftPair == EOS) { break; }
    334             leftPair >>= 16;
    335             rightPair >>= 16;
    336         }
    337     }
    338     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    339 
    340     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    341     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    342 
    343     leftIndex = rightIndex = 0;
    344     leftPair = rightPair = 0;
    345     for(;;) {
    346         while(leftPair == 0) {
    347             if(leftIndex == leftLength) {
    348                 leftPair = EOS;
    349                 break;
    350             }
    351             UChar32 c = left[leftIndex++];
    352             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    353             if(leftPair < MIN_LONG) {
    354                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    355             }
    356             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    357         }
    358 
    359         while(rightPair == 0) {
    360             if(rightIndex == rightLength) {
    361                 rightPair = EOS;
    362                 break;
    363             }
    364             UChar32 c = right[rightIndex++];
    365             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    366             if(rightPair < MIN_LONG) {
    367                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    368             }
    369             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    370         }
    371 
    372         if(leftPair == rightPair) {
    373             if(leftPair == EOS) { break; }
    374             leftPair = rightPair = 0;
    375             continue;
    376         }
    377         uint32_t leftTertiary = leftPair & 0xffff;
    378         uint32_t rightTertiary = rightPair & 0xffff;
    379         if(leftTertiary != rightTertiary) {
    380             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    381                 // Pass through EOS and MERGE_WEIGHT
    382                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    383                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    384                 if(leftTertiary > MERGE_WEIGHT) {
    385                     leftTertiary ^= CASE_MASK;
    386                 }
    387                 if(rightTertiary > MERGE_WEIGHT) {
    388                     rightTertiary ^= CASE_MASK;
    389                 }
    390             }
    391             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    392         }
    393         if(leftPair == EOS) { break; }
    394         leftPair >>= 16;
    395         rightPair >>= 16;
    396     }
    397     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    398 
    399     leftIndex = rightIndex = 0;
    400     leftPair = rightPair = 0;
    401     for(;;) {
    402         while(leftPair == 0) {
    403             if(leftIndex == leftLength) {
    404                 leftPair = EOS;
    405                 break;
    406             }
    407             UChar32 c = left[leftIndex++];
    408             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    409             if(leftPair < MIN_LONG) {
    410                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    411             }
    412             leftPair = getQuaternaries(variableTop, leftPair);
    413         }
    414 
    415         while(rightPair == 0) {
    416             if(rightIndex == rightLength) {
    417                 rightPair = EOS;
    418                 break;
    419             }
    420             UChar32 c = right[rightIndex++];
    421             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    422             if(rightPair < MIN_LONG) {
    423                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    424             }
    425             rightPair = getQuaternaries(variableTop, rightPair);
    426         }
    427 
    428         if(leftPair == rightPair) {
    429             if(leftPair == EOS) { break; }
    430             leftPair = rightPair = 0;
    431             continue;
    432         }
    433         uint32_t leftQuaternary = leftPair & 0xffff;
    434         uint32_t rightQuaternary = rightPair & 0xffff;
    435         if(leftQuaternary != rightQuaternary) {
    436             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    437         }
    438         if(leftPair == EOS) { break; }
    439         leftPair >>= 16;
    440         rightPair >>= 16;
    441     }
    442     return UCOL_EQUAL;
    443 }
    444 
    445 int32_t
    446 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
    447                                  const uint8_t *left, int32_t leftLength,
    448                                  const uint8_t *right, int32_t rightLength) {
    449     // Keep compareUTF16() and compareUTF8() in sync very closely!
    450 
    451     U_ASSERT((table[0] >> 8) == VERSION);
    452     table += (table[0] & 0xff);  // skip the header
    453     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
    454     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    455 
    456     // Check for supported characters, fetch mini CEs, and compare primaries.
    457     int32_t leftIndex = 0, rightIndex = 0;
    458     /**
    459      * Single mini CE or a pair.
    460      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    461      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    462      */
    463     uint32_t leftPair = 0, rightPair = 0;
    464     // Note: There is no need to assemble the code point.
    465     // We only need to look up the table entry for the character,
    466     // and nextPair() looks for whether c==0.
    467     for(;;) {
    468         // We fetch CEs until we get a non-ignorable primary or reach the end.
    469         while(leftPair == 0) {
    470             if(leftIndex == leftLength) {
    471                 leftPair = EOS;
    472                 break;
    473             }
    474             UChar32 c = left[leftIndex++];
    475             uint8_t t;
    476             if(c <= 0x7f) {
    477                 leftPair = primaries[c];
    478                 if(leftPair != 0) { break; }
    479                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    480                     return BAIL_OUT_RESULT;
    481                 }
    482                 leftPair = table[c];
    483             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
    484                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
    485                 ++leftIndex;
    486                 c = ((c - 0xc2) << 6) + t;
    487                 leftPair = primaries[c];
    488                 if(leftPair != 0) { break; }
    489                 leftPair = table[c];
    490             } else {
    491                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
    492             }
    493             if(leftPair >= MIN_SHORT) {
    494                 leftPair &= SHORT_PRIMARY_MASK;
    495                 break;
    496             } else if(leftPair > variableTop) {
    497                 leftPair &= LONG_PRIMARY_MASK;
    498                 break;
    499             } else {
    500                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    501                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    502                 leftPair = getPrimaries(variableTop, leftPair);
    503             }
    504         }
    505 
    506         while(rightPair == 0) {
    507             if(rightIndex == rightLength) {
    508                 rightPair = EOS;
    509                 break;
    510             }
    511             UChar32 c = right[rightIndex++];
    512             uint8_t t;
    513             if(c <= 0x7f) {
    514                 rightPair = primaries[c];
    515                 if(rightPair != 0) { break; }
    516                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    517                     return BAIL_OUT_RESULT;
    518                 }
    519                 rightPair = table[c];
    520             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
    521                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
    522                 ++rightIndex;
    523                 c = ((c - 0xc2) << 6) + t;
    524                 rightPair = primaries[c];
    525                 if(rightPair != 0) { break; }
    526                 rightPair = table[c];
    527             } else {
    528                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
    529             }
    530             if(rightPair >= MIN_SHORT) {
    531                 rightPair &= SHORT_PRIMARY_MASK;
    532                 break;
    533             } else if(rightPair > variableTop) {
    534                 rightPair &= LONG_PRIMARY_MASK;
    535                 break;
    536             } else {
    537                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    538                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    539                 rightPair = getPrimaries(variableTop, rightPair);
    540             }
    541         }
    542 
    543         if(leftPair == rightPair) {
    544             if(leftPair == EOS) { break; }
    545             leftPair = rightPair = 0;
    546             continue;
    547         }
    548         uint32_t leftPrimary = leftPair & 0xffff;
    549         uint32_t rightPrimary = rightPair & 0xffff;
    550         if(leftPrimary != rightPrimary) {
    551             // Return the primary difference.
    552             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    553         }
    554         if(leftPair == EOS) { break; }
    555         leftPair >>= 16;
    556         rightPair >>= 16;
    557     }
    558     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    559     // but we know that the string is well-formed and
    560     // only contains supported characters and mappings.
    561 
    562     // We might skip the secondary level but continue with the case level
    563     // which is turned on separately.
    564     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    565         leftIndex = rightIndex = 0;
    566         leftPair = rightPair = 0;
    567         for(;;) {
    568             while(leftPair == 0) {
    569                 if(leftIndex == leftLength) {
    570                     leftPair = EOS;
    571                     break;
    572                 }
    573                 UChar32 c = left[leftIndex++];
    574                 if(c <= 0x7f) {
    575                     leftPair = table[c];
    576                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    577                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
    578                 } else {
    579                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
    580                 }
    581                 if(leftPair >= MIN_SHORT) {
    582                     leftPair = getSecondariesFromOneShortCE(leftPair);
    583                     break;
    584                 } else if(leftPair > variableTop) {
    585                     leftPair = COMMON_SEC_PLUS_OFFSET;
    586                     break;
    587                 } else {
    588                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    589                     leftPair = getSecondaries(variableTop, leftPair);
    590                 }
    591             }
    592 
    593             while(rightPair == 0) {
    594                 if(rightIndex == rightLength) {
    595                     rightPair = EOS;
    596                     break;
    597                 }
    598                 UChar32 c = right[rightIndex++];
    599                 if(c <= 0x7f) {
    600                     rightPair = table[c];
    601                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    602                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
    603                 } else {
    604                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
    605                 }
    606                 if(rightPair >= MIN_SHORT) {
    607                     rightPair = getSecondariesFromOneShortCE(rightPair);
    608                     break;
    609                 } else if(rightPair > variableTop) {
    610                     rightPair = COMMON_SEC_PLUS_OFFSET;
    611                     break;
    612                 } else {
    613                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    614                     rightPair = getSecondaries(variableTop, rightPair);
    615                 }
    616             }
    617 
    618             if(leftPair == rightPair) {
    619                 if(leftPair == EOS) { break; }
    620                 leftPair = rightPair = 0;
    621                 continue;
    622             }
    623             uint32_t leftSecondary = leftPair & 0xffff;
    624             uint32_t rightSecondary = rightPair & 0xffff;
    625             if(leftSecondary != rightSecondary) {
    626                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    627                     // Full support for backwards secondary requires backwards contraction matching
    628                     // and moving backwards between merge separators.
    629                     return BAIL_OUT_RESULT;
    630                 }
    631                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    632             }
    633             if(leftPair == EOS) { break; }
    634             leftPair >>= 16;
    635             rightPair >>= 16;
    636         }
    637     }
    638 
    639     if((options & CollationSettings::CASE_LEVEL) != 0) {
    640         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    641         leftIndex = rightIndex = 0;
    642         leftPair = rightPair = 0;
    643         for(;;) {
    644             while(leftPair == 0) {
    645                 if(leftIndex == leftLength) {
    646                     leftPair = EOS;
    647                     break;
    648                 }
    649                 UChar32 c = left[leftIndex++];
    650                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    651                 if(leftPair < MIN_LONG) {
    652                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    653                 }
    654                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    655             }
    656 
    657             while(rightPair == 0) {
    658                 if(rightIndex == rightLength) {
    659                     rightPair = EOS;
    660                     break;
    661                 }
    662                 UChar32 c = right[rightIndex++];
    663                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    664                 if(rightPair < MIN_LONG) {
    665                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    666                 }
    667                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    668             }
    669 
    670             if(leftPair == rightPair) {
    671                 if(leftPair == EOS) { break; }
    672                 leftPair = rightPair = 0;
    673                 continue;
    674             }
    675             uint32_t leftCase = leftPair & 0xffff;
    676             uint32_t rightCase = rightPair & 0xffff;
    677             if(leftCase != rightCase) {
    678                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    679                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    680                 } else {
    681                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    682                 }
    683             }
    684             if(leftPair == EOS) { break; }
    685             leftPair >>= 16;
    686             rightPair >>= 16;
    687         }
    688     }
    689     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    690 
    691     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    692     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    693 
    694     leftIndex = rightIndex = 0;
    695     leftPair = rightPair = 0;
    696     for(;;) {
    697         while(leftPair == 0) {
    698             if(leftIndex == leftLength) {
    699                 leftPair = EOS;
    700                 break;
    701             }
    702             UChar32 c = left[leftIndex++];
    703             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    704             if(leftPair < MIN_LONG) {
    705                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    706             }
    707             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    708         }
    709 
    710         while(rightPair == 0) {
    711             if(rightIndex == rightLength) {
    712                 rightPair = EOS;
    713                 break;
    714             }
    715             UChar32 c = right[rightIndex++];
    716             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    717             if(rightPair < MIN_LONG) {
    718                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    719             }
    720             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    721         }
    722 
    723         if(leftPair == rightPair) {
    724             if(leftPair == EOS) { break; }
    725             leftPair = rightPair = 0;
    726             continue;
    727         }
    728         uint32_t leftTertiary = leftPair & 0xffff;
    729         uint32_t rightTertiary = rightPair & 0xffff;
    730         if(leftTertiary != rightTertiary) {
    731             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    732                 // Pass through EOS and MERGE_WEIGHT
    733                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    734                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    735                 if(leftTertiary > MERGE_WEIGHT) {
    736                     leftTertiary ^= CASE_MASK;
    737                 }
    738                 if(rightTertiary > MERGE_WEIGHT) {
    739                     rightTertiary ^= CASE_MASK;
    740                 }
    741             }
    742             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    743         }
    744         if(leftPair == EOS) { break; }
    745         leftPair >>= 16;
    746         rightPair >>= 16;
    747     }
    748     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    749 
    750     leftIndex = rightIndex = 0;
    751     leftPair = rightPair = 0;
    752     for(;;) {
    753         while(leftPair == 0) {
    754             if(leftIndex == leftLength) {
    755                 leftPair = EOS;
    756                 break;
    757             }
    758             UChar32 c = left[leftIndex++];
    759             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    760             if(leftPair < MIN_LONG) {
    761                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    762             }
    763             leftPair = getQuaternaries(variableTop, leftPair);
    764         }
    765 
    766         while(rightPair == 0) {
    767             if(rightIndex == rightLength) {
    768                 rightPair = EOS;
    769                 break;
    770             }
    771             UChar32 c = right[rightIndex++];
    772             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    773             if(rightPair < MIN_LONG) {
    774                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    775             }
    776             rightPair = getQuaternaries(variableTop, rightPair);
    777         }
    778 
    779         if(leftPair == rightPair) {
    780             if(leftPair == EOS) { break; }
    781             leftPair = rightPair = 0;
    782             continue;
    783         }
    784         uint32_t leftQuaternary = leftPair & 0xffff;
    785         uint32_t rightQuaternary = rightPair & 0xffff;
    786         if(leftQuaternary != rightQuaternary) {
    787             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    788         }
    789         if(leftPair == EOS) { break; }
    790         leftPair >>= 16;
    791         rightPair >>= 16;
    792     }
    793     return UCOL_EQUAL;
    794 }
    795 
    796 uint32_t
    797 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
    798     U_ASSERT(c > LATIN_MAX);
    799     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    800         return table[c - PUNCT_START + LATIN_LIMIT];
    801     } else if(c == 0xfffe) {
    802         return MERGE_WEIGHT;
    803     } else if(c == 0xffff) {
    804         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
    805     } else {
    806         return BAIL_OUT;
    807     }
    808 }
    809 
    810 uint32_t
    811 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
    812                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
    813     // The caller handled ASCII and valid/supported Latin.
    814     U_ASSERT(c > 0x7f);
    815     int32_t i2 = sIndex + 1;
    816     if(i2 < sLength || sLength < 0) {
    817         uint8_t t1 = s8[sIndex];
    818         uint8_t t2 = s8[i2];
    819         sIndex += 2;
    820         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
    821             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    822         } else if(c == 0xef && t1 == 0xbf) {
    823             if(t2 == 0xbe) {
    824                 return MERGE_WEIGHT;  // U+FFFE
    825             } else if(t2 == 0xbf) {
    826                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    827             }
    828         }
    829     }
    830     return BAIL_OUT;
    831 }
    832 
    833 uint32_t
    834 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
    835                                      const uint8_t *s8, int32_t &sIndex) {
    836     // The caller handled ASCII.
    837     // The string is well-formed and contains only supported characters.
    838     U_ASSERT(c > 0x7f);
    839     if(c <= LATIN_MAX_UTF8_LEAD) {
    840         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
    841     }
    842     uint8_t t2 = s8[sIndex + 1];
    843     sIndex += 2;
    844     if(c == 0xe2) {
    845         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    846     } else if(t2 == 0xbe) {
    847         return MERGE_WEIGHT;  // U+FFFE
    848     } else {
    849         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    850     }
    851 }
    852 
    853 uint32_t
    854 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
    855                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
    856     if(ce >= MIN_LONG || ce < CONTRACTION) {
    857         return ce;  // simple or special mini CE
    858     } else if(ce >= EXPANSION) {
    859         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    860         return ((uint32_t)table[index + 1] << 16) | table[index];
    861     } else /* ce >= CONTRACTION */ {
    862         if(c == 0 && sLength < 0) {
    863             sLength = sIndex - 1;
    864             return EOS;
    865         }
    866         // Contraction list: Default mapping followed by
    867         // 0 or more single-character contraction suffix mappings.
    868         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    869         if(sIndex != sLength) {
    870             // Read the next character.
    871             int32_t c2;
    872             int32_t nextIndex = sIndex;
    873             if(s16 != NULL) {
    874                 c2 = s16[nextIndex++];
    875                 if(c2 > LATIN_MAX) {
    876                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
    877                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
    878                     } else if(c2 == 0xfffe || c2 == 0xffff) {
    879                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    880                     } else {
    881                         return BAIL_OUT;
    882                     }
    883                 }
    884             } else {
    885                 c2 = s8[nextIndex++];
    886                 if(c2 > 0x7f) {
    887                     uint8_t t;
    888                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
    889                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
    890                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
    891                         ++nextIndex;
    892                     } else {
    893                         int32_t i2 = nextIndex + 1;
    894                         if(i2 < sLength || sLength < 0) {
    895                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
    896                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
    897                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
    898                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
    899                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
    900                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    901                             } else {
    902                                 return BAIL_OUT;
    903                             }
    904                         } else {
    905                             return BAIL_OUT;
    906                         }
    907                         nextIndex += 2;
    908                     }
    909                 }
    910             }
    911             if(c2 == 0 && sLength < 0) {
    912                 sLength = sIndex;
    913                 c2 = -1;
    914             }
    915             // Look for the next character in the contraction suffix list,
    916             // which is in ascending order of single suffix characters.
    917             int32_t i = index;
    918             int32_t head = table[i];  // first skip the default mapping
    919             int32_t x;
    920             do {
    921                 i += head >> CONTR_LENGTH_SHIFT;
    922                 head = table[i];
    923                 x = head & CONTR_CHAR_MASK;
    924             } while(x < c2);
    925             if(x == c2) {
    926                 index = i;
    927                 sIndex = nextIndex;
    928             }
    929         }
    930         // Return the CE or CEs for the default or contraction mapping.
    931         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
    932         if(length == 1) {
    933             return BAIL_OUT;
    934         }
    935         ce = table[index + 1];
    936         if(length == 2) {
    937             return ce;
    938         } else {
    939             return ((uint32_t)table[index + 2] << 16) | ce;
    940         }
    941     }
    942 }
    943 
    944 uint32_t
    945 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
    946     if(pair <= 0xffff) {
    947         // one mini CE
    948         if(pair >= MIN_SHORT) {
    949             pair = getSecondariesFromOneShortCE(pair);
    950         } else if(pair > variableTop) {
    951             pair = COMMON_SEC_PLUS_OFFSET;
    952         } else if(pair >= MIN_LONG) {
    953             pair = 0;  // variable
    954         }
    955         // else special mini CE
    956     } else {
    957         uint32_t ce = pair & 0xffff;
    958         if(ce >= MIN_SHORT) {
    959             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
    960         } else if(ce > variableTop) {
    961             pair = TWO_COMMON_SEC_PLUS_OFFSET;
    962         } else {
    963             U_ASSERT(ce >= MIN_LONG);
    964             pair = 0;  // variable
    965         }
    966     }
    967     return pair;
    968 }
    969 
    970 uint32_t
    971 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
    972     // Primary+caseLevel: Ignore case level weights of primary ignorables.
    973     // Otherwise: Ignore case level weights of secondary ignorables.
    974     // For details see the comments in the CollationCompare class.
    975     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    976     if(pair <= 0xffff) {
    977         // one mini CE
    978         if(pair >= MIN_SHORT) {
    979             // A high secondary weight means we really have two CEs,
    980             // a primary CE and a secondary CE.
    981             uint32_t ce = pair;
    982             pair &= CASE_MASK;  // explicit weight of primary CE
    983             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
    984                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
    985             }
    986         } else if(pair > variableTop) {
    987             pair = LOWER_CASE;
    988         } else if(pair >= MIN_LONG) {
    989             pair = 0;  // variable
    990         }
    991         // else special mini CE
    992     } else {
    993         // two mini CEs, same primary groups, neither expands like above
    994         uint32_t ce = pair & 0xffff;
    995         if(ce >= MIN_SHORT) {
    996             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
    997                 pair &= CASE_MASK;
    998             } else {
    999                 pair &= TWO_CASES_MASK;
   1000             }
   1001         } else if(ce > variableTop) {
   1002             pair = TWO_LOWER_CASES;
   1003         } else {
   1004             U_ASSERT(ce >= MIN_LONG);
   1005             pair = 0;  // variable
   1006         }
   1007     }
   1008     return pair;
   1009 }
   1010 
   1011 uint32_t
   1012 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
   1013     if(pair <= 0xffff) {
   1014         // one mini CE
   1015         if(pair >= MIN_SHORT) {
   1016             // A high secondary weight means we really have two CEs,
   1017             // a primary CE and a secondary CE.
   1018             uint32_t ce = pair;
   1019             if(withCaseBits) {
   1020                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
   1021                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1022                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
   1023                 }
   1024             } else {
   1025                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1026                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1027                     pair |= COMMON_TER_PLUS_OFFSET << 16;
   1028                 }
   1029             }
   1030         } else if(pair > variableTop) {
   1031             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1032             if(withCaseBits) {
   1033                 pair |= LOWER_CASE;
   1034             }
   1035         } else if(pair >= MIN_LONG) {
   1036             pair = 0;  // variable
   1037         }
   1038         // else special mini CE
   1039     } else {
   1040         // two mini CEs, same primary groups, neither expands like above
   1041         uint32_t ce = pair & 0xffff;
   1042         if(ce >= MIN_SHORT) {
   1043             if(withCaseBits) {
   1044                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
   1045             } else {
   1046                 pair &= TWO_TERTIARIES_MASK;
   1047             }
   1048             pair += TWO_TER_OFFSETS;
   1049         } else if(ce > variableTop) {
   1050             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
   1051             if(withCaseBits) {
   1052                 pair |= TWO_LOWER_CASES;
   1053             }
   1054         } else {
   1055             U_ASSERT(ce >= MIN_LONG);
   1056             pair = 0;  // variable
   1057         }
   1058     }
   1059     return pair;
   1060 }
   1061 
   1062 uint32_t
   1063 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
   1064     // Return the primary weight of a variable CE,
   1065     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
   1066     if(pair <= 0xffff) {
   1067         // one mini CE
   1068         if(pair >= MIN_SHORT) {
   1069             // A high secondary weight means we really have two CEs,
   1070             // a primary CE and a secondary CE.
   1071             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1072                 pair = TWO_SHORT_PRIMARIES_MASK;
   1073             } else {
   1074                 pair = SHORT_PRIMARY_MASK;
   1075             }
   1076         } else if(pair > variableTop) {
   1077             pair = SHORT_PRIMARY_MASK;
   1078         } else if(pair >= MIN_LONG) {
   1079             pair &= LONG_PRIMARY_MASK;  // variable
   1080         }
   1081         // else special mini CE
   1082     } else {
   1083         // two mini CEs, same primary groups, neither expands like above
   1084         uint32_t ce = pair & 0xffff;
   1085         if(ce > variableTop) {
   1086             pair = TWO_SHORT_PRIMARIES_MASK;
   1087         } else {
   1088             U_ASSERT(ce >= MIN_LONG);
   1089             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
   1090         }
   1091     }
   1092     return pair;
   1093 }
   1094 
   1095 U_NAMESPACE_END
   1096 
   1097 #endif  // !UCONFIG_NO_COLLATION
   1098