Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationfastlatin.cpp
      7 *
      8 * created on: 2013aug18
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "collationdata.h"
     18 #include "collationfastlatin.h"
     19 #include "collationsettings.h"
     20 #include "putilimp.h"  // U_ALIGN_CODE
     21 #include "uassert.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 int32_t
     26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
     27                                uint16_t *primaries, int32_t capacity) {
     28     const uint16_t *table = data->fastLatinTable;
     29     if(table == NULL) { return -1; }
     30     U_ASSERT(capacity == LATIN_LIMIT);
     31     if(capacity != LATIN_LIMIT) { return -1; }
     32 
     33     uint32_t miniVarTop;
     34     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
     35         // No mini primaries are variable, set a variableTop just below the
     36         // lowest long mini primary.
     37         miniVarTop = MIN_LONG - 1;
     38     } else {
     39         int32_t headerLength = *table & 0xff;
     40         int32_t i = 1 + settings.getMaxVariable();
     41         if(i >= headerLength) {
     42             return -1;  // variableTop >= digits, should not occur
     43         }
     44         miniVarTop = table[i];
     45     }
     46 
     47     UBool digitsAreReordered = FALSE;
     48     if(settings.hasReordering()) {
     49         uint32_t prevStart = 0;
     50         uint32_t beforeDigitStart = 0;
     51         uint32_t digitStart = 0;
     52         uint32_t afterDigitStart = 0;
     53         for(int32_t group = UCOL_REORDER_CODE_FIRST;
     54                 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
     55                 ++group) {
     56             uint32_t start = data->getFirstPrimaryForGroup(group);
     57             start = settings.reorder(start);
     58             if(group == UCOL_REORDER_CODE_DIGIT) {
     59                 beforeDigitStart = prevStart;
     60                 digitStart = start;
     61             } else if(start != 0) {
     62                 if(start < prevStart) {
     63                     // The permutation affects the groups up to Latin.
     64                     return -1;
     65                 }
     66                 // In the future, there might be a special group between digits & Latin.
     67                 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
     68                     afterDigitStart = start;
     69                 }
     70                 prevStart = start;
     71             }
     72         }
     73         uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
     74         latinStart = settings.reorder(latinStart);
     75         if(latinStart < prevStart) {
     76             return -1;
     77         }
     78         if(afterDigitStart == 0) {
     79             afterDigitStart = latinStart;
     80         }
     81         if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
     82             digitsAreReordered = TRUE;
     83         }
     84     }
     85 
     86     table += (table[0] & 0xff);  // skip the header
     87     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
     88         uint32_t p = table[c];
     89         if(p >= MIN_SHORT) {
     90             p &= SHORT_PRIMARY_MASK;
     91         } else if(p > miniVarTop) {
     92             p &= LONG_PRIMARY_MASK;
     93         } else {
     94             p = 0;
     95         }
     96         primaries[c] = (uint16_t)p;
     97     }
     98     if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
     99         // Bail out for digits.
    100         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
    101     }
    102 
    103     // Shift the miniVarTop above other options.
    104     return ((int32_t)miniVarTop << 16) | settings.options;
    105 }
    106 
    107 int32_t
    108 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
    109                                  const UChar *left, int32_t leftLength,
    110                                  const UChar *right, int32_t rightLength) {
    111     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
    112     // optimized for common Latin text.
    113     // Keep them in sync!
    114     // Keep compareUTF16() and compareUTF8() in sync very closely!
    115 
    116     U_ASSERT((table[0] >> 8) == VERSION);
    117     table += (table[0] & 0xff);  // skip the header
    118     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
    119     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    120 
    121     // Check for supported characters, fetch mini CEs, and compare primaries.
    122     U_ALIGN_CODE(16);
    123     int32_t leftIndex = 0, rightIndex = 0;
    124     /**
    125      * Single mini CE or a pair.
    126      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    127      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    128      */
    129     uint32_t leftPair = 0, rightPair = 0;
    130     for(;;) {
    131         // We fetch CEs until we get a non-ignorable primary or reach the end.
    132         while(leftPair == 0) {
    133             if(leftIndex == leftLength) {
    134                 leftPair = EOS;
    135                 break;
    136             }
    137             UChar32 c = left[leftIndex++];
    138             if(c <= LATIN_MAX) {
    139                 leftPair = primaries[c];
    140                 if(leftPair != 0) { break; }
    141                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    142                     return BAIL_OUT_RESULT;
    143                 }
    144                 leftPair = table[c];
    145             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    146                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    147             } else {
    148                 leftPair = lookup(table, c);
    149             }
    150             if(leftPair >= MIN_SHORT) {
    151                 leftPair &= SHORT_PRIMARY_MASK;
    152                 break;
    153             } else if(leftPair > variableTop) {
    154                 leftPair &= LONG_PRIMARY_MASK;
    155                 break;
    156             } else {
    157                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    158                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    159                 leftPair = getPrimaries(variableTop, leftPair);
    160             }
    161         }
    162 
    163         while(rightPair == 0) {
    164             if(rightIndex == rightLength) {
    165                 rightPair = EOS;
    166                 break;
    167             }
    168             UChar32 c = right[rightIndex++];
    169             if(c <= LATIN_MAX) {
    170                 rightPair = primaries[c];
    171                 if(rightPair != 0) { break; }
    172                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    173                     return BAIL_OUT_RESULT;
    174                 }
    175                 rightPair = table[c];
    176             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    177                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    178             } else {
    179                 rightPair = lookup(table, c);
    180             }
    181             if(rightPair >= MIN_SHORT) {
    182                 rightPair &= SHORT_PRIMARY_MASK;
    183                 break;
    184             } else if(rightPair > variableTop) {
    185                 rightPair &= LONG_PRIMARY_MASK;
    186                 break;
    187             } else {
    188                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    189                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    190                 rightPair = getPrimaries(variableTop, rightPair);
    191             }
    192         }
    193 
    194         if(leftPair == rightPair) {
    195             if(leftPair == EOS) { break; }
    196             leftPair = rightPair = 0;
    197             continue;
    198         }
    199         uint32_t leftPrimary = leftPair & 0xffff;
    200         uint32_t rightPrimary = rightPair & 0xffff;
    201         if(leftPrimary != rightPrimary) {
    202             // Return the primary difference.
    203             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    204         }
    205         if(leftPair == EOS) { break; }
    206         leftPair >>= 16;
    207         rightPair >>= 16;
    208     }
    209     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    210     // but we know that the string is well-formed and
    211     // only contains supported characters and mappings.
    212 
    213     // We might skip the secondary level but continue with the case level
    214     // which is turned on separately.
    215     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    216         leftIndex = rightIndex = 0;
    217         leftPair = rightPair = 0;
    218         for(;;) {
    219             while(leftPair == 0) {
    220                 if(leftIndex == leftLength) {
    221                     leftPair = EOS;
    222                     break;
    223                 }
    224                 UChar32 c = left[leftIndex++];
    225                 if(c <= LATIN_MAX) {
    226                     leftPair = table[c];
    227                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    228                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    229                 } else {
    230                     leftPair = lookup(table, c);
    231                 }
    232                 if(leftPair >= MIN_SHORT) {
    233                     leftPair = getSecondariesFromOneShortCE(leftPair);
    234                     break;
    235                 } else if(leftPair > variableTop) {
    236                     leftPair = COMMON_SEC_PLUS_OFFSET;
    237                     break;
    238                 } else {
    239                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    240                     leftPair = getSecondaries(variableTop, leftPair);
    241                 }
    242             }
    243 
    244             while(rightPair == 0) {
    245                 if(rightIndex == rightLength) {
    246                     rightPair = EOS;
    247                     break;
    248                 }
    249                 UChar32 c = right[rightIndex++];
    250                 if(c <= LATIN_MAX) {
    251                     rightPair = table[c];
    252                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    253                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    254                 } else {
    255                     rightPair = lookup(table, c);
    256                 }
    257                 if(rightPair >= MIN_SHORT) {
    258                     rightPair = getSecondariesFromOneShortCE(rightPair);
    259                     break;
    260                 } else if(rightPair > variableTop) {
    261                     rightPair = COMMON_SEC_PLUS_OFFSET;
    262                     break;
    263                 } else {
    264                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    265                     rightPair = getSecondaries(variableTop, rightPair);
    266                 }
    267             }
    268 
    269             if(leftPair == rightPair) {
    270                 if(leftPair == EOS) { break; }
    271                 leftPair = rightPair = 0;
    272                 continue;
    273             }
    274             uint32_t leftSecondary = leftPair & 0xffff;
    275             uint32_t rightSecondary = rightPair & 0xffff;
    276             if(leftSecondary != rightSecondary) {
    277                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    278                     // Full support for backwards secondary requires backwards contraction matching
    279                     // and moving backwards between merge separators.
    280                     return BAIL_OUT_RESULT;
    281                 }
    282                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    283             }
    284             if(leftPair == EOS) { break; }
    285             leftPair >>= 16;
    286             rightPair >>= 16;
    287         }
    288     }
    289 
    290     if((options & CollationSettings::CASE_LEVEL) != 0) {
    291         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    292         leftIndex = rightIndex = 0;
    293         leftPair = rightPair = 0;
    294         for(;;) {
    295             while(leftPair == 0) {
    296                 if(leftIndex == leftLength) {
    297                     leftPair = EOS;
    298                     break;
    299                 }
    300                 UChar32 c = left[leftIndex++];
    301                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    302                 if(leftPair < MIN_LONG) {
    303                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    304                 }
    305                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    306             }
    307 
    308             while(rightPair == 0) {
    309                 if(rightIndex == rightLength) {
    310                     rightPair = EOS;
    311                     break;
    312                 }
    313                 UChar32 c = right[rightIndex++];
    314                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    315                 if(rightPair < MIN_LONG) {
    316                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    317                 }
    318                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    319             }
    320 
    321             if(leftPair == rightPair) {
    322                 if(leftPair == EOS) { break; }
    323                 leftPair = rightPair = 0;
    324                 continue;
    325             }
    326             uint32_t leftCase = leftPair & 0xffff;
    327             uint32_t rightCase = rightPair & 0xffff;
    328             if(leftCase != rightCase) {
    329                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    330                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    331                 } else {
    332                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    333                 }
    334             }
    335             if(leftPair == EOS) { break; }
    336             leftPair >>= 16;
    337             rightPair >>= 16;
    338         }
    339     }
    340     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    341 
    342     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    343     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    344 
    345     leftIndex = rightIndex = 0;
    346     leftPair = rightPair = 0;
    347     for(;;) {
    348         while(leftPair == 0) {
    349             if(leftIndex == leftLength) {
    350                 leftPair = EOS;
    351                 break;
    352             }
    353             UChar32 c = left[leftIndex++];
    354             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    355             if(leftPair < MIN_LONG) {
    356                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    357             }
    358             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    359         }
    360 
    361         while(rightPair == 0) {
    362             if(rightIndex == rightLength) {
    363                 rightPair = EOS;
    364                 break;
    365             }
    366             UChar32 c = right[rightIndex++];
    367             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    368             if(rightPair < MIN_LONG) {
    369                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    370             }
    371             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    372         }
    373 
    374         if(leftPair == rightPair) {
    375             if(leftPair == EOS) { break; }
    376             leftPair = rightPair = 0;
    377             continue;
    378         }
    379         uint32_t leftTertiary = leftPair & 0xffff;
    380         uint32_t rightTertiary = rightPair & 0xffff;
    381         if(leftTertiary != rightTertiary) {
    382             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    383                 // Pass through EOS and MERGE_WEIGHT
    384                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    385                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    386                 if(leftTertiary > MERGE_WEIGHT) {
    387                     leftTertiary ^= CASE_MASK;
    388                 }
    389                 if(rightTertiary > MERGE_WEIGHT) {
    390                     rightTertiary ^= CASE_MASK;
    391                 }
    392             }
    393             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    394         }
    395         if(leftPair == EOS) { break; }
    396         leftPair >>= 16;
    397         rightPair >>= 16;
    398     }
    399     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    400 
    401     leftIndex = rightIndex = 0;
    402     leftPair = rightPair = 0;
    403     for(;;) {
    404         while(leftPair == 0) {
    405             if(leftIndex == leftLength) {
    406                 leftPair = EOS;
    407                 break;
    408             }
    409             UChar32 c = left[leftIndex++];
    410             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    411             if(leftPair < MIN_LONG) {
    412                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
    413             }
    414             leftPair = getQuaternaries(variableTop, leftPair);
    415         }
    416 
    417         while(rightPair == 0) {
    418             if(rightIndex == rightLength) {
    419                 rightPair = EOS;
    420                 break;
    421             }
    422             UChar32 c = right[rightIndex++];
    423             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    424             if(rightPair < MIN_LONG) {
    425                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
    426             }
    427             rightPair = getQuaternaries(variableTop, rightPair);
    428         }
    429 
    430         if(leftPair == rightPair) {
    431             if(leftPair == EOS) { break; }
    432             leftPair = rightPair = 0;
    433             continue;
    434         }
    435         uint32_t leftQuaternary = leftPair & 0xffff;
    436         uint32_t rightQuaternary = rightPair & 0xffff;
    437         if(leftQuaternary != rightQuaternary) {
    438             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    439         }
    440         if(leftPair == EOS) { break; }
    441         leftPair >>= 16;
    442         rightPair >>= 16;
    443     }
    444     return UCOL_EQUAL;
    445 }
    446 
    447 int32_t
    448 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
    449                                  const uint8_t *left, int32_t leftLength,
    450                                  const uint8_t *right, int32_t rightLength) {
    451     // Keep compareUTF16() and compareUTF8() in sync very closely!
    452 
    453     U_ASSERT((table[0] >> 8) == VERSION);
    454     table += (table[0] & 0xff);  // skip the header
    455     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
    456     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    457 
    458     // Check for supported characters, fetch mini CEs, and compare primaries.
    459     U_ALIGN_CODE(16);
    460     int32_t leftIndex = 0, rightIndex = 0;
    461     /**
    462      * Single mini CE or a pair.
    463      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    464      * If there is only one, then it is in the lower bits, and the upper bits are 0.
    465      */
    466     uint32_t leftPair = 0, rightPair = 0;
    467     // Note: There is no need to assemble the code point.
    468     // We only need to look up the table entry for the character,
    469     // and nextPair() looks for whether c==0.
    470     for(;;) {
    471         // We fetch CEs until we get a non-ignorable primary or reach the end.
    472         while(leftPair == 0) {
    473             if(leftIndex == leftLength) {
    474                 leftPair = EOS;
    475                 break;
    476             }
    477             UChar32 c = left[leftIndex++];
    478             uint8_t t;
    479             if(c <= 0x7f) {
    480                 leftPair = primaries[c];
    481                 if(leftPair != 0) { break; }
    482                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    483                     return BAIL_OUT_RESULT;
    484                 }
    485                 leftPair = table[c];
    486             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
    487                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
    488                 ++leftIndex;
    489                 c = ((c - 0xc2) << 6) + t;
    490                 leftPair = primaries[c];
    491                 if(leftPair != 0) { break; }
    492                 leftPair = table[c];
    493             } else {
    494                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
    495             }
    496             if(leftPair >= MIN_SHORT) {
    497                 leftPair &= SHORT_PRIMARY_MASK;
    498                 break;
    499             } else if(leftPair > variableTop) {
    500                 leftPair &= LONG_PRIMARY_MASK;
    501                 break;
    502             } else {
    503                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    504                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    505                 leftPair = getPrimaries(variableTop, leftPair);
    506             }
    507         }
    508 
    509         while(rightPair == 0) {
    510             if(rightIndex == rightLength) {
    511                 rightPair = EOS;
    512                 break;
    513             }
    514             UChar32 c = right[rightIndex++];
    515             uint8_t t;
    516             if(c <= 0x7f) {
    517                 rightPair = primaries[c];
    518                 if(rightPair != 0) { break; }
    519                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    520                     return BAIL_OUT_RESULT;
    521                 }
    522                 rightPair = table[c];
    523             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
    524                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
    525                 ++rightIndex;
    526                 c = ((c - 0xc2) << 6) + t;
    527                 rightPair = primaries[c];
    528                 if(rightPair != 0) { break; }
    529                 rightPair = table[c];
    530             } else {
    531                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
    532             }
    533             if(rightPair >= MIN_SHORT) {
    534                 rightPair &= SHORT_PRIMARY_MASK;
    535                 break;
    536             } else if(rightPair > variableTop) {
    537                 rightPair &= LONG_PRIMARY_MASK;
    538                 break;
    539             } else {
    540                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    541                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    542                 rightPair = getPrimaries(variableTop, rightPair);
    543             }
    544         }
    545 
    546         if(leftPair == rightPair) {
    547             if(leftPair == EOS) { break; }
    548             leftPair = rightPair = 0;
    549             continue;
    550         }
    551         uint32_t leftPrimary = leftPair & 0xffff;
    552         uint32_t rightPrimary = rightPair & 0xffff;
    553         if(leftPrimary != rightPrimary) {
    554             // Return the primary difference.
    555             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    556         }
    557         if(leftPair == EOS) { break; }
    558         leftPair >>= 16;
    559         rightPair >>= 16;
    560     }
    561     // In the following, we need to re-fetch each character because we did not buffer the CEs,
    562     // but we know that the string is well-formed and
    563     // only contains supported characters and mappings.
    564 
    565     // We might skip the secondary level but continue with the case level
    566     // which is turned on separately.
    567     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    568         leftIndex = rightIndex = 0;
    569         leftPair = rightPair = 0;
    570         for(;;) {
    571             while(leftPair == 0) {
    572                 if(leftIndex == leftLength) {
    573                     leftPair = EOS;
    574                     break;
    575                 }
    576                 UChar32 c = left[leftIndex++];
    577                 if(c <= 0x7f) {
    578                     leftPair = table[c];
    579                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    580                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
    581                 } else {
    582                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
    583                 }
    584                 if(leftPair >= MIN_SHORT) {
    585                     leftPair = getSecondariesFromOneShortCE(leftPair);
    586                     break;
    587                 } else if(leftPair > variableTop) {
    588                     leftPair = COMMON_SEC_PLUS_OFFSET;
    589                     break;
    590                 } else {
    591                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    592                     leftPair = getSecondaries(variableTop, leftPair);
    593                 }
    594             }
    595 
    596             while(rightPair == 0) {
    597                 if(rightIndex == rightLength) {
    598                     rightPair = EOS;
    599                     break;
    600                 }
    601                 UChar32 c = right[rightIndex++];
    602                 if(c <= 0x7f) {
    603                     rightPair = table[c];
    604                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
    605                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
    606                 } else {
    607                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
    608                 }
    609                 if(rightPair >= MIN_SHORT) {
    610                     rightPair = getSecondariesFromOneShortCE(rightPair);
    611                     break;
    612                 } else if(rightPair > variableTop) {
    613                     rightPair = COMMON_SEC_PLUS_OFFSET;
    614                     break;
    615                 } else {
    616                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    617                     rightPair = getSecondaries(variableTop, rightPair);
    618                 }
    619             }
    620 
    621             if(leftPair == rightPair) {
    622                 if(leftPair == EOS) { break; }
    623                 leftPair = rightPair = 0;
    624                 continue;
    625             }
    626             uint32_t leftSecondary = leftPair & 0xffff;
    627             uint32_t rightSecondary = rightPair & 0xffff;
    628             if(leftSecondary != rightSecondary) {
    629                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    630                     // Full support for backwards secondary requires backwards contraction matching
    631                     // and moving backwards between merge separators.
    632                     return BAIL_OUT_RESULT;
    633                 }
    634                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    635             }
    636             if(leftPair == EOS) { break; }
    637             leftPair >>= 16;
    638             rightPair >>= 16;
    639         }
    640     }
    641 
    642     if((options & CollationSettings::CASE_LEVEL) != 0) {
    643         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    644         leftIndex = rightIndex = 0;
    645         leftPair = rightPair = 0;
    646         for(;;) {
    647             while(leftPair == 0) {
    648                 if(leftIndex == leftLength) {
    649                     leftPair = EOS;
    650                     break;
    651                 }
    652                 UChar32 c = left[leftIndex++];
    653                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    654                 if(leftPair < MIN_LONG) {
    655                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    656                 }
    657                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    658             }
    659 
    660             while(rightPair == 0) {
    661                 if(rightIndex == rightLength) {
    662                     rightPair = EOS;
    663                     break;
    664                 }
    665                 UChar32 c = right[rightIndex++];
    666                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    667                 if(rightPair < MIN_LONG) {
    668                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    669                 }
    670                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    671             }
    672 
    673             if(leftPair == rightPair) {
    674                 if(leftPair == EOS) { break; }
    675                 leftPair = rightPair = 0;
    676                 continue;
    677             }
    678             uint32_t leftCase = leftPair & 0xffff;
    679             uint32_t rightCase = rightPair & 0xffff;
    680             if(leftCase != rightCase) {
    681                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    682                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    683                 } else {
    684                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    685                 }
    686             }
    687             if(leftPair == EOS) { break; }
    688             leftPair >>= 16;
    689             rightPair >>= 16;
    690         }
    691     }
    692     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    693 
    694     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    695     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    696 
    697     leftIndex = rightIndex = 0;
    698     leftPair = rightPair = 0;
    699     for(;;) {
    700         while(leftPair == 0) {
    701             if(leftIndex == leftLength) {
    702                 leftPair = EOS;
    703                 break;
    704             }
    705             UChar32 c = left[leftIndex++];
    706             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    707             if(leftPair < MIN_LONG) {
    708                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    709             }
    710             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    711         }
    712 
    713         while(rightPair == 0) {
    714             if(rightIndex == rightLength) {
    715                 rightPair = EOS;
    716                 break;
    717             }
    718             UChar32 c = right[rightIndex++];
    719             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    720             if(rightPair < MIN_LONG) {
    721                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    722             }
    723             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    724         }
    725 
    726         if(leftPair == rightPair) {
    727             if(leftPair == EOS) { break; }
    728             leftPair = rightPair = 0;
    729             continue;
    730         }
    731         uint32_t leftTertiary = leftPair & 0xffff;
    732         uint32_t rightTertiary = rightPair & 0xffff;
    733         if(leftTertiary != rightTertiary) {
    734             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    735                 // Pass through EOS and MERGE_WEIGHT
    736                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
    737                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    738                 if(leftTertiary > MERGE_WEIGHT) {
    739                     leftTertiary ^= CASE_MASK;
    740                 }
    741                 if(rightTertiary > MERGE_WEIGHT) {
    742                     rightTertiary ^= CASE_MASK;
    743                 }
    744             }
    745             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    746         }
    747         if(leftPair == EOS) { break; }
    748         leftPair >>= 16;
    749         rightPair >>= 16;
    750     }
    751     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    752 
    753     leftIndex = rightIndex = 0;
    754     leftPair = rightPair = 0;
    755     for(;;) {
    756         while(leftPair == 0) {
    757             if(leftIndex == leftLength) {
    758                 leftPair = EOS;
    759                 break;
    760             }
    761             UChar32 c = left[leftIndex++];
    762             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    763             if(leftPair < MIN_LONG) {
    764                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
    765             }
    766             leftPair = getQuaternaries(variableTop, leftPair);
    767         }
    768 
    769         while(rightPair == 0) {
    770             if(rightIndex == rightLength) {
    771                 rightPair = EOS;
    772                 break;
    773             }
    774             UChar32 c = right[rightIndex++];
    775             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    776             if(rightPair < MIN_LONG) {
    777                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
    778             }
    779             rightPair = getQuaternaries(variableTop, rightPair);
    780         }
    781 
    782         if(leftPair == rightPair) {
    783             if(leftPair == EOS) { break; }
    784             leftPair = rightPair = 0;
    785             continue;
    786         }
    787         uint32_t leftQuaternary = leftPair & 0xffff;
    788         uint32_t rightQuaternary = rightPair & 0xffff;
    789         if(leftQuaternary != rightQuaternary) {
    790             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    791         }
    792         if(leftPair == EOS) { break; }
    793         leftPair >>= 16;
    794         rightPair >>= 16;
    795     }
    796     return UCOL_EQUAL;
    797 }
    798 
    799 uint32_t
    800 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
    801     U_ASSERT(c > LATIN_MAX);
    802     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    803         return table[c - PUNCT_START + LATIN_LIMIT];
    804     } else if(c == 0xfffe) {
    805         return MERGE_WEIGHT;
    806     } else if(c == 0xffff) {
    807         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
    808     } else {
    809         return BAIL_OUT;
    810     }
    811 }
    812 
    813 uint32_t
    814 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
    815                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
    816     // The caller handled ASCII and valid/supported Latin.
    817     U_ASSERT(c > 0x7f);
    818     int32_t i2 = sIndex + 1;
    819     if(i2 < sLength || sLength < 0) {
    820         uint8_t t1 = s8[sIndex];
    821         uint8_t t2 = s8[i2];
    822         sIndex += 2;
    823         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
    824             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    825         } else if(c == 0xef && t1 == 0xbf) {
    826             if(t2 == 0xbe) {
    827                 return MERGE_WEIGHT;  // U+FFFE
    828             } else if(t2 == 0xbf) {
    829                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    830             }
    831         }
    832     }
    833     return BAIL_OUT;
    834 }
    835 
    836 uint32_t
    837 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
    838                                      const uint8_t *s8, int32_t &sIndex) {
    839     // The caller handled ASCII.
    840     // The string is well-formed and contains only supported characters.
    841     U_ASSERT(c > 0x7f);
    842     if(c <= LATIN_MAX_UTF8_LEAD) {
    843         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
    844     }
    845     uint8_t t2 = s8[sIndex + 1];
    846     sIndex += 2;
    847     if(c == 0xe2) {
    848         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    849     } else if(t2 == 0xbe) {
    850         return MERGE_WEIGHT;  // U+FFFE
    851     } else {
    852         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    853     }
    854 }
    855 
    856 uint32_t
    857 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
    858                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
    859     if(ce >= MIN_LONG || ce < CONTRACTION) {
    860         return ce;  // simple or special mini CE
    861     } else if(ce >= EXPANSION) {
    862         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    863         return ((uint32_t)table[index + 1] << 16) | table[index];
    864     } else /* ce >= CONTRACTION */ {
    865         if(c == 0 && sLength < 0) {
    866             sLength = sIndex - 1;
    867             return EOS;
    868         }
    869         // Contraction list: Default mapping followed by
    870         // 0 or more single-character contraction suffix mappings.
    871         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    872         if(sIndex != sLength) {
    873             // Read the next character.
    874             int32_t c2;
    875             int32_t nextIndex = sIndex;
    876             if(s16 != NULL) {
    877                 c2 = s16[nextIndex++];
    878                 if(c2 > LATIN_MAX) {
    879                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
    880                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
    881                     } else if(c2 == 0xfffe || c2 == 0xffff) {
    882                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    883                     } else {
    884                         return BAIL_OUT;
    885                     }
    886                 }
    887             } else {
    888                 c2 = s8[nextIndex++];
    889                 if(c2 > 0x7f) {
    890                     uint8_t t;
    891                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
    892                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
    893                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
    894                         ++nextIndex;
    895                     } else {
    896                         int32_t i2 = nextIndex + 1;
    897                         if(i2 < sLength || sLength < 0) {
    898                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
    899                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
    900                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
    901                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
    902                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
    903                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    904                             } else {
    905                                 return BAIL_OUT;
    906                             }
    907                         } else {
    908                             return BAIL_OUT;
    909                         }
    910                         nextIndex += 2;
    911                     }
    912                 }
    913             }
    914             if(c2 == 0 && sLength < 0) {
    915                 sLength = sIndex;
    916                 c2 = -1;
    917             }
    918             // Look for the next character in the contraction suffix list,
    919             // which is in ascending order of single suffix characters.
    920             int32_t i = index;
    921             int32_t head = table[i];  // first skip the default mapping
    922             int32_t x;
    923             do {
    924                 i += head >> CONTR_LENGTH_SHIFT;
    925                 head = table[i];
    926                 x = head & CONTR_CHAR_MASK;
    927             } while(x < c2);
    928             if(x == c2) {
    929                 index = i;
    930                 sIndex = nextIndex;
    931             }
    932         }
    933         // Return the CE or CEs for the default or contraction mapping.
    934         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
    935         if(length == 1) {
    936             return BAIL_OUT;
    937         }
    938         ce = table[index + 1];
    939         if(length == 2) {
    940             return ce;
    941         } else {
    942             return ((uint32_t)table[index + 2] << 16) | ce;
    943         }
    944     }
    945 }
    946 
    947 uint32_t
    948 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
    949     if(pair <= 0xffff) {
    950         // one mini CE
    951         if(pair >= MIN_SHORT) {
    952             pair = getSecondariesFromOneShortCE(pair);
    953         } else if(pair > variableTop) {
    954             pair = COMMON_SEC_PLUS_OFFSET;
    955         } else if(pair >= MIN_LONG) {
    956             pair = 0;  // variable
    957         }
    958         // else special mini CE
    959     } else {
    960         uint32_t ce = pair & 0xffff;
    961         if(ce >= MIN_SHORT) {
    962             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
    963         } else if(ce > variableTop) {
    964             pair = TWO_COMMON_SEC_PLUS_OFFSET;
    965         } else {
    966             U_ASSERT(ce >= MIN_LONG);
    967             pair = 0;  // variable
    968         }
    969     }
    970     return pair;
    971 }
    972 
    973 uint32_t
    974 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
    975     // Primary+caseLevel: Ignore case level weights of primary ignorables.
    976     // Otherwise: Ignore case level weights of secondary ignorables.
    977     // For details see the comments in the CollationCompare class.
    978     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    979     if(pair <= 0xffff) {
    980         // one mini CE
    981         if(pair >= MIN_SHORT) {
    982             // A high secondary weight means we really have two CEs,
    983             // a primary CE and a secondary CE.
    984             uint32_t ce = pair;
    985             pair &= CASE_MASK;  // explicit weight of primary CE
    986             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
    987                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
    988             }
    989         } else if(pair > variableTop) {
    990             pair = LOWER_CASE;
    991         } else if(pair >= MIN_LONG) {
    992             pair = 0;  // variable
    993         }
    994         // else special mini CE
    995     } else {
    996         // two mini CEs, same primary groups, neither expands like above
    997         uint32_t ce = pair & 0xffff;
    998         if(ce >= MIN_SHORT) {
    999             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
   1000                 pair &= CASE_MASK;
   1001             } else {
   1002                 pair &= TWO_CASES_MASK;
   1003             }
   1004         } else if(ce > variableTop) {
   1005             pair = TWO_LOWER_CASES;
   1006         } else {
   1007             U_ASSERT(ce >= MIN_LONG);
   1008             pair = 0;  // variable
   1009         }
   1010     }
   1011     return pair;
   1012 }
   1013 
   1014 uint32_t
   1015 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
   1016     if(pair <= 0xffff) {
   1017         // one mini CE
   1018         if(pair >= MIN_SHORT) {
   1019             // A high secondary weight means we really have two CEs,
   1020             // a primary CE and a secondary CE.
   1021             uint32_t ce = pair;
   1022             if(withCaseBits) {
   1023                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
   1024                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1025                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
   1026                 }
   1027             } else {
   1028                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1029                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1030                     pair |= COMMON_TER_PLUS_OFFSET << 16;
   1031                 }
   1032             }
   1033         } else if(pair > variableTop) {
   1034             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1035             if(withCaseBits) {
   1036                 pair |= LOWER_CASE;
   1037             }
   1038         } else if(pair >= MIN_LONG) {
   1039             pair = 0;  // variable
   1040         }
   1041         // else special mini CE
   1042     } else {
   1043         // two mini CEs, same primary groups, neither expands like above
   1044         uint32_t ce = pair & 0xffff;
   1045         if(ce >= MIN_SHORT) {
   1046             if(withCaseBits) {
   1047                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
   1048             } else {
   1049                 pair &= TWO_TERTIARIES_MASK;
   1050             }
   1051             pair += TWO_TER_OFFSETS;
   1052         } else if(ce > variableTop) {
   1053             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
   1054             if(withCaseBits) {
   1055                 pair |= TWO_LOWER_CASES;
   1056             }
   1057         } else {
   1058             U_ASSERT(ce >= MIN_LONG);
   1059             pair = 0;  // variable
   1060         }
   1061     }
   1062     return pair;
   1063 }
   1064 
   1065 uint32_t
   1066 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
   1067     // Return the primary weight of a variable CE,
   1068     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
   1069     if(pair <= 0xffff) {
   1070         // one mini CE
   1071         if(pair >= MIN_SHORT) {
   1072             // A high secondary weight means we really have two CEs,
   1073             // a primary CE and a secondary CE.
   1074             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1075                 pair = TWO_SHORT_PRIMARIES_MASK;
   1076             } else {
   1077                 pair = SHORT_PRIMARY_MASK;
   1078             }
   1079         } else if(pair > variableTop) {
   1080             pair = SHORT_PRIMARY_MASK;
   1081         } else if(pair >= MIN_LONG) {
   1082             pair &= LONG_PRIMARY_MASK;  // variable
   1083         }
   1084         // else special mini CE
   1085     } else {
   1086         // two mini CEs, same primary groups, neither expands like above
   1087         uint32_t ce = pair & 0xffff;
   1088         if(ce > variableTop) {
   1089             pair = TWO_SHORT_PRIMARIES_MASK;
   1090         } else {
   1091             U_ASSERT(ce >= MIN_LONG);
   1092             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
   1093         }
   1094     }
   1095     return pair;
   1096 }
   1097 
   1098 U_NAMESPACE_END
   1099 
   1100 #endif  // !UCONFIG_NO_COLLATION
   1101