Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 1996-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationcompare.cpp
      7 *
      8 * created on: 2012feb14 with new and old collation code
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "cmemory.h"
     18 #include "collation.h"
     19 #include "collationcompare.h"
     20 #include "collationiterator.h"
     21 #include "collationsettings.h"
     22 #include "uassert.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 UCollationResult
     27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
     28                                         const CollationSettings &settings,
     29                                         UErrorCode &errorCode) {
     30     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
     31 
     32     int32_t options = settings.options;
     33     uint32_t variableTop;
     34     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
     35         variableTop = 0;
     36     } else {
     37         // +1 so that we can use "<" and primary ignorables test out early.
     38         variableTop = settings.variableTop + 1;
     39     }
     40     UBool anyVariable = FALSE;
     41 
     42     // Fetch CEs, compare primaries, store secondary & tertiary weights.
     43     for(;;) {
     44         // We fetch CEs until we get a non-ignorable primary or reach the end.
     45         uint32_t leftPrimary;
     46         do {
     47             int64_t ce = left.nextCE(errorCode);
     48             leftPrimary = (uint32_t)(ce >> 32);
     49             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     50                 // Variable CE, shift it to quaternary level.
     51                 // Ignore all following primary ignorables, and shift further variable CEs.
     52                 anyVariable = TRUE;
     53                 do {
     54                     // Store only the primary of the variable CE.
     55                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     56                     for(;;) {
     57                         ce = left.nextCE(errorCode);
     58                         leftPrimary = (uint32_t)(ce >> 32);
     59                         if(leftPrimary == 0) {
     60                             left.setCurrentCE(0);
     61                         } else {
     62                             break;
     63                         }
     64                     }
     65                 } while(leftPrimary < variableTop &&
     66                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     67             }
     68         } while(leftPrimary == 0);
     69 
     70         uint32_t rightPrimary;
     71         do {
     72             int64_t ce = right.nextCE(errorCode);
     73             rightPrimary = (uint32_t)(ce >> 32);
     74             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     75                 // Variable CE, shift it to quaternary level.
     76                 // Ignore all following primary ignorables, and shift further variable CEs.
     77                 anyVariable = TRUE;
     78                 do {
     79                     // Store only the primary of the variable CE.
     80                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     81                     for(;;) {
     82                         ce = right.nextCE(errorCode);
     83                         rightPrimary = (uint32_t)(ce >> 32);
     84                         if(rightPrimary == 0) {
     85                             right.setCurrentCE(0);
     86                         } else {
     87                             break;
     88                         }
     89                     }
     90                 } while(rightPrimary < variableTop &&
     91                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     92             }
     93         } while(rightPrimary == 0);
     94 
     95         if(leftPrimary != rightPrimary) {
     96             // Return the primary difference, with script reordering.
     97             if(settings.hasReordering()) {
     98                 leftPrimary = settings.reorder(leftPrimary);
     99                 rightPrimary = settings.reorder(rightPrimary);
    100             }
    101             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    102         }
    103         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
    104     }
    105     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
    106 
    107     // Compare the buffered secondary & tertiary weights.
    108     // We might skip the secondary level but continue with the case level
    109     // which is turned on separately.
    110     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    111         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
    112             int32_t leftIndex = 0;
    113             int32_t rightIndex = 0;
    114             for(;;) {
    115                 uint32_t leftSecondary;
    116                 do {
    117                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
    118                 } while(leftSecondary == 0);
    119 
    120                 uint32_t rightSecondary;
    121                 do {
    122                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
    123                 } while(rightSecondary == 0);
    124 
    125                 if(leftSecondary != rightSecondary) {
    126                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    127                 }
    128                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
    129             }
    130         } else {
    131             // The backwards secondary level compares secondary weights backwards
    132             // within segments separated by the merge separator (U+FFFE, weight 02).
    133             int32_t leftStart = 0;
    134             int32_t rightStart = 0;
    135             for(;;) {
    136                 // Find the merge separator or the NO_CE terminator.
    137                 uint32_t p;
    138                 int32_t leftLimit = leftStart;
    139                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
    140                             Collation::MERGE_SEPARATOR_PRIMARY ||
    141                         p == 0) {
    142                     ++leftLimit;
    143                 }
    144                 int32_t rightLimit = rightStart;
    145                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
    146                             Collation::MERGE_SEPARATOR_PRIMARY ||
    147                         p == 0) {
    148                     ++rightLimit;
    149                 }
    150 
    151                 // Compare the segments.
    152                 int32_t leftIndex = leftLimit;
    153                 int32_t rightIndex = rightLimit;
    154                 for(;;) {
    155                     int32_t leftSecondary = 0;
    156                     while(leftSecondary == 0 && leftIndex > leftStart) {
    157                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
    158                     }
    159 
    160                     int32_t rightSecondary = 0;
    161                     while(rightSecondary == 0 && rightIndex > rightStart) {
    162                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
    163                     }
    164 
    165                     if(leftSecondary != rightSecondary) {
    166                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    167                     }
    168                     if(leftSecondary == 0) { break; }
    169                 }
    170 
    171                 // Did we reach the end of either string?
    172                 // Both strings have the same number of merge separators,
    173                 // or else there would have been a primary-level difference.
    174                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
    175                 if(p == Collation::NO_CE_PRIMARY) { break; }
    176                 // Skip both merge separators and continue.
    177                 leftStart = leftLimit + 1;
    178                 rightStart = rightLimit + 1;
    179             }
    180         }
    181     }
    182 
    183     if((options & CollationSettings::CASE_LEVEL) != 0) {
    184         int32_t strength = CollationSettings::getStrength(options);
    185         int32_t leftIndex = 0;
    186         int32_t rightIndex = 0;
    187         for(;;) {
    188             uint32_t leftCase, leftLower32, rightCase;
    189             if(strength == UCOL_PRIMARY) {
    190                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
    191                 // Otherwise we would get a-umlaut > a
    192                 // which is not desirable for accent-insensitive sorting.
    193                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
    194                 // with only primary weights.
    195                 int64_t ce;
    196                 do {
    197                     ce = left.getCE(leftIndex++);
    198                     leftCase = (uint32_t)ce;
    199                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
    200                 leftLower32 = leftCase;
    201                 leftCase &= 0xc000;
    202 
    203                 do {
    204                     ce = right.getCE(rightIndex++);
    205                     rightCase = (uint32_t)ce;
    206                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
    207                 rightCase &= 0xc000;
    208             } else {
    209                 // Secondary+caseLevel: By analogy with the above,
    210                 // ignore case level weights of secondary ignorables.
    211                 //
    212                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
    213                 // to keep tertiary+caseFirst well-formed.
    214                 //
    215                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
    216                 // Otherwise a tertiary CE's uppercase would be no greater than
    217                 // a primary/secondary CE's uppercase.
    218                 // (See UCA well-formedness condition 2.)
    219                 // We could construct a special case weight higher than uppercase,
    220                 // but it's simpler to always ignore case weights of secondary ignorables,
    221                 // turning 0.0.ut into 0.0.0.t.
    222                 // (See LDML Collation, Case Parameters.)
    223                 do {
    224                     leftCase = (uint32_t)left.getCE(leftIndex++);
    225                 } while(leftCase <= 0xffff);
    226                 leftLower32 = leftCase;
    227                 leftCase &= 0xc000;
    228 
    229                 do {
    230                     rightCase = (uint32_t)right.getCE(rightIndex++);
    231                 } while(rightCase <= 0xffff);
    232                 rightCase &= 0xc000;
    233             }
    234 
    235             // No need to handle NO_CE and MERGE_SEPARATOR specially:
    236             // There is one case weight for each previous-level weight,
    237             // so level length differences were handled there.
    238             if(leftCase != rightCase) {
    239                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    240                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    241                 } else {
    242                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    243                 }
    244             }
    245             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
    246         }
    247     }
    248     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    249 
    250     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
    251 
    252     int32_t leftIndex = 0;
    253     int32_t rightIndex = 0;
    254     uint32_t anyQuaternaries = 0;
    255     for(;;) {
    256         uint32_t leftLower32, leftTertiary;
    257         do {
    258             leftLower32 = (uint32_t)left.getCE(leftIndex++);
    259             anyQuaternaries |= leftLower32;
    260             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    261                      (leftLower32 & 0xc0c0) == 0);
    262             leftTertiary = leftLower32 & tertiaryMask;
    263         } while(leftTertiary == 0);
    264 
    265         uint32_t rightLower32, rightTertiary;
    266         do {
    267             rightLower32 = (uint32_t)right.getCE(rightIndex++);
    268             anyQuaternaries |= rightLower32;
    269             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    270                      (rightLower32 & 0xc0c0) == 0);
    271             rightTertiary = rightLower32 & tertiaryMask;
    272         } while(rightTertiary == 0);
    273 
    274         if(leftTertiary != rightTertiary) {
    275             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    276                 // Pass through NO_CE and keep real tertiary weights larger than that.
    277                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
    278                 // to keep tertiary CEs well-formed.
    279                 // Their case+tertiary weights must be greater than those of
    280                 // primary and secondary CEs.
    281                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
    282                     if(leftLower32 > 0xffff) {
    283                         leftTertiary ^= 0xc000;
    284                     } else {
    285                         leftTertiary += 0x4000;
    286                     }
    287                 }
    288                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
    289                     if(rightLower32 > 0xffff) {
    290                         rightTertiary ^= 0xc000;
    291                     } else {
    292                         rightTertiary += 0x4000;
    293                     }
    294                 }
    295             }
    296             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    297         }
    298         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
    299     }
    300     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    301 
    302     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
    303         // If there are no "variable" CEs and no non-zero quaternary weights,
    304         // then there are no quaternary differences.
    305         return UCOL_EQUAL;
    306     }
    307 
    308     leftIndex = 0;
    309     rightIndex = 0;
    310     for(;;) {
    311         uint32_t leftQuaternary;
    312         do {
    313             int64_t ce = left.getCE(leftIndex++);
    314             leftQuaternary = (uint32_t)ce & 0xffff;
    315             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
    316                 // Variable primary or completely ignorable or NO_CE.
    317                 leftQuaternary = (uint32_t)(ce >> 32);
    318             } else {
    319                 // Regular CE, not tertiary ignorable.
    320                 // Preserve the quaternary weight in bits 7..6.
    321                 leftQuaternary |= 0xffffff3f;
    322             }
    323         } while(leftQuaternary == 0);
    324 
    325         uint32_t rightQuaternary;
    326         do {
    327             int64_t ce = right.getCE(rightIndex++);
    328             rightQuaternary = (uint32_t)ce & 0xffff;
    329             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
    330                 // Variable primary or completely ignorable or NO_CE.
    331                 rightQuaternary = (uint32_t)(ce >> 32);
    332             } else {
    333                 // Regular CE, not tertiary ignorable.
    334                 // Preserve the quaternary weight in bits 7..6.
    335                 rightQuaternary |= 0xffffff3f;
    336             }
    337         } while(rightQuaternary == 0);
    338 
    339         if(leftQuaternary != rightQuaternary) {
    340             // Return the difference, with script reordering.
    341             if(settings.hasReordering()) {
    342                 leftQuaternary = settings.reorder(leftQuaternary);
    343                 rightQuaternary = settings.reorder(rightQuaternary);
    344             }
    345             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    346         }
    347         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
    348     }
    349     return UCOL_EQUAL;
    350 }
    351 
    352 U_NAMESPACE_END
    353 
    354 #endif  // !UCONFIG_NO_COLLATION
    355