Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 1996-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationcompare.cpp
      7 *
      8 * created on: 2012feb14 with new and old collation code
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "cmemory.h"
     18 #include "collation.h"
     19 #include "collationcompare.h"
     20 #include "collationiterator.h"
     21 #include "collationsettings.h"
     22 #include "uassert.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 UCollationResult
     27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
     28                                         const CollationSettings &settings,
     29                                         UErrorCode &errorCode) {
     30     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
     31 
     32     int32_t options = settings.options;
     33     uint32_t variableTop;
     34     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
     35         variableTop = 0;
     36     } else {
     37         // +1 so that we can use "<" and primary ignorables test out early.
     38         variableTop = settings.variableTop + 1;
     39     }
     40     UBool anyVariable = FALSE;
     41 
     42     // Fetch CEs, compare primaries, store secondary & tertiary weights.
     43     U_ALIGN_CODE(16);
     44     for(;;) {
     45         // We fetch CEs until we get a non-ignorable primary or reach the end.
     46         uint32_t leftPrimary;
     47         do {
     48             int64_t ce = left.nextCE(errorCode);
     49             leftPrimary = (uint32_t)(ce >> 32);
     50             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     51                 // Variable CE, shift it to quaternary level.
     52                 // Ignore all following primary ignorables, and shift further variable CEs.
     53                 anyVariable = TRUE;
     54                 do {
     55                     // Store only the primary of the variable CE.
     56                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     57                     for(;;) {
     58                         ce = left.nextCE(errorCode);
     59                         leftPrimary = (uint32_t)(ce >> 32);
     60                         if(leftPrimary == 0) {
     61                             left.setCurrentCE(0);
     62                         } else {
     63                             break;
     64                         }
     65                     }
     66                 } while(leftPrimary < variableTop &&
     67                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     68             }
     69         } while(leftPrimary == 0);
     70 
     71         uint32_t rightPrimary;
     72         do {
     73             int64_t ce = right.nextCE(errorCode);
     74             rightPrimary = (uint32_t)(ce >> 32);
     75             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     76                 // Variable CE, shift it to quaternary level.
     77                 // Ignore all following primary ignorables, and shift further variable CEs.
     78                 anyVariable = TRUE;
     79                 do {
     80                     // Store only the primary of the variable CE.
     81                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     82                     for(;;) {
     83                         ce = right.nextCE(errorCode);
     84                         rightPrimary = (uint32_t)(ce >> 32);
     85                         if(rightPrimary == 0) {
     86                             right.setCurrentCE(0);
     87                         } else {
     88                             break;
     89                         }
     90                     }
     91                 } while(rightPrimary < variableTop &&
     92                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     93             }
     94         } while(rightPrimary == 0);
     95 
     96         if(leftPrimary != rightPrimary) {
     97             // Return the primary difference, with script reordering.
     98             if(settings.hasReordering()) {
     99                 leftPrimary = settings.reorder(leftPrimary);
    100                 rightPrimary = settings.reorder(rightPrimary);
    101             }
    102             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    103         }
    104         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
    105     }
    106     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
    107 
    108     // Compare the buffered secondary & tertiary weights.
    109     // We might skip the secondary level but continue with the case level
    110     // which is turned on separately.
    111     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    112         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
    113             int32_t leftIndex = 0;
    114             int32_t rightIndex = 0;
    115             for(;;) {
    116                 uint32_t leftSecondary;
    117                 do {
    118                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
    119                 } while(leftSecondary == 0);
    120 
    121                 uint32_t rightSecondary;
    122                 do {
    123                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
    124                 } while(rightSecondary == 0);
    125 
    126                 if(leftSecondary != rightSecondary) {
    127                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    128                 }
    129                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
    130             }
    131         } else {
    132             // The backwards secondary level compares secondary weights backwards
    133             // within segments separated by the merge separator (U+FFFE, weight 02).
    134             int32_t leftStart = 0;
    135             int32_t rightStart = 0;
    136             for(;;) {
    137                 // Find the merge separator or the NO_CE terminator.
    138                 uint32_t p;
    139                 int32_t leftLimit = leftStart;
    140                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
    141                             Collation::MERGE_SEPARATOR_PRIMARY ||
    142                         p == 0) {
    143                     ++leftLimit;
    144                 }
    145                 int32_t rightLimit = rightStart;
    146                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
    147                             Collation::MERGE_SEPARATOR_PRIMARY ||
    148                         p == 0) {
    149                     ++rightLimit;
    150                 }
    151 
    152                 // Compare the segments.
    153                 int32_t leftIndex = leftLimit;
    154                 int32_t rightIndex = rightLimit;
    155                 for(;;) {
    156                     int32_t leftSecondary = 0;
    157                     while(leftSecondary == 0 && leftIndex > leftStart) {
    158                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
    159                     }
    160 
    161                     int32_t rightSecondary = 0;
    162                     while(rightSecondary == 0 && rightIndex > rightStart) {
    163                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
    164                     }
    165 
    166                     if(leftSecondary != rightSecondary) {
    167                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    168                     }
    169                     if(leftSecondary == 0) { break; }
    170                 }
    171 
    172                 // Did we reach the end of either string?
    173                 // Both strings have the same number of merge separators,
    174                 // or else there would have been a primary-level difference.
    175                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
    176                 if(p == Collation::NO_CE_PRIMARY) { break; }
    177                 // Skip both merge separators and continue.
    178                 leftStart = leftLimit + 1;
    179                 rightStart = rightLimit + 1;
    180             }
    181         }
    182     }
    183 
    184     if((options & CollationSettings::CASE_LEVEL) != 0) {
    185         int32_t strength = CollationSettings::getStrength(options);
    186         int32_t leftIndex = 0;
    187         int32_t rightIndex = 0;
    188         for(;;) {
    189             uint32_t leftCase, leftLower32, rightCase;
    190             if(strength == UCOL_PRIMARY) {
    191                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
    192                 // Otherwise we would get a-umlaut > a
    193                 // which is not desirable for accent-insensitive sorting.
    194                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
    195                 // with only primary weights.
    196                 int64_t ce;
    197                 do {
    198                     ce = left.getCE(leftIndex++);
    199                     leftCase = (uint32_t)ce;
    200                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
    201                 leftLower32 = leftCase;
    202                 leftCase &= 0xc000;
    203 
    204                 do {
    205                     ce = right.getCE(rightIndex++);
    206                     rightCase = (uint32_t)ce;
    207                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
    208                 rightCase &= 0xc000;
    209             } else {
    210                 // Secondary+caseLevel: By analogy with the above,
    211                 // ignore case level weights of secondary ignorables.
    212                 //
    213                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
    214                 // to keep tertiary+caseFirst well-formed.
    215                 //
    216                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
    217                 // Otherwise a tertiary CE's uppercase would be no greater than
    218                 // a primary/secondary CE's uppercase.
    219                 // (See UCA well-formedness condition 2.)
    220                 // We could construct a special case weight higher than uppercase,
    221                 // but it's simpler to always ignore case weights of secondary ignorables,
    222                 // turning 0.0.ut into 0.0.0.t.
    223                 // (See LDML Collation, Case Parameters.)
    224                 do {
    225                     leftCase = (uint32_t)left.getCE(leftIndex++);
    226                 } while(leftCase <= 0xffff);
    227                 leftLower32 = leftCase;
    228                 leftCase &= 0xc000;
    229 
    230                 do {
    231                     rightCase = (uint32_t)right.getCE(rightIndex++);
    232                 } while(rightCase <= 0xffff);
    233                 rightCase &= 0xc000;
    234             }
    235 
    236             // No need to handle NO_CE and MERGE_SEPARATOR specially:
    237             // There is one case weight for each previous-level weight,
    238             // so level length differences were handled there.
    239             if(leftCase != rightCase) {
    240                 if((options & CollationSettings::UPPER_FIRST) == 0) {
    241                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    242                 } else {
    243                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    244                 }
    245             }
    246             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
    247         }
    248     }
    249     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    250 
    251     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
    252 
    253     int32_t leftIndex = 0;
    254     int32_t rightIndex = 0;
    255     uint32_t anyQuaternaries = 0;
    256     for(;;) {
    257         uint32_t leftLower32, leftTertiary;
    258         do {
    259             leftLower32 = (uint32_t)left.getCE(leftIndex++);
    260             anyQuaternaries |= leftLower32;
    261             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    262                      (leftLower32 & 0xc0c0) == 0);
    263             leftTertiary = leftLower32 & tertiaryMask;
    264         } while(leftTertiary == 0);
    265 
    266         uint32_t rightLower32, rightTertiary;
    267         do {
    268             rightLower32 = (uint32_t)right.getCE(rightIndex++);
    269             anyQuaternaries |= rightLower32;
    270             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    271                      (rightLower32 & 0xc0c0) == 0);
    272             rightTertiary = rightLower32 & tertiaryMask;
    273         } while(rightTertiary == 0);
    274 
    275         if(leftTertiary != rightTertiary) {
    276             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    277                 // Pass through NO_CE and keep real tertiary weights larger than that.
    278                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
    279                 // to keep tertiary CEs well-formed.
    280                 // Their case+tertiary weights must be greater than those of
    281                 // primary and secondary CEs.
    282                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
    283                     if(leftLower32 > 0xffff) {
    284                         leftTertiary ^= 0xc000;
    285                     } else {
    286                         leftTertiary += 0x4000;
    287                     }
    288                 }
    289                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
    290                     if(rightLower32 > 0xffff) {
    291                         rightTertiary ^= 0xc000;
    292                     } else {
    293                         rightTertiary += 0x4000;
    294                     }
    295                 }
    296             }
    297             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    298         }
    299         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
    300     }
    301     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    302 
    303     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
    304         // If there are no "variable" CEs and no non-zero quaternary weights,
    305         // then there are no quaternary differences.
    306         return UCOL_EQUAL;
    307     }
    308 
    309     leftIndex = 0;
    310     rightIndex = 0;
    311     for(;;) {
    312         uint32_t leftQuaternary;
    313         do {
    314             int64_t ce = left.getCE(leftIndex++);
    315             leftQuaternary = (uint32_t)ce & 0xffff;
    316             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
    317                 // Variable primary or completely ignorable or NO_CE.
    318                 leftQuaternary = (uint32_t)(ce >> 32);
    319             } else {
    320                 // Regular CE, not tertiary ignorable.
    321                 // Preserve the quaternary weight in bits 7..6.
    322                 leftQuaternary |= 0xffffff3f;
    323             }
    324         } while(leftQuaternary == 0);
    325 
    326         uint32_t rightQuaternary;
    327         do {
    328             int64_t ce = right.getCE(rightIndex++);
    329             rightQuaternary = (uint32_t)ce & 0xffff;
    330             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
    331                 // Variable primary or completely ignorable or NO_CE.
    332                 rightQuaternary = (uint32_t)(ce >> 32);
    333             } else {
    334                 // Regular CE, not tertiary ignorable.
    335                 // Preserve the quaternary weight in bits 7..6.
    336                 rightQuaternary |= 0xffffff3f;
    337             }
    338         } while(rightQuaternary == 0);
    339 
    340         if(leftQuaternary != rightQuaternary) {
    341             // Return the difference, with script reordering.
    342             if(settings.hasReordering()) {
    343                 leftQuaternary = settings.reorder(leftQuaternary);
    344                 rightQuaternary = settings.reorder(rightQuaternary);
    345             }
    346             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    347         }
    348         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
    349     }
    350     return UCOL_EQUAL;
    351 }
    352 
    353 U_NAMESPACE_END
    354 
    355 #endif  // !UCONFIG_NO_COLLATION
    356