Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2015, International Business Machines
      6  * Corporation and others.  All Rights Reserved.
      7  *******************************************************************************
      8  * CollationCompare.java, ported from collationcompare.h/.cpp
      9  *
     10  * C++ version created on: 2012feb14 with new and old collation code
     11  * created by: Markus W. Scherer
     12  */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import com.ibm.icu.text.Collator;
     17 
     18 public final class CollationCompare /* all static */ {
     19     public static int compareUpToQuaternary(CollationIterator left, CollationIterator right,
     20             CollationSettings settings) {
     21         int options = settings.options;
     22         long variableTop;
     23         if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
     24             variableTop = 0;
     25         } else {
     26             // +1 so that we can use "<" and primary ignorables test out early.
     27             variableTop = settings.variableTop + 1;
     28         }
     29         boolean anyVariable = false;
     30 
     31         // Fetch CEs, compare primaries, store secondary & tertiary weights.
     32         for (;;) {
     33             // We fetch CEs until we get a non-ignorable primary or reach the end.
     34             long leftPrimary;
     35             do {
     36                 long ce = left.nextCE();
     37                 leftPrimary = ce >>> 32;
     38                 if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
     39                     // Variable CE, shift it to quaternary level.
     40                     // Ignore all following primary ignorables, and shift further variable CEs.
     41                     anyVariable = true;
     42                     do {
     43                         // Store only the primary of the variable CE.
     44                         left.setCurrentCE(ce & 0xffffffff00000000L);
     45                         for (;;) {
     46                             ce = left.nextCE();
     47                             leftPrimary = ce >>> 32;
     48                             if (leftPrimary == 0) {
     49                                 left.setCurrentCE(0);
     50                             } else {
     51                                 break;
     52                             }
     53                         }
     54                     } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
     55                 }
     56             } while (leftPrimary == 0);
     57 
     58             long rightPrimary;
     59             do {
     60                 long ce = right.nextCE();
     61                 rightPrimary = ce >>> 32;
     62                 if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
     63                     // Variable CE, shift it to quaternary level.
     64                     // Ignore all following primary ignorables, and shift further variable CEs.
     65                     anyVariable = true;
     66                     do {
     67                         // Store only the primary of the variable CE.
     68                         right.setCurrentCE(ce & 0xffffffff00000000L);
     69                         for (;;) {
     70                             ce = right.nextCE();
     71                             rightPrimary = ce >>> 32;
     72                             if (rightPrimary == 0) {
     73                                 right.setCurrentCE(0);
     74                             } else {
     75                                 break;
     76                             }
     77                         }
     78                     } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
     79                 }
     80             } while (rightPrimary == 0);
     81 
     82             if (leftPrimary != rightPrimary) {
     83                 // Return the primary difference, with script reordering.
     84                 if (settings.hasReordering()) {
     85                     leftPrimary = settings.reorder(leftPrimary);
     86                     rightPrimary = settings.reorder(rightPrimary);
     87                 }
     88                 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
     89             }
     90             if (leftPrimary == Collation.NO_CE_PRIMARY) {
     91                 break;
     92             }
     93         }
     94 
     95         // Compare the buffered secondary & tertiary weights.
     96         // We might skip the secondary level but continue with the case level
     97         // which is turned on separately.
     98         if (CollationSettings.getStrength(options) >= Collator.SECONDARY) {
     99             if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
    100                 int leftIndex = 0;
    101                 int rightIndex = 0;
    102                 for (;;) {
    103                     int leftSecondary;
    104                     do {
    105                         leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16;
    106                     } while (leftSecondary == 0);
    107 
    108                     int rightSecondary;
    109                     do {
    110                         rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16;
    111                     } while (rightSecondary == 0);
    112 
    113                     if (leftSecondary != rightSecondary) {
    114                         return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
    115                     }
    116                     if (leftSecondary == Collation.NO_CE_WEIGHT16) {
    117                         break;
    118                     }
    119                 }
    120             } else {
    121                 // The backwards secondary level compares secondary weights backwards
    122                 // within segments separated by the merge separator (U+FFFE, weight 02).
    123                 int leftStart = 0;
    124                 int rightStart = 0;
    125                 for (;;) {
    126                     // Find the merge separator or the NO_CE terminator.
    127                     long p;
    128                     int leftLimit = leftStart;
    129                     while ((p = left.getCE(leftLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
    130                             || p == 0) {
    131                         ++leftLimit;
    132                     }
    133                     int rightLimit = rightStart;
    134                     while ((p = right.getCE(rightLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
    135                             || p == 0) {
    136                         ++rightLimit;
    137                     }
    138 
    139                     // Compare the segments.
    140                     int leftIndex = leftLimit;
    141                     int rightIndex = rightLimit;
    142                     for (;;) {
    143                         int leftSecondary = 0;
    144                         while (leftSecondary == 0 && leftIndex > leftStart) {
    145                             leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16;
    146                         }
    147 
    148                         int rightSecondary = 0;
    149                         while (rightSecondary == 0 && rightIndex > rightStart) {
    150                             rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16;
    151                         }
    152 
    153                         if (leftSecondary != rightSecondary) {
    154                             return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
    155                         }
    156                         if (leftSecondary == 0) {
    157                             break;
    158                         }
    159                     }
    160 
    161                     // Did we reach the end of either string?
    162                     // Both strings have the same number of merge separators,
    163                     // or else there would have been a primary-level difference.
    164                     assert (left.getCE(leftLimit) == right.getCE(rightLimit));
    165                     if (p == Collation.NO_CE_PRIMARY) {
    166                         break;
    167                     }
    168                     // Skip both merge separators and continue.
    169                     leftStart = leftLimit + 1;
    170                     rightStart = rightLimit + 1;
    171                 }
    172             }
    173         }
    174 
    175         if ((options & CollationSettings.CASE_LEVEL) != 0) {
    176             int strength = CollationSettings.getStrength(options);
    177             int leftIndex = 0;
    178             int rightIndex = 0;
    179             for (;;) {
    180                 int leftCase, leftLower32, rightCase;
    181                 if (strength == Collator.PRIMARY) {
    182                     // Primary+caseLevel: Ignore case level weights of primary ignorables.
    183                     // Otherwise we would get a-umlaut > a
    184                     // which is not desirable for accent-insensitive sorting.
    185                     // Check for (lower 32 bits) == 0 as well because variable CEs are stored
    186                     // with only primary weights.
    187                     long ce;
    188                     do {
    189                         ce = left.getCE(leftIndex++);
    190                         leftCase = (int) ce;
    191                     } while ((ce >>> 32) == 0 || leftCase == 0);
    192                     leftLower32 = leftCase;
    193                     leftCase &= 0xc000;
    194 
    195                     do {
    196                         ce = right.getCE(rightIndex++);
    197                         rightCase = (int) ce;
    198                     } while ((ce >>> 32) == 0 || rightCase == 0);
    199                     rightCase &= 0xc000;
    200                 } else {
    201                     // Secondary+caseLevel: By analogy with the above,
    202                     // ignore case level weights of secondary ignorables.
    203                     //
    204                     // Note: A tertiary CE has uppercase case bits (0.0.ut)
    205                     // to keep tertiary+caseFirst well-formed.
    206                     //
    207                     // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
    208                     // Otherwise a tertiary CE's uppercase would be no greater than
    209                     // a primary/secondary CE's uppercase.
    210                     // (See UCA well-formedness condition 2.)
    211                     // We could construct a special case weight higher than uppercase,
    212                     // but it's simpler to always ignore case weights of secondary ignorables,
    213                     // turning 0.0.ut into 0.0.0.t.
    214                     // (See LDML Collation, Case Parameters.)
    215                     do {
    216                         leftCase = (int) left.getCE(leftIndex++);
    217                     } while ((leftCase & 0xffff0000) == 0);
    218                     leftLower32 = leftCase;
    219                     leftCase &= 0xc000;
    220 
    221                     do {
    222                         rightCase = (int) right.getCE(rightIndex++);
    223                     } while ((rightCase & 0xffff0000) == 0);
    224                     rightCase &= 0xc000;
    225                 }
    226 
    227                 // No need to handle NO_CE and MERGE_SEPARATOR specially:
    228                 // There is one case weight for each previous-level weight,
    229                 // so level length differences were handled there.
    230                 if (leftCase != rightCase) {
    231                     if ((options & CollationSettings.UPPER_FIRST) == 0) {
    232                         return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
    233                     } else {
    234                         return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
    235                     }
    236                 }
    237                 if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) {
    238                     break;
    239                 }
    240             }
    241         }
    242         if (CollationSettings.getStrength(options) <= Collator.SECONDARY) {
    243             return Collation.EQUAL;
    244         }
    245 
    246         int tertiaryMask = CollationSettings.getTertiaryMask(options);
    247 
    248         int leftIndex = 0;
    249         int rightIndex = 0;
    250         int anyQuaternaries = 0;
    251         for (;;) {
    252             int leftLower32, leftTertiary;
    253             do {
    254                 leftLower32 = (int) left.getCE(leftIndex++);
    255                 anyQuaternaries |= leftLower32;
    256                 assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0);
    257                 leftTertiary = leftLower32 & tertiaryMask;
    258             } while (leftTertiary == 0);
    259 
    260             int rightLower32, rightTertiary;
    261             do {
    262                 rightLower32 = (int) right.getCE(rightIndex++);
    263                 anyQuaternaries |= rightLower32;
    264                 assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0);
    265                 rightTertiary = rightLower32 & tertiaryMask;
    266             } while (rightTertiary == 0);
    267 
    268             if (leftTertiary != rightTertiary) {
    269                 if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
    270                     // Pass through NO_CE and keep real tertiary weights larger than that.
    271                     // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
    272                     // to keep tertiary CEs well-formed.
    273                     // Their case+tertiary weights must be greater than those of
    274                     // primary and secondary CEs.
    275                     if (leftTertiary > Collation.NO_CE_WEIGHT16) {
    276                         if ((leftLower32 & 0xffff0000) != 0) {
    277                             leftTertiary ^= 0xc000;
    278                         } else {
    279                             leftTertiary += 0x4000;
    280                         }
    281                     }
    282                     if (rightTertiary > Collation.NO_CE_WEIGHT16) {
    283                         if ((rightLower32 & 0xffff0000) != 0) {
    284                             rightTertiary ^= 0xc000;
    285                         } else {
    286                             rightTertiary += 0x4000;
    287                         }
    288                     }
    289                 }
    290                 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
    291             }
    292             if (leftTertiary == Collation.NO_CE_WEIGHT16) {
    293                 break;
    294             }
    295         }
    296         if (CollationSettings.getStrength(options) <= Collator.TERTIARY) {
    297             return Collation.EQUAL;
    298         }
    299 
    300         if (!anyVariable && (anyQuaternaries & 0xc0) == 0) {
    301             // If there are no "variable" CEs and no non-zero quaternary weights,
    302             // then there are no quaternary differences.
    303             return Collation.EQUAL;
    304         }
    305 
    306         leftIndex = 0;
    307         rightIndex = 0;
    308         for (;;) {
    309             long leftQuaternary;
    310             do {
    311                 long ce = left.getCE(leftIndex++);
    312                 leftQuaternary = ce & 0xffff;
    313                 if (leftQuaternary <= Collation.NO_CE_WEIGHT16) {
    314                     // Variable primary or completely ignorable or NO_CE.
    315                     leftQuaternary = ce >>> 32;
    316                 } else {
    317                     // Regular CE, not tertiary ignorable.
    318                     // Preserve the quaternary weight in bits 7..6.
    319                     leftQuaternary |= 0xffffff3fL;
    320                 }
    321             } while (leftQuaternary == 0);
    322 
    323             long rightQuaternary;
    324             do {
    325                 long ce = right.getCE(rightIndex++);
    326                 rightQuaternary = ce & 0xffff;
    327                 if (rightQuaternary <= Collation.NO_CE_WEIGHT16) {
    328                     // Variable primary or completely ignorable or NO_CE.
    329                     rightQuaternary = ce >>> 32;
    330                 } else {
    331                     // Regular CE, not tertiary ignorable.
    332                     // Preserve the quaternary weight in bits 7..6.
    333                     rightQuaternary |= 0xffffff3fL;
    334                 }
    335             } while (rightQuaternary == 0);
    336 
    337             if (leftQuaternary != rightQuaternary) {
    338                 // Return the difference, with script reordering.
    339                 if (settings.hasReordering()) {
    340                     leftQuaternary = settings.reorder(leftQuaternary);
    341                     rightQuaternary = settings.reorder(rightQuaternary);
    342                 }
    343                 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
    344             }
    345             if (leftQuaternary == Collation.NO_CE_PRIMARY) {
    346                 break;
    347             }
    348         }
    349         return Collation.EQUAL;
    350     }
    351 }
    352