Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationrootelements.cpp
      9 *
     10 * created on: 2013mar05
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "collation.h"
     19 #include "collationrootelements.h"
     20 #include "uassert.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 int64_t
     25 CollationRootElements::lastCEWithPrimaryBefore(uint32_t p) const {
     26     if(p == 0) { return 0; }
     27     U_ASSERT(p > elements[elements[IX_FIRST_PRIMARY_INDEX]]);
     28     int32_t index = findP(p);
     29     uint32_t q = elements[index];
     30     uint32_t secTer;
     31     if(p == (q & 0xffffff00)) {
     32         // p == elements[index] is a root primary. Find the CE before it.
     33         // We must not be in a primary range.
     34         U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
     35         secTer = elements[index - 1];
     36         if((secTer & SEC_TER_DELTA_FLAG) == 0) {
     37             // Primary CE just before p.
     38             p = secTer & 0xffffff00;
     39             secTer = Collation::COMMON_SEC_AND_TER_CE;
     40         } else {
     41             // secTer = last secondary & tertiary for the previous primary
     42             index -= 2;
     43             for(;;) {
     44                 p = elements[index];
     45                 if((p & SEC_TER_DELTA_FLAG) == 0) {
     46                     p &= 0xffffff00;
     47                     break;
     48                 }
     49                 --index;
     50             }
     51         }
     52     } else {
     53         // p > elements[index] which is the previous primary.
     54         // Find the last secondary & tertiary weights for it.
     55         p = q & 0xffffff00;
     56         secTer = Collation::COMMON_SEC_AND_TER_CE;
     57         for(;;) {
     58             q = elements[++index];
     59             if((q & SEC_TER_DELTA_FLAG) == 0) {
     60                 // We must not be in a primary range.
     61                 U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
     62                 break;
     63             }
     64             secTer = q;
     65         }
     66     }
     67     return ((int64_t)p << 32) | (secTer & ~SEC_TER_DELTA_FLAG);
     68 }
     69 
     70 int64_t
     71 CollationRootElements::firstCEWithPrimaryAtLeast(uint32_t p) const {
     72     if(p == 0) { return 0; }
     73     int32_t index = findP(p);
     74     if(p != (elements[index] & 0xffffff00)) {
     75         for(;;) {
     76             p = elements[++index];
     77             if((p & SEC_TER_DELTA_FLAG) == 0) {
     78                 // First primary after p. We must not be in a primary range.
     79                 U_ASSERT((p & PRIMARY_STEP_MASK) == 0);
     80                 break;
     81             }
     82         }
     83     }
     84     // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0.
     85     return ((int64_t)p << 32) | Collation::COMMON_SEC_AND_TER_CE;
     86 }
     87 
     88 uint32_t
     89 CollationRootElements::getPrimaryBefore(uint32_t p, UBool isCompressible) const {
     90     int32_t index = findPrimary(p);
     91     int32_t step;
     92     uint32_t q = elements[index];
     93     if(p == (q & 0xffffff00)) {
     94         // Found p itself. Return the previous primary.
     95         // See if p is at the end of a previous range.
     96         step = (int32_t)q & PRIMARY_STEP_MASK;
     97         if(step == 0) {
     98             // p is not at the end of a range. Look for the previous primary.
     99             do {
    100                 p = elements[--index];
    101             } while((p & SEC_TER_DELTA_FLAG) != 0);
    102             return p & 0xffffff00;
    103         }
    104     } else {
    105         // p is in a range, and not at the start.
    106         uint32_t nextElement = elements[index + 1];
    107         U_ASSERT(isEndOfPrimaryRange(nextElement));
    108         step = (int32_t)nextElement & PRIMARY_STEP_MASK;
    109     }
    110     // Return the previous range primary.
    111     if((p & 0xffff) == 0) {
    112         return Collation::decTwoBytePrimaryByOneStep(p, isCompressible, step);
    113     } else {
    114         return Collation::decThreeBytePrimaryByOneStep(p, isCompressible, step);
    115     }
    116 }
    117 
    118 uint32_t
    119 CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
    120     int32_t index;
    121     uint32_t previousSec, sec;
    122     if(p == 0) {
    123         index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
    124         // Gap at the beginning of the secondary CE range.
    125         previousSec = 0;
    126         sec = elements[index] >> 16;
    127     } else {
    128         index = findPrimary(p) + 1;
    129         previousSec = Collation::BEFORE_WEIGHT16;
    130         sec = getFirstSecTerForPrimary(index) >> 16;
    131     }
    132     U_ASSERT(s >= sec);
    133     while(s > sec) {
    134         previousSec = sec;
    135         U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0);
    136         sec = elements[index++] >> 16;
    137     }
    138     U_ASSERT(sec == s);
    139     return previousSec;
    140 }
    141 
    142 uint32_t
    143 CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const {
    144     U_ASSERT((t & ~Collation::ONLY_TERTIARY_MASK) == 0);
    145     int32_t index;
    146     uint32_t previousTer, secTer;
    147     if(p == 0) {
    148         if(s == 0) {
    149             index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
    150             // Gap at the beginning of the tertiary CE range.
    151             previousTer = 0;
    152         } else {
    153             index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
    154             previousTer = Collation::BEFORE_WEIGHT16;
    155         }
    156         secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
    157     } else {
    158         index = findPrimary(p) + 1;
    159         previousTer = Collation::BEFORE_WEIGHT16;
    160         secTer = getFirstSecTerForPrimary(index);
    161     }
    162     uint32_t st = (s << 16) | t;
    163     while(st > secTer) {
    164         if((secTer >> 16) == s) { previousTer = secTer; }
    165         U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0);
    166         secTer = elements[index++] & ~SEC_TER_DELTA_FLAG;
    167     }
    168     U_ASSERT(secTer == st);
    169     return previousTer & 0xffff;
    170 }
    171 
    172 uint32_t
    173 CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const {
    174     U_ASSERT(p == (elements[index] & 0xffffff00) || isEndOfPrimaryRange(elements[index + 1]));
    175     uint32_t q = elements[++index];
    176     int32_t step;
    177     if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int32_t)q & PRIMARY_STEP_MASK) != 0) {
    178         // Return the next primary in this range.
    179         if((p & 0xffff) == 0) {
    180             return Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
    181         } else {
    182             return Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
    183         }
    184     } else {
    185         // Return the next primary in the list.
    186         while((q & SEC_TER_DELTA_FLAG) != 0) {
    187             q = elements[++index];
    188         }
    189         U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
    190         return q;
    191     }
    192 }
    193 
    194 uint32_t
    195 CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const {
    196     uint32_t secTer;
    197     uint32_t secLimit;
    198     if(index == 0) {
    199         // primary = 0
    200         U_ASSERT(s != 0);
    201         index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
    202         secTer = elements[index];
    203         // Gap at the end of the secondary CE range.
    204         secLimit = 0x10000;
    205     } else {
    206         U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
    207         secTer = getFirstSecTerForPrimary(index + 1);
    208         // If this is an explicit sec/ter unit, then it will be read once more.
    209         // Gap for secondaries of primary CEs.
    210         secLimit = getSecondaryBoundary();
    211     }
    212     for(;;) {
    213         uint32_t sec = secTer >> 16;
    214         if(sec > s) { return sec; }
    215         secTer = elements[++index];
    216         if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
    217     }
    218 }
    219 
    220 uint32_t
    221 CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const {
    222     uint32_t secTer;
    223     uint32_t terLimit;
    224     if(index == 0) {
    225         // primary = 0
    226         if(s == 0) {
    227             U_ASSERT(t != 0);
    228             index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
    229             // Gap at the end of the tertiary CE range.
    230             terLimit = 0x4000;
    231         } else {
    232             index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
    233             // Gap for tertiaries of primary/secondary CEs.
    234             terLimit = getTertiaryBoundary();
    235         }
    236         secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
    237     } else {
    238         U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
    239         secTer = getFirstSecTerForPrimary(index + 1);
    240         // If this is an explicit sec/ter unit, then it will be read once more.
    241         terLimit = getTertiaryBoundary();
    242     }
    243     uint32_t st = (s << 16) | t;
    244     for(;;) {
    245         if(secTer > st) {
    246             U_ASSERT((secTer >> 16) == s);
    247             return secTer & 0xffff;
    248         }
    249         secTer = elements[++index];
    250         // No tertiary greater than t for this primary+secondary.
    251         if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
    252         secTer &= ~SEC_TER_DELTA_FLAG;
    253     }
    254 }
    255 
    256 uint32_t
    257 CollationRootElements::getFirstSecTerForPrimary(int32_t index) const {
    258     uint32_t secTer = elements[index];
    259     if((secTer & SEC_TER_DELTA_FLAG) == 0) {
    260         // No sec/ter delta.
    261         return Collation::COMMON_SEC_AND_TER_CE;
    262     }
    263     secTer &= ~SEC_TER_DELTA_FLAG;
    264     if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
    265         // Implied sec/ter.
    266         return Collation::COMMON_SEC_AND_TER_CE;
    267     }
    268     // Explicit sec/ter below common/common.
    269     return secTer;
    270 }
    271 
    272 int32_t
    273 CollationRootElements::findPrimary(uint32_t p) const {
    274     // Requirement: p must occur as a root primary.
    275     U_ASSERT((p & 0xff) == 0);  // at most a 3-byte primary
    276     int32_t index = findP(p);
    277     // If p is in a range, then we just assume that p is an actual primary in this range.
    278     // (Too cumbersome/expensive to check.)
    279     // Otherwise, it must be an exact match.
    280     U_ASSERT(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00));
    281     return index;
    282 }
    283 
    284 int32_t
    285 CollationRootElements::findP(uint32_t p) const {
    286     // p need not occur as a root primary.
    287     // For example, it might be a reordering group boundary.
    288     U_ASSERT((p >> 24) != Collation::UNASSIGNED_IMPLICIT_BYTE);
    289     // modified binary search
    290     int32_t start = (int32_t)elements[IX_FIRST_PRIMARY_INDEX];
    291     U_ASSERT(p >= elements[start]);
    292     int32_t limit = length - 1;
    293     U_ASSERT(elements[limit] >= PRIMARY_SENTINEL);
    294     U_ASSERT(p < elements[limit]);
    295     while((start + 1) < limit) {
    296         // Invariant: elements[start] and elements[limit] are primaries,
    297         // and elements[start]<=p<=elements[limit].
    298         int32_t i = (start + limit) / 2;
    299         uint32_t q = elements[i];
    300         if((q & SEC_TER_DELTA_FLAG) != 0) {
    301             // Find the next primary.
    302             int32_t j = i + 1;
    303             for(;;) {
    304                 if(j == limit) { break; }
    305                 q = elements[j];
    306                 if((q & SEC_TER_DELTA_FLAG) == 0) {
    307                     i = j;
    308                     break;
    309                 }
    310                 ++j;
    311             }
    312             if((q & SEC_TER_DELTA_FLAG) != 0) {
    313                 // Find the preceding primary.
    314                 j = i - 1;
    315                 for(;;) {
    316                     if(j == start) { break; }
    317                     q = elements[j];
    318                     if((q & SEC_TER_DELTA_FLAG) == 0) {
    319                         i = j;
    320                         break;
    321                     }
    322                     --j;
    323                 }
    324                 if((q & SEC_TER_DELTA_FLAG) != 0) {
    325                     // No primary between start and limit.
    326                     break;
    327                 }
    328             }
    329         }
    330         if(p < (q & 0xffffff00)) {  // Reset the "step" bits of a range end primary.
    331             limit = i;
    332         } else {
    333             start = i;
    334         }
    335     }
    336     return start;
    337 }
    338 
    339 U_NAMESPACE_END
    340 
    341 #endif  // !UCONFIG_NO_COLLATION
    342