Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdata.cpp
      9 *
     10 * created on: 2012jul28
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/uscript.h"
     21 #include "cmemory.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 #include "uassert.h"
     25 #include "utrie2.h"
     26 #include "uvectr32.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 uint32_t
     31 CollationData::getIndirectCE32(uint32_t ce32) const {
     32     U_ASSERT(Collation::isSpecialCE32(ce32));
     33     int32_t tag = Collation::tagFromCE32(ce32);
     34     if(tag == Collation::DIGIT_TAG) {
     35         // Fetch the non-numeric-collation CE32.
     36         ce32 = ce32s[Collation::indexFromCE32(ce32)];
     37     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
     38         ce32 = Collation::UNASSIGNED_CE32;
     39     } else if(tag == Collation::U0000_TAG) {
     40         // Fetch the normal ce32 for U+0000.
     41         ce32 = ce32s[0];
     42     }
     43     return ce32;
     44 }
     45 
     46 uint32_t
     47 CollationData::getFinalCE32(uint32_t ce32) const {
     48     if(Collation::isSpecialCE32(ce32)) {
     49         ce32 = getIndirectCE32(ce32);
     50     }
     51     return ce32;
     52 }
     53 
     54 int64_t
     55 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
     56     if(U_FAILURE(errorCode)) { return 0; }
     57     // Keep parallel with CollationDataBuilder::getSingleCE().
     58     const CollationData *d;
     59     uint32_t ce32 = getCE32(c);
     60     if(ce32 == Collation::FALLBACK_CE32) {
     61         d = base;
     62         ce32 = base->getCE32(c);
     63     } else {
     64         d = this;
     65     }
     66     while(Collation::isSpecialCE32(ce32)) {
     67         switch(Collation::tagFromCE32(ce32)) {
     68         case Collation::LATIN_EXPANSION_TAG:
     69         case Collation::BUILDER_DATA_TAG:
     70         case Collation::PREFIX_TAG:
     71         case Collation::CONTRACTION_TAG:
     72         case Collation::HANGUL_TAG:
     73         case Collation::LEAD_SURROGATE_TAG:
     74             errorCode = U_UNSUPPORTED_ERROR;
     75             return 0;
     76         case Collation::FALLBACK_TAG:
     77         case Collation::RESERVED_TAG_3:
     78             errorCode = U_INTERNAL_PROGRAM_ERROR;
     79             return 0;
     80         case Collation::LONG_PRIMARY_TAG:
     81             return Collation::ceFromLongPrimaryCE32(ce32);
     82         case Collation::LONG_SECONDARY_TAG:
     83             return Collation::ceFromLongSecondaryCE32(ce32);
     84         case Collation::EXPANSION32_TAG:
     85             if(Collation::lengthFromCE32(ce32) == 1) {
     86                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
     87                 break;
     88             } else {
     89                 errorCode = U_UNSUPPORTED_ERROR;
     90                 return 0;
     91             }
     92         case Collation::EXPANSION_TAG: {
     93             if(Collation::lengthFromCE32(ce32) == 1) {
     94                 return d->ces[Collation::indexFromCE32(ce32)];
     95             } else {
     96                 errorCode = U_UNSUPPORTED_ERROR;
     97                 return 0;
     98             }
     99         }
    100         case Collation::DIGIT_TAG:
    101             // Fetch the non-numeric-collation CE32 and continue.
    102             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
    103             break;
    104         case Collation::U0000_TAG:
    105             U_ASSERT(c == 0);
    106             // Fetch the normal ce32 for U+0000 and continue.
    107             ce32 = d->ce32s[0];
    108             break;
    109         case Collation::OFFSET_TAG:
    110             return d->getCEFromOffsetCE32(c, ce32);
    111         case Collation::IMPLICIT_TAG:
    112             return Collation::unassignedCEFromCodePoint(c);
    113         }
    114     }
    115     return Collation::ceFromSimpleCE32(ce32);
    116 }
    117 
    118 uint32_t
    119 CollationData::getFirstPrimaryForGroup(int32_t script) const {
    120     int32_t index = getScriptIndex(script);
    121     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
    122 }
    123 
    124 uint32_t
    125 CollationData::getLastPrimaryForGroup(int32_t script) const {
    126     int32_t index = getScriptIndex(script);
    127     if(index == 0) {
    128         return 0;
    129     }
    130     uint32_t limit = scriptStarts[index + 1];
    131     return (limit << 16) - 1;
    132 }
    133 
    134 int32_t
    135 CollationData::getGroupForPrimary(uint32_t p) const {
    136     p >>= 16;
    137     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
    138         return -1;
    139     }
    140     int32_t index = 1;
    141     while(p >= scriptStarts[index + 1]) { ++index; }
    142     for(int32_t i = 0; i < numScripts; ++i) {
    143         if(scriptsIndex[i] == index) {
    144             return i;
    145         }
    146     }
    147     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    148         if(scriptsIndex[numScripts + i] == index) {
    149             return UCOL_REORDER_CODE_FIRST + i;
    150         }
    151     }
    152     return -1;
    153 }
    154 
    155 int32_t
    156 CollationData::getScriptIndex(int32_t script) const {
    157     if(script < 0) {
    158         return 0;
    159     } else if(script < numScripts) {
    160         return scriptsIndex[script];
    161     } else if(script < UCOL_REORDER_CODE_FIRST) {
    162         return 0;
    163     } else {
    164         script -= UCOL_REORDER_CODE_FIRST;
    165         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
    166             return scriptsIndex[numScripts + script];
    167         } else {
    168             return 0;
    169         }
    170     }
    171 }
    172 
    173 int32_t
    174 CollationData::getEquivalentScripts(int32_t script,
    175                                     int32_t dest[], int32_t capacity,
    176                                     UErrorCode &errorCode) const {
    177     if(U_FAILURE(errorCode)) { return 0; }
    178     int32_t index = getScriptIndex(script);
    179     if(index == 0) { return 0; }
    180     if(script >= UCOL_REORDER_CODE_FIRST) {
    181         // Special groups have no aliases.
    182         if(capacity > 0) {
    183             dest[0] = script;
    184         } else {
    185             errorCode = U_BUFFER_OVERFLOW_ERROR;
    186         }
    187         return 1;
    188     }
    189 
    190     int32_t length = 0;
    191     for(int32_t i = 0; i < numScripts; ++i) {
    192         if(scriptsIndex[i] == index) {
    193             if(length < capacity) {
    194                 dest[length] = i;
    195             }
    196             ++length;
    197         }
    198     }
    199     if(length > capacity) {
    200         errorCode = U_BUFFER_OVERFLOW_ERROR;
    201     }
    202     return length;
    203 }
    204 
    205 void
    206 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    207                                  UVector32 &ranges, UErrorCode &errorCode) const {
    208     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
    209 }
    210 
    211 void
    212 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    213                                  UBool latinMustMove,
    214                                  UVector32 &ranges, UErrorCode &errorCode) const {
    215     if(U_FAILURE(errorCode)) { return; }
    216     ranges.removeAllElements();
    217     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
    218         return;
    219     }
    220 
    221     // Maps each script-or-group range to a new lead byte.
    222     uint8_t table[MAX_NUM_SCRIPT_RANGES];
    223     uprv_memset(table, 0, sizeof(table));
    224 
    225     {
    226         // Set "don't care" values for reserved ranges.
    227         int32_t index = scriptsIndex[
    228                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
    229         if(index != 0) {
    230             table[index] = 0xff;
    231         }
    232         index = scriptsIndex[
    233                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
    234         if(index != 0) {
    235             table[index] = 0xff;
    236         }
    237     }
    238 
    239     // Never reorder special low and high primary lead bytes.
    240     U_ASSERT(scriptStartsLength >= 2);
    241     U_ASSERT(scriptStarts[0] == 0);
    242     int32_t lowStart = scriptStarts[1];
    243     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
    244     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
    245     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
    246 
    247     // Get the set of special reorder codes in the input list.
    248     // This supports a fixed number of special reorder codes;
    249     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
    250     uint32_t specials = 0;
    251     for(int32_t i = 0; i < length; ++i) {
    252         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
    253         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
    254             specials |= (uint32_t)1 << reorderCode;
    255         }
    256     }
    257 
    258     // Start the reordering with the special low reorder codes that do not occur in the input.
    259     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    260         int32_t index = scriptsIndex[numScripts + i];
    261         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
    262             lowStart = addLowScriptRange(table, index, lowStart);
    263         }
    264     }
    265 
    266     // Skip the reserved range before Latin if Latin is the first script,
    267     // so that we do not move it unnecessarily.
    268     int32_t skippedReserved = 0;
    269     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
    270         int32_t index = scriptsIndex[USCRIPT_LATIN];
    271         U_ASSERT(index != 0);
    272         int32_t start = scriptStarts[index];
    273         U_ASSERT(lowStart <= start);
    274         skippedReserved = start - lowStart;
    275         lowStart = start;
    276     }
    277 
    278     // Reorder according to the input scripts, continuing from the bottom of the primary range.
    279     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
    280     UBool hasReorderToEnd = FALSE;
    281     for(int32_t i = 0; i < length;) {
    282         int32_t script = reorder[i++];
    283         if(script == USCRIPT_UNKNOWN) {
    284             // Put the remaining scripts at the top.
    285             hasReorderToEnd = TRUE;
    286             while(i < length) {
    287                 script = reorder[--length];
    288                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
    289                         script == UCOL_REORDER_CODE_DEFAULT) {
    290                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    291                     return;
    292                 }
    293                 int32_t index = getScriptIndex(script);
    294                 if(index == 0) { continue; }
    295                 if(table[index] != 0) {  // Duplicate or equivalent script.
    296                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    297                     return;
    298                 }
    299                 highLimit = addHighScriptRange(table, index, highLimit);
    300             }
    301             break;
    302         }
    303         if(script == UCOL_REORDER_CODE_DEFAULT) {
    304             // The default code must be the only one in the list, and that is handled by the caller.
    305             // Otherwise it must not be used.
    306             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    307             return;
    308         }
    309         int32_t index = getScriptIndex(script);
    310         if(index == 0) { continue; }
    311         if(table[index] != 0) {  // Duplicate or equivalent script.
    312             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    313             return;
    314         }
    315         lowStart = addLowScriptRange(table, index, lowStart);
    316     }
    317 
    318     // Put all remaining scripts into the middle.
    319     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
    320         int32_t leadByte = table[i];
    321         if(leadByte != 0) { continue; }
    322         int32_t start = scriptStarts[i];
    323         if(!hasReorderToEnd && start > lowStart) {
    324             // No need to move this script.
    325             lowStart = start;
    326         }
    327         lowStart = addLowScriptRange(table, i, lowStart);
    328     }
    329     if(lowStart > highLimit) {
    330         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
    331             // Try not skipping the before-Latin reserved range.
    332             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
    333             return;
    334         }
    335         // We need more primary lead bytes than available, despite the reserved ranges.
    336         errorCode = U_BUFFER_OVERFLOW_ERROR;
    337         return;
    338     }
    339 
    340     // Turn lead bytes into a list of (limit, offset) pairs.
    341     // Encode each pair in one list element:
    342     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
    343     int32_t offset = 0;
    344     for(int32_t i = 1;; ++i) {
    345         int32_t nextOffset = offset;
    346         while(i < scriptStartsLength - 1) {
    347             int32_t newLeadByte = table[i];
    348             if(newLeadByte == 0xff) {
    349                 // "Don't care" lead byte for reserved range, continue with current offset.
    350             } else {
    351                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
    352                 if(nextOffset != offset) { break; }
    353             }
    354             ++i;
    355         }
    356         if(offset != 0 || i < scriptStartsLength - 1) {
    357             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
    358         }
    359         if(i == scriptStartsLength - 1) { break; }
    360         offset = nextOffset;
    361     }
    362 }
    363 
    364 int32_t
    365 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
    366     int32_t start = scriptStarts[index];
    367     if((start & 0xff) < (lowStart & 0xff)) {
    368         lowStart += 0x100;
    369     }
    370     table[index] = (uint8_t)(lowStart >> 8);
    371     int32_t limit = scriptStarts[index + 1];
    372     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
    373     return lowStart;
    374 }
    375 
    376 int32_t
    377 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
    378     int32_t limit = scriptStarts[index + 1];
    379     if((limit & 0xff) > (highLimit & 0xff)) {
    380         highLimit -= 0x100;
    381     }
    382     int32_t start = scriptStarts[index];
    383     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
    384     table[index] = (uint8_t)(highLimit >> 8);
    385     return highLimit;
    386 }
    387 
    388 U_NAMESPACE_END
    389 
    390 #endif  // !UCONFIG_NO_COLLATION
    391