Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdata.cpp
      7 *
      8 * created on: 2012jul28
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "unicode/udata.h"
     18 #include "unicode/uscript.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "uassert.h"
     23 #include "utrie2.h"
     24 #include "uvectr32.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 uint32_t
     29 CollationData::getIndirectCE32(uint32_t ce32) const {
     30     U_ASSERT(Collation::isSpecialCE32(ce32));
     31     int32_t tag = Collation::tagFromCE32(ce32);
     32     if(tag == Collation::DIGIT_TAG) {
     33         // Fetch the non-numeric-collation CE32.
     34         ce32 = ce32s[Collation::indexFromCE32(ce32)];
     35     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
     36         ce32 = Collation::UNASSIGNED_CE32;
     37     } else if(tag == Collation::U0000_TAG) {
     38         // Fetch the normal ce32 for U+0000.
     39         ce32 = ce32s[0];
     40     }
     41     return ce32;
     42 }
     43 
     44 uint32_t
     45 CollationData::getFinalCE32(uint32_t ce32) const {
     46     if(Collation::isSpecialCE32(ce32)) {
     47         ce32 = getIndirectCE32(ce32);
     48     }
     49     return ce32;
     50 }
     51 
     52 int64_t
     53 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
     54     if(U_FAILURE(errorCode)) { return 0; }
     55     // Keep parallel with CollationDataBuilder::getSingleCE().
     56     const CollationData *d;
     57     uint32_t ce32 = getCE32(c);
     58     if(ce32 == Collation::FALLBACK_CE32) {
     59         d = base;
     60         ce32 = base->getCE32(c);
     61     } else {
     62         d = this;
     63     }
     64     while(Collation::isSpecialCE32(ce32)) {
     65         switch(Collation::tagFromCE32(ce32)) {
     66         case Collation::LATIN_EXPANSION_TAG:
     67         case Collation::BUILDER_DATA_TAG:
     68         case Collation::PREFIX_TAG:
     69         case Collation::CONTRACTION_TAG:
     70         case Collation::HANGUL_TAG:
     71         case Collation::LEAD_SURROGATE_TAG:
     72             errorCode = U_UNSUPPORTED_ERROR;
     73             return 0;
     74         case Collation::FALLBACK_TAG:
     75         case Collation::RESERVED_TAG_3:
     76             errorCode = U_INTERNAL_PROGRAM_ERROR;
     77             return 0;
     78         case Collation::LONG_PRIMARY_TAG:
     79             return Collation::ceFromLongPrimaryCE32(ce32);
     80         case Collation::LONG_SECONDARY_TAG:
     81             return Collation::ceFromLongSecondaryCE32(ce32);
     82         case Collation::EXPANSION32_TAG:
     83             if(Collation::lengthFromCE32(ce32) == 1) {
     84                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
     85                 break;
     86             } else {
     87                 errorCode = U_UNSUPPORTED_ERROR;
     88                 return 0;
     89             }
     90         case Collation::EXPANSION_TAG: {
     91             if(Collation::lengthFromCE32(ce32) == 1) {
     92                 return d->ces[Collation::indexFromCE32(ce32)];
     93             } else {
     94                 errorCode = U_UNSUPPORTED_ERROR;
     95                 return 0;
     96             }
     97         }
     98         case Collation::DIGIT_TAG:
     99             // Fetch the non-numeric-collation CE32 and continue.
    100             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
    101             break;
    102         case Collation::U0000_TAG:
    103             U_ASSERT(c == 0);
    104             // Fetch the normal ce32 for U+0000 and continue.
    105             ce32 = d->ce32s[0];
    106             break;
    107         case Collation::OFFSET_TAG:
    108             return d->getCEFromOffsetCE32(c, ce32);
    109         case Collation::IMPLICIT_TAG:
    110             return Collation::unassignedCEFromCodePoint(c);
    111         }
    112     }
    113     return Collation::ceFromSimpleCE32(ce32);
    114 }
    115 
    116 uint32_t
    117 CollationData::getFirstPrimaryForGroup(int32_t script) const {
    118     int32_t index = getScriptIndex(script);
    119     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
    120 }
    121 
    122 uint32_t
    123 CollationData::getLastPrimaryForGroup(int32_t script) const {
    124     int32_t index = getScriptIndex(script);
    125     if(index == 0) {
    126         return 0;
    127     }
    128     uint32_t limit = scriptStarts[index + 1];
    129     return (limit << 16) - 1;
    130 }
    131 
    132 int32_t
    133 CollationData::getGroupForPrimary(uint32_t p) const {
    134     p >>= 16;
    135     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
    136         return -1;
    137     }
    138     int32_t index = 1;
    139     while(p >= scriptStarts[index + 1]) { ++index; }
    140     for(int32_t i = 0; i < numScripts; ++i) {
    141         if(scriptsIndex[i] == index) {
    142             return i;
    143         }
    144     }
    145     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    146         if(scriptsIndex[numScripts + i] == index) {
    147             return UCOL_REORDER_CODE_FIRST + i;
    148         }
    149     }
    150     return -1;
    151 }
    152 
    153 int32_t
    154 CollationData::getScriptIndex(int32_t script) const {
    155     if(script < 0) {
    156         return 0;
    157     } else if(script < numScripts) {
    158         return scriptsIndex[script];
    159     } else if(script < UCOL_REORDER_CODE_FIRST) {
    160         return 0;
    161     } else {
    162         script -= UCOL_REORDER_CODE_FIRST;
    163         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
    164             return scriptsIndex[numScripts + script];
    165         } else {
    166             return 0;
    167         }
    168     }
    169 }
    170 
    171 int32_t
    172 CollationData::getEquivalentScripts(int32_t script,
    173                                     int32_t dest[], int32_t capacity,
    174                                     UErrorCode &errorCode) const {
    175     if(U_FAILURE(errorCode)) { return 0; }
    176     int32_t index = getScriptIndex(script);
    177     if(index == 0) { return 0; }
    178     if(script >= UCOL_REORDER_CODE_FIRST) {
    179         // Special groups have no aliases.
    180         if(capacity > 0) {
    181             dest[0] = script;
    182         } else {
    183             errorCode = U_BUFFER_OVERFLOW_ERROR;
    184         }
    185         return 1;
    186     }
    187 
    188     int32_t length = 0;
    189     for(int32_t i = 0; i < numScripts; ++i) {
    190         if(scriptsIndex[i] == index) {
    191             if(length < capacity) {
    192                 dest[length] = i;
    193             }
    194             ++length;
    195         }
    196     }
    197     if(length > capacity) {
    198         errorCode = U_BUFFER_OVERFLOW_ERROR;
    199     }
    200     return length;
    201 }
    202 
    203 void
    204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    205                                  UVector32 &ranges, UErrorCode &errorCode) const {
    206     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
    207 }
    208 
    209 void
    210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    211                                  UBool latinMustMove,
    212                                  UVector32 &ranges, UErrorCode &errorCode) const {
    213     if(U_FAILURE(errorCode)) { return; }
    214     ranges.removeAllElements();
    215     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
    216         return;
    217     }
    218 
    219     // Maps each script-or-group range to a new lead byte.
    220     uint8_t table[MAX_NUM_SCRIPT_RANGES];
    221     uprv_memset(table, 0, sizeof(table));
    222 
    223     {
    224         // Set "don't care" values for reserved ranges.
    225         int32_t index = scriptsIndex[
    226                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
    227         if(index != 0) {
    228             table[index] = 0xff;
    229         }
    230         index = scriptsIndex[
    231                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
    232         if(index != 0) {
    233             table[index] = 0xff;
    234         }
    235     }
    236 
    237     // Never reorder special low and high primary lead bytes.
    238     U_ASSERT(scriptStartsLength >= 2);
    239     U_ASSERT(scriptStarts[0] == 0);
    240     int32_t lowStart = scriptStarts[1];
    241     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
    242     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
    243     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
    244 
    245     // Get the set of special reorder codes in the input list.
    246     // This supports a fixed number of special reorder codes;
    247     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
    248     uint32_t specials = 0;
    249     for(int32_t i = 0; i < length; ++i) {
    250         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
    251         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
    252             specials |= (uint32_t)1 << reorderCode;
    253         }
    254     }
    255 
    256     // Start the reordering with the special low reorder codes that do not occur in the input.
    257     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    258         int32_t index = scriptsIndex[numScripts + i];
    259         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
    260             lowStart = addLowScriptRange(table, index, lowStart);
    261         }
    262     }
    263 
    264     // Skip the reserved range before Latin if Latin is the first script,
    265     // so that we do not move it unnecessarily.
    266     int32_t skippedReserved = 0;
    267     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
    268         int32_t index = scriptsIndex[USCRIPT_LATIN];
    269         U_ASSERT(index != 0);
    270         int32_t start = scriptStarts[index];
    271         U_ASSERT(lowStart <= start);
    272         skippedReserved = start - lowStart;
    273         lowStart = start;
    274     }
    275 
    276     // Reorder according to the input scripts, continuing from the bottom of the primary range.
    277     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
    278     UBool hasReorderToEnd = FALSE;
    279     for(int32_t i = 0; i < length;) {
    280         int32_t script = reorder[i++];
    281         if(script == USCRIPT_UNKNOWN) {
    282             // Put the remaining scripts at the top.
    283             hasReorderToEnd = TRUE;
    284             while(i < length) {
    285                 script = reorder[--length];
    286                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
    287                         script == UCOL_REORDER_CODE_DEFAULT) {
    288                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    289                     return;
    290                 }
    291                 int32_t index = getScriptIndex(script);
    292                 if(index == 0) { continue; }
    293                 if(table[index] != 0) {  // Duplicate or equivalent script.
    294                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    295                     return;
    296                 }
    297                 highLimit = addHighScriptRange(table, index, highLimit);
    298             }
    299             break;
    300         }
    301         if(script == UCOL_REORDER_CODE_DEFAULT) {
    302             // The default code must be the only one in the list, and that is handled by the caller.
    303             // Otherwise it must not be used.
    304             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    305             return;
    306         }
    307         int32_t index = getScriptIndex(script);
    308         if(index == 0) { continue; }
    309         if(table[index] != 0) {  // Duplicate or equivalent script.
    310             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    311             return;
    312         }
    313         lowStart = addLowScriptRange(table, index, lowStart);
    314     }
    315 
    316     // Put all remaining scripts into the middle.
    317     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
    318         int32_t leadByte = table[i];
    319         if(leadByte != 0) { continue; }
    320         int32_t start = scriptStarts[i];
    321         if(!hasReorderToEnd && start > lowStart) {
    322             // No need to move this script.
    323             lowStart = start;
    324         }
    325         lowStart = addLowScriptRange(table, i, lowStart);
    326     }
    327     if(lowStart > highLimit) {
    328         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
    329             // Try not skipping the before-Latin reserved range.
    330             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
    331             return;
    332         }
    333         // We need more primary lead bytes than available, despite the reserved ranges.
    334         errorCode = U_BUFFER_OVERFLOW_ERROR;
    335         return;
    336     }
    337 
    338     // Turn lead bytes into a list of (limit, offset) pairs.
    339     // Encode each pair in one list element:
    340     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
    341     int32_t offset = 0;
    342     for(int32_t i = 1;; ++i) {
    343         int32_t nextOffset = offset;
    344         while(i < scriptStartsLength - 1) {
    345             int32_t newLeadByte = table[i];
    346             if(newLeadByte == 0xff) {
    347                 // "Don't care" lead byte for reserved range, continue with current offset.
    348             } else {
    349                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
    350                 if(nextOffset != offset) { break; }
    351             }
    352             ++i;
    353         }
    354         if(offset != 0 || i < scriptStartsLength - 1) {
    355             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
    356         }
    357         if(i == scriptStartsLength - 1) { break; }
    358         offset = nextOffset;
    359     }
    360 }
    361 
    362 int32_t
    363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
    364     int32_t start = scriptStarts[index];
    365     if((start & 0xff) < (lowStart & 0xff)) {
    366         lowStart += 0x100;
    367     }
    368     table[index] = (uint8_t)(lowStart >> 8);
    369     int32_t limit = scriptStarts[index + 1];
    370     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
    371     return lowStart;
    372 }
    373 
    374 int32_t
    375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
    376     int32_t limit = scriptStarts[index + 1];
    377     if((limit & 0xff) > (highLimit & 0xff)) {
    378         highLimit -= 0x100;
    379     }
    380     int32_t start = scriptStarts[index];
    381     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
    382     table[index] = (uint8_t)(highLimit >> 8);
    383     return highLimit;
    384 }
    385 
    386 U_NAMESPACE_END
    387 
    388 #endif  // !UCONFIG_NO_COLLATION
    389