Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationbasedatabuilder.cpp
      7 *
      8 * created on: 2012aug11
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/localpointer.h"
     17 #include "unicode/ucharstriebuilder.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/utf16.h"
     21 #include "collation.h"
     22 #include "collationbasedatabuilder.h"
     23 #include "collationdata.h"
     24 #include "collationdatabuilder.h"
     25 #include "collationrootelements.h"
     26 #include "normalizer2impl.h"
     27 #include "uassert.h"
     28 #include "utrie2.h"
     29 #include "uvectr32.h"
     30 #include "uvectr64.h"
     31 #include "uvector.h"
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 namespace {
     36 
     37 /**
     38  * Compare two signed int64_t values as if they were unsigned.
     39  */
     40 int32_t
     41 compareInt64AsUnsigned(int64_t a, int64_t b) {
     42     if((uint64_t)a < (uint64_t)b) {
     43         return -1;
     44     } else if((uint64_t)a > (uint64_t)b) {
     45         return 1;
     46     } else {
     47         return 0;
     48     }
     49 }
     50 
     51 // TODO: Try to merge this with the binarySearch in alphaindex.cpp.
     52 /**
     53  * Like Java Collections.binarySearch(List, String, Comparator).
     54  *
     55  * @return the index>=0 where the item was found,
     56  *         or the index<0 for inserting the string at ~index in sorted order
     57  */
     58 int32_t
     59 binarySearch(const UVector64 &list, int64_t ce) {
     60     if (list.size() == 0) { return ~0; }
     61     int32_t start = 0;
     62     int32_t limit = list.size();
     63     for (;;) {
     64         int32_t i = (start + limit) / 2;
     65         int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
     66         if (cmp == 0) {
     67             return i;
     68         } else if (cmp < 0) {
     69             if (i == start) {
     70                 return ~start;  // insert ce before i
     71             }
     72             limit = i;
     73         } else {
     74             if (i == start) {
     75                 return ~(start + 1);  // insert ce after i
     76             }
     77             start = i;
     78         }
     79     }
     80 }
     81 
     82 }  // namespace
     83 
     84 CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
     85         : CollationDataBuilder(errorCode),
     86           numericPrimary(0x12000000),
     87           firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
     88           rootElements(errorCode) {
     89 }
     90 
     91 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
     92 }
     93 
     94 void
     95 CollationBaseDataBuilder::init(UErrorCode &errorCode) {
     96     if(U_FAILURE(errorCode)) { return; }
     97     if(trie != NULL) {
     98         errorCode = U_INVALID_STATE_ERROR;
     99         return;
    100     }
    101 
    102     // Not compressible:
    103     // - digits
    104     // - Latin
    105     // - Hani
    106     // - trail weights
    107     // Some scripts are compressible, some are not.
    108     uprv_memset(compressibleBytes, FALSE, 256);
    109     compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE;
    110 
    111     // For a base, the default is to compute an unassigned-character implicit CE.
    112     // This includes surrogate code points; see the last option in
    113     // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
    114     trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
    115 
    116     // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
    117     for(UChar32 c = 0; c < 0x180; ++c) {
    118         utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
    119     }
    120 
    121     utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
    122     // No root element for the merge separator which has 02 weights.
    123     // Some code assumes that the root first primary CE is the "space first primary"
    124     // from FractionalUCA.txt.
    125 
    126     uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
    127     utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
    128 
    129     // Add a mapping for the first-unassigned boundary,
    130     // which is the AlphabeticIndex overflow boundary.
    131     UnicodeString s((UChar)0xfdd1);  // Script boundary contractions start with U+FDD1.
    132     s.append((UChar)0xfdd0);  // Zzzz script sample character U+FDD0.
    133     int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
    134     add(UnicodeString(), s, &ce, 1, errorCode);
    135 
    136     // Add a tailoring boundary, but not a mapping, for [first trailing].
    137     ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
    138     rootElements.addElement(ce, errorCode);
    139 
    140     // U+FFFD maps to a CE with the third-highest primary weight,
    141     // for predictable handling of ill-formed UTF-8.
    142     uint32_t ce32 = Collation::FFFD_CE32;
    143     utrie2_set32(trie, 0xfffd, ce32, &errorCode);
    144     addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
    145 
    146     // U+FFFF maps to a CE with the highest primary weight.
    147     ce32 = Collation::MAX_REGULAR_CE32;
    148     utrie2_set32(trie, 0xffff, ce32, &errorCode);
    149     addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
    150 }
    151 
    152 void
    153 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
    154                                         UErrorCode &errorCode) {
    155     if(U_FAILURE(errorCode) || length == 0) { return; }
    156     if((length & 1) != 0) {  // incomplete start/end pairs
    157         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    158         return;
    159     }
    160     if(isAssigned(0x4e00)) {  // already set
    161         errorCode = U_INVALID_STATE_ERROR;
    162         return;
    163     }
    164     int32_t numHanCodePoints = 0;
    165     for(int32_t i = 0; i < length; i += 2) {
    166         UChar32 start = ranges[i];
    167         UChar32 end = ranges[i + 1];
    168         numHanCodePoints += end - start + 1;
    169     }
    170     // Multiply the number of code points by (gap+1).
    171     // Add hanStep+2 for tailoring after the last Han character.
    172     int32_t gap = 1;
    173     hanStep = gap + 1;
    174     int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
    175     // Numbers of Han primaries per lead byte determined by
    176     // numbers of 2nd (not compressible) times 3rd primary byte values.
    177     int32_t numHanPerLeadByte = 254 * 254;
    178     int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
    179     uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
    180     hanPrimary |= 0x20200;
    181     firstHanPrimary = hanPrimary;
    182     for(int32_t i = 0; i < length; i += 2) {
    183         UChar32 start = ranges[i];
    184         UChar32 end = ranges[i + 1];
    185         hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
    186     }
    187     // One past the actual last one, but that is harmless for tailoring.
    188     // It saves us from subtracting "hanStep" and handling underflows.
    189     lastHanPrimary = hanPrimary;
    190 }
    191 
    192 UBool
    193 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
    194     return compressibleBytes[b];
    195 }
    196 
    197 void
    198 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
    199     compressibleBytes[b] = TRUE;
    200 }
    201 
    202 int32_t
    203 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
    204     if((p1 & 0xff000000) == (p2 & 0xff000000)) {
    205         // Same lead bytes.
    206         return (int32_t)(p2 - p1) >> 16;
    207     } else {
    208         int32_t linear1;
    209         int32_t linear2;
    210         int32_t factor;
    211         if(isCompressible) {
    212             // Second byte for compressible lead byte: 251 bytes 04..FE
    213             linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
    214             linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
    215             factor = 251;
    216         } else {
    217             // Second byte for incompressible lead byte: 254 bytes 02..FF
    218             linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
    219             linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
    220             factor = 254;
    221         }
    222         linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
    223         linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
    224         return linear2 - linear1;
    225     }
    226 }
    227 
    228 int32_t
    229 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
    230     if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
    231         // Same first two bytes.
    232         return (int32_t)(p2 - p1) >> 8;
    233     } else {
    234         // Third byte: 254 bytes 02..FF
    235         int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
    236         int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
    237         int32_t factor;
    238         if(isCompressible) {
    239             // Second byte for compressible lead byte: 251 bytes 04..FE
    240             linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
    241             linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
    242             factor = 251 * 254;
    243         } else {
    244             // Second byte for incompressible lead byte: 254 bytes 02..FF
    245             linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
    246             linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
    247             factor = 254 * 254;
    248         }
    249         linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
    250         linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
    251         return linear2 - linear1;
    252     }
    253 }
    254 
    255 uint32_t
    256 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
    257     addRootElements(ces, cesLength, errorCode);
    258     return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
    259 }
    260 
    261 void
    262 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
    263                                           UErrorCode &errorCode) {
    264     if(U_FAILURE(errorCode)) { return; }
    265     for(int32_t i = 0; i < cesLength; ++i) {
    266         addRootElement(ces[i], errorCode);
    267     }
    268 }
    269 
    270 void
    271 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
    272     if(U_FAILURE(errorCode) || ce == 0) { return; }
    273     // Remove case bits.
    274     ce &= INT64_C(0xffffffffffff3fff);
    275     U_ASSERT((ce & 0xc0) == 0);  // quaternary==0
    276     // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
    277     // We will add it later, as part of the Han ranges.
    278     uint32_t p = (uint32_t)(ce >> 32);
    279     uint32_t secTer = (uint32_t)ce;
    280     if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
    281         if(firstHanPrimary <= p && p <= lastHanPrimary) {
    282             return;
    283         }
    284     } else {
    285         // Check that secondary and tertiary weights are >= "common".
    286         uint32_t s = secTer >> 16;
    287         uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
    288         if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
    289             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    290             return;
    291         }
    292     }
    293     // Check that primaries have at most 3 bytes.
    294     if((p & 0xff) != 0) {
    295         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    296         return;
    297     }
    298     int32_t i = binarySearch(rootElements, ce);
    299     if(i < 0) {
    300         rootElements.insertElementAt(ce, ~i, errorCode);
    301     }
    302 }
    303 
    304 void
    305 CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
    306                                              const UnicodeString &groupScripts,
    307                                              UErrorCode &errorCode) {
    308     if(U_FAILURE(errorCode)) { return; }
    309     if(groupScripts.isEmpty()) {
    310         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    311         return;
    312     }
    313     if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
    314         // Zzzz must not occur.
    315         // It is the code used in the API to separate low and high scripts.
    316         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    317         return;
    318     }
    319     // Note: We are mostly trusting the input data,
    320     // rather than verifying that reordering groups do not intersect
    321     // with their lead byte ranges nor their sets of scripts,
    322     // and that all script codes are valid.
    323     scripts.append((UChar)((firstByte << 8) | lastByte));
    324     scripts.append((UChar)groupScripts.length());
    325     scripts.append(groupScripts);
    326 }
    327 
    328 void
    329 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
    330     buildMappings(data, errorCode);
    331     data.numericPrimary = numericPrimary;
    332     data.compressibleBytes = compressibleBytes;
    333     data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
    334     data.scriptsLength = scripts.length();
    335     buildFastLatinTable(data, errorCode);
    336 }
    337 
    338 void
    339 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
    340     if(U_FAILURE(errorCode)) { return; }
    341     uint32_t nextHanPrimary = firstHanPrimary;  // Set to 0xffffffff after the last Han range.
    342     uint32_t prevPrimary = 0;  // Start with primary ignorable CEs.
    343     UBool tryRange = FALSE;
    344     for(int32_t i = 0; i < rootElements.size(); ++i) {
    345         int64_t ce = rootElements.elementAti(i);
    346         uint32_t p = (uint32_t)(ce >> 32);
    347         uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
    348         if(p != prevPrimary) {
    349             U_ASSERT((p & 0xff) == 0);
    350             int32_t end;
    351             if(p >= nextHanPrimary) {
    352                 // Add a Han primary weight or range.
    353                 // We omitted them initially, and omitted all CEs with Han primaries
    354                 // and common secondary/tertiary weights.
    355                 U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
    356                 if(p == nextHanPrimary) {
    357                     // One single Han primary with non-common secondary/tertiary weights.
    358                     table.addElement((int32_t)p, errorCode);
    359                     if(p < lastHanPrimary) {
    360                         // Prepare for the next Han range.
    361                         nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
    362                     } else {
    363                         // p is the last Han primary.
    364                         nextHanPrimary = 0xffffffff;
    365                     }
    366                 } else {
    367                     // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
    368                     table.addElement((int32_t)nextHanPrimary, errorCode);
    369                     if(nextHanPrimary == lastHanPrimary) {
    370                         // nextHanPrimary == lastHanPrimary < p
    371                         // We just wrote the single last Han primary.
    372                         nextHanPrimary = 0xffffffff;
    373                     } else if(p < lastHanPrimary) {
    374                         // nextHanPrimary < p < lastHanPrimary
    375                         // End the Han range on p, prepare for the next range.
    376                         table.addElement((int32_t)p | hanStep, errorCode);
    377                         nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
    378                     } else if(p == lastHanPrimary) {
    379                         // nextHanPrimary < p == lastHanPrimary
    380                         // End the last Han range on p.
    381                         table.addElement((int32_t)p | hanStep, errorCode);
    382                         nextHanPrimary = 0xffffffff;
    383                     } else {
    384                         // nextHanPrimary < lastHanPrimary < p
    385                         // End the last Han range, then write p.
    386                         table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
    387                         nextHanPrimary = 0xffffffff;
    388                         table.addElement((int32_t)p, errorCode);
    389                     }
    390                 }
    391             } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
    392                     (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
    393                 // Multiple CEs with only common secondary/tertiary weights were
    394                 // combined into a primary range.
    395                 // The range end was written, ending with the primary of rootElements[end].
    396                 ce = rootElements.elementAti(end);
    397                 p = (uint32_t)(ce >> 32);
    398                 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
    399                 i = end;
    400             } else {
    401                 // Write the primary weight of a normal CE.
    402                 table.addElement((int32_t)p, errorCode);
    403             }
    404             prevPrimary = p;
    405         }
    406         if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
    407             // The common secondar/tertiary weights are implied in the primary unit.
    408             // If there is no intervening delta unit, then we will try to combine
    409             // the next several primaries into a range.
    410             tryRange = TRUE;
    411         } else {
    412             // For each new set of secondary/tertiary weights we write a delta unit.
    413             table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
    414             tryRange = FALSE;
    415         }
    416     }
    417 
    418     // Limit sentinel for root elements.
    419     // This allows us to reduce range checks at runtime.
    420     table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
    421 }
    422 
    423 int32_t
    424 CollationBaseDataBuilder::writeRootElementsRange(
    425         uint32_t prevPrimary, uint32_t p, int32_t i,
    426         UVector32 &table, UErrorCode &errorCode) {
    427     if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
    428     U_ASSERT(prevPrimary < p);
    429     // No ranges of single-byte primaries.
    430     if((p & prevPrimary & 0xff0000) == 0) { return 0; }
    431     // Lead bytes of compressible primaries must match.
    432     UBool isCompressible = isCompressiblePrimary(p);
    433     if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
    434             (p & 0xff000000) != (prevPrimary & 0xff000000)) {
    435         return 0;
    436     }
    437     // Number of bytes in the primaries.
    438     UBool twoBytes;
    439     // Number of primaries from prevPrimary to p.
    440     int32_t step;
    441     if((p & 0xff00) == 0) {
    442         // 2-byte primary
    443         if((prevPrimary & 0xff00) != 0) { return 0; }  // length mismatch
    444         twoBytes = TRUE;
    445         step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
    446     } else {
    447         // 3-byte primary
    448         if((prevPrimary & 0xff00) == 0) { return 0; }  // length mismatch
    449         twoBytes = FALSE;
    450         step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
    451     }
    452     if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
    453     // See if there are more than two CEs with primaries increasing by "step"
    454     // and with only common secondary/tertiary weights on all but the last one.
    455     int32_t end = 0;  // Initially 0: No range for just two primaries.
    456     for(;;) {
    457         prevPrimary = p;
    458         // Calculate which primary we expect next.
    459         uint32_t nextPrimary;  // = p + step
    460         if(twoBytes) {
    461             nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
    462         } else {
    463             nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
    464         }
    465         // Fetch the actual next CE.
    466         int64_t ce = rootElements.elementAti(i);
    467         p = (uint32_t)(ce >> 32);
    468         uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
    469         // Does this primary increase by "step" from the last one?
    470         if(p != nextPrimary ||
    471                 // Do not cross into a new lead byte if either is compressible.
    472                 ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
    473                     (isCompressible || isCompressiblePrimary(p)))) {
    474             // The range ends with the previous CE.
    475             p = prevPrimary;
    476             break;
    477         }
    478         // Extend the range to include this primary.
    479         end = i++;
    480         // This primary is the last in the range if it has non-common weights
    481         // or if we are at the end of the list.
    482         if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
    483     }
    484     if(end != 0) {
    485         table.addElement((int32_t)p | step, errorCode);
    486     }
    487     return end;
    488 }
    489 
    490 U_NAMESPACE_END
    491 
    492 #endif  // !UCONFIG_NO_COLLATION
    493