Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdatareader.cpp
      7 *
      8 * created on: 2013feb07
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "unicode/udata.h"
     18 #include "unicode/uscript.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "collationdatareader.h"
     23 #include "collationfastlatin.h"
     24 #include "collationkeys.h"
     25 #include "collationrootelements.h"
     26 #include "collationsettings.h"
     27 #include "collationtailoring.h"
     28 #include "normalizer2impl.h"
     29 #include "uassert.h"
     30 #include "ucmndata.h"
     31 #include "utrie2.h"
     32 
     33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 namespace {
     38 
     39 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
     40     return (i < length) ? indexes[i] : -1;
     41 }
     42 
     43 }  // namespace
     44 
     45 void
     46 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
     47                           CollationTailoring &tailoring, UErrorCode &errorCode) {
     48     if(U_FAILURE(errorCode)) { return; }
     49     if(base != NULL) {
     50         if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
     51             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     52             return;
     53         }
     54         const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
     55         if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
     56                 isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
     57             errorCode = U_INVALID_FORMAT_ERROR;
     58             return;
     59         }
     60         if(base->getUCAVersion() != tailoring.getUCAVersion()) {
     61             errorCode = U_COLLATOR_VERSION_MISMATCH;
     62             return;
     63         }
     64         int32_t headerLength = header->dataHeader.headerSize;
     65         inBytes += headerLength;
     66         if(inLength >= 0) {
     67             inLength -= headerLength;
     68         }
     69     }
     70 
     71     if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
     72         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     73         return;
     74     }
     75     const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
     76     int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
     77     if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
     78         errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.
     79         return;
     80     }
     81 
     82     // Assume that the tailoring data is in initial state,
     83     // with NULL pointers and 0 lengths.
     84 
     85     // Set pointers to non-empty data parts.
     86     // Do this in order of their byte offsets. (Should help porting to Java.)
     87 
     88     int32_t index;  // one of the indexes[] slots
     89     int32_t offset;  // byte offset for the index part
     90     int32_t length;  // number of bytes in the index part
     91 
     92     if(indexesLength > IX_TOTAL_SIZE) {
     93         length = inIndexes[IX_TOTAL_SIZE];
     94     } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
     95         length = inIndexes[indexesLength - 1];
     96     } else {
     97         length = 0;  // only indexes, and inLength was already checked for them
     98     }
     99     if(0 <= inLength && inLength < length) {
    100         errorCode = U_INVALID_FORMAT_ERROR;
    101         return;
    102     }
    103 
    104     const CollationData *baseData = base == NULL ? NULL : base->data;
    105     const int32_t *reorderCodes = NULL;
    106     int32_t reorderCodesLength = 0;
    107     index = IX_REORDER_CODES_OFFSET;
    108     offset = getIndex(inIndexes, indexesLength, index);
    109     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    110     if(length >= 4) {
    111         if(baseData == NULL) {
    112             // We assume for collation settings that
    113             // the base data does not have a reordering.
    114             errorCode = U_INVALID_FORMAT_ERROR;
    115             return;
    116         }
    117         reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
    118         reorderCodesLength = length / 4;
    119     }
    120 
    121     // There should be a reorder table only if there are reorder codes.
    122     // However, when there are reorder codes the reorder table may be omitted to reduce
    123     // the data size.
    124     const uint8_t *reorderTable = NULL;
    125     index = IX_REORDER_TABLE_OFFSET;
    126     offset = getIndex(inIndexes, indexesLength, index);
    127     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    128     if(length >= 256) {
    129         if(reorderCodesLength == 0) {
    130             errorCode = U_INVALID_FORMAT_ERROR;  // Reordering table without reordering codes.
    131             return;
    132         }
    133         reorderTable = inBytes + offset;
    134     } else {
    135         // If we have reorder codes, then build the reorderTable at the end,
    136         // when the CollationData is otherwise complete.
    137     }
    138 
    139     if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
    140         errorCode = U_INVALID_FORMAT_ERROR;
    141         return;
    142     }
    143     CollationData *data = NULL;  // Remains NULL if there are no mappings.
    144 
    145     index = IX_TRIE_OFFSET;
    146     offset = getIndex(inIndexes, indexesLength, index);
    147     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    148     if(length >= 8) {
    149         if(!tailoring.ensureOwnedData(errorCode)) { return; }
    150         data = tailoring.ownedData;
    151         data->base = baseData;
    152         data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
    153         data->trie = tailoring.trie = utrie2_openFromSerialized(
    154             UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
    155             &errorCode);
    156         if(U_FAILURE(errorCode)) { return; }
    157     } else if(baseData != NULL) {
    158         // Use the base data. Only the settings are tailored.
    159         tailoring.data = baseData;
    160     } else {
    161         errorCode = U_INVALID_FORMAT_ERROR;  // No mappings.
    162         return;
    163     }
    164 
    165     index = IX_CES_OFFSET;
    166     offset = getIndex(inIndexes, indexesLength, index);
    167     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    168     if(length >= 8) {
    169         if(data == NULL) {
    170             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ces without tailored trie.
    171             return;
    172         }
    173         data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
    174         data->cesLength = length / 8;
    175     }
    176 
    177     index = IX_CE32S_OFFSET;
    178     offset = getIndex(inIndexes, indexesLength, index);
    179     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    180     if(length >= 4) {
    181         if(data == NULL) {
    182             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ce32s without tailored trie.
    183             return;
    184         }
    185         data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
    186         data->ce32sLength = length / 4;
    187     }
    188 
    189     int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
    190     if(jamoCE32sStart >= 0) {
    191         if(data == NULL || data->ce32s == NULL) {
    192             errorCode = U_INVALID_FORMAT_ERROR;  // Index into non-existent ce32s[].
    193             return;
    194         }
    195         data->jamoCE32s = data->ce32s + jamoCE32sStart;
    196     } else if(data == NULL) {
    197         // Nothing to do.
    198     } else if(baseData != NULL) {
    199         data->jamoCE32s = baseData->jamoCE32s;
    200     } else {
    201         errorCode = U_INVALID_FORMAT_ERROR;  // No Jamo CE32s for Hangul processing.
    202         return;
    203     }
    204 
    205     index = IX_ROOT_ELEMENTS_OFFSET;
    206     offset = getIndex(inIndexes, indexesLength, index);
    207     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    208     if(length >= 4) {
    209         length /= 4;
    210         if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
    211             errorCode = U_INVALID_FORMAT_ERROR;
    212             return;
    213         }
    214         data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
    215         data->rootElementsLength = length;
    216         uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
    217         if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
    218             errorCode = U_INVALID_FORMAT_ERROR;
    219             return;
    220         }
    221         uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
    222         if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
    223             // [fixed last secondary common byte] is too low,
    224             // and secondary weights would collide with compressed common secondaries.
    225             errorCode = U_INVALID_FORMAT_ERROR;
    226             return;
    227         }
    228     }
    229 
    230     index = IX_CONTEXTS_OFFSET;
    231     offset = getIndex(inIndexes, indexesLength, index);
    232     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    233     if(length >= 2) {
    234         if(data == NULL) {
    235             errorCode = U_INVALID_FORMAT_ERROR;  // Tailored contexts without tailored trie.
    236             return;
    237         }
    238         data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
    239         data->contextsLength = length / 2;
    240     }
    241 
    242     index = IX_UNSAFE_BWD_OFFSET;
    243     offset = getIndex(inIndexes, indexesLength, index);
    244     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    245     if(length >= 2) {
    246         if(data == NULL) {
    247             errorCode = U_INVALID_FORMAT_ERROR;
    248             return;
    249         }
    250         if(baseData == NULL) {
    251             // Create the unsafe-backward set for the root collator.
    252             // Include all non-zero combining marks and trail surrogates.
    253             // We do this at load time, rather than at build time,
    254             // to simplify Unicode version bootstrapping:
    255             // The root data builder only needs the new FractionalUCA.txt data,
    256             // but it need not be built with a version of ICU already updated to
    257             // the corresponding new Unicode Character Database.
    258             //
    259             // The following is an optimized version of
    260             // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
    261             // It is faster and requires fewer code dependencies.
    262             tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
    263             if(tailoring.unsafeBackwardSet == NULL) {
    264                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    265                 return;
    266             }
    267             data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
    268         } else {
    269             // Clone the root collator's set contents.
    270             tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
    271                 baseData->unsafeBackwardSet->cloneAsThawed());
    272             if(tailoring.unsafeBackwardSet == NULL) {
    273                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    274                 return;
    275             }
    276         }
    277         // Add the ranges from the data file to the unsafe-backward set.
    278         USerializedSet sset;
    279         const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
    280         if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
    281             errorCode = U_INVALID_FORMAT_ERROR;
    282             return;
    283         }
    284         int32_t count = uset_getSerializedRangeCount(&sset);
    285         for(int32_t i = 0; i < count; ++i) {
    286             UChar32 start, end;
    287             uset_getSerializedRange(&sset, i, &start, &end);
    288             tailoring.unsafeBackwardSet->add(start, end);
    289         }
    290         // Mark each lead surrogate as "unsafe"
    291         // if any of its 1024 associated supplementary code points is "unsafe".
    292         UChar32 c = 0x10000;
    293         for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
    294             if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
    295                 tailoring.unsafeBackwardSet->add(lead);
    296             }
    297         }
    298         tailoring.unsafeBackwardSet->freeze();
    299         data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
    300     } else if(data == NULL) {
    301         // Nothing to do.
    302     } else if(baseData != NULL) {
    303         // No tailoring-specific data: Alias the root collator's set.
    304         data->unsafeBackwardSet = baseData->unsafeBackwardSet;
    305     } else {
    306         errorCode = U_INVALID_FORMAT_ERROR;  // No unsafeBackwardSet.
    307         return;
    308     }
    309 
    310     // If the fast Latin format version is different,
    311     // or the version is set to 0 for "no fast Latin table",
    312     // then just always use the normal string comparison path.
    313     if(data != NULL) {
    314         data->fastLatinTable = NULL;
    315         data->fastLatinTableLength = 0;
    316         if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
    317             index = IX_FAST_LATIN_TABLE_OFFSET;
    318             offset = getIndex(inIndexes, indexesLength, index);
    319             length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    320             if(length >= 2) {
    321                 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
    322                 data->fastLatinTableLength = length / 2;
    323                 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
    324                     errorCode = U_INVALID_FORMAT_ERROR;  // header vs. table version mismatch
    325                     return;
    326                 }
    327             } else if(baseData != NULL) {
    328                 data->fastLatinTable = baseData->fastLatinTable;
    329                 data->fastLatinTableLength = baseData->fastLatinTableLength;
    330             }
    331         }
    332     }
    333 
    334     index = IX_SCRIPTS_OFFSET;
    335     offset = getIndex(inIndexes, indexesLength, index);
    336     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    337     if(length >= 2) {
    338         if(data == NULL) {
    339             errorCode = U_INVALID_FORMAT_ERROR;
    340             return;
    341         }
    342         data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
    343         data->scriptsLength = length / 2;
    344     } else if(data == NULL) {
    345         // Nothing to do.
    346     } else if(baseData != NULL) {
    347         data->scripts = baseData->scripts;
    348         data->scriptsLength = baseData->scriptsLength;
    349     }
    350 
    351     index = IX_COMPRESSIBLE_BYTES_OFFSET;
    352     offset = getIndex(inIndexes, indexesLength, index);
    353     length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    354     if(length >= 256) {
    355         if(data == NULL) {
    356             errorCode = U_INVALID_FORMAT_ERROR;
    357             return;
    358         }
    359         data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
    360     } else if(data == NULL) {
    361         // Nothing to do.
    362     } else if(baseData != NULL) {
    363         data->compressibleBytes = baseData->compressibleBytes;
    364     } else {
    365         errorCode = U_INVALID_FORMAT_ERROR;  // No compressibleBytes[].
    366         return;
    367     }
    368 
    369     const CollationSettings &ts = *tailoring.settings;
    370     int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
    371     uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
    372     int32_t fastLatinOptions = CollationFastLatin::getOptions(
    373             tailoring.data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries));
    374     if(options == ts.options && ts.variableTop != 0 &&
    375             reorderCodesLength == ts.reorderCodesLength &&
    376             uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
    377             fastLatinOptions == ts.fastLatinOptions &&
    378             (fastLatinOptions < 0 ||
    379                 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
    380                             sizeof(fastLatinPrimaries)) == 0)) {
    381         return;
    382     }
    383 
    384     CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
    385     if(settings == NULL) {
    386         errorCode = U_MEMORY_ALLOCATION_ERROR;
    387         return;
    388     }
    389     settings->options = options;
    390     // Set variableTop from options and scripts data.
    391     settings->variableTop = tailoring.data->getLastPrimaryForGroup(
    392             UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
    393     if(settings->variableTop == 0) {
    394         errorCode = U_INVALID_FORMAT_ERROR;
    395         return;
    396     }
    397 
    398     if(reorderCodesLength == 0 || reorderTable != NULL) {
    399         settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
    400     } else {
    401         uint8_t table[256];
    402         baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
    403         if(U_FAILURE(errorCode)) { return; }
    404         if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
    405             errorCode = U_MEMORY_ALLOCATION_ERROR;
    406             return;
    407         }
    408     }
    409 
    410     settings->fastLatinOptions = CollationFastLatin::getOptions(
    411         tailoring.data, *settings,
    412         settings->fastLatinPrimaries, LENGTHOF(settings->fastLatinPrimaries));
    413 }
    414 
    415 UBool U_CALLCONV
    416 CollationDataReader::isAcceptable(void *context,
    417                                   const char * /* type */, const char * /*name*/,
    418                                   const UDataInfo *pInfo) {
    419     if(
    420         pInfo->size >= 20 &&
    421         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
    422         pInfo->charsetFamily == U_CHARSET_FAMILY &&
    423         pInfo->dataFormat[0] == 0x55 &&  // dataFormat="UCol"
    424         pInfo->dataFormat[1] == 0x43 &&
    425         pInfo->dataFormat[2] == 0x6f &&
    426         pInfo->dataFormat[3] == 0x6c &&
    427         pInfo->formatVersion[0] == 4
    428     ) {
    429         UVersionInfo *version = static_cast<UVersionInfo *>(context);
    430         if(version != NULL) {
    431             uprv_memcpy(version, pInfo->dataVersion, 4);
    432         }
    433         return TRUE;
    434     } else {
    435         return FALSE;
    436     }
    437 }
    438 
    439 U_NAMESPACE_END
    440 
    441 #endif  // !UCONFIG_NO_COLLATION
    442