Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatawriter.cpp
      9 *
     10 * created on: 2013aug06
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/uniset.h"
     21 #include "cmemory.h"
     22 #include "collationdata.h"
     23 #include "collationdatabuilder.h"
     24 #include "collationdatareader.h"
     25 #include "collationdatawriter.h"
     26 #include "collationfastlatin.h"
     27 #include "collationsettings.h"
     28 #include "collationtailoring.h"
     29 #include "uassert.h"
     30 #include "ucmndata.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 uint8_t *
     35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
     36     if(U_FAILURE(errorCode)) { return NULL; }
     37     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
     38     if(buffer.isNull()) {
     39         errorCode = U_MEMORY_ALLOCATION_ERROR;
     40         return NULL;
     41     }
     42     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
     43     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
     44         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
     45             errorCode = U_MEMORY_ALLOCATION_ERROR;
     46             return NULL;
     47         }
     48         errorCode = U_ZERO_ERROR;
     49         length = cloneBinary(buffer.getAlias(), length, errorCode);
     50     }
     51     if(U_FAILURE(errorCode)) { return NULL; }
     52     return buffer.orphan();
     53 }
     54 
     55 int32_t
     56 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
     57     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
     58     return CollationDataWriter::writeTailoring(
     59             *tailoring, *settings, indexes, dest, capacity,
     60             errorCode);
     61 }
     62 
     63 static const UDataInfo dataInfo = {
     64     sizeof(UDataInfo),
     65     0,
     66 
     67     U_IS_BIG_ENDIAN,
     68     U_CHARSET_FAMILY,
     69     U_SIZEOF_UCHAR,
     70     0,
     71 
     72     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
     73     { 5, 0, 0, 0 },                     // formatVersion
     74     { 6, 3, 0, 0 }                      // dataVersion
     75 };
     76 
     77 int32_t
     78 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
     79                                const void *rootElements, int32_t rootElementsLength,
     80                                int32_t indexes[], uint8_t *dest, int32_t capacity,
     81                                UErrorCode &errorCode) {
     82     return write(TRUE, NULL,
     83                  data, settings,
     84                  rootElements, rootElementsLength,
     85                  indexes, dest, capacity, errorCode);
     86 }
     87 
     88 int32_t
     89 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
     90                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
     91                                     UErrorCode &errorCode) {
     92     return write(FALSE, t.version,
     93                  *t.data, settings,
     94                  NULL, 0,
     95                  indexes, dest, capacity, errorCode);
     96 }
     97 
     98 int32_t
     99 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
    100                            const CollationData &data, const CollationSettings &settings,
    101                            const void *rootElements, int32_t rootElementsLength,
    102                            int32_t indexes[], uint8_t *dest, int32_t capacity,
    103                            UErrorCode &errorCode) {
    104     if(U_FAILURE(errorCode)) { return 0; }
    105     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
    106         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    107         return 0;
    108     }
    109 
    110     // Figure out which data items to write before settling on
    111     // the indexes length and writing offsets.
    112     // For any data item, we need to write the start and limit offsets,
    113     // so the indexes length must be at least index-of-start-offset + 2.
    114     int32_t indexesLength;
    115     UBool hasMappings;
    116     UnicodeSet unsafeBackwardSet;
    117     const CollationData *baseData = data.base;
    118 
    119     int32_t fastLatinVersion;
    120     if(data.fastLatinTable != NULL) {
    121         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
    122     } else {
    123         fastLatinVersion = 0;
    124     }
    125     int32_t fastLatinTableLength = 0;
    126 
    127     if(isBase) {
    128         // For the root collator, we write an even number of indexes
    129         // so that we start with an 8-aligned offset.
    130         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
    131         U_ASSERT(settings.reorderCodesLength == 0);
    132         hasMappings = TRUE;
    133         unsafeBackwardSet = *data.unsafeBackwardSet;
    134         fastLatinTableLength = data.fastLatinTableLength;
    135     } else if(baseData == NULL) {
    136         hasMappings = FALSE;
    137         if(settings.reorderCodesLength == 0) {
    138             // only options
    139             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
    140         } else {
    141             // only options, reorder codes, and the reorder table
    142             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
    143         }
    144     } else {
    145         hasMappings = TRUE;
    146         // Tailored mappings, and what else?
    147         // Check in ascending order of optional tailoring data items.
    148         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
    149         if(data.contextsLength != 0) {
    150             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
    151         }
    152         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
    153         if(!unsafeBackwardSet.isEmpty()) {
    154             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
    155         }
    156         if(data.fastLatinTable != baseData->fastLatinTable) {
    157             fastLatinTableLength = data.fastLatinTableLength;
    158             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
    159         }
    160     }
    161 
    162     UVector32 codesAndRanges(errorCode);
    163     const int32_t *reorderCodes = settings.reorderCodes;
    164     int32_t reorderCodesLength = settings.reorderCodesLength;
    165     if(settings.hasReordering() &&
    166             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
    167         // Rebuild the full list of reorder ranges.
    168         // The list in the settings is truncated for efficiency.
    169         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
    170         // Write the codes, then the ranges.
    171         for(int32_t i = 0; i < reorderCodesLength; ++i) {
    172             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
    173         }
    174         if(U_FAILURE(errorCode)) { return 0; }
    175         reorderCodes = codesAndRanges.getBuffer();
    176         reorderCodesLength = codesAndRanges.size();
    177     }
    178 
    179     int32_t headerSize;
    180     if(isBase) {
    181         headerSize = 0;  // udata_create() writes the header
    182     } else {
    183         DataHeader header;
    184         header.dataHeader.magic1 = 0xda;
    185         header.dataHeader.magic2 = 0x27;
    186         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
    187         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
    188         headerSize = (int32_t)sizeof(header);
    189         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
    190         if(hasMappings && data.cesLength != 0) {
    191             // Sum of the sizes of the data items which are
    192             // not automatically multiples of 8 bytes and which are placed before the CEs.
    193             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
    194             if((sum & 7) != 0) {
    195                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
    196                 // We add to the header size here.
    197                 // Alternatively, we could increment the indexesLength
    198                 // or add a few bytes to the reorderTable.
    199                 headerSize += 4;
    200             }
    201         }
    202         header.dataHeader.headerSize = (uint16_t)headerSize;
    203         if(headerSize <= capacity) {
    204             uprv_memcpy(dest, &header, sizeof(header));
    205             // Write 00 bytes so that the padding is not mistaken for a copyright string.
    206             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
    207             dest += headerSize;
    208             capacity -= headerSize;
    209         } else {
    210             dest = NULL;
    211             capacity = 0;
    212         }
    213     }
    214 
    215     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
    216     U_ASSERT((settings.options & ~0xffff) == 0);
    217     indexes[CollationDataReader::IX_OPTIONS] =
    218             data.numericPrimary | fastLatinVersion | settings.options;
    219     indexes[CollationDataReader::IX_RESERVED2] = 0;
    220     indexes[CollationDataReader::IX_RESERVED3] = 0;
    221 
    222     // Byte offsets of data items all start from the start of the indexes.
    223     // We add the headerSize at the very end.
    224     int32_t totalSize = indexesLength * 4;
    225 
    226     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
    227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
    228     } else {
    229         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
    230     }
    231 
    232     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
    233     totalSize += reorderCodesLength * 4;
    234 
    235     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
    236     if(settings.reorderTable != NULL) {
    237         totalSize += 256;
    238     }
    239 
    240     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
    241     if(hasMappings) {
    242         UErrorCode errorCode2 = U_ZERO_ERROR;
    243         int32_t length;
    244         if(totalSize < capacity) {
    245             length = utrie2_serialize(data.trie, dest + totalSize,
    246                                       capacity - totalSize, &errorCode2);
    247         } else {
    248             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
    249         }
    250         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    251             errorCode = errorCode2;
    252             return 0;
    253         }
    254         // The trie size should be a multiple of 8 bytes due to the way
    255         // compactIndex2(UNewTrie2 *trie) currently works.
    256         U_ASSERT((length & 7) == 0);
    257         totalSize += length;
    258     }
    259 
    260     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
    261     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
    262     if(hasMappings && data.cesLength != 0) {
    263         U_ASSERT(((headerSize + totalSize) & 7) == 0);
    264         totalSize += data.cesLength * 8;
    265     }
    266 
    267     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
    268     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
    269     if(hasMappings) {
    270         totalSize += data.ce32sLength * 4;
    271     }
    272 
    273     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
    274     totalSize += rootElementsLength * 4;
    275 
    276     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
    277     if(hasMappings) {
    278         totalSize += data.contextsLength * 2;
    279     }
    280 
    281     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
    282     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
    283         UErrorCode errorCode2 = U_ZERO_ERROR;
    284         int32_t length;
    285         if(totalSize < capacity) {
    286             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
    287             length = unsafeBackwardSet.serialize(
    288                     p, (capacity - totalSize) / 2, errorCode2);
    289         } else {
    290             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
    291         }
    292         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    293             errorCode = errorCode2;
    294             return 0;
    295         }
    296         totalSize += length * 2;
    297     }
    298 
    299     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
    300     totalSize += fastLatinTableLength * 2;
    301 
    302     UnicodeString scripts;
    303     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
    304     if(isBase) {
    305         scripts.append((UChar)data.numScripts);
    306         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
    307         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
    308         totalSize += scripts.length() * 2;
    309     }
    310 
    311     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
    312     if(isBase) {
    313         totalSize += 256;
    314     }
    315 
    316     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
    317     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
    318 
    319     if(totalSize > capacity) {
    320         errorCode = U_BUFFER_OVERFLOW_ERROR;
    321         return headerSize + totalSize;
    322     }
    323 
    324     uprv_memcpy(dest, indexes, indexesLength * 4);
    325     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
    326     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
    327     // The trie has already been serialized into the dest buffer.
    328     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
    329     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
    330     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
    331     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
    332     // The unsafeBackwardSet has already been serialized into the dest buffer.
    333     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
    334     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
    335     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
    336 
    337     return headerSize + totalSize;
    338 }
    339 
    340 void
    341 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
    342                               const void *src, uint8_t *dest) {
    343     int32_t start = indexes[startIndex];
    344     int32_t limit = indexes[startIndex + 1];
    345     if(start < limit) {
    346         uprv_memcpy(dest + start, src, limit - start);
    347     }
    348 }
    349 
    350 U_NAMESPACE_END
    351 
    352 #endif  // !UCONFIG_NO_COLLATION
    353