Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013-2015, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdatawriter.cpp
      7 *
      8 * created on: 2013aug06
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/tblcoll.h"
     17 #include "unicode/udata.h"
     18 #include "unicode/uniset.h"
     19 #include "cmemory.h"
     20 #include "collationdata.h"
     21 #include "collationdatabuilder.h"
     22 #include "collationdatareader.h"
     23 #include "collationdatawriter.h"
     24 #include "collationfastlatin.h"
     25 #include "collationsettings.h"
     26 #include "collationtailoring.h"
     27 #include "uassert.h"
     28 #include "ucmndata.h"
     29 
     30 U_NAMESPACE_BEGIN
     31 
     32 uint8_t *
     33 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
     34     if(U_FAILURE(errorCode)) { return NULL; }
     35     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
     36     if(buffer.isNull()) {
     37         errorCode = U_MEMORY_ALLOCATION_ERROR;
     38         return NULL;
     39     }
     40     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
     41     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
     42         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
     43             errorCode = U_MEMORY_ALLOCATION_ERROR;
     44             return NULL;
     45         }
     46         errorCode = U_ZERO_ERROR;
     47         length = cloneBinary(buffer.getAlias(), length, errorCode);
     48     }
     49     if(U_FAILURE(errorCode)) { return NULL; }
     50     return buffer.orphan();
     51 }
     52 
     53 int32_t
     54 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
     55     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
     56     return CollationDataWriter::writeTailoring(
     57             *tailoring, *settings, indexes, dest, capacity,
     58             errorCode);
     59 }
     60 
     61 static const UDataInfo dataInfo = {
     62     sizeof(UDataInfo),
     63     0,
     64 
     65     U_IS_BIG_ENDIAN,
     66     U_CHARSET_FAMILY,
     67     U_SIZEOF_UCHAR,
     68     0,
     69 
     70     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
     71     { 5, 0, 0, 0 },                     // formatVersion
     72     { 6, 3, 0, 0 }                      // dataVersion
     73 };
     74 
     75 int32_t
     76 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
     77                                const void *rootElements, int32_t rootElementsLength,
     78                                int32_t indexes[], uint8_t *dest, int32_t capacity,
     79                                UErrorCode &errorCode) {
     80     return write(TRUE, NULL,
     81                  data, settings,
     82                  rootElements, rootElementsLength,
     83                  indexes, dest, capacity, errorCode);
     84 }
     85 
     86 int32_t
     87 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
     88                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
     89                                     UErrorCode &errorCode) {
     90     return write(FALSE, t.version,
     91                  *t.data, settings,
     92                  NULL, 0,
     93                  indexes, dest, capacity, errorCode);
     94 }
     95 
     96 int32_t
     97 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
     98                            const CollationData &data, const CollationSettings &settings,
     99                            const void *rootElements, int32_t rootElementsLength,
    100                            int32_t indexes[], uint8_t *dest, int32_t capacity,
    101                            UErrorCode &errorCode) {
    102     if(U_FAILURE(errorCode)) { return 0; }
    103     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
    104         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    105         return 0;
    106     }
    107 
    108     // Figure out which data items to write before settling on
    109     // the indexes length and writing offsets.
    110     // For any data item, we need to write the start and limit offsets,
    111     // so the indexes length must be at least index-of-start-offset + 2.
    112     int32_t indexesLength;
    113     UBool hasMappings;
    114     UnicodeSet unsafeBackwardSet;
    115     const CollationData *baseData = data.base;
    116 
    117     int32_t fastLatinVersion;
    118     if(data.fastLatinTable != NULL) {
    119         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
    120     } else {
    121         fastLatinVersion = 0;
    122     }
    123     int32_t fastLatinTableLength = 0;
    124 
    125     if(isBase) {
    126         // For the root collator, we write an even number of indexes
    127         // so that we start with an 8-aligned offset.
    128         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
    129         U_ASSERT(settings.reorderCodesLength == 0);
    130         hasMappings = TRUE;
    131         unsafeBackwardSet = *data.unsafeBackwardSet;
    132         fastLatinTableLength = data.fastLatinTableLength;
    133     } else if(baseData == NULL) {
    134         hasMappings = FALSE;
    135         if(settings.reorderCodesLength == 0) {
    136             // only options
    137             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
    138         } else {
    139             // only options, reorder codes, and the reorder table
    140             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
    141         }
    142     } else {
    143         hasMappings = TRUE;
    144         // Tailored mappings, and what else?
    145         // Check in ascending order of optional tailoring data items.
    146         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
    147         if(data.contextsLength != 0) {
    148             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
    149         }
    150         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
    151         if(!unsafeBackwardSet.isEmpty()) {
    152             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
    153         }
    154         if(data.fastLatinTable != baseData->fastLatinTable) {
    155             fastLatinTableLength = data.fastLatinTableLength;
    156             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
    157         }
    158     }
    159 
    160     UVector32 codesAndRanges(errorCode);
    161     const int32_t *reorderCodes = settings.reorderCodes;
    162     int32_t reorderCodesLength = settings.reorderCodesLength;
    163     if(settings.hasReordering() &&
    164             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
    165         // Rebuild the full list of reorder ranges.
    166         // The list in the settings is truncated for efficiency.
    167         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
    168         // Write the codes, then the ranges.
    169         for(int32_t i = 0; i < reorderCodesLength; ++i) {
    170             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
    171         }
    172         if(U_FAILURE(errorCode)) { return 0; }
    173         reorderCodes = codesAndRanges.getBuffer();
    174         reorderCodesLength = codesAndRanges.size();
    175     }
    176 
    177     int32_t headerSize;
    178     if(isBase) {
    179         headerSize = 0;  // udata_create() writes the header
    180     } else {
    181         DataHeader header;
    182         header.dataHeader.magic1 = 0xda;
    183         header.dataHeader.magic2 = 0x27;
    184         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
    185         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
    186         headerSize = (int32_t)sizeof(header);
    187         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
    188         if(hasMappings && data.cesLength != 0) {
    189             // Sum of the sizes of the data items which are
    190             // not automatically multiples of 8 bytes and which are placed before the CEs.
    191             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
    192             if((sum & 7) != 0) {
    193                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
    194                 // We add to the header size here.
    195                 // Alternatively, we could increment the indexesLength
    196                 // or add a few bytes to the reorderTable.
    197                 headerSize += 4;
    198             }
    199         }
    200         header.dataHeader.headerSize = (uint16_t)headerSize;
    201         if(headerSize <= capacity) {
    202             uprv_memcpy(dest, &header, sizeof(header));
    203             // Write 00 bytes so that the padding is not mistaken for a copyright string.
    204             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
    205             dest += headerSize;
    206             capacity -= headerSize;
    207         } else {
    208             dest = NULL;
    209             capacity = 0;
    210         }
    211     }
    212 
    213     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
    214     U_ASSERT((settings.options & ~0xffff) == 0);
    215     indexes[CollationDataReader::IX_OPTIONS] =
    216             data.numericPrimary | fastLatinVersion | settings.options;
    217     indexes[CollationDataReader::IX_RESERVED2] = 0;
    218     indexes[CollationDataReader::IX_RESERVED3] = 0;
    219 
    220     // Byte offsets of data items all start from the start of the indexes.
    221     // We add the headerSize at the very end.
    222     int32_t totalSize = indexesLength * 4;
    223 
    224     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
    225         indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
    226     } else {
    227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
    228     }
    229 
    230     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
    231     totalSize += reorderCodesLength * 4;
    232 
    233     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
    234     if(settings.reorderTable != NULL) {
    235         totalSize += 256;
    236     }
    237 
    238     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
    239     if(hasMappings) {
    240         UErrorCode errorCode2 = U_ZERO_ERROR;
    241         int32_t length;
    242         if(totalSize < capacity) {
    243             length = utrie2_serialize(data.trie, dest + totalSize,
    244                                       capacity - totalSize, &errorCode2);
    245         } else {
    246             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
    247         }
    248         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    249             errorCode = errorCode2;
    250             return 0;
    251         }
    252         // The trie size should be a multiple of 8 bytes due to the way
    253         // compactIndex2(UNewTrie2 *trie) currently works.
    254         U_ASSERT((length & 7) == 0);
    255         totalSize += length;
    256     }
    257 
    258     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
    259     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
    260     if(hasMappings && data.cesLength != 0) {
    261         U_ASSERT(((headerSize + totalSize) & 7) == 0);
    262         totalSize += data.cesLength * 8;
    263     }
    264 
    265     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
    266     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
    267     if(hasMappings) {
    268         totalSize += data.ce32sLength * 4;
    269     }
    270 
    271     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
    272     totalSize += rootElementsLength * 4;
    273 
    274     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
    275     if(hasMappings) {
    276         totalSize += data.contextsLength * 2;
    277     }
    278 
    279     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
    280     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
    281         UErrorCode errorCode2 = U_ZERO_ERROR;
    282         int32_t length;
    283         if(totalSize < capacity) {
    284             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
    285             length = unsafeBackwardSet.serialize(
    286                     p, (capacity - totalSize) / 2, errorCode2);
    287         } else {
    288             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
    289         }
    290         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    291             errorCode = errorCode2;
    292             return 0;
    293         }
    294         totalSize += length * 2;
    295     }
    296 
    297     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
    298     totalSize += fastLatinTableLength * 2;
    299 
    300     UnicodeString scripts;
    301     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
    302     if(isBase) {
    303         scripts.append((UChar)data.numScripts);
    304         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
    305         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
    306         totalSize += scripts.length() * 2;
    307     }
    308 
    309     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
    310     if(isBase) {
    311         totalSize += 256;
    312     }
    313 
    314     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
    315     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
    316 
    317     if(totalSize > capacity) {
    318         errorCode = U_BUFFER_OVERFLOW_ERROR;
    319         return headerSize + totalSize;
    320     }
    321 
    322     uprv_memcpy(dest, indexes, indexesLength * 4);
    323     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
    324     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
    325     // The trie has already been serialized into the dest buffer.
    326     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
    327     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
    328     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
    329     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
    330     // The unsafeBackwardSet has already been serialized into the dest buffer.
    331     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
    332     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
    333     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
    334 
    335     return headerSize + totalSize;
    336 }
    337 
    338 void
    339 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
    340                               const void *src, uint8_t *dest) {
    341     int32_t start = indexes[startIndex];
    342     int32_t limit = indexes[startIndex + 1];
    343     if(start < limit) {
    344         uprv_memcpy(dest + start, src, limit - start);
    345     }
    346 }
    347 
    348 U_NAMESPACE_END
    349 
    350 #endif  // !UCONFIG_NO_COLLATION
    351