1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdatawriter.cpp 9 * 10 * created on: 2013aug06 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/tblcoll.h" 19 #include "unicode/udata.h" 20 #include "unicode/uniset.h" 21 #include "cmemory.h" 22 #include "collationdata.h" 23 #include "collationdatabuilder.h" 24 #include "collationdatareader.h" 25 #include "collationdatawriter.h" 26 #include "collationfastlatin.h" 27 #include "collationsettings.h" 28 #include "collationtailoring.h" 29 #include "uassert.h" 30 #include "ucmndata.h" 31 32 U_NAMESPACE_BEGIN 33 34 uint8_t * 35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { 36 if(U_FAILURE(errorCode)) { return NULL; } 37 LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); 38 if(buffer.isNull()) { 39 errorCode = U_MEMORY_ALLOCATION_ERROR; 40 return NULL; 41 } 42 length = cloneBinary(buffer.getAlias(), 20000, errorCode); 43 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 44 if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { 45 errorCode = U_MEMORY_ALLOCATION_ERROR; 46 return NULL; 47 } 48 errorCode = U_ZERO_ERROR; 49 length = cloneBinary(buffer.getAlias(), length, errorCode); 50 } 51 if(U_FAILURE(errorCode)) { return NULL; } 52 return buffer.orphan(); 53 } 54 55 int32_t 56 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { 57 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; 58 return CollationDataWriter::writeTailoring( 59 *tailoring, *settings, indexes, dest, capacity, 60 errorCode); 61 } 62 63 static const UDataInfo dataInfo = { 64 sizeof(UDataInfo), 65 0, 66 67 U_IS_BIG_ENDIAN, 68 U_CHARSET_FAMILY, 69 U_SIZEOF_UCHAR, 70 0, 71 72 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" 73 { 5, 0, 0, 0 }, // formatVersion 74 { 6, 3, 0, 0 } // dataVersion 75 }; 76 77 int32_t 78 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, 79 const void *rootElements, int32_t rootElementsLength, 80 int32_t indexes[], uint8_t *dest, int32_t capacity, 81 UErrorCode &errorCode) { 82 return write(TRUE, NULL, 83 data, settings, 84 rootElements, rootElementsLength, 85 indexes, dest, capacity, errorCode); 86 } 87 88 int32_t 89 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, 90 int32_t indexes[], uint8_t *dest, int32_t capacity, 91 UErrorCode &errorCode) { 92 return write(FALSE, t.version, 93 *t.data, settings, 94 NULL, 0, 95 indexes, dest, capacity, errorCode); 96 } 97 98 int32_t 99 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, 100 const CollationData &data, const CollationSettings &settings, 101 const void *rootElements, int32_t rootElementsLength, 102 int32_t indexes[], uint8_t *dest, int32_t capacity, 103 UErrorCode &errorCode) { 104 if(U_FAILURE(errorCode)) { return 0; } 105 if(capacity < 0 || (capacity > 0 && dest == NULL)) { 106 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 107 return 0; 108 } 109 110 // Figure out which data items to write before settling on 111 // the indexes length and writing offsets. 112 // For any data item, we need to write the start and limit offsets, 113 // so the indexes length must be at least index-of-start-offset + 2. 114 int32_t indexesLength; 115 UBool hasMappings; 116 UnicodeSet unsafeBackwardSet; 117 const CollationData *baseData = data.base; 118 119 int32_t fastLatinVersion; 120 if(data.fastLatinTable != NULL) { 121 fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; 122 } else { 123 fastLatinVersion = 0; 124 } 125 int32_t fastLatinTableLength = 0; 126 127 if(isBase) { 128 // For the root collator, we write an even number of indexes 129 // so that we start with an 8-aligned offset. 130 indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; 131 U_ASSERT(settings.reorderCodesLength == 0); 132 hasMappings = TRUE; 133 unsafeBackwardSet = *data.unsafeBackwardSet; 134 fastLatinTableLength = data.fastLatinTableLength; 135 } else if(baseData == NULL) { 136 hasMappings = FALSE; 137 if(settings.reorderCodesLength == 0) { 138 // only options 139 indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here 140 } else { 141 // only options, reorder codes, and the reorder table 142 indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; 143 } 144 } else { 145 hasMappings = TRUE; 146 // Tailored mappings, and what else? 147 // Check in ascending order of optional tailoring data items. 148 indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; 149 if(data.contextsLength != 0) { 150 indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; 151 } 152 unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); 153 if(!unsafeBackwardSet.isEmpty()) { 154 indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; 155 } 156 if(data.fastLatinTable != baseData->fastLatinTable) { 157 fastLatinTableLength = data.fastLatinTableLength; 158 indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; 159 } 160 } 161 162 UVector32 codesAndRanges(errorCode); 163 const int32_t *reorderCodes = settings.reorderCodes; 164 int32_t reorderCodesLength = settings.reorderCodesLength; 165 if(settings.hasReordering() && 166 CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { 167 // Rebuild the full list of reorder ranges. 168 // The list in the settings is truncated for efficiency. 169 data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); 170 // Write the codes, then the ranges. 171 for(int32_t i = 0; i < reorderCodesLength; ++i) { 172 codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); 173 } 174 if(U_FAILURE(errorCode)) { return 0; } 175 reorderCodes = codesAndRanges.getBuffer(); 176 reorderCodesLength = codesAndRanges.size(); 177 } 178 179 int32_t headerSize; 180 if(isBase) { 181 headerSize = 0; // udata_create() writes the header 182 } else { 183 DataHeader header; 184 header.dataHeader.magic1 = 0xda; 185 header.dataHeader.magic2 = 0x27; 186 uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); 187 uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); 188 headerSize = (int32_t)sizeof(header); 189 U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes 190 if(hasMappings && data.cesLength != 0) { 191 // Sum of the sizes of the data items which are 192 // not automatically multiples of 8 bytes and which are placed before the CEs. 193 int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; 194 if((sum & 7) != 0) { 195 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. 196 // We add to the header size here. 197 // Alternatively, we could increment the indexesLength 198 // or add a few bytes to the reorderTable. 199 headerSize += 4; 200 } 201 } 202 header.dataHeader.headerSize = (uint16_t)headerSize; 203 if(headerSize <= capacity) { 204 uprv_memcpy(dest, &header, sizeof(header)); 205 // Write 00 bytes so that the padding is not mistaken for a copyright string. 206 uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); 207 dest += headerSize; 208 capacity -= headerSize; 209 } else { 210 dest = NULL; 211 capacity = 0; 212 } 213 } 214 215 indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; 216 U_ASSERT((settings.options & ~0xffff) == 0); 217 indexes[CollationDataReader::IX_OPTIONS] = 218 data.numericPrimary | fastLatinVersion | settings.options; 219 indexes[CollationDataReader::IX_RESERVED2] = 0; 220 indexes[CollationDataReader::IX_RESERVED3] = 0; 221 222 // Byte offsets of data items all start from the start of the indexes. 223 // We add the headerSize at the very end. 224 int32_t totalSize = indexesLength * 4; 225 226 if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { 227 indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s); 228 } else { 229 indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; 230 } 231 232 indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; 233 totalSize += reorderCodesLength * 4; 234 235 indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; 236 if(settings.reorderTable != NULL) { 237 totalSize += 256; 238 } 239 240 indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; 241 if(hasMappings) { 242 UErrorCode errorCode2 = U_ZERO_ERROR; 243 int32_t length; 244 if(totalSize < capacity) { 245 length = utrie2_serialize(data.trie, dest + totalSize, 246 capacity - totalSize, &errorCode2); 247 } else { 248 length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); 249 } 250 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 251 errorCode = errorCode2; 252 return 0; 253 } 254 // The trie size should be a multiple of 8 bytes due to the way 255 // compactIndex2(UNewTrie2 *trie) currently works. 256 U_ASSERT((length & 7) == 0); 257 totalSize += length; 258 } 259 260 indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; 261 indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; 262 if(hasMappings && data.cesLength != 0) { 263 U_ASSERT(((headerSize + totalSize) & 7) == 0); 264 totalSize += data.cesLength * 8; 265 } 266 267 indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; 268 indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; 269 if(hasMappings) { 270 totalSize += data.ce32sLength * 4; 271 } 272 273 indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; 274 totalSize += rootElementsLength * 4; 275 276 indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; 277 if(hasMappings) { 278 totalSize += data.contextsLength * 2; 279 } 280 281 indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; 282 if(hasMappings && !unsafeBackwardSet.isEmpty()) { 283 UErrorCode errorCode2 = U_ZERO_ERROR; 284 int32_t length; 285 if(totalSize < capacity) { 286 uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); 287 length = unsafeBackwardSet.serialize( 288 p, (capacity - totalSize) / 2, errorCode2); 289 } else { 290 length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); 291 } 292 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 293 errorCode = errorCode2; 294 return 0; 295 } 296 totalSize += length * 2; 297 } 298 299 indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; 300 totalSize += fastLatinTableLength * 2; 301 302 UnicodeString scripts; 303 indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; 304 if(isBase) { 305 scripts.append((UChar)data.numScripts); 306 scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); 307 scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); 308 totalSize += scripts.length() * 2; 309 } 310 311 indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; 312 if(isBase) { 313 totalSize += 256; 314 } 315 316 indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; 317 indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; 318 319 if(totalSize > capacity) { 320 errorCode = U_BUFFER_OVERFLOW_ERROR; 321 return headerSize + totalSize; 322 } 323 324 uprv_memcpy(dest, indexes, indexesLength * 4); 325 copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); 326 copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); 327 // The trie has already been serialized into the dest buffer. 328 copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); 329 copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); 330 copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); 331 copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); 332 // The unsafeBackwardSet has already been serialized into the dest buffer. 333 copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); 334 copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); 335 copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); 336 337 return headerSize + totalSize; 338 } 339 340 void 341 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, 342 const void *src, uint8_t *dest) { 343 int32_t start = indexes[startIndex]; 344 int32_t limit = indexes[startIndex + 1]; 345 if(start < limit) { 346 uprv_memcpy(dest + start, src, limit - start); 347 } 348 } 349 350 U_NAMESPACE_END 351 352 #endif // !UCONFIG_NO_COLLATION 353