1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationdatawriter.cpp 7 * 8 * created on: 2013aug06 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/tblcoll.h" 17 #include "unicode/udata.h" 18 #include "unicode/uniset.h" 19 #include "cmemory.h" 20 #include "collationdata.h" 21 #include "collationdatabuilder.h" 22 #include "collationdatareader.h" 23 #include "collationdatawriter.h" 24 #include "collationfastlatin.h" 25 #include "collationsettings.h" 26 #include "collationtailoring.h" 27 #include "uassert.h" 28 #include "ucmndata.h" 29 30 U_NAMESPACE_BEGIN 31 32 uint8_t * 33 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { 34 if(U_FAILURE(errorCode)) { return NULL; } 35 LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); 36 if(buffer.isNull()) { 37 errorCode = U_MEMORY_ALLOCATION_ERROR; 38 return NULL; 39 } 40 length = cloneBinary(buffer.getAlias(), 20000, errorCode); 41 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 42 if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { 43 errorCode = U_MEMORY_ALLOCATION_ERROR; 44 return NULL; 45 } 46 errorCode = U_ZERO_ERROR; 47 length = cloneBinary(buffer.getAlias(), length, errorCode); 48 } 49 if(U_FAILURE(errorCode)) { return NULL; } 50 return buffer.orphan(); 51 } 52 53 int32_t 54 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { 55 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; 56 return CollationDataWriter::writeTailoring( 57 *tailoring, *settings, indexes, dest, capacity, 58 errorCode); 59 } 60 61 static const UDataInfo dataInfo = { 62 sizeof(UDataInfo), 63 0, 64 65 U_IS_BIG_ENDIAN, 66 U_CHARSET_FAMILY, 67 U_SIZEOF_UCHAR, 68 0, 69 70 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" 71 { 5, 0, 0, 0 }, // formatVersion 72 { 6, 3, 0, 0 } // dataVersion 73 }; 74 75 int32_t 76 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, 77 const void *rootElements, int32_t rootElementsLength, 78 int32_t indexes[], uint8_t *dest, int32_t capacity, 79 UErrorCode &errorCode) { 80 return write(TRUE, NULL, 81 data, settings, 82 rootElements, rootElementsLength, 83 indexes, dest, capacity, errorCode); 84 } 85 86 int32_t 87 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, 88 int32_t indexes[], uint8_t *dest, int32_t capacity, 89 UErrorCode &errorCode) { 90 return write(FALSE, t.version, 91 *t.data, settings, 92 NULL, 0, 93 indexes, dest, capacity, errorCode); 94 } 95 96 int32_t 97 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, 98 const CollationData &data, const CollationSettings &settings, 99 const void *rootElements, int32_t rootElementsLength, 100 int32_t indexes[], uint8_t *dest, int32_t capacity, 101 UErrorCode &errorCode) { 102 if(U_FAILURE(errorCode)) { return 0; } 103 if(capacity < 0 || (capacity > 0 && dest == NULL)) { 104 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 105 return 0; 106 } 107 108 // Figure out which data items to write before settling on 109 // the indexes length and writing offsets. 110 // For any data item, we need to write the start and limit offsets, 111 // so the indexes length must be at least index-of-start-offset + 2. 112 int32_t indexesLength; 113 UBool hasMappings; 114 UnicodeSet unsafeBackwardSet; 115 const CollationData *baseData = data.base; 116 117 int32_t fastLatinVersion; 118 if(data.fastLatinTable != NULL) { 119 fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; 120 } else { 121 fastLatinVersion = 0; 122 } 123 int32_t fastLatinTableLength = 0; 124 125 if(isBase) { 126 // For the root collator, we write an even number of indexes 127 // so that we start with an 8-aligned offset. 128 indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; 129 U_ASSERT(settings.reorderCodesLength == 0); 130 hasMappings = TRUE; 131 unsafeBackwardSet = *data.unsafeBackwardSet; 132 fastLatinTableLength = data.fastLatinTableLength; 133 } else if(baseData == NULL) { 134 hasMappings = FALSE; 135 if(settings.reorderCodesLength == 0) { 136 // only options 137 indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here 138 } else { 139 // only options, reorder codes, and the reorder table 140 indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; 141 } 142 } else { 143 hasMappings = TRUE; 144 // Tailored mappings, and what else? 145 // Check in ascending order of optional tailoring data items. 146 indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; 147 if(data.contextsLength != 0) { 148 indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; 149 } 150 unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); 151 if(!unsafeBackwardSet.isEmpty()) { 152 indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; 153 } 154 if(data.fastLatinTable != baseData->fastLatinTable) { 155 fastLatinTableLength = data.fastLatinTableLength; 156 indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; 157 } 158 } 159 160 UVector32 codesAndRanges(errorCode); 161 const int32_t *reorderCodes = settings.reorderCodes; 162 int32_t reorderCodesLength = settings.reorderCodesLength; 163 if(settings.hasReordering() && 164 CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { 165 // Rebuild the full list of reorder ranges. 166 // The list in the settings is truncated for efficiency. 167 data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); 168 // Write the codes, then the ranges. 169 for(int32_t i = 0; i < reorderCodesLength; ++i) { 170 codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); 171 } 172 if(U_FAILURE(errorCode)) { return 0; } 173 reorderCodes = codesAndRanges.getBuffer(); 174 reorderCodesLength = codesAndRanges.size(); 175 } 176 177 int32_t headerSize; 178 if(isBase) { 179 headerSize = 0; // udata_create() writes the header 180 } else { 181 DataHeader header; 182 header.dataHeader.magic1 = 0xda; 183 header.dataHeader.magic2 = 0x27; 184 uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); 185 uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); 186 headerSize = (int32_t)sizeof(header); 187 U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes 188 if(hasMappings && data.cesLength != 0) { 189 // Sum of the sizes of the data items which are 190 // not automatically multiples of 8 bytes and which are placed before the CEs. 191 int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; 192 if((sum & 7) != 0) { 193 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. 194 // We add to the header size here. 195 // Alternatively, we could increment the indexesLength 196 // or add a few bytes to the reorderTable. 197 headerSize += 4; 198 } 199 } 200 header.dataHeader.headerSize = (uint16_t)headerSize; 201 if(headerSize <= capacity) { 202 uprv_memcpy(dest, &header, sizeof(header)); 203 // Write 00 bytes so that the padding is not mistaken for a copyright string. 204 uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); 205 dest += headerSize; 206 capacity -= headerSize; 207 } else { 208 dest = NULL; 209 capacity = 0; 210 } 211 } 212 213 indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; 214 U_ASSERT((settings.options & ~0xffff) == 0); 215 indexes[CollationDataReader::IX_OPTIONS] = 216 data.numericPrimary | fastLatinVersion | settings.options; 217 indexes[CollationDataReader::IX_RESERVED2] = 0; 218 indexes[CollationDataReader::IX_RESERVED3] = 0; 219 220 // Byte offsets of data items all start from the start of the indexes. 221 // We add the headerSize at the very end. 222 int32_t totalSize = indexesLength * 4; 223 224 if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { 225 indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s; 226 } else { 227 indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; 228 } 229 230 indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; 231 totalSize += reorderCodesLength * 4; 232 233 indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; 234 if(settings.reorderTable != NULL) { 235 totalSize += 256; 236 } 237 238 indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; 239 if(hasMappings) { 240 UErrorCode errorCode2 = U_ZERO_ERROR; 241 int32_t length; 242 if(totalSize < capacity) { 243 length = utrie2_serialize(data.trie, dest + totalSize, 244 capacity - totalSize, &errorCode2); 245 } else { 246 length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); 247 } 248 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 249 errorCode = errorCode2; 250 return 0; 251 } 252 // The trie size should be a multiple of 8 bytes due to the way 253 // compactIndex2(UNewTrie2 *trie) currently works. 254 U_ASSERT((length & 7) == 0); 255 totalSize += length; 256 } 257 258 indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; 259 indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; 260 if(hasMappings && data.cesLength != 0) { 261 U_ASSERT(((headerSize + totalSize) & 7) == 0); 262 totalSize += data.cesLength * 8; 263 } 264 265 indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; 266 indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; 267 if(hasMappings) { 268 totalSize += data.ce32sLength * 4; 269 } 270 271 indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; 272 totalSize += rootElementsLength * 4; 273 274 indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; 275 if(hasMappings) { 276 totalSize += data.contextsLength * 2; 277 } 278 279 indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; 280 if(hasMappings && !unsafeBackwardSet.isEmpty()) { 281 UErrorCode errorCode2 = U_ZERO_ERROR; 282 int32_t length; 283 if(totalSize < capacity) { 284 uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); 285 length = unsafeBackwardSet.serialize( 286 p, (capacity - totalSize) / 2, errorCode2); 287 } else { 288 length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); 289 } 290 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 291 errorCode = errorCode2; 292 return 0; 293 } 294 totalSize += length * 2; 295 } 296 297 indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; 298 totalSize += fastLatinTableLength * 2; 299 300 UnicodeString scripts; 301 indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; 302 if(isBase) { 303 scripts.append((UChar)data.numScripts); 304 scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); 305 scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); 306 totalSize += scripts.length() * 2; 307 } 308 309 indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; 310 if(isBase) { 311 totalSize += 256; 312 } 313 314 indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; 315 indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; 316 317 if(totalSize > capacity) { 318 errorCode = U_BUFFER_OVERFLOW_ERROR; 319 return headerSize + totalSize; 320 } 321 322 uprv_memcpy(dest, indexes, indexesLength * 4); 323 copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); 324 copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); 325 // The trie has already been serialized into the dest buffer. 326 copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); 327 copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); 328 copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); 329 copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); 330 // The unsafeBackwardSet has already been serialized into the dest buffer. 331 copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); 332 copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); 333 copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); 334 335 return headerSize + totalSize; 336 } 337 338 void 339 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, 340 const void *src, uint8_t *dest) { 341 int32_t start = indexes[startIndex]; 342 int32_t limit = indexes[startIndex + 1]; 343 if(start < limit) { 344 uprv_memcpy(dest + start, src, limit - start); 345 } 346 } 347 348 U_NAMESPACE_END 349 350 #endif // !UCONFIG_NO_COLLATION 351