1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdata.cpp 9 * 10 * created on: 2012jul28 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/ucol.h" 19 #include "unicode/udata.h" 20 #include "unicode/uscript.h" 21 #include "cmemory.h" 22 #include "collation.h" 23 #include "collationdata.h" 24 #include "uassert.h" 25 #include "utrie2.h" 26 #include "uvectr32.h" 27 28 U_NAMESPACE_BEGIN 29 30 uint32_t 31 CollationData::getIndirectCE32(uint32_t ce32) const { 32 U_ASSERT(Collation::isSpecialCE32(ce32)); 33 int32_t tag = Collation::tagFromCE32(ce32); 34 if(tag == Collation::DIGIT_TAG) { 35 // Fetch the non-numeric-collation CE32. 36 ce32 = ce32s[Collation::indexFromCE32(ce32)]; 37 } else if(tag == Collation::LEAD_SURROGATE_TAG) { 38 ce32 = Collation::UNASSIGNED_CE32; 39 } else if(tag == Collation::U0000_TAG) { 40 // Fetch the normal ce32 for U+0000. 41 ce32 = ce32s[0]; 42 } 43 return ce32; 44 } 45 46 uint32_t 47 CollationData::getFinalCE32(uint32_t ce32) const { 48 if(Collation::isSpecialCE32(ce32)) { 49 ce32 = getIndirectCE32(ce32); 50 } 51 return ce32; 52 } 53 54 int64_t 55 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const { 56 if(U_FAILURE(errorCode)) { return 0; } 57 // Keep parallel with CollationDataBuilder::getSingleCE(). 58 const CollationData *d; 59 uint32_t ce32 = getCE32(c); 60 if(ce32 == Collation::FALLBACK_CE32) { 61 d = base; 62 ce32 = base->getCE32(c); 63 } else { 64 d = this; 65 } 66 while(Collation::isSpecialCE32(ce32)) { 67 switch(Collation::tagFromCE32(ce32)) { 68 case Collation::LATIN_EXPANSION_TAG: 69 case Collation::BUILDER_DATA_TAG: 70 case Collation::PREFIX_TAG: 71 case Collation::CONTRACTION_TAG: 72 case Collation::HANGUL_TAG: 73 case Collation::LEAD_SURROGATE_TAG: 74 errorCode = U_UNSUPPORTED_ERROR; 75 return 0; 76 case Collation::FALLBACK_TAG: 77 case Collation::RESERVED_TAG_3: 78 errorCode = U_INTERNAL_PROGRAM_ERROR; 79 return 0; 80 case Collation::LONG_PRIMARY_TAG: 81 return Collation::ceFromLongPrimaryCE32(ce32); 82 case Collation::LONG_SECONDARY_TAG: 83 return Collation::ceFromLongSecondaryCE32(ce32); 84 case Collation::EXPANSION32_TAG: 85 if(Collation::lengthFromCE32(ce32) == 1) { 86 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; 87 break; 88 } else { 89 errorCode = U_UNSUPPORTED_ERROR; 90 return 0; 91 } 92 case Collation::EXPANSION_TAG: { 93 if(Collation::lengthFromCE32(ce32) == 1) { 94 return d->ces[Collation::indexFromCE32(ce32)]; 95 } else { 96 errorCode = U_UNSUPPORTED_ERROR; 97 return 0; 98 } 99 } 100 case Collation::DIGIT_TAG: 101 // Fetch the non-numeric-collation CE32 and continue. 102 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; 103 break; 104 case Collation::U0000_TAG: 105 U_ASSERT(c == 0); 106 // Fetch the normal ce32 for U+0000 and continue. 107 ce32 = d->ce32s[0]; 108 break; 109 case Collation::OFFSET_TAG: 110 return d->getCEFromOffsetCE32(c, ce32); 111 case Collation::IMPLICIT_TAG: 112 return Collation::unassignedCEFromCodePoint(c); 113 } 114 } 115 return Collation::ceFromSimpleCE32(ce32); 116 } 117 118 uint32_t 119 CollationData::getFirstPrimaryForGroup(int32_t script) const { 120 int32_t index = getScriptIndex(script); 121 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16; 122 } 123 124 uint32_t 125 CollationData::getLastPrimaryForGroup(int32_t script) const { 126 int32_t index = getScriptIndex(script); 127 if(index == 0) { 128 return 0; 129 } 130 uint32_t limit = scriptStarts[index + 1]; 131 return (limit << 16) - 1; 132 } 133 134 int32_t 135 CollationData::getGroupForPrimary(uint32_t p) const { 136 p >>= 16; 137 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) { 138 return -1; 139 } 140 int32_t index = 1; 141 while(p >= scriptStarts[index + 1]) { ++index; } 142 for(int32_t i = 0; i < numScripts; ++i) { 143 if(scriptsIndex[i] == index) { 144 return i; 145 } 146 } 147 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 148 if(scriptsIndex[numScripts + i] == index) { 149 return UCOL_REORDER_CODE_FIRST + i; 150 } 151 } 152 return -1; 153 } 154 155 int32_t 156 CollationData::getScriptIndex(int32_t script) const { 157 if(script < 0) { 158 return 0; 159 } else if(script < numScripts) { 160 return scriptsIndex[script]; 161 } else if(script < UCOL_REORDER_CODE_FIRST) { 162 return 0; 163 } else { 164 script -= UCOL_REORDER_CODE_FIRST; 165 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { 166 return scriptsIndex[numScripts + script]; 167 } else { 168 return 0; 169 } 170 } 171 } 172 173 int32_t 174 CollationData::getEquivalentScripts(int32_t script, 175 int32_t dest[], int32_t capacity, 176 UErrorCode &errorCode) const { 177 if(U_FAILURE(errorCode)) { return 0; } 178 int32_t index = getScriptIndex(script); 179 if(index == 0) { return 0; } 180 if(script >= UCOL_REORDER_CODE_FIRST) { 181 // Special groups have no aliases. 182 if(capacity > 0) { 183 dest[0] = script; 184 } else { 185 errorCode = U_BUFFER_OVERFLOW_ERROR; 186 } 187 return 1; 188 } 189 190 int32_t length = 0; 191 for(int32_t i = 0; i < numScripts; ++i) { 192 if(scriptsIndex[i] == index) { 193 if(length < capacity) { 194 dest[length] = i; 195 } 196 ++length; 197 } 198 } 199 if(length > capacity) { 200 errorCode = U_BUFFER_OVERFLOW_ERROR; 201 } 202 return length; 203 } 204 205 void 206 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, 207 UVector32 &ranges, UErrorCode &errorCode) const { 208 makeReorderRanges(reorder, length, FALSE, ranges, errorCode); 209 } 210 211 void 212 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, 213 UBool latinMustMove, 214 UVector32 &ranges, UErrorCode &errorCode) const { 215 if(U_FAILURE(errorCode)) { return; } 216 ranges.removeAllElements(); 217 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) { 218 return; 219 } 220 221 // Maps each script-or-group range to a new lead byte. 222 uint8_t table[MAX_NUM_SCRIPT_RANGES]; 223 uprv_memset(table, 0, sizeof(table)); 224 225 { 226 // Set "don't care" values for reserved ranges. 227 int32_t index = scriptsIndex[ 228 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST]; 229 if(index != 0) { 230 table[index] = 0xff; 231 } 232 index = scriptsIndex[ 233 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST]; 234 if(index != 0) { 235 table[index] = 0xff; 236 } 237 } 238 239 // Never reorder special low and high primary lead bytes. 240 U_ASSERT(scriptStartsLength >= 2); 241 U_ASSERT(scriptStarts[0] == 0); 242 int32_t lowStart = scriptStarts[1]; 243 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8)); 244 int32_t highLimit = scriptStarts[scriptStartsLength - 1]; 245 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8)); 246 247 // Get the set of special reorder codes in the input list. 248 // This supports a fixed number of special reorder codes; 249 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. 250 uint32_t specials = 0; 251 for(int32_t i = 0; i < length; ++i) { 252 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; 253 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { 254 specials |= (uint32_t)1 << reorderCode; 255 } 256 } 257 258 // Start the reordering with the special low reorder codes that do not occur in the input. 259 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 260 int32_t index = scriptsIndex[numScripts + i]; 261 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) { 262 lowStart = addLowScriptRange(table, index, lowStart); 263 } 264 } 265 266 // Skip the reserved range before Latin if Latin is the first script, 267 // so that we do not move it unnecessarily. 268 int32_t skippedReserved = 0; 269 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) { 270 int32_t index = scriptsIndex[USCRIPT_LATIN]; 271 U_ASSERT(index != 0); 272 int32_t start = scriptStarts[index]; 273 U_ASSERT(lowStart <= start); 274 skippedReserved = start - lowStart; 275 lowStart = start; 276 } 277 278 // Reorder according to the input scripts, continuing from the bottom of the primary range. 279 int32_t originalLength = length; // length will be decremented if "others" is in the list. 280 UBool hasReorderToEnd = FALSE; 281 for(int32_t i = 0; i < length;) { 282 int32_t script = reorder[i++]; 283 if(script == USCRIPT_UNKNOWN) { 284 // Put the remaining scripts at the top. 285 hasReorderToEnd = TRUE; 286 while(i < length) { 287 script = reorder[--length]; 288 if(script == USCRIPT_UNKNOWN || // Must occur at most once. 289 script == UCOL_REORDER_CODE_DEFAULT) { 290 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 291 return; 292 } 293 int32_t index = getScriptIndex(script); 294 if(index == 0) { continue; } 295 if(table[index] != 0) { // Duplicate or equivalent script. 296 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 297 return; 298 } 299 highLimit = addHighScriptRange(table, index, highLimit); 300 } 301 break; 302 } 303 if(script == UCOL_REORDER_CODE_DEFAULT) { 304 // The default code must be the only one in the list, and that is handled by the caller. 305 // Otherwise it must not be used. 306 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 307 return; 308 } 309 int32_t index = getScriptIndex(script); 310 if(index == 0) { continue; } 311 if(table[index] != 0) { // Duplicate or equivalent script. 312 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 313 return; 314 } 315 lowStart = addLowScriptRange(table, index, lowStart); 316 } 317 318 // Put all remaining scripts into the middle. 319 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) { 320 int32_t leadByte = table[i]; 321 if(leadByte != 0) { continue; } 322 int32_t start = scriptStarts[i]; 323 if(!hasReorderToEnd && start > lowStart) { 324 // No need to move this script. 325 lowStart = start; 326 } 327 lowStart = addLowScriptRange(table, i, lowStart); 328 } 329 if(lowStart > highLimit) { 330 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { 331 // Try not skipping the before-Latin reserved range. 332 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode); 333 return; 334 } 335 // We need more primary lead bytes than available, despite the reserved ranges. 336 errorCode = U_BUFFER_OVERFLOW_ERROR; 337 return; 338 } 339 340 // Turn lead bytes into a list of (limit, offset) pairs. 341 // Encode each pair in one list element: 342 // Upper 16 bits = limit, lower 16 = signed lead byte offset. 343 int32_t offset = 0; 344 for(int32_t i = 1;; ++i) { 345 int32_t nextOffset = offset; 346 while(i < scriptStartsLength - 1) { 347 int32_t newLeadByte = table[i]; 348 if(newLeadByte == 0xff) { 349 // "Don't care" lead byte for reserved range, continue with current offset. 350 } else { 351 nextOffset = newLeadByte - (scriptStarts[i] >> 8); 352 if(nextOffset != offset) { break; } 353 } 354 ++i; 355 } 356 if(offset != 0 || i < scriptStartsLength - 1) { 357 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode); 358 } 359 if(i == scriptStartsLength - 1) { break; } 360 offset = nextOffset; 361 } 362 } 363 364 int32_t 365 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const { 366 int32_t start = scriptStarts[index]; 367 if((start & 0xff) < (lowStart & 0xff)) { 368 lowStart += 0x100; 369 } 370 table[index] = (uint8_t)(lowStart >> 8); 371 int32_t limit = scriptStarts[index + 1]; 372 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); 373 return lowStart; 374 } 375 376 int32_t 377 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const { 378 int32_t limit = scriptStarts[index + 1]; 379 if((limit & 0xff) > (highLimit & 0xff)) { 380 highLimit -= 0x100; 381 } 382 int32_t start = scriptStarts[index]; 383 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); 384 table[index] = (uint8_t)(highLimit >> 8); 385 return highLimit; 386 } 387 388 U_NAMESPACE_END 389 390 #endif // !UCONFIG_NO_COLLATION 391