1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationdata.cpp 7 * 8 * created on: 2012jul28 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "unicode/udata.h" 18 #include "unicode/uscript.h" 19 #include "cmemory.h" 20 #include "collation.h" 21 #include "collationdata.h" 22 #include "uassert.h" 23 #include "utrie2.h" 24 #include "uvectr32.h" 25 26 U_NAMESPACE_BEGIN 27 28 uint32_t 29 CollationData::getIndirectCE32(uint32_t ce32) const { 30 U_ASSERT(Collation::isSpecialCE32(ce32)); 31 int32_t tag = Collation::tagFromCE32(ce32); 32 if(tag == Collation::DIGIT_TAG) { 33 // Fetch the non-numeric-collation CE32. 34 ce32 = ce32s[Collation::indexFromCE32(ce32)]; 35 } else if(tag == Collation::LEAD_SURROGATE_TAG) { 36 ce32 = Collation::UNASSIGNED_CE32; 37 } else if(tag == Collation::U0000_TAG) { 38 // Fetch the normal ce32 for U+0000. 39 ce32 = ce32s[0]; 40 } 41 return ce32; 42 } 43 44 uint32_t 45 CollationData::getFinalCE32(uint32_t ce32) const { 46 if(Collation::isSpecialCE32(ce32)) { 47 ce32 = getIndirectCE32(ce32); 48 } 49 return ce32; 50 } 51 52 int64_t 53 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const { 54 if(U_FAILURE(errorCode)) { return 0; } 55 // Keep parallel with CollationDataBuilder::getSingleCE(). 56 const CollationData *d; 57 uint32_t ce32 = getCE32(c); 58 if(ce32 == Collation::FALLBACK_CE32) { 59 d = base; 60 ce32 = base->getCE32(c); 61 } else { 62 d = this; 63 } 64 while(Collation::isSpecialCE32(ce32)) { 65 switch(Collation::tagFromCE32(ce32)) { 66 case Collation::LATIN_EXPANSION_TAG: 67 case Collation::BUILDER_DATA_TAG: 68 case Collation::PREFIX_TAG: 69 case Collation::CONTRACTION_TAG: 70 case Collation::HANGUL_TAG: 71 case Collation::LEAD_SURROGATE_TAG: 72 errorCode = U_UNSUPPORTED_ERROR; 73 return 0; 74 case Collation::FALLBACK_TAG: 75 case Collation::RESERVED_TAG_3: 76 errorCode = U_INTERNAL_PROGRAM_ERROR; 77 return 0; 78 case Collation::LONG_PRIMARY_TAG: 79 return Collation::ceFromLongPrimaryCE32(ce32); 80 case Collation::LONG_SECONDARY_TAG: 81 return Collation::ceFromLongSecondaryCE32(ce32); 82 case Collation::EXPANSION32_TAG: 83 if(Collation::lengthFromCE32(ce32) == 1) { 84 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; 85 break; 86 } else { 87 errorCode = U_UNSUPPORTED_ERROR; 88 return 0; 89 } 90 case Collation::EXPANSION_TAG: { 91 if(Collation::lengthFromCE32(ce32) == 1) { 92 return d->ces[Collation::indexFromCE32(ce32)]; 93 } else { 94 errorCode = U_UNSUPPORTED_ERROR; 95 return 0; 96 } 97 } 98 case Collation::DIGIT_TAG: 99 // Fetch the non-numeric-collation CE32 and continue. 100 ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; 101 break; 102 case Collation::U0000_TAG: 103 U_ASSERT(c == 0); 104 // Fetch the normal ce32 for U+0000 and continue. 105 ce32 = d->ce32s[0]; 106 break; 107 case Collation::OFFSET_TAG: 108 return d->getCEFromOffsetCE32(c, ce32); 109 case Collation::IMPLICIT_TAG: 110 return Collation::unassignedCEFromCodePoint(c); 111 } 112 } 113 return Collation::ceFromSimpleCE32(ce32); 114 } 115 116 uint32_t 117 CollationData::getFirstPrimaryForGroup(int32_t script) const { 118 int32_t index = getScriptIndex(script); 119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16; 120 } 121 122 uint32_t 123 CollationData::getLastPrimaryForGroup(int32_t script) const { 124 int32_t index = getScriptIndex(script); 125 if(index == 0) { 126 return 0; 127 } 128 uint32_t limit = scriptStarts[index + 1]; 129 return (limit << 16) - 1; 130 } 131 132 int32_t 133 CollationData::getGroupForPrimary(uint32_t p) const { 134 p >>= 16; 135 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) { 136 return -1; 137 } 138 int32_t index = 1; 139 while(p >= scriptStarts[index + 1]) { ++index; } 140 for(int32_t i = 0; i < numScripts; ++i) { 141 if(scriptsIndex[i] == index) { 142 return i; 143 } 144 } 145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 146 if(scriptsIndex[numScripts + i] == index) { 147 return UCOL_REORDER_CODE_FIRST + i; 148 } 149 } 150 return -1; 151 } 152 153 int32_t 154 CollationData::getScriptIndex(int32_t script) const { 155 if(script < 0) { 156 return 0; 157 } else if(script < numScripts) { 158 return scriptsIndex[script]; 159 } else if(script < UCOL_REORDER_CODE_FIRST) { 160 return 0; 161 } else { 162 script -= UCOL_REORDER_CODE_FIRST; 163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { 164 return scriptsIndex[numScripts + script]; 165 } else { 166 return 0; 167 } 168 } 169 } 170 171 int32_t 172 CollationData::getEquivalentScripts(int32_t script, 173 int32_t dest[], int32_t capacity, 174 UErrorCode &errorCode) const { 175 if(U_FAILURE(errorCode)) { return 0; } 176 int32_t index = getScriptIndex(script); 177 if(index == 0) { return 0; } 178 if(script >= UCOL_REORDER_CODE_FIRST) { 179 // Special groups have no aliases. 180 if(capacity > 0) { 181 dest[0] = script; 182 } else { 183 errorCode = U_BUFFER_OVERFLOW_ERROR; 184 } 185 return 1; 186 } 187 188 int32_t length = 0; 189 for(int32_t i = 0; i < numScripts; ++i) { 190 if(scriptsIndex[i] == index) { 191 if(length < capacity) { 192 dest[length] = i; 193 } 194 ++length; 195 } 196 } 197 if(length > capacity) { 198 errorCode = U_BUFFER_OVERFLOW_ERROR; 199 } 200 return length; 201 } 202 203 void 204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, 205 UVector32 &ranges, UErrorCode &errorCode) const { 206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode); 207 } 208 209 void 210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, 211 UBool latinMustMove, 212 UVector32 &ranges, UErrorCode &errorCode) const { 213 if(U_FAILURE(errorCode)) { return; } 214 ranges.removeAllElements(); 215 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) { 216 return; 217 } 218 219 // Maps each script-or-group range to a new lead byte. 220 uint8_t table[MAX_NUM_SCRIPT_RANGES]; 221 uprv_memset(table, 0, sizeof(table)); 222 223 { 224 // Set "don't care" values for reserved ranges. 225 int32_t index = scriptsIndex[ 226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST]; 227 if(index != 0) { 228 table[index] = 0xff; 229 } 230 index = scriptsIndex[ 231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST]; 232 if(index != 0) { 233 table[index] = 0xff; 234 } 235 } 236 237 // Never reorder special low and high primary lead bytes. 238 U_ASSERT(scriptStartsLength >= 2); 239 U_ASSERT(scriptStarts[0] == 0); 240 int32_t lowStart = scriptStarts[1]; 241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8)); 242 int32_t highLimit = scriptStarts[scriptStartsLength - 1]; 243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8)); 244 245 // Get the set of special reorder codes in the input list. 246 // This supports a fixed number of special reorder codes; 247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. 248 uint32_t specials = 0; 249 for(int32_t i = 0; i < length; ++i) { 250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; 251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { 252 specials |= (uint32_t)1 << reorderCode; 253 } 254 } 255 256 // Start the reordering with the special low reorder codes that do not occur in the input. 257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { 258 int32_t index = scriptsIndex[numScripts + i]; 259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) { 260 lowStart = addLowScriptRange(table, index, lowStart); 261 } 262 } 263 264 // Skip the reserved range before Latin if Latin is the first script, 265 // so that we do not move it unnecessarily. 266 int32_t skippedReserved = 0; 267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) { 268 int32_t index = scriptsIndex[USCRIPT_LATIN]; 269 U_ASSERT(index != 0); 270 int32_t start = scriptStarts[index]; 271 U_ASSERT(lowStart <= start); 272 skippedReserved = start - lowStart; 273 lowStart = start; 274 } 275 276 // Reorder according to the input scripts, continuing from the bottom of the primary range. 277 int32_t originalLength = length; // length will be decremented if "others" is in the list. 278 UBool hasReorderToEnd = FALSE; 279 for(int32_t i = 0; i < length;) { 280 int32_t script = reorder[i++]; 281 if(script == USCRIPT_UNKNOWN) { 282 // Put the remaining scripts at the top. 283 hasReorderToEnd = TRUE; 284 while(i < length) { 285 script = reorder[--length]; 286 if(script == USCRIPT_UNKNOWN || // Must occur at most once. 287 script == UCOL_REORDER_CODE_DEFAULT) { 288 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 289 return; 290 } 291 int32_t index = getScriptIndex(script); 292 if(index == 0) { continue; } 293 if(table[index] != 0) { // Duplicate or equivalent script. 294 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 295 return; 296 } 297 highLimit = addHighScriptRange(table, index, highLimit); 298 } 299 break; 300 } 301 if(script == UCOL_REORDER_CODE_DEFAULT) { 302 // The default code must be the only one in the list, and that is handled by the caller. 303 // Otherwise it must not be used. 304 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 305 return; 306 } 307 int32_t index = getScriptIndex(script); 308 if(index == 0) { continue; } 309 if(table[index] != 0) { // Duplicate or equivalent script. 310 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 311 return; 312 } 313 lowStart = addLowScriptRange(table, index, lowStart); 314 } 315 316 // Put all remaining scripts into the middle. 317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) { 318 int32_t leadByte = table[i]; 319 if(leadByte != 0) { continue; } 320 int32_t start = scriptStarts[i]; 321 if(!hasReorderToEnd && start > lowStart) { 322 // No need to move this script. 323 lowStart = start; 324 } 325 lowStart = addLowScriptRange(table, i, lowStart); 326 } 327 if(lowStart > highLimit) { 328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { 329 // Try not skipping the before-Latin reserved range. 330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode); 331 return; 332 } 333 // We need more primary lead bytes than available, despite the reserved ranges. 334 errorCode = U_BUFFER_OVERFLOW_ERROR; 335 return; 336 } 337 338 // Turn lead bytes into a list of (limit, offset) pairs. 339 // Encode each pair in one list element: 340 // Upper 16 bits = limit, lower 16 = signed lead byte offset. 341 int32_t offset = 0; 342 for(int32_t i = 1;; ++i) { 343 int32_t nextOffset = offset; 344 while(i < scriptStartsLength - 1) { 345 int32_t newLeadByte = table[i]; 346 if(newLeadByte == 0xff) { 347 // "Don't care" lead byte for reserved range, continue with current offset. 348 } else { 349 nextOffset = newLeadByte - (scriptStarts[i] >> 8); 350 if(nextOffset != offset) { break; } 351 } 352 ++i; 353 } 354 if(offset != 0 || i < scriptStartsLength - 1) { 355 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode); 356 } 357 if(i == scriptStartsLength - 1) { break; } 358 offset = nextOffset; 359 } 360 } 361 362 int32_t 363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const { 364 int32_t start = scriptStarts[index]; 365 if((start & 0xff) < (lowStart & 0xff)) { 366 lowStart += 0x100; 367 } 368 table[index] = (uint8_t)(lowStart >> 8); 369 int32_t limit = scriptStarts[index + 1]; 370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); 371 return lowStart; 372 } 373 374 int32_t 375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const { 376 int32_t limit = scriptStarts[index + 1]; 377 if((limit & 0xff) > (highLimit & 0xff)) { 378 highLimit -= 0x100; 379 } 380 int32_t start = scriptStarts[index]; 381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); 382 table[index] = (uint8_t)(highLimit >> 8); 383 return highLimit; 384 } 385 386 U_NAMESPACE_END 387 388 #endif // !UCONFIG_NO_COLLATION 389