1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationsets.cpp 7 * 8 * created on: 2013feb09 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucharstrie.h" 17 #include "unicode/uniset.h" 18 #include "unicode/unistr.h" 19 #include "unicode/ustringtrie.h" 20 #include "collation.h" 21 #include "collationdata.h" 22 #include "collationsets.h" 23 #include "normalizer2impl.h" 24 #include "uassert.h" 25 #include "utf16collationiterator.h" 26 #include "utrie2.h" 27 28 U_NAMESPACE_BEGIN 29 30 U_CDECL_BEGIN 31 32 static UBool U_CALLCONV 33 enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { 34 if(ce32 == Collation::FALLBACK_CE32) { 35 return TRUE; // fallback to base, not tailored 36 } 37 TailoredSet *ts = (TailoredSet *)context; 38 return ts->handleCE32(start, end, ce32); 39 } 40 41 U_CDECL_END 42 43 void 44 TailoredSet::forData(const CollationData *d, UErrorCode &ec) { 45 if(U_FAILURE(ec)) { return; } 46 errorCode = ec; // Preserve info & warning codes. 47 data = d; 48 baseData = d->base; 49 U_ASSERT(baseData != NULL); 50 utrie2_enum(data->trie, NULL, enumTailoredRange, this); 51 ec = errorCode; 52 } 53 54 UBool 55 TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { 56 U_ASSERT(ce32 != Collation::FALLBACK_CE32); 57 if(Collation::isSpecialCE32(ce32)) { 58 ce32 = data->getIndirectCE32(ce32); 59 if(ce32 == Collation::FALLBACK_CE32) { 60 return U_SUCCESS(errorCode); 61 } 62 } 63 do { 64 uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start)); 65 // Do not just continue if ce32 == baseCE32 because 66 // contractions and expansions in different data objects 67 // normally differ even if they have the same data offsets. 68 if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) { 69 // fastpath 70 if(ce32 != baseCE32) { 71 tailored->add(start); 72 } 73 } else { 74 compare(start, ce32, baseCE32); 75 } 76 } while(++start <= end); 77 return U_SUCCESS(errorCode); 78 } 79 80 void 81 TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) { 82 if(Collation::isPrefixCE32(ce32)) { 83 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 84 ce32 = data->getFinalCE32(CollationData::readCE32(p)); 85 if(Collation::isPrefixCE32(baseCE32)) { 86 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 87 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 88 comparePrefixes(c, p + 2, q + 2); 89 } else { 90 addPrefixes(data, c, p + 2); 91 } 92 } else if(Collation::isPrefixCE32(baseCE32)) { 93 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 94 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 95 addPrefixes(baseData, c, q + 2); 96 } 97 98 if(Collation::isContractionCE32(ce32)) { 99 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 100 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 101 ce32 = Collation::NO_CE32; 102 } else { 103 ce32 = data->getFinalCE32(CollationData::readCE32(p)); 104 } 105 if(Collation::isContractionCE32(baseCE32)) { 106 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 107 if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 108 baseCE32 = Collation::NO_CE32; 109 } else { 110 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 111 } 112 compareContractions(c, p + 2, q + 2); 113 } else { 114 addContractions(c, p + 2); 115 } 116 } else if(Collation::isContractionCE32(baseCE32)) { 117 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 118 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 119 addContractions(c, q + 2); 120 } 121 122 int32_t tag; 123 if(Collation::isSpecialCE32(ce32)) { 124 tag = Collation::tagFromCE32(ce32); 125 U_ASSERT(tag != Collation::PREFIX_TAG); 126 U_ASSERT(tag != Collation::CONTRACTION_TAG); 127 // Currently, the tailoring data builder does not write offset tags. 128 // They might be useful for saving space, 129 // but they would complicate the builder, 130 // and in tailorings we assume that performance of tailored characters is more important. 131 U_ASSERT(tag != Collation::OFFSET_TAG); 132 } else { 133 tag = -1; 134 } 135 int32_t baseTag; 136 if(Collation::isSpecialCE32(baseCE32)) { 137 baseTag = Collation::tagFromCE32(baseCE32); 138 U_ASSERT(baseTag != Collation::PREFIX_TAG); 139 U_ASSERT(baseTag != Collation::CONTRACTION_TAG); 140 } else { 141 baseTag = -1; 142 } 143 144 // Non-contextual mappings, expansions, etc. 145 if(baseTag == Collation::OFFSET_TAG) { 146 // We might be comparing a tailoring CE which is a copy of 147 // a base offset-tag CE, via the [optimize [set]] syntax 148 // or when a single-character mapping was copied for tailored contractions. 149 // Offset tags always result in long-primary CEs, 150 // with common secondary/tertiary weights. 151 if(!Collation::isLongPrimaryCE32(ce32)) { 152 add(c); 153 return; 154 } 155 int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)]; 156 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); 157 if(Collation::primaryFromLongPrimaryCE32(ce32) != p) { 158 add(c); 159 return; 160 } 161 } 162 163 if(tag != baseTag) { 164 add(c); 165 return; 166 } 167 168 if(tag == Collation::EXPANSION32_TAG) { 169 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); 170 int32_t length = Collation::lengthFromCE32(ce32); 171 172 const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32); 173 int32_t baseLength = Collation::lengthFromCE32(baseCE32); 174 175 if(length != baseLength) { 176 add(c); 177 return; 178 } 179 for(int32_t i = 0; i < length; ++i) { 180 if(ce32s[i] != baseCE32s[i]) { 181 add(c); 182 break; 183 } 184 } 185 } else if(tag == Collation::EXPANSION_TAG) { 186 const int64_t *ces = data->ces + Collation::indexFromCE32(ce32); 187 int32_t length = Collation::lengthFromCE32(ce32); 188 189 const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32); 190 int32_t baseLength = Collation::lengthFromCE32(baseCE32); 191 192 if(length != baseLength) { 193 add(c); 194 return; 195 } 196 for(int32_t i = 0; i < length; ++i) { 197 if(ces[i] != baseCEs[i]) { 198 add(c); 199 break; 200 } 201 } 202 } else if(tag == Collation::HANGUL_TAG) { 203 UChar jamos[3]; 204 int32_t length = Hangul::decompose(c, jamos); 205 if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) || 206 (length == 3 && tailored->contains(jamos[2]))) { 207 add(c); 208 } 209 } else if(ce32 != baseCE32) { 210 add(c); 211 } 212 } 213 214 void 215 TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) { 216 // Parallel iteration over prefixes of both tables. 217 UCharsTrie::Iterator prefixes(p, 0, errorCode); 218 UCharsTrie::Iterator basePrefixes(q, 0, errorCode); 219 const UnicodeString *tp = NULL; // Tailoring prefix. 220 const UnicodeString *bp = NULL; // Base prefix. 221 // Use a string with a U+FFFF as the limit sentinel. 222 // U+FFFF is untailorable and will not occur in prefixes. 223 UnicodeString none((UChar)0xffff); 224 for(;;) { 225 if(tp == NULL) { 226 if(prefixes.next(errorCode)) { 227 tp = &prefixes.getString(); 228 } else { 229 tp = &none; 230 } 231 } 232 if(bp == NULL) { 233 if(basePrefixes.next(errorCode)) { 234 bp = &basePrefixes.getString(); 235 } else { 236 bp = &none; 237 } 238 } 239 if(tp == &none && bp == &none) { break; } 240 int32_t cmp = tp->compare(*bp); 241 if(cmp < 0) { 242 // tp occurs in the tailoring but not in the base. 243 addPrefix(data, *tp, c, (uint32_t)prefixes.getValue()); 244 tp = NULL; 245 } else if(cmp > 0) { 246 // bp occurs in the base but not in the tailoring. 247 addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue()); 248 bp = NULL; 249 } else { 250 setPrefix(*tp); 251 compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue()); 252 resetPrefix(); 253 tp = NULL; 254 bp = NULL; 255 } 256 } 257 } 258 259 void 260 TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) { 261 // Parallel iteration over suffixes of both tables. 262 UCharsTrie::Iterator suffixes(p, 0, errorCode); 263 UCharsTrie::Iterator baseSuffixes(q, 0, errorCode); 264 const UnicodeString *ts = NULL; // Tailoring suffix. 265 const UnicodeString *bs = NULL; // Base suffix. 266 // Use a string with two U+FFFF as the limit sentinel. 267 // U+FFFF is untailorable and will not occur in contractions except maybe 268 // as a single suffix character for a root-collator boundary contraction. 269 UnicodeString none((UChar)0xffff); 270 none.append((UChar)0xffff); 271 for(;;) { 272 if(ts == NULL) { 273 if(suffixes.next(errorCode)) { 274 ts = &suffixes.getString(); 275 } else { 276 ts = &none; 277 } 278 } 279 if(bs == NULL) { 280 if(baseSuffixes.next(errorCode)) { 281 bs = &baseSuffixes.getString(); 282 } else { 283 bs = &none; 284 } 285 } 286 if(ts == &none && bs == &none) { break; } 287 int32_t cmp = ts->compare(*bs); 288 if(cmp < 0) { 289 // ts occurs in the tailoring but not in the base. 290 addSuffix(c, *ts); 291 ts = NULL; 292 } else if(cmp > 0) { 293 // bs occurs in the base but not in the tailoring. 294 addSuffix(c, *bs); 295 bs = NULL; 296 } else { 297 suffix = ts; 298 compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue()); 299 suffix = NULL; 300 ts = NULL; 301 bs = NULL; 302 } 303 } 304 } 305 306 void 307 TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) { 308 UCharsTrie::Iterator prefixes(p, 0, errorCode); 309 while(prefixes.next(errorCode)) { 310 addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue()); 311 } 312 } 313 314 void 315 TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) { 316 setPrefix(pfx); 317 ce32 = d->getFinalCE32(ce32); 318 if(Collation::isContractionCE32(ce32)) { 319 const UChar *p = d->contexts + Collation::indexFromCE32(ce32); 320 addContractions(c, p + 2); 321 } 322 tailored->add(UnicodeString(unreversedPrefix).append(c)); 323 resetPrefix(); 324 } 325 326 void 327 TailoredSet::addContractions(UChar32 c, const UChar *p) { 328 UCharsTrie::Iterator suffixes(p, 0, errorCode); 329 while(suffixes.next(errorCode)) { 330 addSuffix(c, suffixes.getString()); 331 } 332 } 333 334 void 335 TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) { 336 tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx)); 337 } 338 339 void 340 TailoredSet::add(UChar32 c) { 341 if(unreversedPrefix.isEmpty() && suffix == NULL) { 342 tailored->add(c); 343 } else { 344 UnicodeString s(unreversedPrefix); 345 s.append(c); 346 if(suffix != NULL) { 347 s.append(*suffix); 348 } 349 tailored->add(s); 350 } 351 } 352 353 ContractionsAndExpansions::CESink::~CESink() {} 354 355 U_CDECL_BEGIN 356 357 static UBool U_CALLCONV 358 enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { 359 ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context; 360 if(cne->checkTailored == 0) { 361 // There is no tailoring. 362 // No need to collect nor check the tailored set. 363 } else if(cne->checkTailored < 0) { 364 // Collect the set of code points with mappings in the tailoring data. 365 if(ce32 == Collation::FALLBACK_CE32) { 366 return TRUE; // fallback to base, not tailored 367 } else { 368 cne->tailored.add(start, end); 369 } 370 // checkTailored > 0: Exclude tailored ranges from the base data enumeration. 371 } else if(start == end) { 372 if(cne->tailored.contains(start)) { 373 return TRUE; 374 } 375 } else if(cne->tailored.containsSome(start, end)) { 376 cne->ranges.set(start, end).removeAll(cne->tailored); 377 int32_t count = cne->ranges.getRangeCount(); 378 for(int32_t i = 0; i < count; ++i) { 379 cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32); 380 } 381 return U_SUCCESS(cne->errorCode); 382 } 383 cne->handleCE32(start, end, ce32); 384 return U_SUCCESS(cne->errorCode); 385 } 386 387 U_CDECL_END 388 389 void 390 ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) { 391 if(U_FAILURE(ec)) { return; } 392 errorCode = ec; // Preserve info & warning codes. 393 // Add all from the data, can be tailoring or base. 394 if(d->base != NULL) { 395 checkTailored = -1; 396 } 397 data = d; 398 utrie2_enum(data->trie, NULL, enumCnERange, this); 399 if(d->base == NULL || U_FAILURE(errorCode)) { 400 ec = errorCode; 401 return; 402 } 403 // Add all from the base data but only for un-tailored code points. 404 tailored.freeze(); 405 checkTailored = 1; 406 data = d->base; 407 utrie2_enum(data->trie, NULL, enumCnERange, this); 408 ec = errorCode; 409 } 410 411 void 412 ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) { 413 if(U_FAILURE(ec)) { return; } 414 errorCode = ec; // Preserve info & warning codes. 415 uint32_t ce32 = d->getCE32(c); 416 if(ce32 == Collation::FALLBACK_CE32) { 417 d = d->base; 418 ce32 = d->getCE32(c); 419 } 420 data = d; 421 handleCE32(c, c, ce32); 422 ec = errorCode; 423 } 424 425 void 426 ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { 427 for(;;) { 428 if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) { 429 // !isSpecialCE32() 430 if(sink != NULL) { 431 sink->handleCE(Collation::ceFromSimpleCE32(ce32)); 432 } 433 return; 434 } 435 switch(Collation::tagFromCE32(ce32)) { 436 case Collation::FALLBACK_TAG: 437 return; 438 case Collation::RESERVED_TAG_3: 439 case Collation::BUILDER_DATA_TAG: 440 case Collation::LEAD_SURROGATE_TAG: 441 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } 442 return; 443 case Collation::LONG_PRIMARY_TAG: 444 if(sink != NULL) { 445 sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32)); 446 } 447 return; 448 case Collation::LONG_SECONDARY_TAG: 449 if(sink != NULL) { 450 sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32)); 451 } 452 return; 453 case Collation::LATIN_EXPANSION_TAG: 454 if(sink != NULL) { 455 ces[0] = Collation::latinCE0FromCE32(ce32); 456 ces[1] = Collation::latinCE1FromCE32(ce32); 457 sink->handleExpansion(ces, 2); 458 } 459 // Optimization: If we have a prefix, 460 // then the relevant strings have been added already. 461 if(unreversedPrefix.isEmpty()) { 462 addExpansions(start, end); 463 } 464 return; 465 case Collation::EXPANSION32_TAG: 466 if(sink != NULL) { 467 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); 468 int32_t length = Collation::lengthFromCE32(ce32); 469 for(int32_t i = 0; i < length; ++i) { 470 ces[i] = Collation::ceFromCE32(*ce32s++); 471 } 472 sink->handleExpansion(ces, length); 473 } 474 // Optimization: If we have a prefix, 475 // then the relevant strings have been added already. 476 if(unreversedPrefix.isEmpty()) { 477 addExpansions(start, end); 478 } 479 return; 480 case Collation::EXPANSION_TAG: 481 if(sink != NULL) { 482 int32_t length = Collation::lengthFromCE32(ce32); 483 sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length); 484 } 485 // Optimization: If we have a prefix, 486 // then the relevant strings have been added already. 487 if(unreversedPrefix.isEmpty()) { 488 addExpansions(start, end); 489 } 490 return; 491 case Collation::PREFIX_TAG: 492 handlePrefixes(start, end, ce32); 493 return; 494 case Collation::CONTRACTION_TAG: 495 handleContractions(start, end, ce32); 496 return; 497 case Collation::DIGIT_TAG: 498 // Fetch the non-numeric-collation CE32 and continue. 499 ce32 = data->ce32s[Collation::indexFromCE32(ce32)]; 500 break; 501 case Collation::U0000_TAG: 502 U_ASSERT(start == 0 && end == 0); 503 // Fetch the normal ce32 for U+0000 and continue. 504 ce32 = data->ce32s[0]; 505 break; 506 case Collation::HANGUL_TAG: 507 if(sink != NULL) { 508 // TODO: This should be optimized, 509 // especially if [start..end] is the complete Hangul range. (assert that) 510 UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL); 511 UChar hangul[1] = { 0 }; 512 for(UChar32 c = start; c <= end; ++c) { 513 hangul[0] = (UChar)c; 514 iter.setText(hangul, hangul + 1); 515 int32_t length = iter.fetchCEs(errorCode); 516 if(U_FAILURE(errorCode)) { return; } 517 // Ignore the terminating non-CE. 518 U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE); 519 sink->handleExpansion(iter.getCEs(), length - 1); 520 } 521 } 522 // Optimization: If we have a prefix, 523 // then the relevant strings have been added already. 524 if(unreversedPrefix.isEmpty()) { 525 addExpansions(start, end); 526 } 527 return; 528 case Collation::OFFSET_TAG: 529 // Currently no need to send offset CEs to the sink. 530 return; 531 case Collation::IMPLICIT_TAG: 532 // Currently no need to send implicit CEs to the sink. 533 return; 534 } 535 } 536 } 537 538 void 539 ContractionsAndExpansions::handlePrefixes( 540 UChar32 start, UChar32 end, uint32_t ce32) { 541 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 542 ce32 = CollationData::readCE32(p); // Default if no prefix match. 543 handleCE32(start, end, ce32); 544 if(!addPrefixes) { return; } 545 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); 546 while(prefixes.next(errorCode)) { 547 setPrefix(prefixes.getString()); 548 // Prefix/pre-context mappings are special kinds of contractions 549 // that always yield expansions. 550 addStrings(start, end, contractions); 551 addStrings(start, end, expansions); 552 handleCE32(start, end, (uint32_t)prefixes.getValue()); 553 } 554 resetPrefix(); 555 } 556 557 void 558 ContractionsAndExpansions::handleContractions( 559 UChar32 start, UChar32 end, uint32_t ce32) { 560 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 561 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 562 // No match on the single code point. 563 // We are underneath a prefix, and the default mapping is just 564 // a fallback to the mappings for a shorter prefix. 565 U_ASSERT(!unreversedPrefix.isEmpty()); 566 } else { 567 ce32 = CollationData::readCE32(p); // Default if no suffix match. 568 U_ASSERT(!Collation::isContractionCE32(ce32)); 569 handleCE32(start, end, ce32); 570 } 571 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); 572 while(suffixes.next(errorCode)) { 573 suffix = &suffixes.getString(); 574 addStrings(start, end, contractions); 575 if(!unreversedPrefix.isEmpty()) { 576 addStrings(start, end, expansions); 577 } 578 handleCE32(start, end, (uint32_t)suffixes.getValue()); 579 } 580 suffix = NULL; 581 } 582 583 void 584 ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) { 585 if(unreversedPrefix.isEmpty() && suffix == NULL) { 586 if(expansions != NULL) { 587 expansions->add(start, end); 588 } 589 } else { 590 addStrings(start, end, expansions); 591 } 592 } 593 594 void 595 ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) { 596 if(set == NULL) { return; } 597 UnicodeString s(unreversedPrefix); 598 do { 599 s.append(start); 600 if(suffix != NULL) { 601 s.append(*suffix); 602 } 603 set->add(s); 604 s.truncate(unreversedPrefix.length()); 605 } while(++start <= end); 606 } 607 608 U_NAMESPACE_END 609 610 #endif // !UCONFIG_NO_COLLATION 611