1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 /* 11 * File coleitr.cpp 12 * 13 * Created by: Helena Shih 14 * 15 * Modification History: 16 * 17 * Date Name Description 18 * 19 * 6/23/97 helena Adding comments to make code more readable. 20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java 21 * 12/10/99 aliu Ported Thai collation support from Java. 22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) 23 * 02/19/01 swquek Removed CollationElementIterator() since it is 24 * private constructor and no calls are made to it 25 * 2012-2014 markus Rewritten in C++ again. 26 */ 27 28 #include "unicode/utypes.h" 29 30 #if !UCONFIG_NO_COLLATION 31 32 #include "unicode/coleitr.h" 33 #include "unicode/tblcoll.h" 34 #include "unicode/ustring.h" 35 #include "cmemory.h" 36 #include "collation.h" 37 #include "collationdata.h" 38 #include "collationiterator.h" 39 #include "collationsets.h" 40 #include "collationtailoring.h" 41 #include "uassert.h" 42 #include "uhash.h" 43 #include "utf16collationiterator.h" 44 #include "uvectr32.h" 45 46 /* Constants --------------------------------------------------------------- */ 47 48 U_NAMESPACE_BEGIN 49 50 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) 51 52 /* CollationElementIterator public constructor/destructor ------------------ */ 53 54 CollationElementIterator::CollationElementIterator( 55 const CollationElementIterator& other) 56 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { 57 *this = other; 58 } 59 60 CollationElementIterator::~CollationElementIterator() 61 { 62 delete iter_; 63 delete offsets_; 64 } 65 66 /* CollationElementIterator public methods --------------------------------- */ 67 68 namespace { 69 70 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { 71 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); 72 } 73 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { 74 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); 75 } 76 UBool ceNeedsTwoParts(int64_t ce) { 77 return (ce & INT64_C(0xffff00ff003f)) != 0; 78 } 79 80 } // namespace 81 82 int32_t CollationElementIterator::getOffset() const 83 { 84 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { 85 // CollationIterator::previousCE() decrements the CEs length 86 // while it pops CEs from its internal buffer. 87 int32_t i = iter_->getCEsLength(); 88 if (otherHalf_ != 0) { 89 // Return the trailing CE offset while we are in the middle of a 64-bit CE. 90 ++i; 91 } 92 U_ASSERT(i < offsets_->size()); 93 return offsets_->elementAti(i); 94 } 95 return iter_->getOffset(); 96 } 97 98 /** 99 * Get the ordering priority of the next character in the string. 100 * @return the next character's ordering. Returns NULLORDER if an error has 101 * occured or if the end of string has been reached 102 */ 103 int32_t CollationElementIterator::next(UErrorCode& status) 104 { 105 if (U_FAILURE(status)) { return NULLORDER; } 106 if (dir_ > 1) { 107 // Continue forward iteration. Test this first. 108 if (otherHalf_ != 0) { 109 uint32_t oh = otherHalf_; 110 otherHalf_ = 0; 111 return oh; 112 } 113 } else if (dir_ == 1) { 114 // next() after setOffset() 115 dir_ = 2; 116 } else if (dir_ == 0) { 117 // The iter_ is already reset to the start of the text. 118 dir_ = 2; 119 } else /* dir_ < 0 */ { 120 // illegal change of direction 121 status = U_INVALID_STATE_ERROR; 122 return NULLORDER; 123 } 124 // No need to keep all CEs in the buffer when we iterate. 125 iter_->clearCEsIfNoneRemaining(); 126 int64_t ce = iter_->nextCE(status); 127 if (ce == Collation::NO_CE) { return NULLORDER; } 128 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 129 uint32_t p = (uint32_t)(ce >> 32); 130 uint32_t lower32 = (uint32_t)ce; 131 uint32_t firstHalf = getFirstHalf(p, lower32); 132 uint32_t secondHalf = getSecondHalf(p, lower32); 133 if (secondHalf != 0) { 134 otherHalf_ = secondHalf | 0xc0; // continuation CE 135 } 136 return firstHalf; 137 } 138 139 UBool CollationElementIterator::operator!=( 140 const CollationElementIterator& other) const 141 { 142 return !(*this == other); 143 } 144 145 UBool CollationElementIterator::operator==( 146 const CollationElementIterator& that) const 147 { 148 if (this == &that) { 149 return TRUE; 150 } 151 152 return 153 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && 154 otherHalf_ == that.otherHalf_ && 155 normalizeDir() == that.normalizeDir() && 156 string_ == that.string_ && 157 *iter_ == *that.iter_; 158 } 159 160 /** 161 * Get the ordering priority of the previous collation element in the string. 162 * @param status the error code status. 163 * @return the previous element's ordering. Returns NULLORDER if an error has 164 * occured or if the start of string has been reached. 165 */ 166 int32_t CollationElementIterator::previous(UErrorCode& status) 167 { 168 if (U_FAILURE(status)) { return NULLORDER; } 169 if (dir_ < 0) { 170 // Continue backwards iteration. Test this first. 171 if (otherHalf_ != 0) { 172 uint32_t oh = otherHalf_; 173 otherHalf_ = 0; 174 return oh; 175 } 176 } else if (dir_ == 0) { 177 iter_->resetToOffset(string_.length()); 178 dir_ = -1; 179 } else if (dir_ == 1) { 180 // previous() after setOffset() 181 dir_ = -1; 182 } else /* dir_ > 1 */ { 183 // illegal change of direction 184 status = U_INVALID_STATE_ERROR; 185 return NULLORDER; 186 } 187 if (offsets_ == NULL) { 188 offsets_ = new UVector32(status); 189 if (offsets_ == NULL) { 190 status = U_MEMORY_ALLOCATION_ERROR; 191 return NULLORDER; 192 } 193 } 194 // If we already have expansion CEs, then we also have offsets. 195 // Otherwise remember the trailing offset in case we need to 196 // write offsets for an artificial expansion. 197 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; 198 int64_t ce = iter_->previousCE(*offsets_, status); 199 if (ce == Collation::NO_CE) { return NULLORDER; } 200 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 201 uint32_t p = (uint32_t)(ce >> 32); 202 uint32_t lower32 = (uint32_t)ce; 203 uint32_t firstHalf = getFirstHalf(p, lower32); 204 uint32_t secondHalf = getSecondHalf(p, lower32); 205 if (secondHalf != 0) { 206 if (offsets_->isEmpty()) { 207 // When we convert a single 64-bit CE into two 32-bit CEs, 208 // we need to make this artificial expansion behave like a normal expansion. 209 // See CollationIterator::previousCE(). 210 offsets_->addElement(iter_->getOffset(), status); 211 offsets_->addElement(limitOffset, status); 212 } 213 otherHalf_ = firstHalf; 214 return secondHalf | 0xc0; // continuation CE 215 } 216 return firstHalf; 217 } 218 219 /** 220 * Resets the cursor to the beginning of the string. 221 */ 222 void CollationElementIterator::reset() 223 { 224 iter_ ->resetToOffset(0); 225 otherHalf_ = 0; 226 dir_ = 0; 227 } 228 229 void CollationElementIterator::setOffset(int32_t newOffset, 230 UErrorCode& status) 231 { 232 if (U_FAILURE(status)) { return; } 233 if (0 < newOffset && newOffset < string_.length()) { 234 int32_t offset = newOffset; 235 do { 236 UChar c = string_.charAt(offset); 237 if (!rbc_->isUnsafe(c) || 238 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { 239 break; 240 } 241 // Back up to before this unsafe character. 242 --offset; 243 } while (offset > 0); 244 if (offset < newOffset) { 245 // We might have backed up more than necessary. 246 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, 247 // but for text "chu" setOffset(2) should remain at 2 248 // although we initially back up to offset 0. 249 // Find the last safe offset no greater than newOffset by iterating forward. 250 int32_t lastSafeOffset = offset; 251 do { 252 iter_->resetToOffset(lastSafeOffset); 253 do { 254 iter_->nextCE(status); 255 if (U_FAILURE(status)) { return; } 256 } while ((offset = iter_->getOffset()) == lastSafeOffset); 257 if (offset <= newOffset) { 258 lastSafeOffset = offset; 259 } 260 } while (offset < newOffset); 261 newOffset = lastSafeOffset; 262 } 263 } 264 iter_->resetToOffset(newOffset); 265 otherHalf_ = 0; 266 dir_ = 1; 267 } 268 269 /** 270 * Sets the source to the new source string. 271 */ 272 void CollationElementIterator::setText(const UnicodeString& source, 273 UErrorCode& status) 274 { 275 if (U_FAILURE(status)) { 276 return; 277 } 278 279 string_ = source; 280 const UChar *s = string_.getBuffer(); 281 CollationIterator *newIter; 282 UBool numeric = rbc_->settings->isNumeric(); 283 if (rbc_->settings->dontCheckFCD()) { 284 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 285 } else { 286 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 287 } 288 if (newIter == NULL) { 289 status = U_MEMORY_ALLOCATION_ERROR; 290 return; 291 } 292 delete iter_; 293 iter_ = newIter; 294 otherHalf_ = 0; 295 dir_ = 0; 296 } 297 298 // Sets the source to the new character iterator. 299 void CollationElementIterator::setText(CharacterIterator& source, 300 UErrorCode& status) 301 { 302 if (U_FAILURE(status)) 303 return; 304 305 source.getText(string_); 306 setText(string_, status); 307 } 308 309 int32_t CollationElementIterator::strengthOrder(int32_t order) const 310 { 311 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); 312 // Mask off the unwanted differences. 313 if (s == UCOL_PRIMARY) { 314 order &= 0xffff0000; 315 } 316 else if (s == UCOL_SECONDARY) { 317 order &= 0xffffff00; 318 } 319 320 return order; 321 } 322 323 /* CollationElementIterator private constructors/destructors --------------- */ 324 325 /** 326 * This is the "real" constructor for this class; it constructs an iterator 327 * over the source text using the specified collator 328 */ 329 CollationElementIterator::CollationElementIterator( 330 const UnicodeString &source, 331 const RuleBasedCollator *coll, 332 UErrorCode &status) 333 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { 334 setText(source, status); 335 } 336 337 /** 338 * This is the "real" constructor for this class; it constructs an iterator over 339 * the source text using the specified collator 340 */ 341 CollationElementIterator::CollationElementIterator( 342 const CharacterIterator &source, 343 const RuleBasedCollator *coll, 344 UErrorCode &status) 345 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { 346 // We only call source.getText() which should be const anyway. 347 setText(const_cast<CharacterIterator &>(source), status); 348 } 349 350 /* CollationElementIterator private methods -------------------------------- */ 351 352 const CollationElementIterator& CollationElementIterator::operator=( 353 const CollationElementIterator& other) 354 { 355 if (this == &other) { 356 return *this; 357 } 358 359 CollationIterator *newIter; 360 const FCDUTF16CollationIterator *otherFCDIter = 361 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); 362 if(otherFCDIter != NULL) { 363 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); 364 } else { 365 const UTF16CollationIterator *otherIter = 366 dynamic_cast<const UTF16CollationIterator *>(other.iter_); 367 if(otherIter != NULL) { 368 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); 369 } else { 370 newIter = NULL; 371 } 372 } 373 if(newIter != NULL) { 374 delete iter_; 375 iter_ = newIter; 376 rbc_ = other.rbc_; 377 otherHalf_ = other.otherHalf_; 378 dir_ = other.dir_; 379 380 string_ = other.string_; 381 } 382 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { 383 UErrorCode errorCode = U_ZERO_ERROR; 384 if(offsets_ == NULL) { 385 offsets_ = new UVector32(other.offsets_->size(), errorCode); 386 } 387 if(offsets_ != NULL) { 388 offsets_->assign(*other.offsets_, errorCode); 389 } 390 } 391 return *this; 392 } 393 394 namespace { 395 396 class MaxExpSink : public ContractionsAndExpansions::CESink { 397 public: 398 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} 399 virtual ~MaxExpSink(); 400 virtual void handleCE(int64_t /*ce*/) {} 401 virtual void handleExpansion(const int64_t ces[], int32_t length) { 402 if (length <= 1) { 403 // We do not need to add single CEs into the map. 404 return; 405 } 406 int32_t count = 0; // number of CE "halves" 407 for (int32_t i = 0; i < length; ++i) { 408 count += ceNeedsTwoParts(ces[i]) ? 2 : 1; 409 } 410 // last "half" of the last CE 411 int64_t ce = ces[length - 1]; 412 uint32_t p = (uint32_t)(ce >> 32); 413 uint32_t lower32 = (uint32_t)ce; 414 uint32_t lastHalf = getSecondHalf(p, lower32); 415 if (lastHalf == 0) { 416 lastHalf = getFirstHalf(p, lower32); 417 U_ASSERT(lastHalf != 0); 418 } else { 419 lastHalf |= 0xc0; // old-style continuation CE 420 } 421 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { 422 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); 423 } 424 } 425 426 private: 427 UHashtable *maxExpansions; 428 UErrorCode &errorCode; 429 }; 430 431 MaxExpSink::~MaxExpSink() {} 432 433 } // namespace 434 435 UHashtable * 436 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { 437 if (U_FAILURE(errorCode)) { return NULL; } 438 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, 439 uhash_compareLong, &errorCode); 440 if (U_FAILURE(errorCode)) { return NULL; } 441 MaxExpSink sink(maxExpansions, errorCode); 442 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); 443 if (U_FAILURE(errorCode)) { 444 uhash_close(maxExpansions); 445 return NULL; 446 } 447 return maxExpansions; 448 } 449 450 int32_t 451 CollationElementIterator::getMaxExpansion(int32_t order) const { 452 return getMaxExpansion(rbc_->tailoring->maxExpansions, order); 453 } 454 455 int32_t 456 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { 457 if (order == 0) { return 1; } 458 int32_t max; 459 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { 460 return max; 461 } 462 if ((order & 0xc0) == 0xc0) { 463 // old-style continuation CE 464 return 2; 465 } else { 466 return 1; 467 } 468 } 469 470 U_NAMESPACE_END 471 472 #endif /* #if !UCONFIG_NO_COLLATION */ 473