1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 /* 9 * File coleitr.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 17 * 6/23/97 helena Adding comments to make code more readable. 18 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java 19 * 12/10/99 aliu Ported Thai collation support from Java. 20 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) 21 * 02/19/01 swquek Removed CollationElementIterator() since it is 22 * private constructor and no calls are made to it 23 * 2012-2014 markus Rewritten in C++ again. 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_COLLATION 29 30 #include "unicode/coleitr.h" 31 #include "unicode/tblcoll.h" 32 #include "unicode/ustring.h" 33 #include "cmemory.h" 34 #include "collation.h" 35 #include "collationdata.h" 36 #include "collationiterator.h" 37 #include "collationsets.h" 38 #include "collationtailoring.h" 39 #include "uassert.h" 40 #include "uhash.h" 41 #include "utf16collationiterator.h" 42 #include "uvectr32.h" 43 44 /* Constants --------------------------------------------------------------- */ 45 46 U_NAMESPACE_BEGIN 47 48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) 49 50 /* CollationElementIterator public constructor/destructor ------------------ */ 51 52 CollationElementIterator::CollationElementIterator( 53 const CollationElementIterator& other) 54 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { 55 *this = other; 56 } 57 58 CollationElementIterator::~CollationElementIterator() 59 { 60 delete iter_; 61 delete offsets_; 62 } 63 64 /* CollationElementIterator public methods --------------------------------- */ 65 66 namespace { 67 68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { 69 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); 70 } 71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { 72 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); 73 } 74 UBool ceNeedsTwoParts(int64_t ce) { 75 return (ce & INT64_C(0xffff00ff003f)) != 0; 76 } 77 78 } // namespace 79 80 int32_t CollationElementIterator::getOffset() const 81 { 82 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { 83 // CollationIterator::previousCE() decrements the CEs length 84 // while it pops CEs from its internal buffer. 85 int32_t i = iter_->getCEsLength(); 86 if (otherHalf_ != 0) { 87 // Return the trailing CE offset while we are in the middle of a 64-bit CE. 88 ++i; 89 } 90 U_ASSERT(i < offsets_->size()); 91 return offsets_->elementAti(i); 92 } 93 return iter_->getOffset(); 94 } 95 96 /** 97 * Get the ordering priority of the next character in the string. 98 * @return the next character's ordering. Returns NULLORDER if an error has 99 * occured or if the end of string has been reached 100 */ 101 int32_t CollationElementIterator::next(UErrorCode& status) 102 { 103 if (U_FAILURE(status)) { return NULLORDER; } 104 if (dir_ > 1) { 105 // Continue forward iteration. Test this first. 106 if (otherHalf_ != 0) { 107 uint32_t oh = otherHalf_; 108 otherHalf_ = 0; 109 return oh; 110 } 111 } else if (dir_ == 1) { 112 // next() after setOffset() 113 dir_ = 2; 114 } else if (dir_ == 0) { 115 // The iter_ is already reset to the start of the text. 116 dir_ = 2; 117 } else /* dir_ < 0 */ { 118 // illegal change of direction 119 status = U_INVALID_STATE_ERROR; 120 return NULLORDER; 121 } 122 // No need to keep all CEs in the buffer when we iterate. 123 iter_->clearCEsIfNoneRemaining(); 124 int64_t ce = iter_->nextCE(status); 125 if (ce == Collation::NO_CE) { return NULLORDER; } 126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 127 uint32_t p = (uint32_t)(ce >> 32); 128 uint32_t lower32 = (uint32_t)ce; 129 uint32_t firstHalf = getFirstHalf(p, lower32); 130 uint32_t secondHalf = getSecondHalf(p, lower32); 131 if (secondHalf != 0) { 132 otherHalf_ = secondHalf | 0xc0; // continuation CE 133 } 134 return firstHalf; 135 } 136 137 UBool CollationElementIterator::operator!=( 138 const CollationElementIterator& other) const 139 { 140 return !(*this == other); 141 } 142 143 UBool CollationElementIterator::operator==( 144 const CollationElementIterator& that) const 145 { 146 if (this == &that) { 147 return TRUE; 148 } 149 150 return 151 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && 152 otherHalf_ == that.otherHalf_ && 153 normalizeDir() == that.normalizeDir() && 154 string_ == that.string_ && 155 *iter_ == *that.iter_; 156 } 157 158 /** 159 * Get the ordering priority of the previous collation element in the string. 160 * @param status the error code status. 161 * @return the previous element's ordering. Returns NULLORDER if an error has 162 * occured or if the start of string has been reached. 163 */ 164 int32_t CollationElementIterator::previous(UErrorCode& status) 165 { 166 if (U_FAILURE(status)) { return NULLORDER; } 167 if (dir_ < 0) { 168 // Continue backwards iteration. Test this first. 169 if (otherHalf_ != 0) { 170 uint32_t oh = otherHalf_; 171 otherHalf_ = 0; 172 return oh; 173 } 174 } else if (dir_ == 0) { 175 iter_->resetToOffset(string_.length()); 176 dir_ = -1; 177 } else if (dir_ == 1) { 178 // previous() after setOffset() 179 dir_ = -1; 180 } else /* dir_ > 1 */ { 181 // illegal change of direction 182 status = U_INVALID_STATE_ERROR; 183 return NULLORDER; 184 } 185 if (offsets_ == NULL) { 186 offsets_ = new UVector32(status); 187 if (offsets_ == NULL) { 188 status = U_MEMORY_ALLOCATION_ERROR; 189 return NULLORDER; 190 } 191 } 192 // If we already have expansion CEs, then we also have offsets. 193 // Otherwise remember the trailing offset in case we need to 194 // write offsets for an artificial expansion. 195 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; 196 int64_t ce = iter_->previousCE(*offsets_, status); 197 if (ce == Collation::NO_CE) { return NULLORDER; } 198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. 199 uint32_t p = (uint32_t)(ce >> 32); 200 uint32_t lower32 = (uint32_t)ce; 201 uint32_t firstHalf = getFirstHalf(p, lower32); 202 uint32_t secondHalf = getSecondHalf(p, lower32); 203 if (secondHalf != 0) { 204 if (offsets_->isEmpty()) { 205 // When we convert a single 64-bit CE into two 32-bit CEs, 206 // we need to make this artificial expansion behave like a normal expansion. 207 // See CollationIterator::previousCE(). 208 offsets_->addElement(iter_->getOffset(), status); 209 offsets_->addElement(limitOffset, status); 210 } 211 otherHalf_ = firstHalf; 212 return secondHalf | 0xc0; // continuation CE 213 } 214 return firstHalf; 215 } 216 217 /** 218 * Resets the cursor to the beginning of the string. 219 */ 220 void CollationElementIterator::reset() 221 { 222 iter_ ->resetToOffset(0); 223 otherHalf_ = 0; 224 dir_ = 0; 225 } 226 227 void CollationElementIterator::setOffset(int32_t newOffset, 228 UErrorCode& status) 229 { 230 if (U_FAILURE(status)) { return; } 231 if (0 < newOffset && newOffset < string_.length()) { 232 int32_t offset = newOffset; 233 do { 234 UChar c = string_.charAt(offset); 235 if (!rbc_->isUnsafe(c) || 236 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { 237 break; 238 } 239 // Back up to before this unsafe character. 240 --offset; 241 } while (offset > 0); 242 if (offset < newOffset) { 243 // We might have backed up more than necessary. 244 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, 245 // but for text "chu" setOffset(2) should remain at 2 246 // although we initially back up to offset 0. 247 // Find the last safe offset no greater than newOffset by iterating forward. 248 int32_t lastSafeOffset = offset; 249 do { 250 iter_->resetToOffset(lastSafeOffset); 251 do { 252 iter_->nextCE(status); 253 if (U_FAILURE(status)) { return; } 254 } while ((offset = iter_->getOffset()) == lastSafeOffset); 255 if (offset <= newOffset) { 256 lastSafeOffset = offset; 257 } 258 } while (offset < newOffset); 259 newOffset = lastSafeOffset; 260 } 261 } 262 iter_->resetToOffset(newOffset); 263 otherHalf_ = 0; 264 dir_ = 1; 265 } 266 267 /** 268 * Sets the source to the new source string. 269 */ 270 void CollationElementIterator::setText(const UnicodeString& source, 271 UErrorCode& status) 272 { 273 if (U_FAILURE(status)) { 274 return; 275 } 276 277 string_ = source; 278 const UChar *s = string_.getBuffer(); 279 CollationIterator *newIter; 280 UBool numeric = rbc_->settings->isNumeric(); 281 if (rbc_->settings->dontCheckFCD()) { 282 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 283 } else { 284 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); 285 } 286 if (newIter == NULL) { 287 status = U_MEMORY_ALLOCATION_ERROR; 288 return; 289 } 290 delete iter_; 291 iter_ = newIter; 292 otherHalf_ = 0; 293 dir_ = 0; 294 } 295 296 // Sets the source to the new character iterator. 297 void CollationElementIterator::setText(CharacterIterator& source, 298 UErrorCode& status) 299 { 300 if (U_FAILURE(status)) 301 return; 302 303 source.getText(string_); 304 setText(string_, status); 305 } 306 307 int32_t CollationElementIterator::strengthOrder(int32_t order) const 308 { 309 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); 310 // Mask off the unwanted differences. 311 if (s == UCOL_PRIMARY) { 312 order &= 0xffff0000; 313 } 314 else if (s == UCOL_SECONDARY) { 315 order &= 0xffffff00; 316 } 317 318 return order; 319 } 320 321 /* CollationElementIterator private constructors/destructors --------------- */ 322 323 /** 324 * This is the "real" constructor for this class; it constructs an iterator 325 * over the source text using the specified collator 326 */ 327 CollationElementIterator::CollationElementIterator( 328 const UnicodeString &source, 329 const RuleBasedCollator *coll, 330 UErrorCode &status) 331 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { 332 setText(source, status); 333 } 334 335 /** 336 * This is the "real" constructor for this class; it constructs an iterator over 337 * the source text using the specified collator 338 */ 339 CollationElementIterator::CollationElementIterator( 340 const CharacterIterator &source, 341 const RuleBasedCollator *coll, 342 UErrorCode &status) 343 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { 344 // We only call source.getText() which should be const anyway. 345 setText(const_cast<CharacterIterator &>(source), status); 346 } 347 348 /* CollationElementIterator private methods -------------------------------- */ 349 350 const CollationElementIterator& CollationElementIterator::operator=( 351 const CollationElementIterator& other) 352 { 353 if (this == &other) { 354 return *this; 355 } 356 357 CollationIterator *newIter; 358 const FCDUTF16CollationIterator *otherFCDIter = 359 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); 360 if(otherFCDIter != NULL) { 361 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); 362 } else { 363 const UTF16CollationIterator *otherIter = 364 dynamic_cast<const UTF16CollationIterator *>(other.iter_); 365 if(otherIter != NULL) { 366 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); 367 } else { 368 newIter = NULL; 369 } 370 } 371 if(newIter != NULL) { 372 delete iter_; 373 iter_ = newIter; 374 rbc_ = other.rbc_; 375 otherHalf_ = other.otherHalf_; 376 dir_ = other.dir_; 377 378 string_ = other.string_; 379 } 380 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { 381 UErrorCode errorCode = U_ZERO_ERROR; 382 if(offsets_ == NULL) { 383 offsets_ = new UVector32(other.offsets_->size(), errorCode); 384 } 385 if(offsets_ != NULL) { 386 offsets_->assign(*other.offsets_, errorCode); 387 } 388 } 389 return *this; 390 } 391 392 namespace { 393 394 class MaxExpSink : public ContractionsAndExpansions::CESink { 395 public: 396 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} 397 virtual ~MaxExpSink(); 398 virtual void handleCE(int64_t /*ce*/) {} 399 virtual void handleExpansion(const int64_t ces[], int32_t length) { 400 if (length <= 1) { 401 // We do not need to add single CEs into the map. 402 return; 403 } 404 int32_t count = 0; // number of CE "halves" 405 for (int32_t i = 0; i < length; ++i) { 406 count += ceNeedsTwoParts(ces[i]) ? 2 : 1; 407 } 408 // last "half" of the last CE 409 int64_t ce = ces[length - 1]; 410 uint32_t p = (uint32_t)(ce >> 32); 411 uint32_t lower32 = (uint32_t)ce; 412 uint32_t lastHalf = getSecondHalf(p, lower32); 413 if (lastHalf == 0) { 414 lastHalf = getFirstHalf(p, lower32); 415 U_ASSERT(lastHalf != 0); 416 } else { 417 lastHalf |= 0xc0; // old-style continuation CE 418 } 419 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { 420 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); 421 } 422 } 423 424 private: 425 UHashtable *maxExpansions; 426 UErrorCode &errorCode; 427 }; 428 429 MaxExpSink::~MaxExpSink() {} 430 431 } // namespace 432 433 UHashtable * 434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { 435 if (U_FAILURE(errorCode)) { return NULL; } 436 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, 437 uhash_compareLong, &errorCode); 438 if (U_FAILURE(errorCode)) { return NULL; } 439 MaxExpSink sink(maxExpansions, errorCode); 440 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); 441 if (U_FAILURE(errorCode)) { 442 uhash_close(maxExpansions); 443 return NULL; 444 } 445 return maxExpansions; 446 } 447 448 int32_t 449 CollationElementIterator::getMaxExpansion(int32_t order) const { 450 return getMaxExpansion(rbc_->tailoring->maxExpansions, order); 451 } 452 453 int32_t 454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { 455 if (order == 0) { return 1; } 456 int32_t max; 457 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { 458 return max; 459 } 460 if ((order & 0xc0) == 0xc0) { 461 // old-style continuation CE 462 return 2; 463 } else { 464 return 1; 465 } 466 } 467 468 U_NAMESPACE_END 469 470 #endif /* #if !UCONFIG_NO_COLLATION */ 471