1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * utf16collationiterator.cpp 7 * 8 * created on: 2010oct27 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "charstr.h" 17 #include "cmemory.h" 18 #include "collation.h" 19 #include "collationdata.h" 20 #include "collationfcd.h" 21 #include "collationiterator.h" 22 #include "normalizer2impl.h" 23 #include "uassert.h" 24 #include "utf16collationiterator.h" 25 26 U_NAMESPACE_BEGIN 27 28 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other, 29 const UChar *newText) 30 : CollationIterator(other), 31 start(newText), 32 pos(newText + (other.pos - other.start)), 33 limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) { 34 } 35 36 UTF16CollationIterator::~UTF16CollationIterator() {} 37 38 UBool 39 UTF16CollationIterator::operator==(const CollationIterator &other) const { 40 if(!CollationIterator::operator==(other)) { return FALSE; } 41 const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other); 42 // Compare the iterator state but not the text: Assume that the caller does that. 43 return (pos - start) == (o.pos - o.start); 44 } 45 46 void 47 UTF16CollationIterator::resetToOffset(int32_t newOffset) { 48 reset(); 49 pos = start + newOffset; 50 } 51 52 int32_t 53 UTF16CollationIterator::getOffset() const { 54 return (int32_t)(pos - start); 55 } 56 57 uint32_t 58 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 59 if(pos == limit) { 60 c = U_SENTINEL; 61 return Collation::FALLBACK_CE32; 62 } 63 c = *pos++; 64 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 65 } 66 67 UChar 68 UTF16CollationIterator::handleGetTrailSurrogate() { 69 if(pos == limit) { return 0; } 70 UChar trail; 71 if(U16_IS_TRAIL(trail = *pos)) { ++pos; } 72 return trail; 73 } 74 75 UBool 76 UTF16CollationIterator::foundNULTerminator() { 77 if(limit == NULL) { 78 limit = --pos; 79 return TRUE; 80 } else { 81 return FALSE; 82 } 83 } 84 85 UChar32 86 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 87 if(pos == limit) { 88 return U_SENTINEL; 89 } 90 UChar32 c = *pos; 91 if(c == 0 && limit == NULL) { 92 limit = pos; 93 return U_SENTINEL; 94 } 95 ++pos; 96 UChar trail; 97 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 98 ++pos; 99 return U16_GET_SUPPLEMENTARY(c, trail); 100 } else { 101 return c; 102 } 103 } 104 105 UChar32 106 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 107 if(pos == start) { 108 return U_SENTINEL; 109 } 110 UChar32 c = *--pos; 111 UChar lead; 112 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 113 --pos; 114 return U16_GET_SUPPLEMENTARY(lead, c); 115 } else { 116 return c; 117 } 118 } 119 120 void 121 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 122 while(num > 0 && pos != limit) { 123 UChar32 c = *pos; 124 if(c == 0 && limit == NULL) { 125 limit = pos; 126 break; 127 } 128 ++pos; 129 --num; 130 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) { 131 ++pos; 132 } 133 } 134 } 135 136 void 137 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 138 while(num > 0 && pos != start) { 139 UChar32 c = *--pos; 140 --num; 141 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) { 142 --pos; 143 } 144 } 145 } 146 147 // FCDUTF16CollationIterator ----------------------------------------------- *** 148 149 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other, 150 const UChar *newText) 151 : UTF16CollationIterator(other), 152 rawStart(newText), 153 segmentStart(newText + (other.segmentStart - other.rawStart)), 154 segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)), 155 rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)), 156 nfcImpl(other.nfcImpl), 157 normalized(other.normalized), 158 checkDir(other.checkDir) { 159 if(checkDir != 0 || other.start == other.segmentStart) { 160 start = newText + (other.start - other.rawStart); 161 pos = newText + (other.pos - other.rawStart); 162 limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart); 163 } else { 164 start = normalized.getBuffer(); 165 pos = start + (other.pos - other.start); 166 limit = start + normalized.length(); 167 } 168 } 169 170 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {} 171 172 UBool 173 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const { 174 // Skip the UTF16CollationIterator and call its parent. 175 if(!CollationIterator::operator==(other)) { return FALSE; } 176 const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other); 177 // Compare the iterator state but not the text: Assume that the caller does that. 178 if(checkDir != o.checkDir) { return FALSE; } 179 if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; } 180 if(checkDir != 0 || start == segmentStart) { 181 return (pos - rawStart) == (o.pos - o.rawStart); 182 } else { 183 return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) && 184 (pos - start) == (o.pos - o.start); 185 } 186 } 187 188 void 189 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) { 190 reset(); 191 start = segmentStart = pos = rawStart + newOffset; 192 limit = rawLimit; 193 checkDir = 1; 194 } 195 196 int32_t 197 FCDUTF16CollationIterator::getOffset() const { 198 if(checkDir != 0 || start == segmentStart) { 199 return (int32_t)(pos - rawStart); 200 } else if(pos == start) { 201 return (int32_t)(segmentStart - rawStart); 202 } else { 203 return (int32_t)(segmentLimit - rawStart); 204 } 205 } 206 207 uint32_t 208 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 209 for(;;) { 210 if(checkDir > 0) { 211 if(pos == limit) { 212 c = U_SENTINEL; 213 return Collation::FALLBACK_CE32; 214 } 215 c = *pos++; 216 if(CollationFCD::hasTccc(c)) { 217 if(CollationFCD::maybeTibetanCompositeVowel(c) || 218 (pos != limit && CollationFCD::hasLccc(*pos))) { 219 --pos; 220 if(!nextSegment(errorCode)) { 221 c = U_SENTINEL; 222 return Collation::FALLBACK_CE32; 223 } 224 c = *pos++; 225 } 226 } 227 break; 228 } else if(checkDir == 0 && pos != limit) { 229 c = *pos++; 230 break; 231 } else { 232 switchToForward(); 233 } 234 } 235 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 236 } 237 238 UBool 239 FCDUTF16CollationIterator::foundNULTerminator() { 240 if(limit == NULL) { 241 limit = rawLimit = --pos; 242 return TRUE; 243 } else { 244 return FALSE; 245 } 246 } 247 248 UChar32 249 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) { 250 UChar32 c; 251 for(;;) { 252 if(checkDir > 0) { 253 if(pos == limit) { 254 return U_SENTINEL; 255 } 256 c = *pos++; 257 if(CollationFCD::hasTccc(c)) { 258 if(CollationFCD::maybeTibetanCompositeVowel(c) || 259 (pos != limit && CollationFCD::hasLccc(*pos))) { 260 --pos; 261 if(!nextSegment(errorCode)) { 262 return U_SENTINEL; 263 } 264 c = *pos++; 265 } 266 } else if(c == 0 && limit == NULL) { 267 limit = rawLimit = --pos; 268 return U_SENTINEL; 269 } 270 break; 271 } else if(checkDir == 0 && pos != limit) { 272 c = *pos++; 273 break; 274 } else { 275 switchToForward(); 276 } 277 } 278 UChar trail; 279 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 280 ++pos; 281 return U16_GET_SUPPLEMENTARY(c, trail); 282 } else { 283 return c; 284 } 285 } 286 287 UChar32 288 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) { 289 UChar32 c; 290 for(;;) { 291 if(checkDir < 0) { 292 if(pos == start) { 293 return U_SENTINEL; 294 } 295 c = *--pos; 296 if(CollationFCD::hasLccc(c)) { 297 if(CollationFCD::maybeTibetanCompositeVowel(c) || 298 (pos != start && CollationFCD::hasTccc(*(pos - 1)))) { 299 ++pos; 300 if(!previousSegment(errorCode)) { 301 return U_SENTINEL; 302 } 303 c = *--pos; 304 } 305 } 306 break; 307 } else if(checkDir == 0 && pos != start) { 308 c = *--pos; 309 break; 310 } else { 311 switchToBackward(); 312 } 313 } 314 UChar lead; 315 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 316 --pos; 317 return U16_GET_SUPPLEMENTARY(lead, c); 318 } else { 319 return c; 320 } 321 } 322 323 void 324 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 325 // Specify the class to avoid a virtual-function indirection. 326 // In Java, we would declare this class final. 327 while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) { 328 --num; 329 } 330 } 331 332 void 333 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 334 // Specify the class to avoid a virtual-function indirection. 335 // In Java, we would declare this class final. 336 while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) { 337 --num; 338 } 339 } 340 341 void 342 FCDUTF16CollationIterator::switchToForward() { 343 U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit)); 344 if(checkDir < 0) { 345 // Turn around from backward checking. 346 start = segmentStart = pos; 347 if(pos == segmentLimit) { 348 limit = rawLimit; 349 checkDir = 1; // Check forward. 350 } else { // pos < segmentLimit 351 checkDir = 0; // Stay in FCD segment. 352 } 353 } else { 354 // Reached the end of the FCD segment. 355 if(start == segmentStart) { 356 // The input text segment is FCD, extend it forward. 357 } else { 358 // The input text segment needed to be normalized. 359 // Switch to checking forward from it. 360 pos = start = segmentStart = segmentLimit; 361 // Note: If this segment is at the end of the input text, 362 // then it might help to return FALSE to indicate that, so that 363 // we do not have to re-check and normalize when we turn around and go backwards. 364 // However, that would complicate the call sites for an optimization of an unusual case. 365 } 366 limit = rawLimit; 367 checkDir = 1; 368 } 369 } 370 371 UBool 372 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) { 373 if(U_FAILURE(errorCode)) { return FALSE; } 374 U_ASSERT(checkDir > 0 && pos != limit); 375 // The input text [segmentStart..pos[ passes the FCD check. 376 const UChar *p = pos; 377 uint8_t prevCC = 0; 378 for(;;) { 379 // Fetch the next character's fcd16 value. 380 const UChar *q = p; 381 uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit); 382 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 383 if(leadCC == 0 && q != pos) { 384 // FCD boundary before the [q, p[ character. 385 limit = segmentLimit = q; 386 break; 387 } 388 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 389 // Fails FCD check. Find the next FCD boundary and normalize. 390 do { 391 q = p; 392 } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff); 393 if(!normalize(pos, q, errorCode)) { return FALSE; } 394 pos = start; 395 break; 396 } 397 prevCC = (uint8_t)fcd16; 398 if(p == rawLimit || prevCC == 0) { 399 // FCD boundary after the last character. 400 limit = segmentLimit = p; 401 break; 402 } 403 } 404 U_ASSERT(pos != limit); 405 checkDir = 0; 406 return TRUE; 407 } 408 409 void 410 FCDUTF16CollationIterator::switchToBackward() { 411 U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start)); 412 if(checkDir > 0) { 413 // Turn around from forward checking. 414 limit = segmentLimit = pos; 415 if(pos == segmentStart) { 416 start = rawStart; 417 checkDir = -1; // Check backward. 418 } else { // pos > segmentStart 419 checkDir = 0; // Stay in FCD segment. 420 } 421 } else { 422 // Reached the start of the FCD segment. 423 if(start == segmentStart) { 424 // The input text segment is FCD, extend it backward. 425 } else { 426 // The input text segment needed to be normalized. 427 // Switch to checking backward from it. 428 pos = limit = segmentLimit = segmentStart; 429 } 430 start = rawStart; 431 checkDir = -1; 432 } 433 } 434 435 UBool 436 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) { 437 if(U_FAILURE(errorCode)) { return FALSE; } 438 U_ASSERT(checkDir < 0 && pos != start); 439 // The input text [pos..segmentLimit[ passes the FCD check. 440 const UChar *p = pos; 441 uint8_t nextCC = 0; 442 for(;;) { 443 // Fetch the previous character's fcd16 value. 444 const UChar *q = p; 445 uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p); 446 uint8_t trailCC = (uint8_t)fcd16; 447 if(trailCC == 0 && q != pos) { 448 // FCD boundary after the [p, q[ character. 449 start = segmentStart = q; 450 break; 451 } 452 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 453 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 454 // Fails FCD check. Find the previous FCD boundary and normalize. 455 do { 456 q = p; 457 } while(fcd16 > 0xff && p != rawStart && 458 (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0); 459 if(!normalize(q, pos, errorCode)) { return FALSE; } 460 pos = limit; 461 break; 462 } 463 nextCC = (uint8_t)(fcd16 >> 8); 464 if(p == rawStart || nextCC == 0) { 465 // FCD boundary before the following character. 466 start = segmentStart = p; 467 break; 468 } 469 } 470 U_ASSERT(pos != start); 471 checkDir = 0; 472 return TRUE; 473 } 474 475 UBool 476 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) { 477 // NFD without argument checking. 478 U_ASSERT(U_SUCCESS(errorCode)); 479 nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode); 480 if(U_FAILURE(errorCode)) { return FALSE; } 481 // Switch collation processing into the FCD buffer 482 // with the result of normalizing [segmentStart, segmentLimit[. 483 segmentStart = from; 484 segmentLimit = to; 485 start = normalized.getBuffer(); 486 limit = start + normalized.length(); 487 return TRUE; 488 } 489 490 U_NAMESPACE_END 491 492 #endif // !UCONFIG_NO_COLLATION 493