1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * utf16collationiterator.cpp 9 * 10 * created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "charstr.h" 19 #include "cmemory.h" 20 #include "collation.h" 21 #include "collationdata.h" 22 #include "collationfcd.h" 23 #include "collationiterator.h" 24 #include "normalizer2impl.h" 25 #include "uassert.h" 26 #include "utf16collationiterator.h" 27 28 U_NAMESPACE_BEGIN 29 30 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other, 31 const UChar *newText) 32 : CollationIterator(other), 33 start(newText), 34 pos(newText + (other.pos - other.start)), 35 limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) { 36 } 37 38 UTF16CollationIterator::~UTF16CollationIterator() {} 39 40 UBool 41 UTF16CollationIterator::operator==(const CollationIterator &other) const { 42 if(!CollationIterator::operator==(other)) { return FALSE; } 43 const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other); 44 // Compare the iterator state but not the text: Assume that the caller does that. 45 return (pos - start) == (o.pos - o.start); 46 } 47 48 void 49 UTF16CollationIterator::resetToOffset(int32_t newOffset) { 50 reset(); 51 pos = start + newOffset; 52 } 53 54 int32_t 55 UTF16CollationIterator::getOffset() const { 56 return (int32_t)(pos - start); 57 } 58 59 uint32_t 60 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 61 if(pos == limit) { 62 c = U_SENTINEL; 63 return Collation::FALLBACK_CE32; 64 } 65 c = *pos++; 66 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 67 } 68 69 UChar 70 UTF16CollationIterator::handleGetTrailSurrogate() { 71 if(pos == limit) { return 0; } 72 UChar trail; 73 if(U16_IS_TRAIL(trail = *pos)) { ++pos; } 74 return trail; 75 } 76 77 UBool 78 UTF16CollationIterator::foundNULTerminator() { 79 if(limit == NULL) { 80 limit = --pos; 81 return TRUE; 82 } else { 83 return FALSE; 84 } 85 } 86 87 UChar32 88 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 89 if(pos == limit) { 90 return U_SENTINEL; 91 } 92 UChar32 c = *pos; 93 if(c == 0 && limit == NULL) { 94 limit = pos; 95 return U_SENTINEL; 96 } 97 ++pos; 98 UChar trail; 99 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 100 ++pos; 101 return U16_GET_SUPPLEMENTARY(c, trail); 102 } else { 103 return c; 104 } 105 } 106 107 UChar32 108 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 109 if(pos == start) { 110 return U_SENTINEL; 111 } 112 UChar32 c = *--pos; 113 UChar lead; 114 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 115 --pos; 116 return U16_GET_SUPPLEMENTARY(lead, c); 117 } else { 118 return c; 119 } 120 } 121 122 void 123 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 124 while(num > 0 && pos != limit) { 125 UChar32 c = *pos; 126 if(c == 0 && limit == NULL) { 127 limit = pos; 128 break; 129 } 130 ++pos; 131 --num; 132 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) { 133 ++pos; 134 } 135 } 136 } 137 138 void 139 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 140 while(num > 0 && pos != start) { 141 UChar32 c = *--pos; 142 --num; 143 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) { 144 --pos; 145 } 146 } 147 } 148 149 // FCDUTF16CollationIterator ----------------------------------------------- *** 150 151 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other, 152 const UChar *newText) 153 : UTF16CollationIterator(other), 154 rawStart(newText), 155 segmentStart(newText + (other.segmentStart - other.rawStart)), 156 segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)), 157 rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)), 158 nfcImpl(other.nfcImpl), 159 normalized(other.normalized), 160 checkDir(other.checkDir) { 161 if(checkDir != 0 || other.start == other.segmentStart) { 162 start = newText + (other.start - other.rawStart); 163 pos = newText + (other.pos - other.rawStart); 164 limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart); 165 } else { 166 start = normalized.getBuffer(); 167 pos = start + (other.pos - other.start); 168 limit = start + normalized.length(); 169 } 170 } 171 172 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {} 173 174 UBool 175 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const { 176 // Skip the UTF16CollationIterator and call its parent. 177 if(!CollationIterator::operator==(other)) { return FALSE; } 178 const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other); 179 // Compare the iterator state but not the text: Assume that the caller does that. 180 if(checkDir != o.checkDir) { return FALSE; } 181 if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; } 182 if(checkDir != 0 || start == segmentStart) { 183 return (pos - rawStart) == (o.pos - o.rawStart); 184 } else { 185 return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) && 186 (pos - start) == (o.pos - o.start); 187 } 188 } 189 190 void 191 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) { 192 reset(); 193 start = segmentStart = pos = rawStart + newOffset; 194 limit = rawLimit; 195 checkDir = 1; 196 } 197 198 int32_t 199 FCDUTF16CollationIterator::getOffset() const { 200 if(checkDir != 0 || start == segmentStart) { 201 return (int32_t)(pos - rawStart); 202 } else if(pos == start) { 203 return (int32_t)(segmentStart - rawStart); 204 } else { 205 return (int32_t)(segmentLimit - rawStart); 206 } 207 } 208 209 uint32_t 210 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 211 for(;;) { 212 if(checkDir > 0) { 213 if(pos == limit) { 214 c = U_SENTINEL; 215 return Collation::FALLBACK_CE32; 216 } 217 c = *pos++; 218 if(CollationFCD::hasTccc(c)) { 219 if(CollationFCD::maybeTibetanCompositeVowel(c) || 220 (pos != limit && CollationFCD::hasLccc(*pos))) { 221 --pos; 222 if(!nextSegment(errorCode)) { 223 c = U_SENTINEL; 224 return Collation::FALLBACK_CE32; 225 } 226 c = *pos++; 227 } 228 } 229 break; 230 } else if(checkDir == 0 && pos != limit) { 231 c = *pos++; 232 break; 233 } else { 234 switchToForward(); 235 } 236 } 237 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 238 } 239 240 UBool 241 FCDUTF16CollationIterator::foundNULTerminator() { 242 if(limit == NULL) { 243 limit = rawLimit = --pos; 244 return TRUE; 245 } else { 246 return FALSE; 247 } 248 } 249 250 UChar32 251 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) { 252 UChar32 c; 253 for(;;) { 254 if(checkDir > 0) { 255 if(pos == limit) { 256 return U_SENTINEL; 257 } 258 c = *pos++; 259 if(CollationFCD::hasTccc(c)) { 260 if(CollationFCD::maybeTibetanCompositeVowel(c) || 261 (pos != limit && CollationFCD::hasLccc(*pos))) { 262 --pos; 263 if(!nextSegment(errorCode)) { 264 return U_SENTINEL; 265 } 266 c = *pos++; 267 } 268 } else if(c == 0 && limit == NULL) { 269 limit = rawLimit = --pos; 270 return U_SENTINEL; 271 } 272 break; 273 } else if(checkDir == 0 && pos != limit) { 274 c = *pos++; 275 break; 276 } else { 277 switchToForward(); 278 } 279 } 280 UChar trail; 281 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 282 ++pos; 283 return U16_GET_SUPPLEMENTARY(c, trail); 284 } else { 285 return c; 286 } 287 } 288 289 UChar32 290 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) { 291 UChar32 c; 292 for(;;) { 293 if(checkDir < 0) { 294 if(pos == start) { 295 return U_SENTINEL; 296 } 297 c = *--pos; 298 if(CollationFCD::hasLccc(c)) { 299 if(CollationFCD::maybeTibetanCompositeVowel(c) || 300 (pos != start && CollationFCD::hasTccc(*(pos - 1)))) { 301 ++pos; 302 if(!previousSegment(errorCode)) { 303 return U_SENTINEL; 304 } 305 c = *--pos; 306 } 307 } 308 break; 309 } else if(checkDir == 0 && pos != start) { 310 c = *--pos; 311 break; 312 } else { 313 switchToBackward(); 314 } 315 } 316 UChar lead; 317 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 318 --pos; 319 return U16_GET_SUPPLEMENTARY(lead, c); 320 } else { 321 return c; 322 } 323 } 324 325 void 326 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 327 // Specify the class to avoid a virtual-function indirection. 328 // In Java, we would declare this class final. 329 while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) { 330 --num; 331 } 332 } 333 334 void 335 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 336 // Specify the class to avoid a virtual-function indirection. 337 // In Java, we would declare this class final. 338 while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) { 339 --num; 340 } 341 } 342 343 void 344 FCDUTF16CollationIterator::switchToForward() { 345 U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit)); 346 if(checkDir < 0) { 347 // Turn around from backward checking. 348 start = segmentStart = pos; 349 if(pos == segmentLimit) { 350 limit = rawLimit; 351 checkDir = 1; // Check forward. 352 } else { // pos < segmentLimit 353 checkDir = 0; // Stay in FCD segment. 354 } 355 } else { 356 // Reached the end of the FCD segment. 357 if(start == segmentStart) { 358 // The input text segment is FCD, extend it forward. 359 } else { 360 // The input text segment needed to be normalized. 361 // Switch to checking forward from it. 362 pos = start = segmentStart = segmentLimit; 363 // Note: If this segment is at the end of the input text, 364 // then it might help to return FALSE to indicate that, so that 365 // we do not have to re-check and normalize when we turn around and go backwards. 366 // However, that would complicate the call sites for an optimization of an unusual case. 367 } 368 limit = rawLimit; 369 checkDir = 1; 370 } 371 } 372 373 UBool 374 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) { 375 if(U_FAILURE(errorCode)) { return FALSE; } 376 U_ASSERT(checkDir > 0 && pos != limit); 377 // The input text [segmentStart..pos[ passes the FCD check. 378 const UChar *p = pos; 379 uint8_t prevCC = 0; 380 for(;;) { 381 // Fetch the next character's fcd16 value. 382 const UChar *q = p; 383 uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit); 384 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 385 if(leadCC == 0 && q != pos) { 386 // FCD boundary before the [q, p[ character. 387 limit = segmentLimit = q; 388 break; 389 } 390 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 391 // Fails FCD check. Find the next FCD boundary and normalize. 392 do { 393 q = p; 394 } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff); 395 if(!normalize(pos, q, errorCode)) { return FALSE; } 396 pos = start; 397 break; 398 } 399 prevCC = (uint8_t)fcd16; 400 if(p == rawLimit || prevCC == 0) { 401 // FCD boundary after the last character. 402 limit = segmentLimit = p; 403 break; 404 } 405 } 406 U_ASSERT(pos != limit); 407 checkDir = 0; 408 return TRUE; 409 } 410 411 void 412 FCDUTF16CollationIterator::switchToBackward() { 413 U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start)); 414 if(checkDir > 0) { 415 // Turn around from forward checking. 416 limit = segmentLimit = pos; 417 if(pos == segmentStart) { 418 start = rawStart; 419 checkDir = -1; // Check backward. 420 } else { // pos > segmentStart 421 checkDir = 0; // Stay in FCD segment. 422 } 423 } else { 424 // Reached the start of the FCD segment. 425 if(start == segmentStart) { 426 // The input text segment is FCD, extend it backward. 427 } else { 428 // The input text segment needed to be normalized. 429 // Switch to checking backward from it. 430 pos = limit = segmentLimit = segmentStart; 431 } 432 start = rawStart; 433 checkDir = -1; 434 } 435 } 436 437 UBool 438 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) { 439 if(U_FAILURE(errorCode)) { return FALSE; } 440 U_ASSERT(checkDir < 0 && pos != start); 441 // The input text [pos..segmentLimit[ passes the FCD check. 442 const UChar *p = pos; 443 uint8_t nextCC = 0; 444 for(;;) { 445 // Fetch the previous character's fcd16 value. 446 const UChar *q = p; 447 uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p); 448 uint8_t trailCC = (uint8_t)fcd16; 449 if(trailCC == 0 && q != pos) { 450 // FCD boundary after the [p, q[ character. 451 start = segmentStart = q; 452 break; 453 } 454 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 455 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 456 // Fails FCD check. Find the previous FCD boundary and normalize. 457 do { 458 q = p; 459 } while(fcd16 > 0xff && p != rawStart && 460 (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0); 461 if(!normalize(q, pos, errorCode)) { return FALSE; } 462 pos = limit; 463 break; 464 } 465 nextCC = (uint8_t)(fcd16 >> 8); 466 if(p == rawStart || nextCC == 0) { 467 // FCD boundary before the following character. 468 start = segmentStart = p; 469 break; 470 } 471 } 472 U_ASSERT(pos != start); 473 checkDir = 0; 474 return TRUE; 475 } 476 477 UBool 478 FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) { 479 // NFD without argument checking. 480 U_ASSERT(U_SUCCESS(errorCode)); 481 nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode); 482 if(U_FAILURE(errorCode)) { return FALSE; } 483 // Switch collation processing into the FCD buffer 484 // with the result of normalizing [segmentStart, segmentLimit[. 485 segmentStart = from; 486 segmentLimit = to; 487 start = normalized.getBuffer(); 488 limit = start + normalized.length(); 489 return TRUE; 490 } 491 492 U_NAMESPACE_END 493 494 #endif // !UCONFIG_NO_COLLATION 495