1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * utf8collationiterator.cpp 9 * 10 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp) 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/utf8.h" 19 #include "charstr.h" 20 #include "cmemory.h" 21 #include "collation.h" 22 #include "collationdata.h" 23 #include "collationfcd.h" 24 #include "collationiterator.h" 25 #include "normalizer2impl.h" 26 #include "uassert.h" 27 #include "utf8collationiterator.h" 28 29 U_NAMESPACE_BEGIN 30 31 UTF8CollationIterator::~UTF8CollationIterator() {} 32 33 void 34 UTF8CollationIterator::resetToOffset(int32_t newOffset) { 35 reset(); 36 pos = newOffset; 37 } 38 39 int32_t 40 UTF8CollationIterator::getOffset() const { 41 return pos; 42 } 43 44 uint32_t 45 UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 46 if(pos == length) { 47 c = U_SENTINEL; 48 return Collation::FALLBACK_CE32; 49 } 50 // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). 51 c = u8[pos++]; 52 if(U8_IS_SINGLE(c)) { 53 // ASCII 00..7F 54 return trie->data32[c]; 55 } 56 uint8_t t1, t2; 57 if(0xe0 <= c && c < 0xf0 && 58 ((pos + 1) < length || length < 0) && 59 U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && 60 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { 61 // U+0800..U+FFFF except surrogates 62 c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); 63 pos += 2; 64 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 65 } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 66 // U+0080..U+07FF 67 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 68 c = ((c & 0x1f) << 6) | t1; 69 ++pos; 70 return ce32; 71 } else { 72 // Function call for supplementary code points and error cases. 73 // Illegal byte sequences yield U+FFFD. 74 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 75 return data->getCE32(c); 76 } 77 } 78 79 UBool 80 UTF8CollationIterator::foundNULTerminator() { 81 if(length < 0) { 82 length = --pos; 83 return TRUE; 84 } else { 85 return FALSE; 86 } 87 } 88 89 UBool 90 UTF8CollationIterator::forbidSurrogateCodePoints() const { 91 return TRUE; 92 } 93 94 UChar32 95 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 96 if(pos == length) { 97 return U_SENTINEL; 98 } 99 if(u8[pos] == 0 && length < 0) { 100 length = pos; 101 return U_SENTINEL; 102 } 103 UChar32 c; 104 U8_NEXT_OR_FFFD(u8, pos, length, c); 105 return c; 106 } 107 108 UChar32 109 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 110 if(pos == 0) { 111 return U_SENTINEL; 112 } 113 UChar32 c; 114 U8_PREV_OR_FFFD(u8, 0, pos, c); 115 return c; 116 } 117 118 void 119 UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 120 U8_FWD_N(u8, pos, length, num); 121 } 122 123 void 124 UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 125 U8_BACK_N(u8, 0, pos, num); 126 } 127 128 // FCDUTF8CollationIterator ------------------------------------------------ *** 129 130 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} 131 132 void 133 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { 134 reset(); 135 start = pos = newOffset; 136 state = CHECK_FWD; 137 } 138 139 int32_t 140 FCDUTF8CollationIterator::getOffset() const { 141 if(state != IN_NORMALIZED) { 142 return pos; 143 } else if(pos == 0) { 144 return start; 145 } else { 146 return limit; 147 } 148 } 149 150 uint32_t 151 FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 152 for(;;) { 153 if(state == CHECK_FWD) { 154 // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. 155 if(pos == length) { 156 c = U_SENTINEL; 157 return Collation::FALLBACK_CE32; 158 } 159 c = u8[pos++]; 160 if(U8_IS_SINGLE(c)) { 161 // ASCII 00..7F 162 return trie->data32[c]; 163 } 164 uint8_t t1, t2; 165 if(0xe0 <= c && c < 0xf0 && 166 ((pos + 1) < length || length < 0) && 167 U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && 168 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { 169 // U+0800..U+FFFF except surrogates 170 c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); 171 pos += 2; 172 if(CollationFCD::hasTccc(c) && 173 (CollationFCD::maybeTibetanCompositeVowel(c) || 174 (pos != length && nextHasLccc()))) { 175 pos -= 3; 176 } else { 177 break; // return CE32(BMP) 178 } 179 } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 180 // U+0080..U+07FF 181 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 182 c = ((c & 0x1f) << 6) | t1; 183 ++pos; 184 if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { 185 pos -= 2; 186 } else { 187 return ce32; 188 } 189 } else { 190 // Function call for supplementary code points and error cases. 191 // Illegal byte sequences yield U+FFFD. 192 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 193 if(c == 0xfffd) { 194 return Collation::FFFD_CE32; 195 } else { 196 U_ASSERT(c > 0xffff); 197 if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { 198 pos -= 4; 199 } else { 200 return data->getCE32FromSupplementary(c); 201 } 202 } 203 } 204 if(!nextSegment(errorCode)) { 205 c = U_SENTINEL; 206 return Collation::FALLBACK_CE32; 207 } 208 continue; 209 } else if(state == IN_FCD_SEGMENT && pos != limit) { 210 return UTF8CollationIterator::handleNextCE32(c, errorCode); 211 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 212 c = normalized[pos++]; 213 break; 214 } else { 215 switchToForward(); 216 } 217 } 218 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 219 } 220 221 UBool 222 FCDUTF8CollationIterator::nextHasLccc() const { 223 U_ASSERT(state == CHECK_FWD && pos != length); 224 // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8. 225 // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.) 226 UChar32 c = u8[pos]; 227 if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; } 228 int32_t i = pos; 229 U8_NEXT_OR_FFFD(u8, i, length, c); 230 if(c > 0xffff) { c = U16_LEAD(c); } 231 return CollationFCD::hasLccc(c); 232 } 233 234 UBool 235 FCDUTF8CollationIterator::previousHasTccc() const { 236 U_ASSERT(state == CHECK_BWD && pos != 0); 237 UChar32 c = u8[pos - 1]; 238 if(U8_IS_SINGLE(c)) { return FALSE; } 239 int32_t i = pos; 240 U8_PREV_OR_FFFD(u8, 0, i, c); 241 if(c > 0xffff) { c = U16_LEAD(c); } 242 return CollationFCD::hasTccc(c); 243 } 244 245 UChar 246 FCDUTF8CollationIterator::handleGetTrailSurrogate() { 247 if(state != IN_NORMALIZED) { return 0; } 248 U_ASSERT(pos < normalized.length()); 249 UChar trail; 250 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 251 return trail; 252 } 253 254 UBool 255 FCDUTF8CollationIterator::foundNULTerminator() { 256 if(state == CHECK_FWD && length < 0) { 257 length = --pos; 258 return TRUE; 259 } else { 260 return FALSE; 261 } 262 } 263 264 UChar32 265 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { 266 UChar32 c; 267 for(;;) { 268 if(state == CHECK_FWD) { 269 if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { 270 return U_SENTINEL; 271 } 272 if(U8_IS_SINGLE(c)) { 273 ++pos; 274 return c; 275 } 276 U8_NEXT_OR_FFFD(u8, pos, length, c); 277 if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && 278 (CollationFCD::maybeTibetanCompositeVowel(c) || 279 (pos != length && nextHasLccc()))) { 280 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 281 // and we can use U8_LENGTH() rather than a previous-position variable. 282 pos -= U8_LENGTH(c); 283 if(!nextSegment(errorCode)) { 284 return U_SENTINEL; 285 } 286 continue; 287 } 288 return c; 289 } else if(state == IN_FCD_SEGMENT && pos != limit) { 290 U8_NEXT_OR_FFFD(u8, pos, length, c); 291 return c; 292 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 293 c = normalized.char32At(pos); 294 pos += U16_LENGTH(c); 295 return c; 296 } else { 297 switchToForward(); 298 } 299 } 300 } 301 302 UChar32 303 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { 304 UChar32 c; 305 for(;;) { 306 if(state == CHECK_BWD) { 307 if(pos == 0) { 308 return U_SENTINEL; 309 } 310 if(U8_IS_SINGLE(c = u8[pos - 1])) { 311 --pos; 312 return c; 313 } 314 U8_PREV_OR_FFFD(u8, 0, pos, c); 315 if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && 316 (CollationFCD::maybeTibetanCompositeVowel(c) || 317 (pos != 0 && previousHasTccc()))) { 318 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 319 // and we can use U8_LENGTH() rather than a previous-position variable. 320 pos += U8_LENGTH(c); 321 if(!previousSegment(errorCode)) { 322 return U_SENTINEL; 323 } 324 continue; 325 } 326 return c; 327 } else if(state == IN_FCD_SEGMENT && pos != start) { 328 U8_PREV_OR_FFFD(u8, 0, pos, c); 329 return c; 330 } else if(state >= IN_NORMALIZED && pos != 0) { 331 c = normalized.char32At(pos - 1); 332 pos -= U16_LENGTH(c); 333 return c; 334 } else { 335 switchToBackward(); 336 } 337 } 338 } 339 340 void 341 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 342 // Specify the class to avoid a virtual-function indirection. 343 // In Java, we would declare this class final. 344 while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { 345 --num; 346 } 347 } 348 349 void 350 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 351 // Specify the class to avoid a virtual-function indirection. 352 // In Java, we would declare this class final. 353 while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { 354 --num; 355 } 356 } 357 358 void 359 FCDUTF8CollationIterator::switchToForward() { 360 U_ASSERT(state == CHECK_BWD || 361 (state == IN_FCD_SEGMENT && pos == limit) || 362 (state == IN_NORMALIZED && pos == normalized.length())); 363 if(state == CHECK_BWD) { 364 // Turn around from backward checking. 365 start = pos; 366 if(pos == limit) { 367 state = CHECK_FWD; // Check forward. 368 } else { // pos < limit 369 state = IN_FCD_SEGMENT; // Stay in FCD segment. 370 } 371 } else { 372 // Reached the end of the FCD segment. 373 if(state == IN_FCD_SEGMENT) { 374 // The input text segment is FCD, extend it forward. 375 } else { 376 // The input text segment needed to be normalized. 377 // Switch to checking forward from it. 378 start = pos = limit; 379 } 380 state = CHECK_FWD; 381 } 382 } 383 384 UBool 385 FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { 386 if(U_FAILURE(errorCode)) { return FALSE; } 387 U_ASSERT(state == CHECK_FWD && pos != length); 388 // The input text [start..pos[ passes the FCD check. 389 int32_t segmentStart = pos; 390 // Collect the characters being checked, in case they need to be normalized. 391 UnicodeString s; 392 uint8_t prevCC = 0; 393 for(;;) { 394 // Fetch the next character and its fcd16 value. 395 int32_t cpStart = pos; 396 UChar32 c; 397 U8_NEXT_OR_FFFD(u8, pos, length, c); 398 uint16_t fcd16 = nfcImpl.getFCD16(c); 399 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 400 if(leadCC == 0 && cpStart != segmentStart) { 401 // FCD boundary before this character. 402 pos = cpStart; 403 break; 404 } 405 s.append(c); 406 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 407 // Fails FCD check. Find the next FCD boundary and normalize. 408 while(pos != length) { 409 cpStart = pos; 410 U8_NEXT_OR_FFFD(u8, pos, length, c); 411 if(nfcImpl.getFCD16(c) <= 0xff) { 412 pos = cpStart; 413 break; 414 } 415 s.append(c); 416 } 417 if(!normalize(s, errorCode)) { return FALSE; } 418 start = segmentStart; 419 limit = pos; 420 state = IN_NORMALIZED; 421 pos = 0; 422 return TRUE; 423 } 424 prevCC = (uint8_t)fcd16; 425 if(pos == length || prevCC == 0) { 426 // FCD boundary after the last character. 427 break; 428 } 429 } 430 limit = pos; 431 pos = segmentStart; 432 U_ASSERT(pos != limit); 433 state = IN_FCD_SEGMENT; 434 return TRUE; 435 } 436 437 void 438 FCDUTF8CollationIterator::switchToBackward() { 439 U_ASSERT(state == CHECK_FWD || 440 (state == IN_FCD_SEGMENT && pos == start) || 441 (state >= IN_NORMALIZED && pos == 0)); 442 if(state == CHECK_FWD) { 443 // Turn around from forward checking. 444 limit = pos; 445 if(pos == start) { 446 state = CHECK_BWD; // Check backward. 447 } else { // pos > start 448 state = IN_FCD_SEGMENT; // Stay in FCD segment. 449 } 450 } else { 451 // Reached the start of the FCD segment. 452 if(state == IN_FCD_SEGMENT) { 453 // The input text segment is FCD, extend it backward. 454 } else { 455 // The input text segment needed to be normalized. 456 // Switch to checking backward from it. 457 limit = pos = start; 458 } 459 state = CHECK_BWD; 460 } 461 } 462 463 UBool 464 FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { 465 if(U_FAILURE(errorCode)) { return FALSE; } 466 U_ASSERT(state == CHECK_BWD && pos != 0); 467 // The input text [pos..limit[ passes the FCD check. 468 int32_t segmentLimit = pos; 469 // Collect the characters being checked, in case they need to be normalized. 470 UnicodeString s; 471 uint8_t nextCC = 0; 472 for(;;) { 473 // Fetch the previous character and its fcd16 value. 474 int32_t cpLimit = pos; 475 UChar32 c; 476 U8_PREV_OR_FFFD(u8, 0, pos, c); 477 uint16_t fcd16 = nfcImpl.getFCD16(c); 478 uint8_t trailCC = (uint8_t)fcd16; 479 if(trailCC == 0 && cpLimit != segmentLimit) { 480 // FCD boundary after this character. 481 pos = cpLimit; 482 break; 483 } 484 s.append(c); 485 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 486 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 487 // Fails FCD check. Find the previous FCD boundary and normalize. 488 while(fcd16 > 0xff && pos != 0) { 489 cpLimit = pos; 490 U8_PREV_OR_FFFD(u8, 0, pos, c); 491 fcd16 = nfcImpl.getFCD16(c); 492 if(fcd16 == 0) { 493 pos = cpLimit; 494 break; 495 } 496 s.append(c); 497 } 498 s.reverse(); 499 if(!normalize(s, errorCode)) { return FALSE; } 500 limit = segmentLimit; 501 start = pos; 502 state = IN_NORMALIZED; 503 pos = normalized.length(); 504 return TRUE; 505 } 506 nextCC = (uint8_t)(fcd16 >> 8); 507 if(pos == 0 || nextCC == 0) { 508 // FCD boundary before the following character. 509 break; 510 } 511 } 512 start = pos; 513 pos = segmentLimit; 514 U_ASSERT(pos != start); 515 state = IN_FCD_SEGMENT; 516 return TRUE; 517 } 518 519 UBool 520 FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 521 // NFD without argument checking. 522 U_ASSERT(U_SUCCESS(errorCode)); 523 nfcImpl.decompose(s, normalized, errorCode); 524 return U_SUCCESS(errorCode); 525 } 526 527 U_NAMESPACE_END 528 529 #endif // !UCONFIG_NO_COLLATION 530