1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * utf8collationiterator.cpp 9 * 10 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp) 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/utf8.h" 19 #include "charstr.h" 20 #include "cmemory.h" 21 #include "collation.h" 22 #include "collationdata.h" 23 #include "collationfcd.h" 24 #include "collationiterator.h" 25 #include "normalizer2impl.h" 26 #include "uassert.h" 27 #include "utf8collationiterator.h" 28 29 U_NAMESPACE_BEGIN 30 31 UTF8CollationIterator::~UTF8CollationIterator() {} 32 33 void 34 UTF8CollationIterator::resetToOffset(int32_t newOffset) { 35 reset(); 36 pos = newOffset; 37 } 38 39 int32_t 40 UTF8CollationIterator::getOffset() const { 41 return pos; 42 } 43 44 uint32_t 45 UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 46 if(pos == length) { 47 c = U_SENTINEL; 48 return Collation::FALLBACK_CE32; 49 } 50 // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). 51 c = u8[pos++]; 52 if(c < 0xc0) { 53 // ASCII 00..7F; trail bytes 80..BF map to error values. 54 return trie->data32[c]; 55 } 56 uint8_t t1, t2; 57 if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 58 // U+0080..U+07FF; 00..7F map to error values. 59 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 60 c = ((c & 0x1f) << 6) | t1; 61 ++pos; 62 return ce32; 63 } else if(c <= 0xef && 64 ((pos + 1) < length || length < 0) && 65 (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && 66 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f 67 ) { 68 // U+0800..U+FFFF; caller maps surrogates to error values. 69 c = (UChar)((c << 12) | (t1 << 6) | t2); 70 pos += 2; 71 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 72 } else { 73 // Function call for supplementary code points and error cases. 74 // Illegal byte sequences yield U+FFFD. 75 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 76 return data->getCE32(c); 77 } 78 } 79 80 UBool 81 UTF8CollationIterator::foundNULTerminator() { 82 if(length < 0) { 83 length = --pos; 84 return TRUE; 85 } else { 86 return FALSE; 87 } 88 } 89 90 UBool 91 UTF8CollationIterator::forbidSurrogateCodePoints() const { 92 return TRUE; 93 } 94 95 UChar32 96 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 97 if(pos == length) { 98 return U_SENTINEL; 99 } 100 if(u8[pos] == 0 && length < 0) { 101 length = pos; 102 return U_SENTINEL; 103 } 104 UChar32 c; 105 U8_NEXT_OR_FFFD(u8, pos, length, c); 106 return c; 107 } 108 109 UChar32 110 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 111 if(pos == 0) { 112 return U_SENTINEL; 113 } 114 UChar32 c; 115 U8_PREV_OR_FFFD(u8, 0, pos, c); 116 return c; 117 } 118 119 void 120 UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 121 U8_FWD_N(u8, pos, length, num); 122 } 123 124 void 125 UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 126 U8_BACK_N(u8, 0, pos, num); 127 } 128 129 // FCDUTF8CollationIterator ------------------------------------------------ *** 130 131 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} 132 133 void 134 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { 135 reset(); 136 start = pos = newOffset; 137 state = CHECK_FWD; 138 } 139 140 int32_t 141 FCDUTF8CollationIterator::getOffset() const { 142 if(state != IN_NORMALIZED) { 143 return pos; 144 } else if(pos == 0) { 145 return start; 146 } else { 147 return limit; 148 } 149 } 150 151 uint32_t 152 FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 153 for(;;) { 154 if(state == CHECK_FWD) { 155 // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. 156 if(pos == length) { 157 c = U_SENTINEL; 158 return Collation::FALLBACK_CE32; 159 } 160 c = u8[pos++]; 161 if(c < 0xc0) { 162 // ASCII 00..7F; trail bytes 80..BF map to error values. 163 return trie->data32[c]; 164 } 165 uint8_t t1, t2; 166 if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 167 // U+0080..U+07FF; 00..7F map to error values. 168 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 169 c = ((c & 0x1f) << 6) | t1; 170 ++pos; 171 if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { 172 pos -= 2; 173 } else { 174 return ce32; 175 } 176 } else if(c <= 0xef && 177 ((pos + 1) < length || length < 0) && 178 (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && 179 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f 180 ) { 181 // U+0800..U+FFFF; caller maps surrogates to error values. 182 c = (UChar)((c << 12) | (t1 << 6) | t2); 183 pos += 2; 184 if(CollationFCD::hasTccc(c) && 185 (CollationFCD::maybeTibetanCompositeVowel(c) || 186 (pos != length && nextHasLccc()))) { 187 pos -= 3; 188 } else { 189 break; // return CE32(BMP) 190 } 191 } else { 192 // Function call for supplementary code points and error cases. 193 // Illegal byte sequences yield U+FFFD. 194 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 195 if(c == 0xfffd) { 196 return Collation::FFFD_CE32; 197 } else { 198 U_ASSERT(c > 0xffff); 199 if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { 200 pos -= 4; 201 } else { 202 return data->getCE32FromSupplementary(c); 203 } 204 } 205 } 206 if(!nextSegment(errorCode)) { 207 c = U_SENTINEL; 208 return Collation::FALLBACK_CE32; 209 } 210 continue; 211 } else if(state == IN_FCD_SEGMENT && pos != limit) { 212 return UTF8CollationIterator::handleNextCE32(c, errorCode); 213 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 214 c = normalized[pos++]; 215 break; 216 } else { 217 switchToForward(); 218 } 219 } 220 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 221 } 222 223 UBool 224 FCDUTF8CollationIterator::nextHasLccc() const { 225 U_ASSERT(state == CHECK_FWD && pos != length); 226 // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8. 227 // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.) 228 UChar32 c = u8[pos]; 229 if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; } 230 int32_t i = pos; 231 U8_NEXT_OR_FFFD(u8, i, length, c); 232 if(c > 0xffff) { c = U16_LEAD(c); } 233 return CollationFCD::hasLccc(c); 234 } 235 236 UBool 237 FCDUTF8CollationIterator::previousHasTccc() const { 238 U_ASSERT(state == CHECK_BWD && pos != 0); 239 UChar32 c = u8[pos - 1]; 240 if(c < 0x80) { return FALSE; } 241 int32_t i = pos; 242 U8_PREV_OR_FFFD(u8, 0, i, c); 243 if(c > 0xffff) { c = U16_LEAD(c); } 244 return CollationFCD::hasTccc(c); 245 } 246 247 UChar 248 FCDUTF8CollationIterator::handleGetTrailSurrogate() { 249 if(state != IN_NORMALIZED) { return 0; } 250 U_ASSERT(pos < normalized.length()); 251 UChar trail; 252 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 253 return trail; 254 } 255 256 UBool 257 FCDUTF8CollationIterator::foundNULTerminator() { 258 if(state == CHECK_FWD && length < 0) { 259 length = --pos; 260 return TRUE; 261 } else { 262 return FALSE; 263 } 264 } 265 266 UChar32 267 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { 268 UChar32 c; 269 for(;;) { 270 if(state == CHECK_FWD) { 271 if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { 272 return U_SENTINEL; 273 } 274 if(c < 0x80) { 275 ++pos; 276 return c; 277 } 278 U8_NEXT_OR_FFFD(u8, pos, length, c); 279 if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && 280 (CollationFCD::maybeTibetanCompositeVowel(c) || 281 (pos != length && nextHasLccc()))) { 282 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 283 // and we can use U8_LENGTH() rather than a previous-position variable. 284 pos -= U8_LENGTH(c); 285 if(!nextSegment(errorCode)) { 286 return U_SENTINEL; 287 } 288 continue; 289 } 290 return c; 291 } else if(state == IN_FCD_SEGMENT && pos != limit) { 292 U8_NEXT_OR_FFFD(u8, pos, length, c); 293 return c; 294 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 295 c = normalized.char32At(pos); 296 pos += U16_LENGTH(c); 297 return c; 298 } else { 299 switchToForward(); 300 } 301 } 302 } 303 304 UChar32 305 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { 306 UChar32 c; 307 for(;;) { 308 if(state == CHECK_BWD) { 309 if(pos == 0) { 310 return U_SENTINEL; 311 } 312 if((c = u8[pos - 1]) < 0x80) { 313 --pos; 314 return c; 315 } 316 U8_PREV_OR_FFFD(u8, 0, pos, c); 317 if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && 318 (CollationFCD::maybeTibetanCompositeVowel(c) || 319 (pos != 0 && previousHasTccc()))) { 320 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 321 // and we can use U8_LENGTH() rather than a previous-position variable. 322 pos += U8_LENGTH(c); 323 if(!previousSegment(errorCode)) { 324 return U_SENTINEL; 325 } 326 continue; 327 } 328 return c; 329 } else if(state == IN_FCD_SEGMENT && pos != start) { 330 U8_PREV_OR_FFFD(u8, 0, pos, c); 331 return c; 332 } else if(state >= IN_NORMALIZED && pos != 0) { 333 c = normalized.char32At(pos - 1); 334 pos -= U16_LENGTH(c); 335 return c; 336 } else { 337 switchToBackward(); 338 } 339 } 340 } 341 342 void 343 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 344 // Specify the class to avoid a virtual-function indirection. 345 // In Java, we would declare this class final. 346 while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { 347 --num; 348 } 349 } 350 351 void 352 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 353 // Specify the class to avoid a virtual-function indirection. 354 // In Java, we would declare this class final. 355 while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { 356 --num; 357 } 358 } 359 360 void 361 FCDUTF8CollationIterator::switchToForward() { 362 U_ASSERT(state == CHECK_BWD || 363 (state == IN_FCD_SEGMENT && pos == limit) || 364 (state == IN_NORMALIZED && pos == normalized.length())); 365 if(state == CHECK_BWD) { 366 // Turn around from backward checking. 367 start = pos; 368 if(pos == limit) { 369 state = CHECK_FWD; // Check forward. 370 } else { // pos < limit 371 state = IN_FCD_SEGMENT; // Stay in FCD segment. 372 } 373 } else { 374 // Reached the end of the FCD segment. 375 if(state == IN_FCD_SEGMENT) { 376 // The input text segment is FCD, extend it forward. 377 } else { 378 // The input text segment needed to be normalized. 379 // Switch to checking forward from it. 380 start = pos = limit; 381 } 382 state = CHECK_FWD; 383 } 384 } 385 386 UBool 387 FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { 388 if(U_FAILURE(errorCode)) { return FALSE; } 389 U_ASSERT(state == CHECK_FWD && pos != length); 390 // The input text [start..pos[ passes the FCD check. 391 int32_t segmentStart = pos; 392 // Collect the characters being checked, in case they need to be normalized. 393 UnicodeString s; 394 uint8_t prevCC = 0; 395 for(;;) { 396 // Fetch the next character and its fcd16 value. 397 int32_t cpStart = pos; 398 UChar32 c; 399 U8_NEXT_OR_FFFD(u8, pos, length, c); 400 uint16_t fcd16 = nfcImpl.getFCD16(c); 401 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 402 if(leadCC == 0 && cpStart != segmentStart) { 403 // FCD boundary before this character. 404 pos = cpStart; 405 break; 406 } 407 s.append(c); 408 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 409 // Fails FCD check. Find the next FCD boundary and normalize. 410 while(pos != length) { 411 cpStart = pos; 412 U8_NEXT_OR_FFFD(u8, pos, length, c); 413 if(nfcImpl.getFCD16(c) <= 0xff) { 414 pos = cpStart; 415 break; 416 } 417 s.append(c); 418 } 419 if(!normalize(s, errorCode)) { return FALSE; } 420 start = segmentStart; 421 limit = pos; 422 state = IN_NORMALIZED; 423 pos = 0; 424 return TRUE; 425 } 426 prevCC = (uint8_t)fcd16; 427 if(pos == length || prevCC == 0) { 428 // FCD boundary after the last character. 429 break; 430 } 431 } 432 limit = pos; 433 pos = segmentStart; 434 U_ASSERT(pos != limit); 435 state = IN_FCD_SEGMENT; 436 return TRUE; 437 } 438 439 void 440 FCDUTF8CollationIterator::switchToBackward() { 441 U_ASSERT(state == CHECK_FWD || 442 (state == IN_FCD_SEGMENT && pos == start) || 443 (state >= IN_NORMALIZED && pos == 0)); 444 if(state == CHECK_FWD) { 445 // Turn around from forward checking. 446 limit = pos; 447 if(pos == start) { 448 state = CHECK_BWD; // Check backward. 449 } else { // pos > start 450 state = IN_FCD_SEGMENT; // Stay in FCD segment. 451 } 452 } else { 453 // Reached the start of the FCD segment. 454 if(state == IN_FCD_SEGMENT) { 455 // The input text segment is FCD, extend it backward. 456 } else { 457 // The input text segment needed to be normalized. 458 // Switch to checking backward from it. 459 limit = pos = start; 460 } 461 state = CHECK_BWD; 462 } 463 } 464 465 UBool 466 FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { 467 if(U_FAILURE(errorCode)) { return FALSE; } 468 U_ASSERT(state == CHECK_BWD && pos != 0); 469 // The input text [pos..limit[ passes the FCD check. 470 int32_t segmentLimit = pos; 471 // Collect the characters being checked, in case they need to be normalized. 472 UnicodeString s; 473 uint8_t nextCC = 0; 474 for(;;) { 475 // Fetch the previous character and its fcd16 value. 476 int32_t cpLimit = pos; 477 UChar32 c; 478 U8_PREV_OR_FFFD(u8, 0, pos, c); 479 uint16_t fcd16 = nfcImpl.getFCD16(c); 480 uint8_t trailCC = (uint8_t)fcd16; 481 if(trailCC == 0 && cpLimit != segmentLimit) { 482 // FCD boundary after this character. 483 pos = cpLimit; 484 break; 485 } 486 s.append(c); 487 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 488 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 489 // Fails FCD check. Find the previous FCD boundary and normalize. 490 while(fcd16 > 0xff && pos != 0) { 491 cpLimit = pos; 492 U8_PREV_OR_FFFD(u8, 0, pos, c); 493 fcd16 = nfcImpl.getFCD16(c); 494 if(fcd16 == 0) { 495 pos = cpLimit; 496 break; 497 } 498 s.append(c); 499 } 500 s.reverse(); 501 if(!normalize(s, errorCode)) { return FALSE; } 502 limit = segmentLimit; 503 start = pos; 504 state = IN_NORMALIZED; 505 pos = normalized.length(); 506 return TRUE; 507 } 508 nextCC = (uint8_t)(fcd16 >> 8); 509 if(pos == 0 || nextCC == 0) { 510 // FCD boundary before the following character. 511 break; 512 } 513 } 514 start = pos; 515 pos = segmentLimit; 516 U_ASSERT(pos != start); 517 state = IN_FCD_SEGMENT; 518 return TRUE; 519 } 520 521 UBool 522 FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 523 // NFD without argument checking. 524 U_ASSERT(U_SUCCESS(errorCode)); 525 nfcImpl.decompose(s, normalized, errorCode); 526 return U_SUCCESS(errorCode); 527 } 528 529 U_NAMESPACE_END 530 531 #endif // !UCONFIG_NO_COLLATION 532