1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * utf8collationiterator.cpp 7 * 8 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp) 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/utf8.h" 17 #include "charstr.h" 18 #include "cmemory.h" 19 #include "collation.h" 20 #include "collationdata.h" 21 #include "collationfcd.h" 22 #include "collationiterator.h" 23 #include "normalizer2impl.h" 24 #include "uassert.h" 25 #include "utf8collationiterator.h" 26 27 U_NAMESPACE_BEGIN 28 29 UTF8CollationIterator::~UTF8CollationIterator() {} 30 31 void 32 UTF8CollationIterator::resetToOffset(int32_t newOffset) { 33 reset(); 34 pos = newOffset; 35 } 36 37 int32_t 38 UTF8CollationIterator::getOffset() const { 39 return pos; 40 } 41 42 uint32_t 43 UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 44 if(pos == length) { 45 c = U_SENTINEL; 46 return Collation::FALLBACK_CE32; 47 } 48 // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). 49 c = u8[pos++]; 50 if(c < 0xc0) { 51 // ASCII 00..7F; trail bytes 80..BF map to error values. 52 return trie->data32[c]; 53 } 54 uint8_t t1, t2; 55 if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 56 // U+0080..U+07FF; 00..7F map to error values. 57 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 58 c = ((c & 0x1f) << 6) | t1; 59 ++pos; 60 return ce32; 61 } else if(c <= 0xef && 62 ((pos + 1) < length || length < 0) && 63 (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && 64 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f 65 ) { 66 // U+0800..U+FFFF; caller maps surrogates to error values. 67 c = (UChar)((c << 12) | (t1 << 6) | t2); 68 pos += 2; 69 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 70 } else { 71 // Function call for supplementary code points and error cases. 72 // Illegal byte sequences yield U+FFFD. 73 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 74 return data->getCE32(c); 75 } 76 } 77 78 UBool 79 UTF8CollationIterator::foundNULTerminator() { 80 if(length < 0) { 81 length = --pos; 82 return TRUE; 83 } else { 84 return FALSE; 85 } 86 } 87 88 UBool 89 UTF8CollationIterator::forbidSurrogateCodePoints() const { 90 return TRUE; 91 } 92 93 UChar32 94 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 95 if(pos == length) { 96 return U_SENTINEL; 97 } 98 if(u8[pos] == 0 && length < 0) { 99 length = pos; 100 return U_SENTINEL; 101 } 102 UChar32 c; 103 U8_NEXT_OR_FFFD(u8, pos, length, c); 104 return c; 105 } 106 107 UChar32 108 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 109 if(pos == 0) { 110 return U_SENTINEL; 111 } 112 UChar32 c; 113 U8_PREV_OR_FFFD(u8, 0, pos, c); 114 return c; 115 } 116 117 void 118 UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 119 U8_FWD_N(u8, pos, length, num); 120 } 121 122 void 123 UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 124 U8_BACK_N(u8, 0, pos, num); 125 } 126 127 // FCDUTF8CollationIterator ------------------------------------------------ *** 128 129 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} 130 131 void 132 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { 133 reset(); 134 start = pos = newOffset; 135 state = CHECK_FWD; 136 } 137 138 int32_t 139 FCDUTF8CollationIterator::getOffset() const { 140 if(state != IN_NORMALIZED) { 141 return pos; 142 } else if(pos == 0) { 143 return start; 144 } else { 145 return limit; 146 } 147 } 148 149 uint32_t 150 FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 151 for(;;) { 152 if(state == CHECK_FWD) { 153 // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. 154 if(pos == length) { 155 c = U_SENTINEL; 156 return Collation::FALLBACK_CE32; 157 } 158 c = u8[pos++]; 159 if(c < 0xc0) { 160 // ASCII 00..7F; trail bytes 80..BF map to error values. 161 return trie->data32[c]; 162 } 163 uint8_t t1, t2; 164 if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { 165 // U+0080..U+07FF; 00..7F map to error values. 166 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; 167 c = ((c & 0x1f) << 6) | t1; 168 ++pos; 169 if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { 170 pos -= 2; 171 } else { 172 return ce32; 173 } 174 } else if(c <= 0xef && 175 ((pos + 1) < length || length < 0) && 176 (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && 177 (t2 = (u8[pos + 1] - 0x80)) <= 0x3f 178 ) { 179 // U+0800..U+FFFF; caller maps surrogates to error values. 180 c = (UChar)((c << 12) | (t1 << 6) | t2); 181 pos += 2; 182 if(CollationFCD::hasTccc(c) && 183 (CollationFCD::maybeTibetanCompositeVowel(c) || 184 (pos != length && nextHasLccc()))) { 185 pos -= 3; 186 } else { 187 break; // return CE32(BMP) 188 } 189 } else { 190 // Function call for supplementary code points and error cases. 191 // Illegal byte sequences yield U+FFFD. 192 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); 193 if(c == 0xfffd) { 194 return Collation::FFFD_CE32; 195 } else { 196 U_ASSERT(c > 0xffff); 197 if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { 198 pos -= 4; 199 } else { 200 return data->getCE32FromSupplementary(c); 201 } 202 } 203 } 204 if(!nextSegment(errorCode)) { 205 c = U_SENTINEL; 206 return Collation::FALLBACK_CE32; 207 } 208 continue; 209 } else if(state == IN_FCD_SEGMENT && pos != limit) { 210 return UTF8CollationIterator::handleNextCE32(c, errorCode); 211 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 212 c = normalized[pos++]; 213 break; 214 } else { 215 switchToForward(); 216 } 217 } 218 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 219 } 220 221 UBool 222 FCDUTF8CollationIterator::nextHasLccc() const { 223 U_ASSERT(state == CHECK_FWD && pos != length); 224 // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8. 225 // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.) 226 UChar32 c = u8[pos]; 227 if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; } 228 int32_t i = pos; 229 U8_NEXT_OR_FFFD(u8, i, length, c); 230 if(c > 0xffff) { c = U16_LEAD(c); } 231 return CollationFCD::hasLccc(c); 232 } 233 234 UBool 235 FCDUTF8CollationIterator::previousHasTccc() const { 236 U_ASSERT(state == CHECK_BWD && pos != 0); 237 UChar32 c = u8[pos - 1]; 238 if(c < 0x80) { return FALSE; } 239 int32_t i = pos; 240 U8_PREV_OR_FFFD(u8, 0, i, c); 241 if(c > 0xffff) { c = U16_LEAD(c); } 242 return CollationFCD::hasTccc(c); 243 } 244 245 UChar 246 FCDUTF8CollationIterator::handleGetTrailSurrogate() { 247 if(state != IN_NORMALIZED) { return 0; } 248 U_ASSERT(pos < normalized.length()); 249 UChar trail; 250 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 251 return trail; 252 } 253 254 UBool 255 FCDUTF8CollationIterator::foundNULTerminator() { 256 if(state == CHECK_FWD && length < 0) { 257 length = --pos; 258 return TRUE; 259 } else { 260 return FALSE; 261 } 262 } 263 264 UChar32 265 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { 266 UChar32 c; 267 for(;;) { 268 if(state == CHECK_FWD) { 269 if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { 270 return U_SENTINEL; 271 } 272 if(c < 0x80) { 273 ++pos; 274 return c; 275 } 276 U8_NEXT_OR_FFFD(u8, pos, length, c); 277 if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && 278 (CollationFCD::maybeTibetanCompositeVowel(c) || 279 (pos != length && nextHasLccc()))) { 280 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 281 // and we can use U8_LENGTH() rather than a previous-position variable. 282 pos -= U8_LENGTH(c); 283 if(!nextSegment(errorCode)) { 284 return U_SENTINEL; 285 } 286 continue; 287 } 288 return c; 289 } else if(state == IN_FCD_SEGMENT && pos != limit) { 290 U8_NEXT_OR_FFFD(u8, pos, length, c); 291 return c; 292 } else if(state == IN_NORMALIZED && pos != normalized.length()) { 293 c = normalized.char32At(pos); 294 pos += U16_LENGTH(c); 295 return c; 296 } else { 297 switchToForward(); 298 } 299 } 300 } 301 302 UChar32 303 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { 304 UChar32 c; 305 for(;;) { 306 if(state == CHECK_BWD) { 307 if(pos == 0) { 308 return U_SENTINEL; 309 } 310 if((c = u8[pos - 1]) < 0x80) { 311 --pos; 312 return c; 313 } 314 U8_PREV_OR_FFFD(u8, 0, pos, c); 315 if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && 316 (CollationFCD::maybeTibetanCompositeVowel(c) || 317 (pos != 0 && previousHasTccc()))) { 318 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence 319 // and we can use U8_LENGTH() rather than a previous-position variable. 320 pos += U8_LENGTH(c); 321 if(!previousSegment(errorCode)) { 322 return U_SENTINEL; 323 } 324 continue; 325 } 326 return c; 327 } else if(state == IN_FCD_SEGMENT && pos != start) { 328 U8_PREV_OR_FFFD(u8, 0, pos, c); 329 return c; 330 } else if(state >= IN_NORMALIZED && pos != 0) { 331 c = normalized.char32At(pos - 1); 332 pos -= U16_LENGTH(c); 333 return c; 334 } else { 335 switchToBackward(); 336 } 337 } 338 } 339 340 void 341 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 342 // Specify the class to avoid a virtual-function indirection. 343 // In Java, we would declare this class final. 344 while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { 345 --num; 346 } 347 } 348 349 void 350 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 351 // Specify the class to avoid a virtual-function indirection. 352 // In Java, we would declare this class final. 353 while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { 354 --num; 355 } 356 } 357 358 void 359 FCDUTF8CollationIterator::switchToForward() { 360 U_ASSERT(state == CHECK_BWD || 361 (state == IN_FCD_SEGMENT && pos == limit) || 362 (state == IN_NORMALIZED && pos == normalized.length())); 363 if(state == CHECK_BWD) { 364 // Turn around from backward checking. 365 start = pos; 366 if(pos == limit) { 367 state = CHECK_FWD; // Check forward. 368 } else { // pos < limit 369 state = IN_FCD_SEGMENT; // Stay in FCD segment. 370 } 371 } else { 372 // Reached the end of the FCD segment. 373 if(state == IN_FCD_SEGMENT) { 374 // The input text segment is FCD, extend it forward. 375 } else { 376 // The input text segment needed to be normalized. 377 // Switch to checking forward from it. 378 start = pos = limit; 379 } 380 state = CHECK_FWD; 381 } 382 } 383 384 UBool 385 FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { 386 if(U_FAILURE(errorCode)) { return FALSE; } 387 U_ASSERT(state == CHECK_FWD && pos != length); 388 // The input text [start..pos[ passes the FCD check. 389 int32_t segmentStart = pos; 390 // Collect the characters being checked, in case they need to be normalized. 391 UnicodeString s; 392 uint8_t prevCC = 0; 393 for(;;) { 394 // Fetch the next character and its fcd16 value. 395 int32_t cpStart = pos; 396 UChar32 c; 397 U8_NEXT_OR_FFFD(u8, pos, length, c); 398 uint16_t fcd16 = nfcImpl.getFCD16(c); 399 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 400 if(leadCC == 0 && cpStart != segmentStart) { 401 // FCD boundary before this character. 402 pos = cpStart; 403 break; 404 } 405 s.append(c); 406 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 407 // Fails FCD check. Find the next FCD boundary and normalize. 408 while(pos != length) { 409 cpStart = pos; 410 U8_NEXT_OR_FFFD(u8, pos, length, c); 411 if(nfcImpl.getFCD16(c) <= 0xff) { 412 pos = cpStart; 413 break; 414 } 415 s.append(c); 416 } 417 if(!normalize(s, errorCode)) { return FALSE; } 418 start = segmentStart; 419 limit = pos; 420 state = IN_NORMALIZED; 421 pos = 0; 422 return TRUE; 423 } 424 prevCC = (uint8_t)fcd16; 425 if(pos == length || prevCC == 0) { 426 // FCD boundary after the last character. 427 break; 428 } 429 } 430 limit = pos; 431 pos = segmentStart; 432 U_ASSERT(pos != limit); 433 state = IN_FCD_SEGMENT; 434 return TRUE; 435 } 436 437 void 438 FCDUTF8CollationIterator::switchToBackward() { 439 U_ASSERT(state == CHECK_FWD || 440 (state == IN_FCD_SEGMENT && pos == start) || 441 (state >= IN_NORMALIZED && pos == 0)); 442 if(state == CHECK_FWD) { 443 // Turn around from forward checking. 444 limit = pos; 445 if(pos == start) { 446 state = CHECK_BWD; // Check backward. 447 } else { // pos > start 448 state = IN_FCD_SEGMENT; // Stay in FCD segment. 449 } 450 } else { 451 // Reached the start of the FCD segment. 452 if(state == IN_FCD_SEGMENT) { 453 // The input text segment is FCD, extend it backward. 454 } else { 455 // The input text segment needed to be normalized. 456 // Switch to checking backward from it. 457 limit = pos = start; 458 } 459 state = CHECK_BWD; 460 } 461 } 462 463 UBool 464 FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { 465 if(U_FAILURE(errorCode)) { return FALSE; } 466 U_ASSERT(state == CHECK_BWD && pos != 0); 467 // The input text [pos..limit[ passes the FCD check. 468 int32_t segmentLimit = pos; 469 // Collect the characters being checked, in case they need to be normalized. 470 UnicodeString s; 471 uint8_t nextCC = 0; 472 for(;;) { 473 // Fetch the previous character and its fcd16 value. 474 int32_t cpLimit = pos; 475 UChar32 c; 476 U8_PREV_OR_FFFD(u8, 0, pos, c); 477 uint16_t fcd16 = nfcImpl.getFCD16(c); 478 uint8_t trailCC = (uint8_t)fcd16; 479 if(trailCC == 0 && cpLimit != segmentLimit) { 480 // FCD boundary after this character. 481 pos = cpLimit; 482 break; 483 } 484 s.append(c); 485 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 486 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 487 // Fails FCD check. Find the previous FCD boundary and normalize. 488 while(fcd16 > 0xff && pos != 0) { 489 cpLimit = pos; 490 U8_PREV_OR_FFFD(u8, 0, pos, c); 491 fcd16 = nfcImpl.getFCD16(c); 492 if(fcd16 == 0) { 493 pos = cpLimit; 494 break; 495 } 496 s.append(c); 497 } 498 s.reverse(); 499 if(!normalize(s, errorCode)) { return FALSE; } 500 limit = segmentLimit; 501 start = pos; 502 state = IN_NORMALIZED; 503 pos = normalized.length(); 504 return TRUE; 505 } 506 nextCC = (uint8_t)(fcd16 >> 8); 507 if(pos == 0 || nextCC == 0) { 508 // FCD boundary before the following character. 509 break; 510 } 511 } 512 start = pos; 513 pos = segmentLimit; 514 U_ASSERT(pos != start); 515 state = IN_FCD_SEGMENT; 516 return TRUE; 517 } 518 519 UBool 520 FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 521 // NFD without argument checking. 522 U_ASSERT(U_SUCCESS(errorCode)); 523 nfcImpl.decompose(s, normalized, errorCode); 524 return U_SUCCESS(errorCode); 525 } 526 527 U_NAMESPACE_END 528 529 #endif // !UCONFIG_NO_COLLATION 530