1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * uitercollationiterator.cpp 9 * 10 * created on: 2012sep23 (from utf16collationiterator.cpp) 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/uiter.h" 19 #include "charstr.h" 20 #include "cmemory.h" 21 #include "collation.h" 22 #include "collationdata.h" 23 #include "collationfcd.h" 24 #include "collationiterator.h" 25 #include "normalizer2impl.h" 26 #include "uassert.h" 27 #include "uitercollationiterator.h" 28 29 U_NAMESPACE_BEGIN 30 31 UIterCollationIterator::~UIterCollationIterator() {} 32 33 void 34 UIterCollationIterator::resetToOffset(int32_t newOffset) { 35 reset(); 36 iter.move(&iter, newOffset, UITER_START); 37 } 38 39 int32_t 40 UIterCollationIterator::getOffset() const { 41 return iter.getIndex(&iter, UITER_CURRENT); 42 } 43 44 uint32_t 45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 46 c = iter.next(&iter); 47 if(c < 0) { 48 return Collation::FALLBACK_CE32; 49 } 50 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 51 } 52 53 UChar 54 UIterCollationIterator::handleGetTrailSurrogate() { 55 UChar32 trail = iter.next(&iter); 56 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); } 57 return (UChar)trail; 58 } 59 60 UChar32 61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 62 return uiter_next32(&iter); 63 } 64 65 UChar32 66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 67 return uiter_previous32(&iter); 68 } 69 70 void 71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 72 while(num > 0 && (uiter_next32(&iter)) >= 0) { 73 --num; 74 } 75 } 76 77 void 78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 79 while(num > 0 && (uiter_previous32(&iter)) >= 0) { 80 --num; 81 } 82 } 83 84 // FCDUIterCollationIterator ----------------------------------------------- *** 85 86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {} 87 88 void 89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) { 90 UIterCollationIterator::resetToOffset(newOffset); 91 start = newOffset; 92 state = ITER_CHECK_FWD; 93 } 94 95 int32_t 96 FCDUIterCollationIterator::getOffset() const { 97 if(state <= ITER_CHECK_BWD) { 98 return iter.getIndex(&iter, UITER_CURRENT); 99 } else if(state == ITER_IN_FCD_SEGMENT) { 100 return pos; 101 } else if(pos == 0) { 102 return start; 103 } else { 104 return limit; 105 } 106 } 107 108 uint32_t 109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 110 for(;;) { 111 if(state == ITER_CHECK_FWD) { 112 c = iter.next(&iter); 113 if(c < 0) { 114 return Collation::FALLBACK_CE32; 115 } 116 if(CollationFCD::hasTccc(c)) { 117 if(CollationFCD::maybeTibetanCompositeVowel(c) || 118 CollationFCD::hasLccc(iter.current(&iter))) { 119 iter.previous(&iter); 120 if(!nextSegment(errorCode)) { 121 c = U_SENTINEL; 122 return Collation::FALLBACK_CE32; 123 } 124 continue; 125 } 126 } 127 break; 128 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { 129 c = iter.next(&iter); 130 ++pos; 131 U_ASSERT(c >= 0); 132 break; 133 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { 134 c = normalized[pos++]; 135 break; 136 } else { 137 switchToForward(); 138 } 139 } 140 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 141 } 142 143 UChar 144 FCDUIterCollationIterator::handleGetTrailSurrogate() { 145 if(state <= ITER_IN_FCD_SEGMENT) { 146 UChar32 trail = iter.next(&iter); 147 if(U16_IS_TRAIL(trail)) { 148 if(state == ITER_IN_FCD_SEGMENT) { ++pos; } 149 } else if(trail >= 0) { 150 iter.previous(&iter); 151 } 152 return (UChar)trail; 153 } else { 154 U_ASSERT(pos < normalized.length()); 155 UChar trail; 156 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 157 return trail; 158 } 159 } 160 161 UChar32 162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { 163 UChar32 c; 164 for(;;) { 165 if(state == ITER_CHECK_FWD) { 166 c = iter.next(&iter); 167 if(c < 0) { 168 return c; 169 } 170 if(CollationFCD::hasTccc(c)) { 171 if(CollationFCD::maybeTibetanCompositeVowel(c) || 172 CollationFCD::hasLccc(iter.current(&iter))) { 173 iter.previous(&iter); 174 if(!nextSegment(errorCode)) { 175 return U_SENTINEL; 176 } 177 continue; 178 } 179 } 180 if(U16_IS_LEAD(c)) { 181 UChar32 trail = iter.next(&iter); 182 if(U16_IS_TRAIL(trail)) { 183 return U16_GET_SUPPLEMENTARY(c, trail); 184 } else if(trail >= 0) { 185 iter.previous(&iter); 186 } 187 } 188 return c; 189 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { 190 c = uiter_next32(&iter); 191 pos += U16_LENGTH(c); 192 U_ASSERT(c >= 0); 193 return c; 194 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { 195 c = normalized.char32At(pos); 196 pos += U16_LENGTH(c); 197 return c; 198 } else { 199 switchToForward(); 200 } 201 } 202 } 203 204 UChar32 205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { 206 UChar32 c; 207 for(;;) { 208 if(state == ITER_CHECK_BWD) { 209 c = iter.previous(&iter); 210 if(c < 0) { 211 start = pos = 0; 212 state = ITER_IN_FCD_SEGMENT; 213 return U_SENTINEL; 214 } 215 if(CollationFCD::hasLccc(c)) { 216 UChar32 prev = U_SENTINEL; 217 if(CollationFCD::maybeTibetanCompositeVowel(c) || 218 CollationFCD::hasTccc(prev = iter.previous(&iter))) { 219 iter.next(&iter); 220 if(prev >= 0) { 221 iter.next(&iter); 222 } 223 if(!previousSegment(errorCode)) { 224 return U_SENTINEL; 225 } 226 continue; 227 } 228 // hasLccc(trail)=true for all trail surrogates 229 if(U16_IS_TRAIL(c)) { 230 if(prev < 0) { 231 prev = iter.previous(&iter); 232 } 233 if(U16_IS_LEAD(prev)) { 234 return U16_GET_SUPPLEMENTARY(prev, c); 235 } 236 } 237 if(prev >= 0) { 238 iter.next(&iter); 239 } 240 } 241 return c; 242 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { 243 c = uiter_previous32(&iter); 244 pos -= U16_LENGTH(c); 245 U_ASSERT(c >= 0); 246 return c; 247 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { 248 c = normalized.char32At(pos - 1); 249 pos -= U16_LENGTH(c); 250 return c; 251 } else { 252 switchToBackward(); 253 } 254 } 255 } 256 257 void 258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 259 // Specify the class to avoid a virtual-function indirection. 260 // In Java, we would declare this class final. 261 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) { 262 --num; 263 } 264 } 265 266 void 267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 268 // Specify the class to avoid a virtual-function indirection. 269 // In Java, we would declare this class final. 270 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) { 271 --num; 272 } 273 } 274 275 void 276 FCDUIterCollationIterator::switchToForward() { 277 U_ASSERT(state == ITER_CHECK_BWD || 278 (state == ITER_IN_FCD_SEGMENT && pos == limit) || 279 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length())); 280 if(state == ITER_CHECK_BWD) { 281 // Turn around from backward checking. 282 start = pos = iter.getIndex(&iter, UITER_CURRENT); 283 if(pos == limit) { 284 state = ITER_CHECK_FWD; // Check forward. 285 } else { // pos < limit 286 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 287 } 288 } else { 289 // Reached the end of the FCD segment. 290 if(state == ITER_IN_FCD_SEGMENT) { 291 // The input text segment is FCD, extend it forward. 292 } else { 293 // The input text segment needed to be normalized. 294 // Switch to checking forward from it. 295 if(state == IN_NORM_ITER_AT_START) { 296 iter.move(&iter, limit - start, UITER_CURRENT); 297 } 298 start = limit; 299 } 300 state = ITER_CHECK_FWD; 301 } 302 } 303 304 UBool 305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) { 306 if(U_FAILURE(errorCode)) { return FALSE; } 307 U_ASSERT(state == ITER_CHECK_FWD); 308 // The input text [start..(iter index)[ passes the FCD check. 309 pos = iter.getIndex(&iter, UITER_CURRENT); 310 // Collect the characters being checked, in case they need to be normalized. 311 UnicodeString s; 312 uint8_t prevCC = 0; 313 for(;;) { 314 // Fetch the next character and its fcd16 value. 315 UChar32 c = uiter_next32(&iter); 316 if(c < 0) { break; } 317 uint16_t fcd16 = nfcImpl.getFCD16(c); 318 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 319 if(leadCC == 0 && !s.isEmpty()) { 320 // FCD boundary before this character. 321 uiter_previous32(&iter); 322 break; 323 } 324 s.append(c); 325 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 326 // Fails FCD check. Find the next FCD boundary and normalize. 327 for(;;) { 328 c = uiter_next32(&iter); 329 if(c < 0) { break; } 330 if(nfcImpl.getFCD16(c) <= 0xff) { 331 uiter_previous32(&iter); 332 break; 333 } 334 s.append(c); 335 } 336 if(!normalize(s, errorCode)) { return FALSE; } 337 start = pos; 338 limit = pos + s.length(); 339 state = IN_NORM_ITER_AT_LIMIT; 340 pos = 0; 341 return TRUE; 342 } 343 prevCC = (uint8_t)fcd16; 344 if(prevCC == 0) { 345 // FCD boundary after the last character. 346 break; 347 } 348 } 349 limit = pos + s.length(); 350 U_ASSERT(pos != limit); 351 iter.move(&iter, -s.length(), UITER_CURRENT); 352 state = ITER_IN_FCD_SEGMENT; 353 return TRUE; 354 } 355 356 void 357 FCDUIterCollationIterator::switchToBackward() { 358 U_ASSERT(state == ITER_CHECK_FWD || 359 (state == ITER_IN_FCD_SEGMENT && pos == start) || 360 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0)); 361 if(state == ITER_CHECK_FWD) { 362 // Turn around from forward checking. 363 limit = pos = iter.getIndex(&iter, UITER_CURRENT); 364 if(pos == start) { 365 state = ITER_CHECK_BWD; // Check backward. 366 } else { // pos > start 367 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 368 } 369 } else { 370 // Reached the start of the FCD segment. 371 if(state == ITER_IN_FCD_SEGMENT) { 372 // The input text segment is FCD, extend it backward. 373 } else { 374 // The input text segment needed to be normalized. 375 // Switch to checking backward from it. 376 if(state == IN_NORM_ITER_AT_LIMIT) { 377 iter.move(&iter, start - limit, UITER_CURRENT); 378 } 379 limit = start; 380 } 381 state = ITER_CHECK_BWD; 382 } 383 } 384 385 UBool 386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) { 387 if(U_FAILURE(errorCode)) { return FALSE; } 388 U_ASSERT(state == ITER_CHECK_BWD); 389 // The input text [(iter index)..limit[ passes the FCD check. 390 pos = iter.getIndex(&iter, UITER_CURRENT); 391 // Collect the characters being checked, in case they need to be normalized. 392 UnicodeString s; 393 uint8_t nextCC = 0; 394 for(;;) { 395 // Fetch the previous character and its fcd16 value. 396 UChar32 c = uiter_previous32(&iter); 397 if(c < 0) { break; } 398 uint16_t fcd16 = nfcImpl.getFCD16(c); 399 uint8_t trailCC = (uint8_t)fcd16; 400 if(trailCC == 0 && !s.isEmpty()) { 401 // FCD boundary after this character. 402 uiter_next32(&iter); 403 break; 404 } 405 s.append(c); 406 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 407 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 408 // Fails FCD check. Find the previous FCD boundary and normalize. 409 while(fcd16 > 0xff) { 410 c = uiter_previous32(&iter); 411 if(c < 0) { break; } 412 fcd16 = nfcImpl.getFCD16(c); 413 if(fcd16 == 0) { 414 (void)uiter_next32(&iter); 415 break; 416 } 417 s.append(c); 418 } 419 s.reverse(); 420 if(!normalize(s, errorCode)) { return FALSE; } 421 limit = pos; 422 start = pos - s.length(); 423 state = IN_NORM_ITER_AT_START; 424 pos = normalized.length(); 425 return TRUE; 426 } 427 nextCC = (uint8_t)(fcd16 >> 8); 428 if(nextCC == 0) { 429 // FCD boundary before the following character. 430 break; 431 } 432 } 433 start = pos - s.length(); 434 U_ASSERT(pos != start); 435 iter.move(&iter, s.length(), UITER_CURRENT); 436 state = ITER_IN_FCD_SEGMENT; 437 return TRUE; 438 } 439 440 UBool 441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 442 // NFD without argument checking. 443 U_ASSERT(U_SUCCESS(errorCode)); 444 nfcImpl.decompose(s, normalized, errorCode); 445 return U_SUCCESS(errorCode); 446 } 447 448 U_NAMESPACE_END 449 450 #endif // !UCONFIG_NO_COLLATION 451