1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * uitercollationiterator.cpp 7 * 8 * created on: 2012sep23 (from utf16collationiterator.cpp) 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/uiter.h" 17 #include "charstr.h" 18 #include "cmemory.h" 19 #include "collation.h" 20 #include "collationdata.h" 21 #include "collationfcd.h" 22 #include "collationiterator.h" 23 #include "normalizer2impl.h" 24 #include "uassert.h" 25 #include "uitercollationiterator.h" 26 27 U_NAMESPACE_BEGIN 28 29 UIterCollationIterator::~UIterCollationIterator() {} 30 31 void 32 UIterCollationIterator::resetToOffset(int32_t newOffset) { 33 reset(); 34 iter.move(&iter, newOffset, UITER_START); 35 } 36 37 int32_t 38 UIterCollationIterator::getOffset() const { 39 return iter.getIndex(&iter, UITER_CURRENT); 40 } 41 42 uint32_t 43 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 44 c = iter.next(&iter); 45 if(c < 0) { 46 return Collation::FALLBACK_CE32; 47 } 48 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 49 } 50 51 UChar 52 UIterCollationIterator::handleGetTrailSurrogate() { 53 UChar32 trail = iter.next(&iter); 54 if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); } 55 return (UChar)trail; 56 } 57 58 UChar32 59 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 60 return uiter_next32(&iter); 61 } 62 63 UChar32 64 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 65 return uiter_previous32(&iter); 66 } 67 68 void 69 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 70 while(num > 0 && (uiter_next32(&iter)) >= 0) { 71 --num; 72 } 73 } 74 75 void 76 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 77 while(num > 0 && (uiter_previous32(&iter)) >= 0) { 78 --num; 79 } 80 } 81 82 // FCDUIterCollationIterator ----------------------------------------------- *** 83 84 FCDUIterCollationIterator::~FCDUIterCollationIterator() {} 85 86 void 87 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) { 88 UIterCollationIterator::resetToOffset(newOffset); 89 start = newOffset; 90 state = ITER_CHECK_FWD; 91 } 92 93 int32_t 94 FCDUIterCollationIterator::getOffset() const { 95 if(state <= ITER_CHECK_BWD) { 96 return iter.getIndex(&iter, UITER_CURRENT); 97 } else if(state == ITER_IN_FCD_SEGMENT) { 98 return pos; 99 } else if(pos == 0) { 100 return start; 101 } else { 102 return limit; 103 } 104 } 105 106 uint32_t 107 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 108 for(;;) { 109 if(state == ITER_CHECK_FWD) { 110 c = iter.next(&iter); 111 if(c < 0) { 112 return Collation::FALLBACK_CE32; 113 } 114 if(CollationFCD::hasTccc(c)) { 115 if(CollationFCD::maybeTibetanCompositeVowel(c) || 116 CollationFCD::hasLccc(iter.current(&iter))) { 117 iter.previous(&iter); 118 if(!nextSegment(errorCode)) { 119 c = U_SENTINEL; 120 return Collation::FALLBACK_CE32; 121 } 122 continue; 123 } 124 } 125 break; 126 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { 127 c = iter.next(&iter); 128 ++pos; 129 U_ASSERT(c >= 0); 130 break; 131 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { 132 c = normalized[pos++]; 133 break; 134 } else { 135 switchToForward(); 136 } 137 } 138 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 139 } 140 141 UChar 142 FCDUIterCollationIterator::handleGetTrailSurrogate() { 143 if(state <= ITER_IN_FCD_SEGMENT) { 144 UChar32 trail = iter.next(&iter); 145 if(U16_IS_TRAIL(trail)) { 146 if(state == ITER_IN_FCD_SEGMENT) { ++pos; } 147 } else if(trail >= 0) { 148 iter.previous(&iter); 149 } 150 return (UChar)trail; 151 } else { 152 U_ASSERT(pos < normalized.length()); 153 UChar trail; 154 if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } 155 return trail; 156 } 157 } 158 159 UChar32 160 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { 161 UChar32 c; 162 for(;;) { 163 if(state == ITER_CHECK_FWD) { 164 c = iter.next(&iter); 165 if(c < 0) { 166 return c; 167 } 168 if(CollationFCD::hasTccc(c)) { 169 if(CollationFCD::maybeTibetanCompositeVowel(c) || 170 CollationFCD::hasLccc(iter.current(&iter))) { 171 iter.previous(&iter); 172 if(!nextSegment(errorCode)) { 173 return U_SENTINEL; 174 } 175 continue; 176 } 177 } 178 if(U16_IS_LEAD(c)) { 179 UChar32 trail = iter.next(&iter); 180 if(U16_IS_TRAIL(trail)) { 181 return U16_GET_SUPPLEMENTARY(c, trail); 182 } else if(trail >= 0) { 183 iter.previous(&iter); 184 } 185 } 186 return c; 187 } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { 188 c = uiter_next32(&iter); 189 pos += U16_LENGTH(c); 190 U_ASSERT(c >= 0); 191 return c; 192 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { 193 c = normalized.char32At(pos); 194 pos += U16_LENGTH(c); 195 return c; 196 } else { 197 switchToForward(); 198 } 199 } 200 } 201 202 UChar32 203 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { 204 UChar32 c; 205 for(;;) { 206 if(state == ITER_CHECK_BWD) { 207 c = iter.previous(&iter); 208 if(c < 0) { 209 start = pos = 0; 210 state = ITER_IN_FCD_SEGMENT; 211 return U_SENTINEL; 212 } 213 if(CollationFCD::hasLccc(c)) { 214 UChar32 prev = U_SENTINEL; 215 if(CollationFCD::maybeTibetanCompositeVowel(c) || 216 CollationFCD::hasTccc(prev = iter.previous(&iter))) { 217 iter.next(&iter); 218 if(prev >= 0) { 219 iter.next(&iter); 220 } 221 if(!previousSegment(errorCode)) { 222 return U_SENTINEL; 223 } 224 continue; 225 } 226 // hasLccc(trail)=true for all trail surrogates 227 if(U16_IS_TRAIL(c)) { 228 if(prev < 0) { 229 prev = iter.previous(&iter); 230 } 231 if(U16_IS_LEAD(prev)) { 232 return U16_GET_SUPPLEMENTARY(prev, c); 233 } 234 } 235 if(prev >= 0) { 236 iter.next(&iter); 237 } 238 } 239 return c; 240 } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { 241 c = uiter_previous32(&iter); 242 pos -= U16_LENGTH(c); 243 U_ASSERT(c >= 0); 244 return c; 245 } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { 246 c = normalized.char32At(pos - 1); 247 pos -= U16_LENGTH(c); 248 return c; 249 } else { 250 switchToBackward(); 251 } 252 } 253 } 254 255 void 256 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 257 // Specify the class to avoid a virtual-function indirection. 258 // In Java, we would declare this class final. 259 while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) { 260 --num; 261 } 262 } 263 264 void 265 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 266 // Specify the class to avoid a virtual-function indirection. 267 // In Java, we would declare this class final. 268 while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) { 269 --num; 270 } 271 } 272 273 void 274 FCDUIterCollationIterator::switchToForward() { 275 U_ASSERT(state == ITER_CHECK_BWD || 276 (state == ITER_IN_FCD_SEGMENT && pos == limit) || 277 (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length())); 278 if(state == ITER_CHECK_BWD) { 279 // Turn around from backward checking. 280 start = pos = iter.getIndex(&iter, UITER_CURRENT); 281 if(pos == limit) { 282 state = ITER_CHECK_FWD; // Check forward. 283 } else { // pos < limit 284 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 285 } 286 } else { 287 // Reached the end of the FCD segment. 288 if(state == ITER_IN_FCD_SEGMENT) { 289 // The input text segment is FCD, extend it forward. 290 } else { 291 // The input text segment needed to be normalized. 292 // Switch to checking forward from it. 293 if(state == IN_NORM_ITER_AT_START) { 294 iter.move(&iter, limit - start, UITER_CURRENT); 295 } 296 start = limit; 297 } 298 state = ITER_CHECK_FWD; 299 } 300 } 301 302 UBool 303 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) { 304 if(U_FAILURE(errorCode)) { return FALSE; } 305 U_ASSERT(state == ITER_CHECK_FWD); 306 // The input text [start..(iter index)[ passes the FCD check. 307 pos = iter.getIndex(&iter, UITER_CURRENT); 308 // Collect the characters being checked, in case they need to be normalized. 309 UnicodeString s; 310 uint8_t prevCC = 0; 311 for(;;) { 312 // Fetch the next character and its fcd16 value. 313 UChar32 c = uiter_next32(&iter); 314 if(c < 0) { break; } 315 uint16_t fcd16 = nfcImpl.getFCD16(c); 316 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 317 if(leadCC == 0 && !s.isEmpty()) { 318 // FCD boundary before this character. 319 uiter_previous32(&iter); 320 break; 321 } 322 s.append(c); 323 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 324 // Fails FCD check. Find the next FCD boundary and normalize. 325 for(;;) { 326 c = uiter_next32(&iter); 327 if(c < 0) { break; } 328 if(nfcImpl.getFCD16(c) <= 0xff) { 329 uiter_previous32(&iter); 330 break; 331 } 332 s.append(c); 333 } 334 if(!normalize(s, errorCode)) { return FALSE; } 335 start = pos; 336 limit = pos + s.length(); 337 state = IN_NORM_ITER_AT_LIMIT; 338 pos = 0; 339 return TRUE; 340 } 341 prevCC = (uint8_t)fcd16; 342 if(prevCC == 0) { 343 // FCD boundary after the last character. 344 break; 345 } 346 } 347 limit = pos + s.length(); 348 U_ASSERT(pos != limit); 349 iter.move(&iter, -s.length(), UITER_CURRENT); 350 state = ITER_IN_FCD_SEGMENT; 351 return TRUE; 352 } 353 354 void 355 FCDUIterCollationIterator::switchToBackward() { 356 U_ASSERT(state == ITER_CHECK_FWD || 357 (state == ITER_IN_FCD_SEGMENT && pos == start) || 358 (state >= IN_NORM_ITER_AT_LIMIT && pos == 0)); 359 if(state == ITER_CHECK_FWD) { 360 // Turn around from forward checking. 361 limit = pos = iter.getIndex(&iter, UITER_CURRENT); 362 if(pos == start) { 363 state = ITER_CHECK_BWD; // Check backward. 364 } else { // pos > start 365 state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 366 } 367 } else { 368 // Reached the start of the FCD segment. 369 if(state == ITER_IN_FCD_SEGMENT) { 370 // The input text segment is FCD, extend it backward. 371 } else { 372 // The input text segment needed to be normalized. 373 // Switch to checking backward from it. 374 if(state == IN_NORM_ITER_AT_LIMIT) { 375 iter.move(&iter, start - limit, UITER_CURRENT); 376 } 377 limit = start; 378 } 379 state = ITER_CHECK_BWD; 380 } 381 } 382 383 UBool 384 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) { 385 if(U_FAILURE(errorCode)) { return FALSE; } 386 U_ASSERT(state == ITER_CHECK_BWD); 387 // The input text [(iter index)..limit[ passes the FCD check. 388 pos = iter.getIndex(&iter, UITER_CURRENT); 389 // Collect the characters being checked, in case they need to be normalized. 390 UnicodeString s; 391 uint8_t nextCC = 0; 392 for(;;) { 393 // Fetch the previous character and its fcd16 value. 394 UChar32 c = uiter_previous32(&iter); 395 if(c < 0) { break; } 396 uint16_t fcd16 = nfcImpl.getFCD16(c); 397 uint8_t trailCC = (uint8_t)fcd16; 398 if(trailCC == 0 && !s.isEmpty()) { 399 // FCD boundary after this character. 400 uiter_next32(&iter); 401 break; 402 } 403 s.append(c); 404 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 405 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 406 // Fails FCD check. Find the previous FCD boundary and normalize. 407 while(fcd16 > 0xff) { 408 c = uiter_previous32(&iter); 409 if(c < 0) { break; } 410 fcd16 = nfcImpl.getFCD16(c); 411 if(fcd16 == 0) { 412 (void)uiter_next32(&iter); 413 break; 414 } 415 s.append(c); 416 } 417 s.reverse(); 418 if(!normalize(s, errorCode)) { return FALSE; } 419 limit = pos; 420 start = pos - s.length(); 421 state = IN_NORM_ITER_AT_START; 422 pos = normalized.length(); 423 return TRUE; 424 } 425 nextCC = (uint8_t)(fcd16 >> 8); 426 if(nextCC == 0) { 427 // FCD boundary before the following character. 428 break; 429 } 430 } 431 start = pos - s.length(); 432 U_ASSERT(pos != start); 433 iter.move(&iter, s.length(), UITER_CURRENT); 434 state = ITER_IN_FCD_SEGMENT; 435 return TRUE; 436 } 437 438 UBool 439 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { 440 // NFD without argument checking. 441 U_ASSERT(U_SUCCESS(errorCode)); 442 nfcImpl.decompose(s, normalized, errorCode); 443 return U_SUCCESS(errorCode); 444 } 445 446 U_NAMESPACE_END 447 448 #endif // !UCONFIG_NO_COLLATION 449