1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uiter.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jan18 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/ustring.h" 19 #include "unicode/chariter.h" 20 #include "unicode/rep.h" 21 #include "unicode/uiter.h" 22 #include "unicode/utf.h" 23 #include "unicode/utf8.h" 24 #include "unicode/utf16.h" 25 #include "cstring.h" 26 27 U_NAMESPACE_USE 28 29 #define IS_EVEN(n) (((n)&1)==0) 30 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) 31 32 U_CDECL_BEGIN 33 34 /* No-Op UCharIterator implementation for illegal input --------------------- */ 35 36 static int32_t U_CALLCONV 37 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { 38 return 0; 39 } 40 41 static int32_t U_CALLCONV 42 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { 43 return 0; 44 } 45 46 static UBool U_CALLCONV 47 noopHasNext(UCharIterator * /*iter*/) { 48 return FALSE; 49 } 50 51 static UChar32 U_CALLCONV 52 noopCurrent(UCharIterator * /*iter*/) { 53 return U_SENTINEL; 54 } 55 56 static uint32_t U_CALLCONV 57 noopGetState(const UCharIterator * /*iter*/) { 58 return UITER_NO_STATE; 59 } 60 61 static void U_CALLCONV 62 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { 63 *pErrorCode=U_UNSUPPORTED_ERROR; 64 } 65 66 static const UCharIterator noopIterator={ 67 0, 0, 0, 0, 0, 0, 68 noopGetIndex, 69 noopMove, 70 noopHasNext, 71 noopHasNext, 72 noopCurrent, 73 noopCurrent, 74 noopCurrent, 75 NULL, 76 noopGetState, 77 noopSetState 78 }; 79 80 /* UCharIterator implementation for simple strings -------------------------- */ 81 82 /* 83 * This is an implementation of a code unit (UChar) iterator 84 * for UChar * strings. 85 * 86 * The UCharIterator.context field holds a pointer to the string. 87 */ 88 89 static int32_t U_CALLCONV 90 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 91 switch(origin) { 92 case UITER_ZERO: 93 return 0; 94 case UITER_START: 95 return iter->start; 96 case UITER_CURRENT: 97 return iter->index; 98 case UITER_LIMIT: 99 return iter->limit; 100 case UITER_LENGTH: 101 return iter->length; 102 default: 103 /* not a valid origin */ 104 /* Should never get here! */ 105 return -1; 106 } 107 } 108 109 static int32_t U_CALLCONV 110 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 111 int32_t pos; 112 113 switch(origin) { 114 case UITER_ZERO: 115 pos=delta; 116 break; 117 case UITER_START: 118 pos=iter->start+delta; 119 break; 120 case UITER_CURRENT: 121 pos=iter->index+delta; 122 break; 123 case UITER_LIMIT: 124 pos=iter->limit+delta; 125 break; 126 case UITER_LENGTH: 127 pos=iter->length+delta; 128 break; 129 default: 130 return -1; /* Error */ 131 } 132 133 if(pos<iter->start) { 134 pos=iter->start; 135 } else if(pos>iter->limit) { 136 pos=iter->limit; 137 } 138 139 return iter->index=pos; 140 } 141 142 static UBool U_CALLCONV 143 stringIteratorHasNext(UCharIterator *iter) { 144 return iter->index<iter->limit; 145 } 146 147 static UBool U_CALLCONV 148 stringIteratorHasPrevious(UCharIterator *iter) { 149 return iter->index>iter->start; 150 } 151 152 static UChar32 U_CALLCONV 153 stringIteratorCurrent(UCharIterator *iter) { 154 if(iter->index<iter->limit) { 155 return ((const UChar *)(iter->context))[iter->index]; 156 } else { 157 return U_SENTINEL; 158 } 159 } 160 161 static UChar32 U_CALLCONV 162 stringIteratorNext(UCharIterator *iter) { 163 if(iter->index<iter->limit) { 164 return ((const UChar *)(iter->context))[iter->index++]; 165 } else { 166 return U_SENTINEL; 167 } 168 } 169 170 static UChar32 U_CALLCONV 171 stringIteratorPrevious(UCharIterator *iter) { 172 if(iter->index>iter->start) { 173 return ((const UChar *)(iter->context))[--iter->index]; 174 } else { 175 return U_SENTINEL; 176 } 177 } 178 179 static uint32_t U_CALLCONV 180 stringIteratorGetState(const UCharIterator *iter) { 181 return (uint32_t)iter->index; 182 } 183 184 static void U_CALLCONV 185 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 186 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 187 /* do nothing */ 188 } else if(iter==NULL) { 189 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 190 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { 191 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 192 } else { 193 iter->index=(int32_t)state; 194 } 195 } 196 197 static const UCharIterator stringIterator={ 198 0, 0, 0, 0, 0, 0, 199 stringIteratorGetIndex, 200 stringIteratorMove, 201 stringIteratorHasNext, 202 stringIteratorHasPrevious, 203 stringIteratorCurrent, 204 stringIteratorNext, 205 stringIteratorPrevious, 206 NULL, 207 stringIteratorGetState, 208 stringIteratorSetState 209 }; 210 211 U_CAPI void U_EXPORT2 212 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { 213 if(iter!=0) { 214 if(s!=0 && length>=-1) { 215 *iter=stringIterator; 216 iter->context=s; 217 if(length>=0) { 218 iter->length=length; 219 } else { 220 iter->length=u_strlen(s); 221 } 222 iter->limit=iter->length; 223 } else { 224 *iter=noopIterator; 225 } 226 } 227 } 228 229 /* UCharIterator implementation for UTF-16BE strings ------------------------ */ 230 231 /* 232 * This is an implementation of a code unit (UChar) iterator 233 * for UTF-16BE strings, i.e., strings in byte-vectors where 234 * each UChar is stored as a big-endian pair of bytes. 235 * 236 * The UCharIterator.context field holds a pointer to the string. 237 * Everything works just like with a normal UChar iterator (uiter_setString), 238 * except that UChars are assembled from byte pairs. 239 */ 240 241 /* internal helper function */ 242 static inline UChar32 243 utf16BEIteratorGet(UCharIterator *iter, int32_t index) { 244 const uint8_t *p=(const uint8_t *)iter->context; 245 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; 246 } 247 248 static UChar32 U_CALLCONV 249 utf16BEIteratorCurrent(UCharIterator *iter) { 250 int32_t index; 251 252 if((index=iter->index)<iter->limit) { 253 return utf16BEIteratorGet(iter, index); 254 } else { 255 return U_SENTINEL; 256 } 257 } 258 259 static UChar32 U_CALLCONV 260 utf16BEIteratorNext(UCharIterator *iter) { 261 int32_t index; 262 263 if((index=iter->index)<iter->limit) { 264 iter->index=index+1; 265 return utf16BEIteratorGet(iter, index); 266 } else { 267 return U_SENTINEL; 268 } 269 } 270 271 static UChar32 U_CALLCONV 272 utf16BEIteratorPrevious(UCharIterator *iter) { 273 int32_t index; 274 275 if((index=iter->index)>iter->start) { 276 iter->index=--index; 277 return utf16BEIteratorGet(iter, index); 278 } else { 279 return U_SENTINEL; 280 } 281 } 282 283 static const UCharIterator utf16BEIterator={ 284 0, 0, 0, 0, 0, 0, 285 stringIteratorGetIndex, 286 stringIteratorMove, 287 stringIteratorHasNext, 288 stringIteratorHasPrevious, 289 utf16BEIteratorCurrent, 290 utf16BEIteratorNext, 291 utf16BEIteratorPrevious, 292 NULL, 293 stringIteratorGetState, 294 stringIteratorSetState 295 }; 296 297 /* 298 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL, 299 * i.e., before a pair of 0 bytes where the first 0 byte is at an even 300 * offset from s. 301 */ 302 static int32_t 303 utf16BE_strlen(const char *s) { 304 if(IS_POINTER_EVEN(s)) { 305 /* 306 * even-aligned, call u_strlen(s) 307 * we are probably on a little-endian machine, but searching for UChar NUL 308 * does not care about endianness 309 */ 310 return u_strlen((const UChar *)s); 311 } else { 312 /* odd-aligned, search for pair of 0 bytes */ 313 const char *p=s; 314 315 while(!(*p==0 && p[1]==0)) { 316 p+=2; 317 } 318 return (int32_t)((p-s)/2); 319 } 320 } 321 322 U_CAPI void U_EXPORT2 323 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { 324 if(iter!=NULL) { 325 /* allow only even-length strings (the input length counts bytes) */ 326 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { 327 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ 328 length>>=1; 329 330 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { 331 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */ 332 uiter_setString(iter, (const UChar *)s, length); 333 return; 334 } 335 336 *iter=utf16BEIterator; 337 iter->context=s; 338 if(length>=0) { 339 iter->length=length; 340 } else { 341 iter->length=utf16BE_strlen(s); 342 } 343 iter->limit=iter->length; 344 } else { 345 *iter=noopIterator; 346 } 347 } 348 } 349 350 /* UCharIterator wrapper around CharacterIterator --------------------------- */ 351 352 /* 353 * This is wrapper code around a C++ CharacterIterator to 354 * look like a C UCharIterator. 355 * 356 * The UCharIterator.context field holds a pointer to the CharacterIterator. 357 */ 358 359 static int32_t U_CALLCONV 360 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 361 switch(origin) { 362 case UITER_ZERO: 363 return 0; 364 case UITER_START: 365 return ((CharacterIterator *)(iter->context))->startIndex(); 366 case UITER_CURRENT: 367 return ((CharacterIterator *)(iter->context))->getIndex(); 368 case UITER_LIMIT: 369 return ((CharacterIterator *)(iter->context))->endIndex(); 370 case UITER_LENGTH: 371 return ((CharacterIterator *)(iter->context))->getLength(); 372 default: 373 /* not a valid origin */ 374 /* Should never get here! */ 375 return -1; 376 } 377 } 378 379 static int32_t U_CALLCONV 380 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 381 switch(origin) { 382 case UITER_ZERO: 383 ((CharacterIterator *)(iter->context))->setIndex(delta); 384 return ((CharacterIterator *)(iter->context))->getIndex(); 385 case UITER_START: 386 case UITER_CURRENT: 387 case UITER_LIMIT: 388 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); 389 case UITER_LENGTH: 390 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); 391 return ((CharacterIterator *)(iter->context))->getIndex(); 392 default: 393 /* not a valid origin */ 394 /* Should never get here! */ 395 return -1; 396 } 397 } 398 399 static UBool U_CALLCONV 400 characterIteratorHasNext(UCharIterator *iter) { 401 return ((CharacterIterator *)(iter->context))->hasNext(); 402 } 403 404 static UBool U_CALLCONV 405 characterIteratorHasPrevious(UCharIterator *iter) { 406 return ((CharacterIterator *)(iter->context))->hasPrevious(); 407 } 408 409 static UChar32 U_CALLCONV 410 characterIteratorCurrent(UCharIterator *iter) { 411 UChar32 c; 412 413 c=((CharacterIterator *)(iter->context))->current(); 414 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { 415 return c; 416 } else { 417 return U_SENTINEL; 418 } 419 } 420 421 static UChar32 U_CALLCONV 422 characterIteratorNext(UCharIterator *iter) { 423 if(((CharacterIterator *)(iter->context))->hasNext()) { 424 return ((CharacterIterator *)(iter->context))->nextPostInc(); 425 } else { 426 return U_SENTINEL; 427 } 428 } 429 430 static UChar32 U_CALLCONV 431 characterIteratorPrevious(UCharIterator *iter) { 432 if(((CharacterIterator *)(iter->context))->hasPrevious()) { 433 return ((CharacterIterator *)(iter->context))->previous(); 434 } else { 435 return U_SENTINEL; 436 } 437 } 438 439 static uint32_t U_CALLCONV 440 characterIteratorGetState(const UCharIterator *iter) { 441 return ((CharacterIterator *)(iter->context))->getIndex(); 442 } 443 444 static void U_CALLCONV 445 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 446 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 447 /* do nothing */ 448 } else if(iter==NULL || iter->context==NULL) { 449 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 450 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { 451 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 452 } else { 453 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); 454 } 455 } 456 457 static const UCharIterator characterIteratorWrapper={ 458 0, 0, 0, 0, 0, 0, 459 characterIteratorGetIndex, 460 characterIteratorMove, 461 characterIteratorHasNext, 462 characterIteratorHasPrevious, 463 characterIteratorCurrent, 464 characterIteratorNext, 465 characterIteratorPrevious, 466 NULL, 467 characterIteratorGetState, 468 characterIteratorSetState 469 }; 470 471 U_CAPI void U_EXPORT2 472 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { 473 if(iter!=0) { 474 if(charIter!=0) { 475 *iter=characterIteratorWrapper; 476 iter->context=charIter; 477 } else { 478 *iter=noopIterator; 479 } 480 } 481 } 482 483 /* UCharIterator wrapper around Replaceable --------------------------------- */ 484 485 /* 486 * This is an implementation of a code unit (UChar) iterator 487 * based on a Replaceable object. 488 * 489 * The UCharIterator.context field holds a pointer to the Replaceable. 490 * UCharIterator.length and UCharIterator.index hold Replaceable.length() 491 * and the iteration index. 492 */ 493 494 static UChar32 U_CALLCONV 495 replaceableIteratorCurrent(UCharIterator *iter) { 496 if(iter->index<iter->limit) { 497 return ((Replaceable *)(iter->context))->charAt(iter->index); 498 } else { 499 return U_SENTINEL; 500 } 501 } 502 503 static UChar32 U_CALLCONV 504 replaceableIteratorNext(UCharIterator *iter) { 505 if(iter->index<iter->limit) { 506 return ((Replaceable *)(iter->context))->charAt(iter->index++); 507 } else { 508 return U_SENTINEL; 509 } 510 } 511 512 static UChar32 U_CALLCONV 513 replaceableIteratorPrevious(UCharIterator *iter) { 514 if(iter->index>iter->start) { 515 return ((Replaceable *)(iter->context))->charAt(--iter->index); 516 } else { 517 return U_SENTINEL; 518 } 519 } 520 521 static const UCharIterator replaceableIterator={ 522 0, 0, 0, 0, 0, 0, 523 stringIteratorGetIndex, 524 stringIteratorMove, 525 stringIteratorHasNext, 526 stringIteratorHasPrevious, 527 replaceableIteratorCurrent, 528 replaceableIteratorNext, 529 replaceableIteratorPrevious, 530 NULL, 531 stringIteratorGetState, 532 stringIteratorSetState 533 }; 534 535 U_CAPI void U_EXPORT2 536 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { 537 if(iter!=0) { 538 if(rep!=0) { 539 *iter=replaceableIterator; 540 iter->context=rep; 541 iter->limit=iter->length=rep->length(); 542 } else { 543 *iter=noopIterator; 544 } 545 } 546 } 547 548 /* UCharIterator implementation for UTF-8 strings --------------------------- */ 549 550 /* 551 * Possible, probably necessary only for an implementation for arbitrary 552 * converters: 553 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. 554 * This would require to turn reservedFn into a close function and 555 * to introduce a uiter_close(iter). 556 */ 557 558 #define UITER_CNV_CAPACITY 16 559 560 /* 561 * Minimal implementation: 562 * Maintain a single-UChar buffer for an additional surrogate. 563 * The caller must not modify start and limit because they are used internally. 564 * 565 * Use UCharIterator fields as follows: 566 * context pointer to UTF-8 string 567 * length UTF-16 length of the string; -1 until lazy evaluation 568 * start current UTF-8 index 569 * index current UTF-16 index; may be -1="unknown" after setState() 570 * limit UTF-8 length of the string 571 * reservedField supplementary code point 572 * 573 * Since UCharIterator delivers 16-bit code units, the iteration can be 574 * currently in the middle of the byte sequence for a supplementary code point. 575 * In this case, reservedField will contain that code point and start will 576 * point to after the corresponding byte sequence. The UTF-16 index will be 577 * one less than what it would otherwise be corresponding to the UTF-8 index. 578 * Otherwise, reservedField will be 0. 579 */ 580 581 /* 582 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 583 * Add implementations that do not call strlen() for iteration but check for NUL. 584 */ 585 586 static int32_t U_CALLCONV 587 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 588 switch(origin) { 589 case UITER_ZERO: 590 case UITER_START: 591 return 0; 592 case UITER_CURRENT: 593 if(iter->index<0) { 594 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 595 const uint8_t *s; 596 UChar32 c; 597 int32_t i, limit, index; 598 599 s=(const uint8_t *)iter->context; 600 i=index=0; 601 limit=iter->start; /* count up to the UTF-8 index */ 602 while(i<limit) { 603 U8_NEXT_OR_FFFD(s, i, limit, c); 604 index+=U16_LENGTH(c); 605 } 606 607 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 608 if(i==iter->limit) { 609 iter->length=index; /* in case it was <0 or wrong */ 610 } 611 if(iter->reservedField!=0) { 612 --index; /* we are in the middle of a supplementary code point */ 613 } 614 iter->index=index; 615 } 616 return iter->index; 617 case UITER_LIMIT: 618 case UITER_LENGTH: 619 if(iter->length<0) { 620 const uint8_t *s; 621 UChar32 c; 622 int32_t i, limit, length; 623 624 s=(const uint8_t *)iter->context; 625 if(iter->index<0) { 626 /* 627 * the current UTF-16 index is unknown after setState(), 628 * we must first count from the beginning to here 629 */ 630 i=length=0; 631 limit=iter->start; 632 633 /* count from the beginning to the current index */ 634 while(i<limit) { 635 U8_NEXT_OR_FFFD(s, i, limit, c); 636 length+=U16_LENGTH(c); 637 } 638 639 /* assume i==limit==iter->start, set the UTF-16 index */ 640 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 641 iter->index= iter->reservedField!=0 ? length-1 : length; 642 } else { 643 i=iter->start; 644 length=iter->index; 645 if(iter->reservedField!=0) { 646 ++length; 647 } 648 } 649 650 /* count from the current index to the end */ 651 limit=iter->limit; 652 while(i<limit) { 653 U8_NEXT_OR_FFFD(s, i, limit, c); 654 length+=U16_LENGTH(c); 655 } 656 iter->length=length; 657 } 658 return iter->length; 659 default: 660 /* not a valid origin */ 661 /* Should never get here! */ 662 return -1; 663 } 664 } 665 666 static int32_t U_CALLCONV 667 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 668 const uint8_t *s; 669 UChar32 c; 670 int32_t pos; /* requested UTF-16 index */ 671 int32_t i; /* UTF-8 index */ 672 UBool havePos; 673 674 /* calculate the requested UTF-16 index */ 675 switch(origin) { 676 case UITER_ZERO: 677 case UITER_START: 678 pos=delta; 679 havePos=TRUE; 680 /* iter->index<0 (unknown) is possible */ 681 break; 682 case UITER_CURRENT: 683 if(iter->index>=0) { 684 pos=iter->index+delta; 685 havePos=TRUE; 686 } else { 687 /* the current UTF-16 index is unknown after setState(), use only delta */ 688 pos=0; 689 havePos=FALSE; 690 } 691 break; 692 case UITER_LIMIT: 693 case UITER_LENGTH: 694 if(iter->length>=0) { 695 pos=iter->length+delta; 696 havePos=TRUE; 697 } else { 698 /* pin to the end, avoid counting the length */ 699 iter->index=-1; 700 iter->start=iter->limit; 701 iter->reservedField=0; 702 if(delta>=0) { 703 return UITER_UNKNOWN_INDEX; 704 } else { 705 /* the current UTF-16 index is unknown, use only delta */ 706 pos=0; 707 havePos=FALSE; 708 } 709 } 710 break; 711 default: 712 return -1; /* Error */ 713 } 714 715 if(havePos) { 716 /* shortcuts: pinning to the edges of the string */ 717 if(pos<=0) { 718 iter->index=iter->start=iter->reservedField=0; 719 return 0; 720 } else if(iter->length>=0 && pos>=iter->length) { 721 iter->index=iter->length; 722 iter->start=iter->limit; 723 iter->reservedField=0; 724 return iter->index; 725 } 726 727 /* minimize the number of U8_NEXT/PREV operations */ 728 if(iter->index<0 || pos<iter->index/2) { 729 /* go forward from the start instead of backward from the current index */ 730 iter->index=iter->start=iter->reservedField=0; 731 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 732 /* 733 * if we have the UTF-16 index and length and the new position is 734 * closer to the end than the current index, 735 * then go backward from the end instead of forward from the current index 736 */ 737 iter->index=iter->length; 738 iter->start=iter->limit; 739 iter->reservedField=0; 740 } 741 742 delta=pos-iter->index; 743 if(delta==0) { 744 return iter->index; /* nothing to do */ 745 } 746 } else { 747 /* move relative to unknown UTF-16 index */ 748 if(delta==0) { 749 return UITER_UNKNOWN_INDEX; /* nothing to do */ 750 } else if(-delta>=iter->start) { 751 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 752 iter->index=iter->start=iter->reservedField=0; 753 return 0; 754 } else if(delta>=(iter->limit-iter->start)) { 755 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 756 iter->index=iter->length; /* may or may not be <0 (unknown) */ 757 iter->start=iter->limit; 758 iter->reservedField=0; 759 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; 760 } 761 } 762 763 /* delta!=0 */ 764 765 /* move towards the requested position, pin to the edges of the string */ 766 s=(const uint8_t *)iter->context; 767 pos=iter->index; /* could be <0 (unknown) */ 768 i=iter->start; 769 if(delta>0) { 770 /* go forward */ 771 int32_t limit=iter->limit; 772 if(iter->reservedField!=0) { 773 iter->reservedField=0; 774 ++pos; 775 --delta; 776 } 777 while(delta>0 && i<limit) { 778 U8_NEXT_OR_FFFD(s, i, limit, c); 779 if(c<=0xffff) { 780 ++pos; 781 --delta; 782 } else if(delta>=2) { 783 pos+=2; 784 delta-=2; 785 } else /* delta==1 */ { 786 /* stop in the middle of a supplementary code point */ 787 iter->reservedField=c; 788 ++pos; 789 break; /* delta=0; */ 790 } 791 } 792 if(i==limit) { 793 if(iter->length<0 && iter->index>=0) { 794 iter->length= iter->reservedField==0 ? pos : pos+1; 795 } else if(iter->index<0 && iter->length>=0) { 796 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 797 } 798 } 799 } else /* delta<0 */ { 800 /* go backward */ 801 if(iter->reservedField!=0) { 802 iter->reservedField=0; 803 i-=4; /* we stayed behind the supplementary code point; go before it now */ 804 --pos; 805 ++delta; 806 } 807 while(delta<0 && i>0) { 808 U8_PREV_OR_FFFD(s, 0, i, c); 809 if(c<=0xffff) { 810 --pos; 811 ++delta; 812 } else if(delta<=-2) { 813 pos-=2; 814 delta+=2; 815 } else /* delta==-1 */ { 816 /* stop in the middle of a supplementary code point */ 817 i+=4; /* back to behind this supplementary code point for consistent state */ 818 iter->reservedField=c; 819 --pos; 820 break; /* delta=0; */ 821 } 822 } 823 } 824 825 iter->start=i; 826 if(iter->index>=0) { 827 return iter->index=pos; 828 } else { 829 /* we started with index<0 (unknown) so pos is bogus */ 830 if(i<=1) { 831 return iter->index=i; /* reached the beginning */ 832 } else { 833 /* we still don't know the UTF-16 index */ 834 return UITER_UNKNOWN_INDEX; 835 } 836 } 837 } 838 839 static UBool U_CALLCONV 840 utf8IteratorHasNext(UCharIterator *iter) { 841 return iter->start<iter->limit || iter->reservedField!=0; 842 } 843 844 static UBool U_CALLCONV 845 utf8IteratorHasPrevious(UCharIterator *iter) { 846 return iter->start>0; 847 } 848 849 static UChar32 U_CALLCONV 850 utf8IteratorCurrent(UCharIterator *iter) { 851 if(iter->reservedField!=0) { 852 return U16_TRAIL(iter->reservedField); 853 } else if(iter->start<iter->limit) { 854 const uint8_t *s=(const uint8_t *)iter->context; 855 UChar32 c; 856 int32_t i=iter->start; 857 858 U8_NEXT_OR_FFFD(s, i, iter->limit, c); 859 if(c<=0xffff) { 860 return c; 861 } else { 862 return U16_LEAD(c); 863 } 864 } else { 865 return U_SENTINEL; 866 } 867 } 868 869 static UChar32 U_CALLCONV 870 utf8IteratorNext(UCharIterator *iter) { 871 int32_t index; 872 873 if(iter->reservedField!=0) { 874 UChar trail=U16_TRAIL(iter->reservedField); 875 iter->reservedField=0; 876 if((index=iter->index)>=0) { 877 iter->index=index+1; 878 } 879 return trail; 880 } else if(iter->start<iter->limit) { 881 const uint8_t *s=(const uint8_t *)iter->context; 882 UChar32 c; 883 884 U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c); 885 if((index=iter->index)>=0) { 886 iter->index=++index; 887 if(iter->length<0 && iter->start==iter->limit) { 888 iter->length= c<=0xffff ? index : index+1; 889 } 890 } else if(iter->start==iter->limit && iter->length>=0) { 891 iter->index= c<=0xffff ? iter->length : iter->length-1; 892 } 893 if(c<=0xffff) { 894 return c; 895 } else { 896 iter->reservedField=c; 897 return U16_LEAD(c); 898 } 899 } else { 900 return U_SENTINEL; 901 } 902 } 903 904 static UChar32 U_CALLCONV 905 utf8IteratorPrevious(UCharIterator *iter) { 906 int32_t index; 907 908 if(iter->reservedField!=0) { 909 UChar lead=U16_LEAD(iter->reservedField); 910 iter->reservedField=0; 911 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 912 if((index=iter->index)>0) { 913 iter->index=index-1; 914 } 915 return lead; 916 } else if(iter->start>0) { 917 const uint8_t *s=(const uint8_t *)iter->context; 918 UChar32 c; 919 920 U8_PREV_OR_FFFD(s, 0, iter->start, c); 921 if((index=iter->index)>0) { 922 iter->index=index-1; 923 } else if(iter->start<=1) { 924 iter->index= c<=0xffff ? iter->start : iter->start+1; 925 } 926 if(c<=0xffff) { 927 return c; 928 } else { 929 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 930 iter->reservedField=c; 931 return U16_TRAIL(c); 932 } 933 } else { 934 return U_SENTINEL; 935 } 936 } 937 938 static uint32_t U_CALLCONV 939 utf8IteratorGetState(const UCharIterator *iter) { 940 uint32_t state=(uint32_t)(iter->start<<1); 941 if(iter->reservedField!=0) { 942 state|=1; 943 } 944 return state; 945 } 946 947 static void U_CALLCONV 948 utf8IteratorSetState(UCharIterator *iter, 949 uint32_t state, 950 UErrorCode *pErrorCode) 951 { 952 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 953 /* do nothing */ 954 } else if(iter==NULL) { 955 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 956 } else if(state==utf8IteratorGetState(iter)) { 957 /* setting to the current state: no-op */ 958 } else { 959 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 960 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 961 962 if((state==0 ? index<0 : index<4) || iter->limit<index) { 963 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 964 } else { 965 iter->start=index; /* restore UTF-8 byte index */ 966 if(index<=1) { 967 iter->index=index; 968 } else { 969 iter->index=-1; /* unknown UTF-16 index */ 970 } 971 if(state==0) { 972 iter->reservedField=0; 973 } else { 974 /* verified index>=4 above */ 975 UChar32 c; 976 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c); 977 if(c<=0xffff) { 978 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 979 } else { 980 iter->reservedField=c; 981 } 982 } 983 } 984 } 985 } 986 987 static const UCharIterator utf8Iterator={ 988 0, 0, 0, 0, 0, 0, 989 utf8IteratorGetIndex, 990 utf8IteratorMove, 991 utf8IteratorHasNext, 992 utf8IteratorHasPrevious, 993 utf8IteratorCurrent, 994 utf8IteratorNext, 995 utf8IteratorPrevious, 996 NULL, 997 utf8IteratorGetState, 998 utf8IteratorSetState 999 }; 1000 1001 U_CAPI void U_EXPORT2 1002 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { 1003 if(iter!=0) { 1004 if(s!=0 && length>=-1) { 1005 *iter=utf8Iterator; 1006 iter->context=s; 1007 if(length>=0) { 1008 iter->limit=length; 1009 } else { 1010 iter->limit=(int32_t)uprv_strlen(s); 1011 } 1012 iter->length= iter->limit<=1 ? iter->limit : -1; 1013 } else { 1014 *iter=noopIterator; 1015 } 1016 } 1017 } 1018 1019 /* Helper functions --------------------------------------------------------- */ 1020 1021 U_CAPI UChar32 U_EXPORT2 1022 uiter_current32(UCharIterator *iter) { 1023 UChar32 c, c2; 1024 1025 c=iter->current(iter); 1026 if(U16_IS_SURROGATE(c)) { 1027 if(U16_IS_SURROGATE_LEAD(c)) { 1028 /* 1029 * go to the next code unit 1030 * we know that we are not at the limit because c!=U_SENTINEL 1031 */ 1032 iter->move(iter, 1, UITER_CURRENT); 1033 if(U16_IS_TRAIL(c2=iter->current(iter))) { 1034 c=U16_GET_SUPPLEMENTARY(c, c2); 1035 } 1036 1037 /* undo index movement */ 1038 iter->move(iter, -1, UITER_CURRENT); 1039 } else { 1040 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1041 c=U16_GET_SUPPLEMENTARY(c2, c); 1042 } 1043 if(c2>=0) { 1044 /* undo index movement */ 1045 iter->move(iter, 1, UITER_CURRENT); 1046 } 1047 } 1048 } 1049 return c; 1050 } 1051 1052 U_CAPI UChar32 U_EXPORT2 1053 uiter_next32(UCharIterator *iter) { 1054 UChar32 c, c2; 1055 1056 c=iter->next(iter); 1057 if(U16_IS_LEAD(c)) { 1058 if(U16_IS_TRAIL(c2=iter->next(iter))) { 1059 c=U16_GET_SUPPLEMENTARY(c, c2); 1060 } else if(c2>=0) { 1061 /* unmatched first surrogate, undo index movement */ 1062 iter->move(iter, -1, UITER_CURRENT); 1063 } 1064 } 1065 return c; 1066 } 1067 1068 U_CAPI UChar32 U_EXPORT2 1069 uiter_previous32(UCharIterator *iter) { 1070 UChar32 c, c2; 1071 1072 c=iter->previous(iter); 1073 if(U16_IS_TRAIL(c)) { 1074 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1075 c=U16_GET_SUPPLEMENTARY(c2, c); 1076 } else if(c2>=0) { 1077 /* unmatched second surrogate, undo index movement */ 1078 iter->move(iter, 1, UITER_CURRENT); 1079 } 1080 } 1081 return c; 1082 } 1083 1084 U_CAPI uint32_t U_EXPORT2 1085 uiter_getState(const UCharIterator *iter) { 1086 if(iter==NULL || iter->getState==NULL) { 1087 return UITER_NO_STATE; 1088 } else { 1089 return iter->getState(iter); 1090 } 1091 } 1092 1093 U_CAPI void U_EXPORT2 1094 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1095 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1096 /* do nothing */ 1097 } else if(iter==NULL) { 1098 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1099 } else if(iter->setState==NULL) { 1100 *pErrorCode=U_UNSUPPORTED_ERROR; 1101 } else { 1102 iter->setState(iter, state, pErrorCode); 1103 } 1104 } 1105 1106 U_CDECL_END 1107