1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2006, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uiter.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jan18 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/ustring.h" 19 #include "unicode/chariter.h" 20 #include "unicode/rep.h" 21 #include "unicode/uiter.h" 22 #include "cstring.h" 23 24 U_NAMESPACE_USE 25 26 #define IS_EVEN(n) (((n)&1)==0) 27 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) 28 29 U_CDECL_BEGIN 30 31 /* No-Op UCharIterator implementation for illegal input --------------------- */ 32 33 static int32_t U_CALLCONV 34 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { 35 return 0; 36 } 37 38 static int32_t U_CALLCONV 39 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { 40 return 0; 41 } 42 43 static UBool U_CALLCONV 44 noopHasNext(UCharIterator * /*iter*/) { 45 return FALSE; 46 } 47 48 static UChar32 U_CALLCONV 49 noopCurrent(UCharIterator * /*iter*/) { 50 return U_SENTINEL; 51 } 52 53 static uint32_t U_CALLCONV 54 noopGetState(const UCharIterator * /*iter*/) { 55 return UITER_NO_STATE; 56 } 57 58 static void U_CALLCONV 59 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { 60 *pErrorCode=U_UNSUPPORTED_ERROR; 61 } 62 63 static const UCharIterator noopIterator={ 64 0, 0, 0, 0, 0, 0, 65 noopGetIndex, 66 noopMove, 67 noopHasNext, 68 noopHasNext, 69 noopCurrent, 70 noopCurrent, 71 noopCurrent, 72 NULL, 73 noopGetState, 74 noopSetState 75 }; 76 77 /* UCharIterator implementation for simple strings -------------------------- */ 78 79 /* 80 * This is an implementation of a code unit (UChar) iterator 81 * for UChar * strings. 82 * 83 * The UCharIterator.context field holds a pointer to the string. 84 */ 85 86 static int32_t U_CALLCONV 87 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 88 switch(origin) { 89 case UITER_ZERO: 90 return 0; 91 case UITER_START: 92 return iter->start; 93 case UITER_CURRENT: 94 return iter->index; 95 case UITER_LIMIT: 96 return iter->limit; 97 case UITER_LENGTH: 98 return iter->length; 99 default: 100 /* not a valid origin */ 101 /* Should never get here! */ 102 return -1; 103 } 104 } 105 106 static int32_t U_CALLCONV 107 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 108 int32_t pos; 109 110 switch(origin) { 111 case UITER_ZERO: 112 pos=delta; 113 break; 114 case UITER_START: 115 pos=iter->start+delta; 116 break; 117 case UITER_CURRENT: 118 pos=iter->index+delta; 119 break; 120 case UITER_LIMIT: 121 pos=iter->limit+delta; 122 break; 123 case UITER_LENGTH: 124 pos=iter->length+delta; 125 break; 126 default: 127 return -1; /* Error */ 128 } 129 130 if(pos<iter->start) { 131 pos=iter->start; 132 } else if(pos>iter->limit) { 133 pos=iter->limit; 134 } 135 136 return iter->index=pos; 137 } 138 139 static UBool U_CALLCONV 140 stringIteratorHasNext(UCharIterator *iter) { 141 return iter->index<iter->limit; 142 } 143 144 static UBool U_CALLCONV 145 stringIteratorHasPrevious(UCharIterator *iter) { 146 return iter->index>iter->start; 147 } 148 149 static UChar32 U_CALLCONV 150 stringIteratorCurrent(UCharIterator *iter) { 151 if(iter->index<iter->limit) { 152 return ((const UChar *)(iter->context))[iter->index]; 153 } else { 154 return U_SENTINEL; 155 } 156 } 157 158 static UChar32 U_CALLCONV 159 stringIteratorNext(UCharIterator *iter) { 160 if(iter->index<iter->limit) { 161 return ((const UChar *)(iter->context))[iter->index++]; 162 } else { 163 return U_SENTINEL; 164 } 165 } 166 167 static UChar32 U_CALLCONV 168 stringIteratorPrevious(UCharIterator *iter) { 169 if(iter->index>iter->start) { 170 return ((const UChar *)(iter->context))[--iter->index]; 171 } else { 172 return U_SENTINEL; 173 } 174 } 175 176 static uint32_t U_CALLCONV 177 stringIteratorGetState(const UCharIterator *iter) { 178 return (uint32_t)iter->index; 179 } 180 181 static void U_CALLCONV 182 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 183 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 184 /* do nothing */ 185 } else if(iter==NULL) { 186 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 187 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { 188 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 189 } else { 190 iter->index=(int32_t)state; 191 } 192 } 193 194 static const UCharIterator stringIterator={ 195 0, 0, 0, 0, 0, 0, 196 stringIteratorGetIndex, 197 stringIteratorMove, 198 stringIteratorHasNext, 199 stringIteratorHasPrevious, 200 stringIteratorCurrent, 201 stringIteratorNext, 202 stringIteratorPrevious, 203 NULL, 204 stringIteratorGetState, 205 stringIteratorSetState 206 }; 207 208 U_CAPI void U_EXPORT2 209 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { 210 if(iter!=0) { 211 if(s!=0 && length>=-1) { 212 *iter=stringIterator; 213 iter->context=s; 214 if(length>=0) { 215 iter->length=length; 216 } else { 217 iter->length=u_strlen(s); 218 } 219 iter->limit=iter->length; 220 } else { 221 *iter=noopIterator; 222 } 223 } 224 } 225 226 /* UCharIterator implementation for UTF-16BE strings ------------------------ */ 227 228 /* 229 * This is an implementation of a code unit (UChar) iterator 230 * for UTF-16BE strings, i.e., strings in byte-vectors where 231 * each UChar is stored as a big-endian pair of bytes. 232 * 233 * The UCharIterator.context field holds a pointer to the string. 234 * Everything works just like with a normal UChar iterator (uiter_setString), 235 * except that UChars are assembled from byte pairs. 236 */ 237 238 /* internal helper function */ 239 static inline UChar32 240 utf16BEIteratorGet(UCharIterator *iter, int32_t index) { 241 const uint8_t *p=(const uint8_t *)iter->context; 242 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; 243 } 244 245 static UChar32 U_CALLCONV 246 utf16BEIteratorCurrent(UCharIterator *iter) { 247 int32_t index; 248 249 if((index=iter->index)<iter->limit) { 250 return utf16BEIteratorGet(iter, index); 251 } else { 252 return U_SENTINEL; 253 } 254 } 255 256 static UChar32 U_CALLCONV 257 utf16BEIteratorNext(UCharIterator *iter) { 258 int32_t index; 259 260 if((index=iter->index)<iter->limit) { 261 iter->index=index+1; 262 return utf16BEIteratorGet(iter, index); 263 } else { 264 return U_SENTINEL; 265 } 266 } 267 268 static UChar32 U_CALLCONV 269 utf16BEIteratorPrevious(UCharIterator *iter) { 270 int32_t index; 271 272 if((index=iter->index)>iter->start) { 273 iter->index=--index; 274 return utf16BEIteratorGet(iter, index); 275 } else { 276 return U_SENTINEL; 277 } 278 } 279 280 static const UCharIterator utf16BEIterator={ 281 0, 0, 0, 0, 0, 0, 282 stringIteratorGetIndex, 283 stringIteratorMove, 284 stringIteratorHasNext, 285 stringIteratorHasPrevious, 286 utf16BEIteratorCurrent, 287 utf16BEIteratorNext, 288 utf16BEIteratorPrevious, 289 NULL, 290 stringIteratorGetState, 291 stringIteratorSetState 292 }; 293 294 /* 295 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL, 296 * i.e., before a pair of 0 bytes where the first 0 byte is at an even 297 * offset from s. 298 */ 299 static int32_t 300 utf16BE_strlen(const char *s) { 301 if(IS_POINTER_EVEN(s)) { 302 /* 303 * even-aligned, call u_strlen(s) 304 * we are probably on a little-endian machine, but searching for UChar NUL 305 * does not care about endianness 306 */ 307 return u_strlen((const UChar *)s); 308 } else { 309 /* odd-aligned, search for pair of 0 bytes */ 310 const char *p=s; 311 312 while(!(*p==0 && p[1]==0)) { 313 p+=2; 314 } 315 return (int32_t)((p-s)/2); 316 } 317 } 318 319 U_CAPI void U_EXPORT2 320 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { 321 if(iter!=NULL) { 322 /* allow only even-length strings (the input length counts bytes) */ 323 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { 324 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ 325 length>>=1; 326 327 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { 328 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */ 329 uiter_setString(iter, (const UChar *)s, length); 330 return; 331 } 332 333 *iter=utf16BEIterator; 334 iter->context=s; 335 if(length>=0) { 336 iter->length=length; 337 } else { 338 iter->length=utf16BE_strlen(s); 339 } 340 iter->limit=iter->length; 341 } else { 342 *iter=noopIterator; 343 } 344 } 345 } 346 347 /* UCharIterator wrapper around CharacterIterator --------------------------- */ 348 349 /* 350 * This is wrapper code around a C++ CharacterIterator to 351 * look like a C UCharIterator. 352 * 353 * The UCharIterator.context field holds a pointer to the CharacterIterator. 354 */ 355 356 static int32_t U_CALLCONV 357 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 358 switch(origin) { 359 case UITER_ZERO: 360 return 0; 361 case UITER_START: 362 return ((CharacterIterator *)(iter->context))->startIndex(); 363 case UITER_CURRENT: 364 return ((CharacterIterator *)(iter->context))->getIndex(); 365 case UITER_LIMIT: 366 return ((CharacterIterator *)(iter->context))->endIndex(); 367 case UITER_LENGTH: 368 return ((CharacterIterator *)(iter->context))->getLength(); 369 default: 370 /* not a valid origin */ 371 /* Should never get here! */ 372 return -1; 373 } 374 } 375 376 static int32_t U_CALLCONV 377 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 378 switch(origin) { 379 case UITER_ZERO: 380 ((CharacterIterator *)(iter->context))->setIndex(delta); 381 return ((CharacterIterator *)(iter->context))->getIndex(); 382 case UITER_START: 383 case UITER_CURRENT: 384 case UITER_LIMIT: 385 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); 386 case UITER_LENGTH: 387 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); 388 return ((CharacterIterator *)(iter->context))->getIndex(); 389 default: 390 /* not a valid origin */ 391 /* Should never get here! */ 392 return -1; 393 } 394 } 395 396 static UBool U_CALLCONV 397 characterIteratorHasNext(UCharIterator *iter) { 398 return ((CharacterIterator *)(iter->context))->hasNext(); 399 } 400 401 static UBool U_CALLCONV 402 characterIteratorHasPrevious(UCharIterator *iter) { 403 return ((CharacterIterator *)(iter->context))->hasPrevious(); 404 } 405 406 static UChar32 U_CALLCONV 407 characterIteratorCurrent(UCharIterator *iter) { 408 UChar32 c; 409 410 c=((CharacterIterator *)(iter->context))->current(); 411 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { 412 return c; 413 } else { 414 return U_SENTINEL; 415 } 416 } 417 418 static UChar32 U_CALLCONV 419 characterIteratorNext(UCharIterator *iter) { 420 if(((CharacterIterator *)(iter->context))->hasNext()) { 421 return ((CharacterIterator *)(iter->context))->nextPostInc(); 422 } else { 423 return U_SENTINEL; 424 } 425 } 426 427 static UChar32 U_CALLCONV 428 characterIteratorPrevious(UCharIterator *iter) { 429 if(((CharacterIterator *)(iter->context))->hasPrevious()) { 430 return ((CharacterIterator *)(iter->context))->previous(); 431 } else { 432 return U_SENTINEL; 433 } 434 } 435 436 static uint32_t U_CALLCONV 437 characterIteratorGetState(const UCharIterator *iter) { 438 return ((CharacterIterator *)(iter->context))->getIndex(); 439 } 440 441 static void U_CALLCONV 442 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 443 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 444 /* do nothing */ 445 } else if(iter==NULL || iter->context==NULL) { 446 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 447 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { 448 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 449 } else { 450 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); 451 } 452 } 453 454 static const UCharIterator characterIteratorWrapper={ 455 0, 0, 0, 0, 0, 0, 456 characterIteratorGetIndex, 457 characterIteratorMove, 458 characterIteratorHasNext, 459 characterIteratorHasPrevious, 460 characterIteratorCurrent, 461 characterIteratorNext, 462 characterIteratorPrevious, 463 NULL, 464 characterIteratorGetState, 465 characterIteratorSetState 466 }; 467 468 U_CAPI void U_EXPORT2 469 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { 470 if(iter!=0) { 471 if(charIter!=0) { 472 *iter=characterIteratorWrapper; 473 iter->context=charIter; 474 } else { 475 *iter=noopIterator; 476 } 477 } 478 } 479 480 /* UCharIterator wrapper around Replaceable --------------------------------- */ 481 482 /* 483 * This is an implementation of a code unit (UChar) iterator 484 * based on a Replaceable object. 485 * 486 * The UCharIterator.context field holds a pointer to the Replaceable. 487 * UCharIterator.length and UCharIterator.index hold Replaceable.length() 488 * and the iteration index. 489 */ 490 491 static UChar32 U_CALLCONV 492 replaceableIteratorCurrent(UCharIterator *iter) { 493 if(iter->index<iter->limit) { 494 return ((Replaceable *)(iter->context))->charAt(iter->index); 495 } else { 496 return U_SENTINEL; 497 } 498 } 499 500 static UChar32 U_CALLCONV 501 replaceableIteratorNext(UCharIterator *iter) { 502 if(iter->index<iter->limit) { 503 return ((Replaceable *)(iter->context))->charAt(iter->index++); 504 } else { 505 return U_SENTINEL; 506 } 507 } 508 509 static UChar32 U_CALLCONV 510 replaceableIteratorPrevious(UCharIterator *iter) { 511 if(iter->index>iter->start) { 512 return ((Replaceable *)(iter->context))->charAt(--iter->index); 513 } else { 514 return U_SENTINEL; 515 } 516 } 517 518 static const UCharIterator replaceableIterator={ 519 0, 0, 0, 0, 0, 0, 520 stringIteratorGetIndex, 521 stringIteratorMove, 522 stringIteratorHasNext, 523 stringIteratorHasPrevious, 524 replaceableIteratorCurrent, 525 replaceableIteratorNext, 526 replaceableIteratorPrevious, 527 NULL, 528 stringIteratorGetState, 529 stringIteratorSetState 530 }; 531 532 U_CAPI void U_EXPORT2 533 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { 534 if(iter!=0) { 535 if(rep!=0) { 536 *iter=replaceableIterator; 537 iter->context=rep; 538 iter->limit=iter->length=rep->length(); 539 } else { 540 *iter=noopIterator; 541 } 542 } 543 } 544 545 /* UCharIterator implementation for UTF-8 strings --------------------------- */ 546 547 /* 548 * Possible, probably necessary only for an implementation for arbitrary 549 * converters: 550 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. 551 * This would require to turn reservedFn into a close function and 552 * to introduce a uiter_close(iter). 553 */ 554 555 #define UITER_CNV_CAPACITY 16 556 557 /* 558 * Minimal implementation: 559 * Maintain a single-UChar buffer for an additional surrogate. 560 * The caller must not modify start and limit because they are used internally. 561 * 562 * Use UCharIterator fields as follows: 563 * context pointer to UTF-8 string 564 * length UTF-16 length of the string; -1 until lazy evaluation 565 * start current UTF-8 index 566 * index current UTF-16 index; may be -1="unknown" after setState() 567 * limit UTF-8 length of the string 568 * reservedField supplementary code point 569 * 570 * Since UCharIterator delivers 16-bit code units, the iteration can be 571 * currently in the middle of the byte sequence for a supplementary code point. 572 * In this case, reservedField will contain that code point and start will 573 * point to after the corresponding byte sequence. The UTF-16 index will be 574 * one less than what it would otherwise be corresponding to the UTF-8 index. 575 * Otherwise, reservedField will be 0. 576 */ 577 578 /* 579 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 580 * Add implementations that do not call strlen() for iteration but check for NUL. 581 */ 582 583 static int32_t U_CALLCONV 584 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 585 switch(origin) { 586 case UITER_ZERO: 587 case UITER_START: 588 return 0; 589 case UITER_CURRENT: 590 if(iter->index<0) { 591 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 592 const uint8_t *s; 593 UChar32 c; 594 int32_t i, limit, index; 595 596 s=(const uint8_t *)iter->context; 597 i=index=0; 598 limit=iter->start; /* count up to the UTF-8 index */ 599 while(i<limit) { 600 U8_NEXT(s, i, limit, c); 601 if(c<=0xffff) { 602 ++index; 603 } else { 604 index+=2; 605 } 606 } 607 608 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 609 if(i==iter->limit) { 610 iter->length=index; /* in case it was <0 or wrong */ 611 } 612 if(iter->reservedField!=0) { 613 --index; /* we are in the middle of a supplementary code point */ 614 } 615 iter->index=index; 616 } 617 return iter->index; 618 case UITER_LIMIT: 619 case UITER_LENGTH: 620 if(iter->length<0) { 621 const uint8_t *s; 622 UChar32 c; 623 int32_t i, limit, length; 624 625 s=(const uint8_t *)iter->context; 626 if(iter->index<0) { 627 /* 628 * the current UTF-16 index is unknown after setState(), 629 * we must first count from the beginning to here 630 */ 631 i=length=0; 632 limit=iter->start; 633 634 /* count from the beginning to the current index */ 635 while(i<limit) { 636 U8_NEXT(s, i, limit, c); 637 if(c<=0xffff) { 638 ++length; 639 } else { 640 length+=2; 641 } 642 } 643 644 /* assume i==limit==iter->start, set the UTF-16 index */ 645 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 646 iter->index= iter->reservedField!=0 ? length-1 : length; 647 } else { 648 i=iter->start; 649 length=iter->index; 650 if(iter->reservedField!=0) { 651 ++length; 652 } 653 } 654 655 /* count from the current index to the end */ 656 limit=iter->limit; 657 while(i<limit) { 658 U8_NEXT(s, i, limit, c); 659 if(c<=0xffff) { 660 ++length; 661 } else { 662 length+=2; 663 } 664 } 665 iter->length=length; 666 } 667 return iter->length; 668 default: 669 /* not a valid origin */ 670 /* Should never get here! */ 671 return -1; 672 } 673 } 674 675 static int32_t U_CALLCONV 676 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 677 const uint8_t *s; 678 UChar32 c; 679 int32_t pos; /* requested UTF-16 index */ 680 int32_t i; /* UTF-8 index */ 681 UBool havePos; 682 683 /* calculate the requested UTF-16 index */ 684 switch(origin) { 685 case UITER_ZERO: 686 case UITER_START: 687 pos=delta; 688 havePos=TRUE; 689 /* iter->index<0 (unknown) is possible */ 690 break; 691 case UITER_CURRENT: 692 if(iter->index>=0) { 693 pos=iter->index+delta; 694 havePos=TRUE; 695 } else { 696 /* the current UTF-16 index is unknown after setState(), use only delta */ 697 pos=0; 698 havePos=FALSE; 699 } 700 break; 701 case UITER_LIMIT: 702 case UITER_LENGTH: 703 if(iter->length>=0) { 704 pos=iter->length+delta; 705 havePos=TRUE; 706 } else { 707 /* pin to the end, avoid counting the length */ 708 iter->index=-1; 709 iter->start=iter->limit; 710 iter->reservedField=0; 711 if(delta>=0) { 712 return UITER_UNKNOWN_INDEX; 713 } else { 714 /* the current UTF-16 index is unknown, use only delta */ 715 pos=0; 716 havePos=FALSE; 717 } 718 } 719 break; 720 default: 721 return -1; /* Error */ 722 } 723 724 if(havePos) { 725 /* shortcuts: pinning to the edges of the string */ 726 if(pos<=0) { 727 iter->index=iter->start=iter->reservedField=0; 728 return 0; 729 } else if(iter->length>=0 && pos>=iter->length) { 730 iter->index=iter->length; 731 iter->start=iter->limit; 732 iter->reservedField=0; 733 return iter->index; 734 } 735 736 /* minimize the number of U8_NEXT/PREV operations */ 737 if(iter->index<0 || pos<iter->index/2) { 738 /* go forward from the start instead of backward from the current index */ 739 iter->index=iter->start=iter->reservedField=0; 740 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 741 /* 742 * if we have the UTF-16 index and length and the new position is 743 * closer to the end than the current index, 744 * then go backward from the end instead of forward from the current index 745 */ 746 iter->index=iter->length; 747 iter->start=iter->limit; 748 iter->reservedField=0; 749 } 750 751 delta=pos-iter->index; 752 if(delta==0) { 753 return iter->index; /* nothing to do */ 754 } 755 } else { 756 /* move relative to unknown UTF-16 index */ 757 if(delta==0) { 758 return UITER_UNKNOWN_INDEX; /* nothing to do */ 759 } else if(-delta>=iter->start) { 760 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 761 iter->index=iter->start=iter->reservedField=0; 762 return 0; 763 } else if(delta>=(iter->limit-iter->start)) { 764 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 765 iter->index=iter->length; /* may or may not be <0 (unknown) */ 766 iter->start=iter->limit; 767 iter->reservedField=0; 768 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; 769 } 770 } 771 772 /* delta!=0 */ 773 774 /* move towards the requested position, pin to the edges of the string */ 775 s=(const uint8_t *)iter->context; 776 pos=iter->index; /* could be <0 (unknown) */ 777 i=iter->start; 778 if(delta>0) { 779 /* go forward */ 780 int32_t limit=iter->limit; 781 if(iter->reservedField!=0) { 782 iter->reservedField=0; 783 ++pos; 784 --delta; 785 } 786 while(delta>0 && i<limit) { 787 U8_NEXT(s, i, limit, c); 788 if(c<0xffff) { 789 ++pos; 790 --delta; 791 } else if(delta>=2) { 792 pos+=2; 793 delta-=2; 794 } else /* delta==1 */ { 795 /* stop in the middle of a supplementary code point */ 796 iter->reservedField=c; 797 ++pos; 798 break; /* delta=0; */ 799 } 800 } 801 if(i==limit) { 802 if(iter->length<0 && iter->index>=0) { 803 iter->length= iter->reservedField==0 ? pos : pos+1; 804 } else if(iter->index<0 && iter->length>=0) { 805 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 806 } 807 } 808 } else /* delta<0 */ { 809 /* go backward */ 810 if(iter->reservedField!=0) { 811 iter->reservedField=0; 812 i-=4; /* we stayed behind the supplementary code point; go before it now */ 813 --pos; 814 ++delta; 815 } 816 while(delta<0 && i>0) { 817 U8_PREV(s, 0, i, c); 818 if(c<0xffff) { 819 --pos; 820 ++delta; 821 } else if(delta<=-2) { 822 pos-=2; 823 delta+=2; 824 } else /* delta==-1 */ { 825 /* stop in the middle of a supplementary code point */ 826 i+=4; /* back to behind this supplementary code point for consistent state */ 827 iter->reservedField=c; 828 --pos; 829 break; /* delta=0; */ 830 } 831 } 832 } 833 834 iter->start=i; 835 if(iter->index>=0) { 836 return iter->index=pos; 837 } else { 838 /* we started with index<0 (unknown) so pos is bogus */ 839 if(i<=1) { 840 return iter->index=i; /* reached the beginning */ 841 } else { 842 /* we still don't know the UTF-16 index */ 843 return UITER_UNKNOWN_INDEX; 844 } 845 } 846 } 847 848 static UBool U_CALLCONV 849 utf8IteratorHasNext(UCharIterator *iter) { 850 return iter->start<iter->limit || iter->reservedField!=0; 851 } 852 853 static UBool U_CALLCONV 854 utf8IteratorHasPrevious(UCharIterator *iter) { 855 return iter->start>0; 856 } 857 858 static UChar32 U_CALLCONV 859 utf8IteratorCurrent(UCharIterator *iter) { 860 if(iter->reservedField!=0) { 861 return U16_TRAIL(iter->reservedField); 862 } else if(iter->start<iter->limit) { 863 const uint8_t *s=(const uint8_t *)iter->context; 864 UChar32 c; 865 int32_t i=iter->start; 866 867 U8_NEXT(s, i, iter->limit, c); 868 if(c<0) { 869 return 0xfffd; 870 } else if(c<=0xffff) { 871 return c; 872 } else { 873 return U16_LEAD(c); 874 } 875 } else { 876 return U_SENTINEL; 877 } 878 } 879 880 static UChar32 U_CALLCONV 881 utf8IteratorNext(UCharIterator *iter) { 882 int32_t index; 883 884 if(iter->reservedField!=0) { 885 UChar trail=U16_TRAIL(iter->reservedField); 886 iter->reservedField=0; 887 if((index=iter->index)>=0) { 888 iter->index=index+1; 889 } 890 return trail; 891 } else if(iter->start<iter->limit) { 892 const uint8_t *s=(const uint8_t *)iter->context; 893 UChar32 c; 894 895 U8_NEXT(s, iter->start, iter->limit, c); 896 if((index=iter->index)>=0) { 897 iter->index=++index; 898 if(iter->length<0 && iter->start==iter->limit) { 899 iter->length= c<=0xffff ? index : index+1; 900 } 901 } else if(iter->start==iter->limit && iter->length>=0) { 902 iter->index= c<=0xffff ? iter->length : iter->length-1; 903 } 904 if(c<0) { 905 return 0xfffd; 906 } else if(c<=0xffff) { 907 return c; 908 } else { 909 iter->reservedField=c; 910 return U16_LEAD(c); 911 } 912 } else { 913 return U_SENTINEL; 914 } 915 } 916 917 static UChar32 U_CALLCONV 918 utf8IteratorPrevious(UCharIterator *iter) { 919 int32_t index; 920 921 if(iter->reservedField!=0) { 922 UChar lead=U16_LEAD(iter->reservedField); 923 iter->reservedField=0; 924 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 925 if((index=iter->index)>0) { 926 iter->index=index-1; 927 } 928 return lead; 929 } else if(iter->start>0) { 930 const uint8_t *s=(const uint8_t *)iter->context; 931 UChar32 c; 932 933 U8_PREV(s, 0, iter->start, c); 934 if((index=iter->index)>0) { 935 iter->index=index-1; 936 } else if(iter->start<=1) { 937 iter->index= c<=0xffff ? iter->start : iter->start+1; 938 } 939 if(c<0) { 940 return 0xfffd; 941 } else if(c<=0xffff) { 942 return c; 943 } else { 944 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 945 iter->reservedField=c; 946 return U16_TRAIL(c); 947 } 948 } else { 949 return U_SENTINEL; 950 } 951 } 952 953 static uint32_t U_CALLCONV 954 utf8IteratorGetState(const UCharIterator *iter) { 955 uint32_t state=(uint32_t)(iter->start<<1); 956 if(iter->reservedField!=0) { 957 state|=1; 958 } 959 return state; 960 } 961 962 static void U_CALLCONV 963 utf8IteratorSetState(UCharIterator *iter, 964 uint32_t state, 965 UErrorCode *pErrorCode) 966 { 967 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 968 /* do nothing */ 969 } else if(iter==NULL) { 970 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 971 } else if(state==utf8IteratorGetState(iter)) { 972 /* setting to the current state: no-op */ 973 } else { 974 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 975 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 976 977 if((state==0 ? index<0 : index<4) || iter->limit<index) { 978 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 979 } else { 980 iter->start=index; /* restore UTF-8 byte index */ 981 if(index<=1) { 982 iter->index=index; 983 } else { 984 iter->index=-1; /* unknown UTF-16 index */ 985 } 986 if(state==0) { 987 iter->reservedField=0; 988 } else { 989 /* verified index>=4 above */ 990 UChar32 c; 991 U8_PREV((const uint8_t *)iter->context, 0, index, c); 992 if(c<=0xffff) { 993 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 994 } else { 995 iter->reservedField=c; 996 } 997 } 998 } 999 } 1000 } 1001 1002 static const UCharIterator utf8Iterator={ 1003 0, 0, 0, 0, 0, 0, 1004 utf8IteratorGetIndex, 1005 utf8IteratorMove, 1006 utf8IteratorHasNext, 1007 utf8IteratorHasPrevious, 1008 utf8IteratorCurrent, 1009 utf8IteratorNext, 1010 utf8IteratorPrevious, 1011 NULL, 1012 utf8IteratorGetState, 1013 utf8IteratorSetState 1014 }; 1015 1016 U_CAPI void U_EXPORT2 1017 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { 1018 if(iter!=0) { 1019 if(s!=0 && length>=-1) { 1020 *iter=utf8Iterator; 1021 iter->context=s; 1022 if(length>=0) { 1023 iter->limit=length; 1024 } else { 1025 iter->limit=(int32_t)uprv_strlen(s); 1026 } 1027 iter->length= iter->limit<=1 ? iter->limit : -1; 1028 } else { 1029 *iter=noopIterator; 1030 } 1031 } 1032 } 1033 1034 /* Helper functions --------------------------------------------------------- */ 1035 1036 U_CAPI UChar32 U_EXPORT2 1037 uiter_current32(UCharIterator *iter) { 1038 UChar32 c, c2; 1039 1040 c=iter->current(iter); 1041 if(UTF_IS_SURROGATE(c)) { 1042 if(UTF_IS_SURROGATE_FIRST(c)) { 1043 /* 1044 * go to the next code unit 1045 * we know that we are not at the limit because c!=U_SENTINEL 1046 */ 1047 iter->move(iter, 1, UITER_CURRENT); 1048 if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) { 1049 c=UTF16_GET_PAIR_VALUE(c, c2); 1050 } 1051 1052 /* undo index movement */ 1053 iter->move(iter, -1, UITER_CURRENT); 1054 } else { 1055 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) { 1056 c=UTF16_GET_PAIR_VALUE(c2, c); 1057 } 1058 if(c2>=0) { 1059 /* undo index movement */ 1060 iter->move(iter, 1, UITER_CURRENT); 1061 } 1062 } 1063 } 1064 return c; 1065 } 1066 1067 U_CAPI UChar32 U_EXPORT2 1068 uiter_next32(UCharIterator *iter) { 1069 UChar32 c, c2; 1070 1071 c=iter->next(iter); 1072 if(UTF_IS_FIRST_SURROGATE(c)) { 1073 if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) { 1074 c=UTF16_GET_PAIR_VALUE(c, c2); 1075 } else if(c2>=0) { 1076 /* unmatched first surrogate, undo index movement */ 1077 iter->move(iter, -1, UITER_CURRENT); 1078 } 1079 } 1080 return c; 1081 } 1082 1083 U_CAPI UChar32 U_EXPORT2 1084 uiter_previous32(UCharIterator *iter) { 1085 UChar32 c, c2; 1086 1087 c=iter->previous(iter); 1088 if(UTF_IS_SECOND_SURROGATE(c)) { 1089 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) { 1090 c=UTF16_GET_PAIR_VALUE(c2, c); 1091 } else if(c2>=0) { 1092 /* unmatched second surrogate, undo index movement */ 1093 iter->move(iter, 1, UITER_CURRENT); 1094 } 1095 } 1096 return c; 1097 } 1098 1099 U_CAPI uint32_t U_EXPORT2 1100 uiter_getState(const UCharIterator *iter) { 1101 if(iter==NULL || iter->getState==NULL) { 1102 return UITER_NO_STATE; 1103 } else { 1104 return iter->getState(iter); 1105 } 1106 } 1107 1108 U_CAPI void U_EXPORT2 1109 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1110 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1111 /* do nothing */ 1112 } else if(iter==NULL) { 1113 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1114 } else if(iter->setState==NULL) { 1115 *pErrorCode=U_UNSUPPORTED_ERROR; 1116 } else { 1117 iter->setState(iter, state, pErrorCode); 1118 } 1119 } 1120 1121 U_CDECL_END 1122