Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2006, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uiter.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jan18
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/ustring.h"
     19 #include "unicode/chariter.h"
     20 #include "unicode/rep.h"
     21 #include "unicode/uiter.h"
     22 #include "cstring.h"
     23 
     24 U_NAMESPACE_USE
     25 
     26 #define IS_EVEN(n) (((n)&1)==0)
     27 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
     28 
     29 U_CDECL_BEGIN
     30 
     31 /* No-Op UCharIterator implementation for illegal input --------------------- */
     32 
     33 static int32_t U_CALLCONV
     34 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
     35     return 0;
     36 }
     37 
     38 static int32_t U_CALLCONV
     39 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
     40     return 0;
     41 }
     42 
     43 static UBool U_CALLCONV
     44 noopHasNext(UCharIterator * /*iter*/) {
     45     return FALSE;
     46 }
     47 
     48 static UChar32 U_CALLCONV
     49 noopCurrent(UCharIterator * /*iter*/) {
     50     return U_SENTINEL;
     51 }
     52 
     53 static uint32_t U_CALLCONV
     54 noopGetState(const UCharIterator * /*iter*/) {
     55     return UITER_NO_STATE;
     56 }
     57 
     58 static void U_CALLCONV
     59 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
     60     *pErrorCode=U_UNSUPPORTED_ERROR;
     61 }
     62 
     63 static const UCharIterator noopIterator={
     64     0, 0, 0, 0, 0, 0,
     65     noopGetIndex,
     66     noopMove,
     67     noopHasNext,
     68     noopHasNext,
     69     noopCurrent,
     70     noopCurrent,
     71     noopCurrent,
     72     NULL,
     73     noopGetState,
     74     noopSetState
     75 };
     76 
     77 /* UCharIterator implementation for simple strings -------------------------- */
     78 
     79 /*
     80  * This is an implementation of a code unit (UChar) iterator
     81  * for UChar * strings.
     82  *
     83  * The UCharIterator.context field holds a pointer to the string.
     84  */
     85 
     86 static int32_t U_CALLCONV
     87 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
     88     switch(origin) {
     89     case UITER_ZERO:
     90         return 0;
     91     case UITER_START:
     92         return iter->start;
     93     case UITER_CURRENT:
     94         return iter->index;
     95     case UITER_LIMIT:
     96         return iter->limit;
     97     case UITER_LENGTH:
     98         return iter->length;
     99     default:
    100         /* not a valid origin */
    101         /* Should never get here! */
    102         return -1;
    103     }
    104 }
    105 
    106 static int32_t U_CALLCONV
    107 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    108     int32_t pos;
    109 
    110     switch(origin) {
    111     case UITER_ZERO:
    112         pos=delta;
    113         break;
    114     case UITER_START:
    115         pos=iter->start+delta;
    116         break;
    117     case UITER_CURRENT:
    118         pos=iter->index+delta;
    119         break;
    120     case UITER_LIMIT:
    121         pos=iter->limit+delta;
    122         break;
    123     case UITER_LENGTH:
    124         pos=iter->length+delta;
    125         break;
    126     default:
    127         return -1;  /* Error */
    128     }
    129 
    130     if(pos<iter->start) {
    131         pos=iter->start;
    132     } else if(pos>iter->limit) {
    133         pos=iter->limit;
    134     }
    135 
    136     return iter->index=pos;
    137 }
    138 
    139 static UBool U_CALLCONV
    140 stringIteratorHasNext(UCharIterator *iter) {
    141     return iter->index<iter->limit;
    142 }
    143 
    144 static UBool U_CALLCONV
    145 stringIteratorHasPrevious(UCharIterator *iter) {
    146     return iter->index>iter->start;
    147 }
    148 
    149 static UChar32 U_CALLCONV
    150 stringIteratorCurrent(UCharIterator *iter) {
    151     if(iter->index<iter->limit) {
    152         return ((const UChar *)(iter->context))[iter->index];
    153     } else {
    154         return U_SENTINEL;
    155     }
    156 }
    157 
    158 static UChar32 U_CALLCONV
    159 stringIteratorNext(UCharIterator *iter) {
    160     if(iter->index<iter->limit) {
    161         return ((const UChar *)(iter->context))[iter->index++];
    162     } else {
    163         return U_SENTINEL;
    164     }
    165 }
    166 
    167 static UChar32 U_CALLCONV
    168 stringIteratorPrevious(UCharIterator *iter) {
    169     if(iter->index>iter->start) {
    170         return ((const UChar *)(iter->context))[--iter->index];
    171     } else {
    172         return U_SENTINEL;
    173     }
    174 }
    175 
    176 static uint32_t U_CALLCONV
    177 stringIteratorGetState(const UCharIterator *iter) {
    178     return (uint32_t)iter->index;
    179 }
    180 
    181 static void U_CALLCONV
    182 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    183     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    184         /* do nothing */
    185     } else if(iter==NULL) {
    186         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    187     } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
    188         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    189     } else {
    190         iter->index=(int32_t)state;
    191     }
    192 }
    193 
    194 static const UCharIterator stringIterator={
    195     0, 0, 0, 0, 0, 0,
    196     stringIteratorGetIndex,
    197     stringIteratorMove,
    198     stringIteratorHasNext,
    199     stringIteratorHasPrevious,
    200     stringIteratorCurrent,
    201     stringIteratorNext,
    202     stringIteratorPrevious,
    203     NULL,
    204     stringIteratorGetState,
    205     stringIteratorSetState
    206 };
    207 
    208 U_CAPI void U_EXPORT2
    209 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
    210     if(iter!=0) {
    211         if(s!=0 && length>=-1) {
    212             *iter=stringIterator;
    213             iter->context=s;
    214             if(length>=0) {
    215                 iter->length=length;
    216             } else {
    217                 iter->length=u_strlen(s);
    218             }
    219             iter->limit=iter->length;
    220         } else {
    221             *iter=noopIterator;
    222         }
    223     }
    224 }
    225 
    226 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
    227 
    228 /*
    229  * This is an implementation of a code unit (UChar) iterator
    230  * for UTF-16BE strings, i.e., strings in byte-vectors where
    231  * each UChar is stored as a big-endian pair of bytes.
    232  *
    233  * The UCharIterator.context field holds a pointer to the string.
    234  * Everything works just like with a normal UChar iterator (uiter_setString),
    235  * except that UChars are assembled from byte pairs.
    236  */
    237 
    238 /* internal helper function */
    239 static inline UChar32
    240 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
    241     const uint8_t *p=(const uint8_t *)iter->context;
    242     return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
    243 }
    244 
    245 static UChar32 U_CALLCONV
    246 utf16BEIteratorCurrent(UCharIterator *iter) {
    247     int32_t index;
    248 
    249     if((index=iter->index)<iter->limit) {
    250         return utf16BEIteratorGet(iter, index);
    251     } else {
    252         return U_SENTINEL;
    253     }
    254 }
    255 
    256 static UChar32 U_CALLCONV
    257 utf16BEIteratorNext(UCharIterator *iter) {
    258     int32_t index;
    259 
    260     if((index=iter->index)<iter->limit) {
    261         iter->index=index+1;
    262         return utf16BEIteratorGet(iter, index);
    263     } else {
    264         return U_SENTINEL;
    265     }
    266 }
    267 
    268 static UChar32 U_CALLCONV
    269 utf16BEIteratorPrevious(UCharIterator *iter) {
    270     int32_t index;
    271 
    272     if((index=iter->index)>iter->start) {
    273         iter->index=--index;
    274         return utf16BEIteratorGet(iter, index);
    275     } else {
    276         return U_SENTINEL;
    277     }
    278 }
    279 
    280 static const UCharIterator utf16BEIterator={
    281     0, 0, 0, 0, 0, 0,
    282     stringIteratorGetIndex,
    283     stringIteratorMove,
    284     stringIteratorHasNext,
    285     stringIteratorHasPrevious,
    286     utf16BEIteratorCurrent,
    287     utf16BEIteratorNext,
    288     utf16BEIteratorPrevious,
    289     NULL,
    290     stringIteratorGetState,
    291     stringIteratorSetState
    292 };
    293 
    294 /*
    295  * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
    296  * i.e., before a pair of 0 bytes where the first 0 byte is at an even
    297  * offset from s.
    298  */
    299 static int32_t
    300 utf16BE_strlen(const char *s) {
    301     if(IS_POINTER_EVEN(s)) {
    302         /*
    303          * even-aligned, call u_strlen(s)
    304          * we are probably on a little-endian machine, but searching for UChar NUL
    305          * does not care about endianness
    306          */
    307         return u_strlen((const UChar *)s);
    308     } else {
    309         /* odd-aligned, search for pair of 0 bytes */
    310         const char *p=s;
    311 
    312         while(!(*p==0 && p[1]==0)) {
    313             p+=2;
    314         }
    315         return (int32_t)((p-s)/2);
    316     }
    317 }
    318 
    319 U_CAPI void U_EXPORT2
    320 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
    321     if(iter!=NULL) {
    322         /* allow only even-length strings (the input length counts bytes) */
    323         if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
    324             /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
    325             length>>=1;
    326 
    327             if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
    328                 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
    329                 uiter_setString(iter, (const UChar *)s, length);
    330                 return;
    331             }
    332 
    333             *iter=utf16BEIterator;
    334             iter->context=s;
    335             if(length>=0) {
    336                 iter->length=length;
    337             } else {
    338                 iter->length=utf16BE_strlen(s);
    339             }
    340             iter->limit=iter->length;
    341         } else {
    342             *iter=noopIterator;
    343         }
    344     }
    345 }
    346 
    347 /* UCharIterator wrapper around CharacterIterator --------------------------- */
    348 
    349 /*
    350  * This is wrapper code around a C++ CharacterIterator to
    351  * look like a C UCharIterator.
    352  *
    353  * The UCharIterator.context field holds a pointer to the CharacterIterator.
    354  */
    355 
    356 static int32_t U_CALLCONV
    357 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    358     switch(origin) {
    359     case UITER_ZERO:
    360         return 0;
    361     case UITER_START:
    362         return ((CharacterIterator *)(iter->context))->startIndex();
    363     case UITER_CURRENT:
    364         return ((CharacterIterator *)(iter->context))->getIndex();
    365     case UITER_LIMIT:
    366         return ((CharacterIterator *)(iter->context))->endIndex();
    367     case UITER_LENGTH:
    368         return ((CharacterIterator *)(iter->context))->getLength();
    369     default:
    370         /* not a valid origin */
    371         /* Should never get here! */
    372         return -1;
    373     }
    374 }
    375 
    376 static int32_t U_CALLCONV
    377 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    378     switch(origin) {
    379     case UITER_ZERO:
    380         ((CharacterIterator *)(iter->context))->setIndex(delta);
    381         return ((CharacterIterator *)(iter->context))->getIndex();
    382     case UITER_START:
    383     case UITER_CURRENT:
    384     case UITER_LIMIT:
    385         return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
    386     case UITER_LENGTH:
    387         ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
    388         return ((CharacterIterator *)(iter->context))->getIndex();
    389     default:
    390         /* not a valid origin */
    391         /* Should never get here! */
    392         return -1;
    393     }
    394 }
    395 
    396 static UBool U_CALLCONV
    397 characterIteratorHasNext(UCharIterator *iter) {
    398     return ((CharacterIterator *)(iter->context))->hasNext();
    399 }
    400 
    401 static UBool U_CALLCONV
    402 characterIteratorHasPrevious(UCharIterator *iter) {
    403     return ((CharacterIterator *)(iter->context))->hasPrevious();
    404 }
    405 
    406 static UChar32 U_CALLCONV
    407 characterIteratorCurrent(UCharIterator *iter) {
    408     UChar32 c;
    409 
    410     c=((CharacterIterator *)(iter->context))->current();
    411     if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
    412         return c;
    413     } else {
    414         return U_SENTINEL;
    415     }
    416 }
    417 
    418 static UChar32 U_CALLCONV
    419 characterIteratorNext(UCharIterator *iter) {
    420     if(((CharacterIterator *)(iter->context))->hasNext()) {
    421         return ((CharacterIterator *)(iter->context))->nextPostInc();
    422     } else {
    423         return U_SENTINEL;
    424     }
    425 }
    426 
    427 static UChar32 U_CALLCONV
    428 characterIteratorPrevious(UCharIterator *iter) {
    429     if(((CharacterIterator *)(iter->context))->hasPrevious()) {
    430         return ((CharacterIterator *)(iter->context))->previous();
    431     } else {
    432         return U_SENTINEL;
    433     }
    434 }
    435 
    436 static uint32_t U_CALLCONV
    437 characterIteratorGetState(const UCharIterator *iter) {
    438     return ((CharacterIterator *)(iter->context))->getIndex();
    439 }
    440 
    441 static void U_CALLCONV
    442 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    443     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    444         /* do nothing */
    445     } else if(iter==NULL || iter->context==NULL) {
    446         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    447     } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
    448         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    449     } else {
    450         ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
    451     }
    452 }
    453 
    454 static const UCharIterator characterIteratorWrapper={
    455     0, 0, 0, 0, 0, 0,
    456     characterIteratorGetIndex,
    457     characterIteratorMove,
    458     characterIteratorHasNext,
    459     characterIteratorHasPrevious,
    460     characterIteratorCurrent,
    461     characterIteratorNext,
    462     characterIteratorPrevious,
    463     NULL,
    464     characterIteratorGetState,
    465     characterIteratorSetState
    466 };
    467 
    468 U_CAPI void U_EXPORT2
    469 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
    470     if(iter!=0) {
    471         if(charIter!=0) {
    472             *iter=characterIteratorWrapper;
    473             iter->context=charIter;
    474         } else {
    475             *iter=noopIterator;
    476         }
    477     }
    478 }
    479 
    480 /* UCharIterator wrapper around Replaceable --------------------------------- */
    481 
    482 /*
    483  * This is an implementation of a code unit (UChar) iterator
    484  * based on a Replaceable object.
    485  *
    486  * The UCharIterator.context field holds a pointer to the Replaceable.
    487  * UCharIterator.length and UCharIterator.index hold Replaceable.length()
    488  * and the iteration index.
    489  */
    490 
    491 static UChar32 U_CALLCONV
    492 replaceableIteratorCurrent(UCharIterator *iter) {
    493     if(iter->index<iter->limit) {
    494         return ((Replaceable *)(iter->context))->charAt(iter->index);
    495     } else {
    496         return U_SENTINEL;
    497     }
    498 }
    499 
    500 static UChar32 U_CALLCONV
    501 replaceableIteratorNext(UCharIterator *iter) {
    502     if(iter->index<iter->limit) {
    503         return ((Replaceable *)(iter->context))->charAt(iter->index++);
    504     } else {
    505         return U_SENTINEL;
    506     }
    507 }
    508 
    509 static UChar32 U_CALLCONV
    510 replaceableIteratorPrevious(UCharIterator *iter) {
    511     if(iter->index>iter->start) {
    512         return ((Replaceable *)(iter->context))->charAt(--iter->index);
    513     } else {
    514         return U_SENTINEL;
    515     }
    516 }
    517 
    518 static const UCharIterator replaceableIterator={
    519     0, 0, 0, 0, 0, 0,
    520     stringIteratorGetIndex,
    521     stringIteratorMove,
    522     stringIteratorHasNext,
    523     stringIteratorHasPrevious,
    524     replaceableIteratorCurrent,
    525     replaceableIteratorNext,
    526     replaceableIteratorPrevious,
    527     NULL,
    528     stringIteratorGetState,
    529     stringIteratorSetState
    530 };
    531 
    532 U_CAPI void U_EXPORT2
    533 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
    534     if(iter!=0) {
    535         if(rep!=0) {
    536             *iter=replaceableIterator;
    537             iter->context=rep;
    538             iter->limit=iter->length=rep->length();
    539         } else {
    540             *iter=noopIterator;
    541         }
    542     }
    543 }
    544 
    545 /* UCharIterator implementation for UTF-8 strings --------------------------- */
    546 
    547 /*
    548  * Possible, probably necessary only for an implementation for arbitrary
    549  * converters:
    550  * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
    551  * This would require to turn reservedFn into a close function and
    552  * to introduce a uiter_close(iter).
    553  */
    554 
    555 #define UITER_CNV_CAPACITY 16
    556 
    557 /*
    558  * Minimal implementation:
    559  * Maintain a single-UChar buffer for an additional surrogate.
    560  * The caller must not modify start and limit because they are used internally.
    561  *
    562  * Use UCharIterator fields as follows:
    563  *   context        pointer to UTF-8 string
    564  *   length         UTF-16 length of the string; -1 until lazy evaluation
    565  *   start          current UTF-8 index
    566  *   index          current UTF-16 index; may be -1="unknown" after setState()
    567  *   limit          UTF-8 length of the string
    568  *   reservedField  supplementary code point
    569  *
    570  * Since UCharIterator delivers 16-bit code units, the iteration can be
    571  * currently in the middle of the byte sequence for a supplementary code point.
    572  * In this case, reservedField will contain that code point and start will
    573  * point to after the corresponding byte sequence. The UTF-16 index will be
    574  * one less than what it would otherwise be corresponding to the UTF-8 index.
    575  * Otherwise, reservedField will be 0.
    576  */
    577 
    578 /*
    579  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
    580  * Add implementations that do not call strlen() for iteration but check for NUL.
    581  */
    582 
    583 static int32_t U_CALLCONV
    584 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    585     switch(origin) {
    586     case UITER_ZERO:
    587     case UITER_START:
    588         return 0;
    589     case UITER_CURRENT:
    590         if(iter->index<0) {
    591             /* the current UTF-16 index is unknown after setState(), count from the beginning */
    592             const uint8_t *s;
    593             UChar32 c;
    594             int32_t i, limit, index;
    595 
    596             s=(const uint8_t *)iter->context;
    597             i=index=0;
    598             limit=iter->start; /* count up to the UTF-8 index */
    599             while(i<limit) {
    600                 U8_NEXT(s, i, limit, c);
    601                 if(c<=0xffff) {
    602                     ++index;
    603                 } else {
    604                     index+=2;
    605                 }
    606             }
    607 
    608             iter->start=i; /* just in case setState() did not get us to a code point boundary */
    609             if(i==iter->limit) {
    610                 iter->length=index; /* in case it was <0 or wrong */
    611             }
    612             if(iter->reservedField!=0) {
    613                 --index; /* we are in the middle of a supplementary code point */
    614             }
    615             iter->index=index;
    616         }
    617         return iter->index;
    618     case UITER_LIMIT:
    619     case UITER_LENGTH:
    620         if(iter->length<0) {
    621             const uint8_t *s;
    622             UChar32 c;
    623             int32_t i, limit, length;
    624 
    625             s=(const uint8_t *)iter->context;
    626             if(iter->index<0) {
    627                 /*
    628                  * the current UTF-16 index is unknown after setState(),
    629                  * we must first count from the beginning to here
    630                  */
    631                 i=length=0;
    632                 limit=iter->start;
    633 
    634                 /* count from the beginning to the current index */
    635                 while(i<limit) {
    636                     U8_NEXT(s, i, limit, c);
    637                     if(c<=0xffff) {
    638                         ++length;
    639                     } else {
    640                         length+=2;
    641                     }
    642                 }
    643 
    644                 /* assume i==limit==iter->start, set the UTF-16 index */
    645                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
    646                 iter->index= iter->reservedField!=0 ? length-1 : length;
    647             } else {
    648                 i=iter->start;
    649                 length=iter->index;
    650                 if(iter->reservedField!=0) {
    651                     ++length;
    652                 }
    653             }
    654 
    655             /* count from the current index to the end */
    656             limit=iter->limit;
    657             while(i<limit) {
    658                 U8_NEXT(s, i, limit, c);
    659                 if(c<=0xffff) {
    660                     ++length;
    661                 } else {
    662                     length+=2;
    663                 }
    664             }
    665             iter->length=length;
    666         }
    667         return iter->length;
    668     default:
    669         /* not a valid origin */
    670         /* Should never get here! */
    671         return -1;
    672     }
    673 }
    674 
    675 static int32_t U_CALLCONV
    676 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    677     const uint8_t *s;
    678     UChar32 c;
    679     int32_t pos; /* requested UTF-16 index */
    680     int32_t i; /* UTF-8 index */
    681     UBool havePos;
    682 
    683     /* calculate the requested UTF-16 index */
    684     switch(origin) {
    685     case UITER_ZERO:
    686     case UITER_START:
    687         pos=delta;
    688         havePos=TRUE;
    689         /* iter->index<0 (unknown) is possible */
    690         break;
    691     case UITER_CURRENT:
    692         if(iter->index>=0) {
    693             pos=iter->index+delta;
    694             havePos=TRUE;
    695         } else {
    696             /* the current UTF-16 index is unknown after setState(), use only delta */
    697             pos=0;
    698             havePos=FALSE;
    699         }
    700         break;
    701     case UITER_LIMIT:
    702     case UITER_LENGTH:
    703         if(iter->length>=0) {
    704             pos=iter->length+delta;
    705             havePos=TRUE;
    706         } else {
    707             /* pin to the end, avoid counting the length */
    708             iter->index=-1;
    709             iter->start=iter->limit;
    710             iter->reservedField=0;
    711             if(delta>=0) {
    712                 return UITER_UNKNOWN_INDEX;
    713             } else {
    714                 /* the current UTF-16 index is unknown, use only delta */
    715                 pos=0;
    716                 havePos=FALSE;
    717             }
    718         }
    719         break;
    720     default:
    721         return -1;  /* Error */
    722     }
    723 
    724     if(havePos) {
    725         /* shortcuts: pinning to the edges of the string */
    726         if(pos<=0) {
    727             iter->index=iter->start=iter->reservedField=0;
    728             return 0;
    729         } else if(iter->length>=0 && pos>=iter->length) {
    730             iter->index=iter->length;
    731             iter->start=iter->limit;
    732             iter->reservedField=0;
    733             return iter->index;
    734         }
    735 
    736         /* minimize the number of U8_NEXT/PREV operations */
    737         if(iter->index<0 || pos<iter->index/2) {
    738             /* go forward from the start instead of backward from the current index */
    739             iter->index=iter->start=iter->reservedField=0;
    740         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
    741             /*
    742              * if we have the UTF-16 index and length and the new position is
    743              * closer to the end than the current index,
    744              * then go backward from the end instead of forward from the current index
    745              */
    746             iter->index=iter->length;
    747             iter->start=iter->limit;
    748             iter->reservedField=0;
    749         }
    750 
    751         delta=pos-iter->index;
    752         if(delta==0) {
    753             return iter->index; /* nothing to do */
    754         }
    755     } else {
    756         /* move relative to unknown UTF-16 index */
    757         if(delta==0) {
    758             return UITER_UNKNOWN_INDEX; /* nothing to do */
    759         } else if(-delta>=iter->start) {
    760             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
    761             iter->index=iter->start=iter->reservedField=0;
    762             return 0;
    763         } else if(delta>=(iter->limit-iter->start)) {
    764             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
    765             iter->index=iter->length; /* may or may not be <0 (unknown) */
    766             iter->start=iter->limit;
    767             iter->reservedField=0;
    768             return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
    769         }
    770     }
    771 
    772     /* delta!=0 */
    773 
    774     /* move towards the requested position, pin to the edges of the string */
    775     s=(const uint8_t *)iter->context;
    776     pos=iter->index; /* could be <0 (unknown) */
    777     i=iter->start;
    778     if(delta>0) {
    779         /* go forward */
    780         int32_t limit=iter->limit;
    781         if(iter->reservedField!=0) {
    782             iter->reservedField=0;
    783             ++pos;
    784             --delta;
    785         }
    786         while(delta>0 && i<limit) {
    787             U8_NEXT(s, i, limit, c);
    788             if(c<0xffff) {
    789                 ++pos;
    790                 --delta;
    791             } else if(delta>=2) {
    792                 pos+=2;
    793                 delta-=2;
    794             } else /* delta==1 */ {
    795                 /* stop in the middle of a supplementary code point */
    796                 iter->reservedField=c;
    797                 ++pos;
    798                 break; /* delta=0; */
    799             }
    800         }
    801         if(i==limit) {
    802             if(iter->length<0 && iter->index>=0) {
    803                 iter->length= iter->reservedField==0 ? pos : pos+1;
    804             } else if(iter->index<0 && iter->length>=0) {
    805                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
    806             }
    807         }
    808     } else /* delta<0 */ {
    809         /* go backward */
    810         if(iter->reservedField!=0) {
    811             iter->reservedField=0;
    812             i-=4; /* we stayed behind the supplementary code point; go before it now */
    813             --pos;
    814             ++delta;
    815         }
    816         while(delta<0 && i>0) {
    817             U8_PREV(s, 0, i, c);
    818             if(c<0xffff) {
    819                 --pos;
    820                 ++delta;
    821             } else if(delta<=-2) {
    822                 pos-=2;
    823                 delta+=2;
    824             } else /* delta==-1 */ {
    825                 /* stop in the middle of a supplementary code point */
    826                 i+=4; /* back to behind this supplementary code point for consistent state */
    827                 iter->reservedField=c;
    828                 --pos;
    829                 break; /* delta=0; */
    830             }
    831         }
    832     }
    833 
    834     iter->start=i;
    835     if(iter->index>=0) {
    836         return iter->index=pos;
    837     } else {
    838         /* we started with index<0 (unknown) so pos is bogus */
    839         if(i<=1) {
    840             return iter->index=i; /* reached the beginning */
    841         } else {
    842             /* we still don't know the UTF-16 index */
    843             return UITER_UNKNOWN_INDEX;
    844         }
    845     }
    846 }
    847 
    848 static UBool U_CALLCONV
    849 utf8IteratorHasNext(UCharIterator *iter) {
    850     return iter->start<iter->limit || iter->reservedField!=0;
    851 }
    852 
    853 static UBool U_CALLCONV
    854 utf8IteratorHasPrevious(UCharIterator *iter) {
    855     return iter->start>0;
    856 }
    857 
    858 static UChar32 U_CALLCONV
    859 utf8IteratorCurrent(UCharIterator *iter) {
    860     if(iter->reservedField!=0) {
    861         return U16_TRAIL(iter->reservedField);
    862     } else if(iter->start<iter->limit) {
    863         const uint8_t *s=(const uint8_t *)iter->context;
    864         UChar32 c;
    865         int32_t i=iter->start;
    866 
    867         U8_NEXT(s, i, iter->limit, c);
    868         if(c<0) {
    869             return 0xfffd;
    870         } else if(c<=0xffff) {
    871             return c;
    872         } else {
    873             return U16_LEAD(c);
    874         }
    875     } else {
    876         return U_SENTINEL;
    877     }
    878 }
    879 
    880 static UChar32 U_CALLCONV
    881 utf8IteratorNext(UCharIterator *iter) {
    882     int32_t index;
    883 
    884     if(iter->reservedField!=0) {
    885         UChar trail=U16_TRAIL(iter->reservedField);
    886         iter->reservedField=0;
    887         if((index=iter->index)>=0) {
    888             iter->index=index+1;
    889         }
    890         return trail;
    891     } else if(iter->start<iter->limit) {
    892         const uint8_t *s=(const uint8_t *)iter->context;
    893         UChar32 c;
    894 
    895         U8_NEXT(s, iter->start, iter->limit, c);
    896         if((index=iter->index)>=0) {
    897             iter->index=++index;
    898             if(iter->length<0 && iter->start==iter->limit) {
    899                 iter->length= c<=0xffff ? index : index+1;
    900             }
    901         } else if(iter->start==iter->limit && iter->length>=0) {
    902             iter->index= c<=0xffff ? iter->length : iter->length-1;
    903         }
    904         if(c<0) {
    905             return 0xfffd;
    906         } else if(c<=0xffff) {
    907             return c;
    908         } else {
    909             iter->reservedField=c;
    910             return U16_LEAD(c);
    911         }
    912     } else {
    913         return U_SENTINEL;
    914     }
    915 }
    916 
    917 static UChar32 U_CALLCONV
    918 utf8IteratorPrevious(UCharIterator *iter) {
    919     int32_t index;
    920 
    921     if(iter->reservedField!=0) {
    922         UChar lead=U16_LEAD(iter->reservedField);
    923         iter->reservedField=0;
    924         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
    925         if((index=iter->index)>0) {
    926             iter->index=index-1;
    927         }
    928         return lead;
    929     } else if(iter->start>0) {
    930         const uint8_t *s=(const uint8_t *)iter->context;
    931         UChar32 c;
    932 
    933         U8_PREV(s, 0, iter->start, c);
    934         if((index=iter->index)>0) {
    935             iter->index=index-1;
    936         } else if(iter->start<=1) {
    937             iter->index= c<=0xffff ? iter->start : iter->start+1;
    938         }
    939         if(c<0) {
    940             return 0xfffd;
    941         } else if(c<=0xffff) {
    942             return c;
    943         } else {
    944             iter->start+=4; /* back to behind this supplementary code point for consistent state */
    945             iter->reservedField=c;
    946             return U16_TRAIL(c);
    947         }
    948     } else {
    949         return U_SENTINEL;
    950     }
    951 }
    952 
    953 static uint32_t U_CALLCONV
    954 utf8IteratorGetState(const UCharIterator *iter) {
    955     uint32_t state=(uint32_t)(iter->start<<1);
    956     if(iter->reservedField!=0) {
    957         state|=1;
    958     }
    959     return state;
    960 }
    961 
    962 static void U_CALLCONV
    963 utf8IteratorSetState(UCharIterator *iter,
    964                      uint32_t state,
    965                      UErrorCode *pErrorCode)
    966 {
    967     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    968         /* do nothing */
    969     } else if(iter==NULL) {
    970         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    971     } else if(state==utf8IteratorGetState(iter)) {
    972         /* setting to the current state: no-op */
    973     } else {
    974         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
    975         state&=1; /* 1 if in surrogate pair, must be index>=4 */
    976 
    977         if((state==0 ? index<0 : index<4) || iter->limit<index) {
    978             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    979         } else {
    980             iter->start=index; /* restore UTF-8 byte index */
    981             if(index<=1) {
    982                 iter->index=index;
    983             } else {
    984                 iter->index=-1; /* unknown UTF-16 index */
    985             }
    986             if(state==0) {
    987                 iter->reservedField=0;
    988             } else {
    989                 /* verified index>=4 above */
    990                 UChar32 c;
    991                 U8_PREV((const uint8_t *)iter->context, 0, index, c);
    992                 if(c<=0xffff) {
    993                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    994                 } else {
    995                     iter->reservedField=c;
    996                 }
    997             }
    998         }
    999     }
   1000 }
   1001 
   1002 static const UCharIterator utf8Iterator={
   1003     0, 0, 0, 0, 0, 0,
   1004     utf8IteratorGetIndex,
   1005     utf8IteratorMove,
   1006     utf8IteratorHasNext,
   1007     utf8IteratorHasPrevious,
   1008     utf8IteratorCurrent,
   1009     utf8IteratorNext,
   1010     utf8IteratorPrevious,
   1011     NULL,
   1012     utf8IteratorGetState,
   1013     utf8IteratorSetState
   1014 };
   1015 
   1016 U_CAPI void U_EXPORT2
   1017 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
   1018     if(iter!=0) {
   1019         if(s!=0 && length>=-1) {
   1020             *iter=utf8Iterator;
   1021             iter->context=s;
   1022             if(length>=0) {
   1023                 iter->limit=length;
   1024             } else {
   1025                 iter->limit=(int32_t)uprv_strlen(s);
   1026             }
   1027             iter->length= iter->limit<=1 ? iter->limit : -1;
   1028         } else {
   1029             *iter=noopIterator;
   1030         }
   1031     }
   1032 }
   1033 
   1034 /* Helper functions --------------------------------------------------------- */
   1035 
   1036 U_CAPI UChar32 U_EXPORT2
   1037 uiter_current32(UCharIterator *iter) {
   1038     UChar32 c, c2;
   1039 
   1040     c=iter->current(iter);
   1041     if(UTF_IS_SURROGATE(c)) {
   1042         if(UTF_IS_SURROGATE_FIRST(c)) {
   1043             /*
   1044              * go to the next code unit
   1045              * we know that we are not at the limit because c!=U_SENTINEL
   1046              */
   1047             iter->move(iter, 1, UITER_CURRENT);
   1048             if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) {
   1049                 c=UTF16_GET_PAIR_VALUE(c, c2);
   1050             }
   1051 
   1052             /* undo index movement */
   1053             iter->move(iter, -1, UITER_CURRENT);
   1054         } else {
   1055             if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
   1056                 c=UTF16_GET_PAIR_VALUE(c2, c);
   1057             }
   1058             if(c2>=0) {
   1059                 /* undo index movement */
   1060                 iter->move(iter, 1, UITER_CURRENT);
   1061             }
   1062         }
   1063     }
   1064     return c;
   1065 }
   1066 
   1067 U_CAPI UChar32 U_EXPORT2
   1068 uiter_next32(UCharIterator *iter) {
   1069     UChar32 c, c2;
   1070 
   1071     c=iter->next(iter);
   1072     if(UTF_IS_FIRST_SURROGATE(c)) {
   1073         if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) {
   1074             c=UTF16_GET_PAIR_VALUE(c, c2);
   1075         } else if(c2>=0) {
   1076             /* unmatched first surrogate, undo index movement */
   1077             iter->move(iter, -1, UITER_CURRENT);
   1078         }
   1079     }
   1080     return c;
   1081 }
   1082 
   1083 U_CAPI UChar32 U_EXPORT2
   1084 uiter_previous32(UCharIterator *iter) {
   1085     UChar32 c, c2;
   1086 
   1087     c=iter->previous(iter);
   1088     if(UTF_IS_SECOND_SURROGATE(c)) {
   1089         if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
   1090             c=UTF16_GET_PAIR_VALUE(c2, c);
   1091         } else if(c2>=0) {
   1092             /* unmatched second surrogate, undo index movement */
   1093             iter->move(iter, 1, UITER_CURRENT);
   1094         }
   1095     }
   1096     return c;
   1097 }
   1098 
   1099 U_CAPI uint32_t U_EXPORT2
   1100 uiter_getState(const UCharIterator *iter) {
   1101     if(iter==NULL || iter->getState==NULL) {
   1102         return UITER_NO_STATE;
   1103     } else {
   1104         return iter->getState(iter);
   1105     }
   1106 }
   1107 
   1108 U_CAPI void U_EXPORT2
   1109 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
   1110     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1111         /* do nothing */
   1112     } else if(iter==NULL) {
   1113         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1114     } else if(iter->setState==NULL) {
   1115         *pErrorCode=U_UNSUPPORTED_ERROR;
   1116     } else {
   1117         iter->setState(iter, state, pErrorCode);
   1118     }
   1119 }
   1120 
   1121 U_CDECL_END
   1122