Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uiter.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jan18
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/ustring.h"
     19 #include "unicode/chariter.h"
     20 #include "unicode/rep.h"
     21 #include "unicode/uiter.h"
     22 #include "unicode/utf.h"
     23 #include "unicode/utf8.h"
     24 #include "unicode/utf16.h"
     25 #include "cstring.h"
     26 
     27 U_NAMESPACE_USE
     28 
     29 #define IS_EVEN(n) (((n)&1)==0)
     30 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
     31 
     32 U_CDECL_BEGIN
     33 
     34 /* No-Op UCharIterator implementation for illegal input --------------------- */
     35 
     36 static int32_t U_CALLCONV
     37 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
     38     return 0;
     39 }
     40 
     41 static int32_t U_CALLCONV
     42 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
     43     return 0;
     44 }
     45 
     46 static UBool U_CALLCONV
     47 noopHasNext(UCharIterator * /*iter*/) {
     48     return FALSE;
     49 }
     50 
     51 static UChar32 U_CALLCONV
     52 noopCurrent(UCharIterator * /*iter*/) {
     53     return U_SENTINEL;
     54 }
     55 
     56 static uint32_t U_CALLCONV
     57 noopGetState(const UCharIterator * /*iter*/) {
     58     return UITER_NO_STATE;
     59 }
     60 
     61 static void U_CALLCONV
     62 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
     63     *pErrorCode=U_UNSUPPORTED_ERROR;
     64 }
     65 
     66 static const UCharIterator noopIterator={
     67     0, 0, 0, 0, 0, 0,
     68     noopGetIndex,
     69     noopMove,
     70     noopHasNext,
     71     noopHasNext,
     72     noopCurrent,
     73     noopCurrent,
     74     noopCurrent,
     75     NULL,
     76     noopGetState,
     77     noopSetState
     78 };
     79 
     80 /* UCharIterator implementation for simple strings -------------------------- */
     81 
     82 /*
     83  * This is an implementation of a code unit (UChar) iterator
     84  * for UChar * strings.
     85  *
     86  * The UCharIterator.context field holds a pointer to the string.
     87  */
     88 
     89 static int32_t U_CALLCONV
     90 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
     91     switch(origin) {
     92     case UITER_ZERO:
     93         return 0;
     94     case UITER_START:
     95         return iter->start;
     96     case UITER_CURRENT:
     97         return iter->index;
     98     case UITER_LIMIT:
     99         return iter->limit;
    100     case UITER_LENGTH:
    101         return iter->length;
    102     default:
    103         /* not a valid origin */
    104         /* Should never get here! */
    105         return -1;
    106     }
    107 }
    108 
    109 static int32_t U_CALLCONV
    110 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    111     int32_t pos;
    112 
    113     switch(origin) {
    114     case UITER_ZERO:
    115         pos=delta;
    116         break;
    117     case UITER_START:
    118         pos=iter->start+delta;
    119         break;
    120     case UITER_CURRENT:
    121         pos=iter->index+delta;
    122         break;
    123     case UITER_LIMIT:
    124         pos=iter->limit+delta;
    125         break;
    126     case UITER_LENGTH:
    127         pos=iter->length+delta;
    128         break;
    129     default:
    130         return -1;  /* Error */
    131     }
    132 
    133     if(pos<iter->start) {
    134         pos=iter->start;
    135     } else if(pos>iter->limit) {
    136         pos=iter->limit;
    137     }
    138 
    139     return iter->index=pos;
    140 }
    141 
    142 static UBool U_CALLCONV
    143 stringIteratorHasNext(UCharIterator *iter) {
    144     return iter->index<iter->limit;
    145 }
    146 
    147 static UBool U_CALLCONV
    148 stringIteratorHasPrevious(UCharIterator *iter) {
    149     return iter->index>iter->start;
    150 }
    151 
    152 static UChar32 U_CALLCONV
    153 stringIteratorCurrent(UCharIterator *iter) {
    154     if(iter->index<iter->limit) {
    155         return ((const UChar *)(iter->context))[iter->index];
    156     } else {
    157         return U_SENTINEL;
    158     }
    159 }
    160 
    161 static UChar32 U_CALLCONV
    162 stringIteratorNext(UCharIterator *iter) {
    163     if(iter->index<iter->limit) {
    164         return ((const UChar *)(iter->context))[iter->index++];
    165     } else {
    166         return U_SENTINEL;
    167     }
    168 }
    169 
    170 static UChar32 U_CALLCONV
    171 stringIteratorPrevious(UCharIterator *iter) {
    172     if(iter->index>iter->start) {
    173         return ((const UChar *)(iter->context))[--iter->index];
    174     } else {
    175         return U_SENTINEL;
    176     }
    177 }
    178 
    179 static uint32_t U_CALLCONV
    180 stringIteratorGetState(const UCharIterator *iter) {
    181     return (uint32_t)iter->index;
    182 }
    183 
    184 static void U_CALLCONV
    185 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    186     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    187         /* do nothing */
    188     } else if(iter==NULL) {
    189         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    190     } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
    191         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    192     } else {
    193         iter->index=(int32_t)state;
    194     }
    195 }
    196 
    197 static const UCharIterator stringIterator={
    198     0, 0, 0, 0, 0, 0,
    199     stringIteratorGetIndex,
    200     stringIteratorMove,
    201     stringIteratorHasNext,
    202     stringIteratorHasPrevious,
    203     stringIteratorCurrent,
    204     stringIteratorNext,
    205     stringIteratorPrevious,
    206     NULL,
    207     stringIteratorGetState,
    208     stringIteratorSetState
    209 };
    210 
    211 U_CAPI void U_EXPORT2
    212 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
    213     if(iter!=0) {
    214         if(s!=0 && length>=-1) {
    215             *iter=stringIterator;
    216             iter->context=s;
    217             if(length>=0) {
    218                 iter->length=length;
    219             } else {
    220                 iter->length=u_strlen(s);
    221             }
    222             iter->limit=iter->length;
    223         } else {
    224             *iter=noopIterator;
    225         }
    226     }
    227 }
    228 
    229 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
    230 
    231 /*
    232  * This is an implementation of a code unit (UChar) iterator
    233  * for UTF-16BE strings, i.e., strings in byte-vectors where
    234  * each UChar is stored as a big-endian pair of bytes.
    235  *
    236  * The UCharIterator.context field holds a pointer to the string.
    237  * Everything works just like with a normal UChar iterator (uiter_setString),
    238  * except that UChars are assembled from byte pairs.
    239  */
    240 
    241 /* internal helper function */
    242 static inline UChar32
    243 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
    244     const uint8_t *p=(const uint8_t *)iter->context;
    245     return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
    246 }
    247 
    248 static UChar32 U_CALLCONV
    249 utf16BEIteratorCurrent(UCharIterator *iter) {
    250     int32_t index;
    251 
    252     if((index=iter->index)<iter->limit) {
    253         return utf16BEIteratorGet(iter, index);
    254     } else {
    255         return U_SENTINEL;
    256     }
    257 }
    258 
    259 static UChar32 U_CALLCONV
    260 utf16BEIteratorNext(UCharIterator *iter) {
    261     int32_t index;
    262 
    263     if((index=iter->index)<iter->limit) {
    264         iter->index=index+1;
    265         return utf16BEIteratorGet(iter, index);
    266     } else {
    267         return U_SENTINEL;
    268     }
    269 }
    270 
    271 static UChar32 U_CALLCONV
    272 utf16BEIteratorPrevious(UCharIterator *iter) {
    273     int32_t index;
    274 
    275     if((index=iter->index)>iter->start) {
    276         iter->index=--index;
    277         return utf16BEIteratorGet(iter, index);
    278     } else {
    279         return U_SENTINEL;
    280     }
    281 }
    282 
    283 static const UCharIterator utf16BEIterator={
    284     0, 0, 0, 0, 0, 0,
    285     stringIteratorGetIndex,
    286     stringIteratorMove,
    287     stringIteratorHasNext,
    288     stringIteratorHasPrevious,
    289     utf16BEIteratorCurrent,
    290     utf16BEIteratorNext,
    291     utf16BEIteratorPrevious,
    292     NULL,
    293     stringIteratorGetState,
    294     stringIteratorSetState
    295 };
    296 
    297 /*
    298  * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
    299  * i.e., before a pair of 0 bytes where the first 0 byte is at an even
    300  * offset from s.
    301  */
    302 static int32_t
    303 utf16BE_strlen(const char *s) {
    304     if(IS_POINTER_EVEN(s)) {
    305         /*
    306          * even-aligned, call u_strlen(s)
    307          * we are probably on a little-endian machine, but searching for UChar NUL
    308          * does not care about endianness
    309          */
    310         return u_strlen((const UChar *)s);
    311     } else {
    312         /* odd-aligned, search for pair of 0 bytes */
    313         const char *p=s;
    314 
    315         while(!(*p==0 && p[1]==0)) {
    316             p+=2;
    317         }
    318         return (int32_t)((p-s)/2);
    319     }
    320 }
    321 
    322 U_CAPI void U_EXPORT2
    323 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
    324     if(iter!=NULL) {
    325         /* allow only even-length strings (the input length counts bytes) */
    326         if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
    327             /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
    328             length>>=1;
    329 
    330             if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
    331                 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
    332                 uiter_setString(iter, (const UChar *)s, length);
    333                 return;
    334             }
    335 
    336             *iter=utf16BEIterator;
    337             iter->context=s;
    338             if(length>=0) {
    339                 iter->length=length;
    340             } else {
    341                 iter->length=utf16BE_strlen(s);
    342             }
    343             iter->limit=iter->length;
    344         } else {
    345             *iter=noopIterator;
    346         }
    347     }
    348 }
    349 
    350 /* UCharIterator wrapper around CharacterIterator --------------------------- */
    351 
    352 /*
    353  * This is wrapper code around a C++ CharacterIterator to
    354  * look like a C UCharIterator.
    355  *
    356  * The UCharIterator.context field holds a pointer to the CharacterIterator.
    357  */
    358 
    359 static int32_t U_CALLCONV
    360 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    361     switch(origin) {
    362     case UITER_ZERO:
    363         return 0;
    364     case UITER_START:
    365         return ((CharacterIterator *)(iter->context))->startIndex();
    366     case UITER_CURRENT:
    367         return ((CharacterIterator *)(iter->context))->getIndex();
    368     case UITER_LIMIT:
    369         return ((CharacterIterator *)(iter->context))->endIndex();
    370     case UITER_LENGTH:
    371         return ((CharacterIterator *)(iter->context))->getLength();
    372     default:
    373         /* not a valid origin */
    374         /* Should never get here! */
    375         return -1;
    376     }
    377 }
    378 
    379 static int32_t U_CALLCONV
    380 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    381     switch(origin) {
    382     case UITER_ZERO:
    383         ((CharacterIterator *)(iter->context))->setIndex(delta);
    384         return ((CharacterIterator *)(iter->context))->getIndex();
    385     case UITER_START:
    386     case UITER_CURRENT:
    387     case UITER_LIMIT:
    388         return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
    389     case UITER_LENGTH:
    390         ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
    391         return ((CharacterIterator *)(iter->context))->getIndex();
    392     default:
    393         /* not a valid origin */
    394         /* Should never get here! */
    395         return -1;
    396     }
    397 }
    398 
    399 static UBool U_CALLCONV
    400 characterIteratorHasNext(UCharIterator *iter) {
    401     return ((CharacterIterator *)(iter->context))->hasNext();
    402 }
    403 
    404 static UBool U_CALLCONV
    405 characterIteratorHasPrevious(UCharIterator *iter) {
    406     return ((CharacterIterator *)(iter->context))->hasPrevious();
    407 }
    408 
    409 static UChar32 U_CALLCONV
    410 characterIteratorCurrent(UCharIterator *iter) {
    411     UChar32 c;
    412 
    413     c=((CharacterIterator *)(iter->context))->current();
    414     if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
    415         return c;
    416     } else {
    417         return U_SENTINEL;
    418     }
    419 }
    420 
    421 static UChar32 U_CALLCONV
    422 characterIteratorNext(UCharIterator *iter) {
    423     if(((CharacterIterator *)(iter->context))->hasNext()) {
    424         return ((CharacterIterator *)(iter->context))->nextPostInc();
    425     } else {
    426         return U_SENTINEL;
    427     }
    428 }
    429 
    430 static UChar32 U_CALLCONV
    431 characterIteratorPrevious(UCharIterator *iter) {
    432     if(((CharacterIterator *)(iter->context))->hasPrevious()) {
    433         return ((CharacterIterator *)(iter->context))->previous();
    434     } else {
    435         return U_SENTINEL;
    436     }
    437 }
    438 
    439 static uint32_t U_CALLCONV
    440 characterIteratorGetState(const UCharIterator *iter) {
    441     return ((CharacterIterator *)(iter->context))->getIndex();
    442 }
    443 
    444 static void U_CALLCONV
    445 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    446     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    447         /* do nothing */
    448     } else if(iter==NULL || iter->context==NULL) {
    449         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    450     } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
    451         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    452     } else {
    453         ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
    454     }
    455 }
    456 
    457 static const UCharIterator characterIteratorWrapper={
    458     0, 0, 0, 0, 0, 0,
    459     characterIteratorGetIndex,
    460     characterIteratorMove,
    461     characterIteratorHasNext,
    462     characterIteratorHasPrevious,
    463     characterIteratorCurrent,
    464     characterIteratorNext,
    465     characterIteratorPrevious,
    466     NULL,
    467     characterIteratorGetState,
    468     characterIteratorSetState
    469 };
    470 
    471 U_CAPI void U_EXPORT2
    472 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
    473     if(iter!=0) {
    474         if(charIter!=0) {
    475             *iter=characterIteratorWrapper;
    476             iter->context=charIter;
    477         } else {
    478             *iter=noopIterator;
    479         }
    480     }
    481 }
    482 
    483 /* UCharIterator wrapper around Replaceable --------------------------------- */
    484 
    485 /*
    486  * This is an implementation of a code unit (UChar) iterator
    487  * based on a Replaceable object.
    488  *
    489  * The UCharIterator.context field holds a pointer to the Replaceable.
    490  * UCharIterator.length and UCharIterator.index hold Replaceable.length()
    491  * and the iteration index.
    492  */
    493 
    494 static UChar32 U_CALLCONV
    495 replaceableIteratorCurrent(UCharIterator *iter) {
    496     if(iter->index<iter->limit) {
    497         return ((Replaceable *)(iter->context))->charAt(iter->index);
    498     } else {
    499         return U_SENTINEL;
    500     }
    501 }
    502 
    503 static UChar32 U_CALLCONV
    504 replaceableIteratorNext(UCharIterator *iter) {
    505     if(iter->index<iter->limit) {
    506         return ((Replaceable *)(iter->context))->charAt(iter->index++);
    507     } else {
    508         return U_SENTINEL;
    509     }
    510 }
    511 
    512 static UChar32 U_CALLCONV
    513 replaceableIteratorPrevious(UCharIterator *iter) {
    514     if(iter->index>iter->start) {
    515         return ((Replaceable *)(iter->context))->charAt(--iter->index);
    516     } else {
    517         return U_SENTINEL;
    518     }
    519 }
    520 
    521 static const UCharIterator replaceableIterator={
    522     0, 0, 0, 0, 0, 0,
    523     stringIteratorGetIndex,
    524     stringIteratorMove,
    525     stringIteratorHasNext,
    526     stringIteratorHasPrevious,
    527     replaceableIteratorCurrent,
    528     replaceableIteratorNext,
    529     replaceableIteratorPrevious,
    530     NULL,
    531     stringIteratorGetState,
    532     stringIteratorSetState
    533 };
    534 
    535 U_CAPI void U_EXPORT2
    536 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
    537     if(iter!=0) {
    538         if(rep!=0) {
    539             *iter=replaceableIterator;
    540             iter->context=rep;
    541             iter->limit=iter->length=rep->length();
    542         } else {
    543             *iter=noopIterator;
    544         }
    545     }
    546 }
    547 
    548 /* UCharIterator implementation for UTF-8 strings --------------------------- */
    549 
    550 /*
    551  * Possible, probably necessary only for an implementation for arbitrary
    552  * converters:
    553  * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
    554  * This would require to turn reservedFn into a close function and
    555  * to introduce a uiter_close(iter).
    556  */
    557 
    558 #define UITER_CNV_CAPACITY 16
    559 
    560 /*
    561  * Minimal implementation:
    562  * Maintain a single-UChar buffer for an additional surrogate.
    563  * The caller must not modify start and limit because they are used internally.
    564  *
    565  * Use UCharIterator fields as follows:
    566  *   context        pointer to UTF-8 string
    567  *   length         UTF-16 length of the string; -1 until lazy evaluation
    568  *   start          current UTF-8 index
    569  *   index          current UTF-16 index; may be -1="unknown" after setState()
    570  *   limit          UTF-8 length of the string
    571  *   reservedField  supplementary code point
    572  *
    573  * Since UCharIterator delivers 16-bit code units, the iteration can be
    574  * currently in the middle of the byte sequence for a supplementary code point.
    575  * In this case, reservedField will contain that code point and start will
    576  * point to after the corresponding byte sequence. The UTF-16 index will be
    577  * one less than what it would otherwise be corresponding to the UTF-8 index.
    578  * Otherwise, reservedField will be 0.
    579  */
    580 
    581 /*
    582  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
    583  * Add implementations that do not call strlen() for iteration but check for NUL.
    584  */
    585 
    586 static int32_t U_CALLCONV
    587 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    588     switch(origin) {
    589     case UITER_ZERO:
    590     case UITER_START:
    591         return 0;
    592     case UITER_CURRENT:
    593         if(iter->index<0) {
    594             /* the current UTF-16 index is unknown after setState(), count from the beginning */
    595             const uint8_t *s;
    596             UChar32 c;
    597             int32_t i, limit, index;
    598 
    599             s=(const uint8_t *)iter->context;
    600             i=index=0;
    601             limit=iter->start; /* count up to the UTF-8 index */
    602             while(i<limit) {
    603                 U8_NEXT_OR_FFFD(s, i, limit, c);
    604                 index+=U16_LENGTH(c);
    605             }
    606 
    607             iter->start=i; /* just in case setState() did not get us to a code point boundary */
    608             if(i==iter->limit) {
    609                 iter->length=index; /* in case it was <0 or wrong */
    610             }
    611             if(iter->reservedField!=0) {
    612                 --index; /* we are in the middle of a supplementary code point */
    613             }
    614             iter->index=index;
    615         }
    616         return iter->index;
    617     case UITER_LIMIT:
    618     case UITER_LENGTH:
    619         if(iter->length<0) {
    620             const uint8_t *s;
    621             UChar32 c;
    622             int32_t i, limit, length;
    623 
    624             s=(const uint8_t *)iter->context;
    625             if(iter->index<0) {
    626                 /*
    627                  * the current UTF-16 index is unknown after setState(),
    628                  * we must first count from the beginning to here
    629                  */
    630                 i=length=0;
    631                 limit=iter->start;
    632 
    633                 /* count from the beginning to the current index */
    634                 while(i<limit) {
    635                     U8_NEXT_OR_FFFD(s, i, limit, c);
    636                     length+=U16_LENGTH(c);
    637                 }
    638 
    639                 /* assume i==limit==iter->start, set the UTF-16 index */
    640                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
    641                 iter->index= iter->reservedField!=0 ? length-1 : length;
    642             } else {
    643                 i=iter->start;
    644                 length=iter->index;
    645                 if(iter->reservedField!=0) {
    646                     ++length;
    647                 }
    648             }
    649 
    650             /* count from the current index to the end */
    651             limit=iter->limit;
    652             while(i<limit) {
    653                 U8_NEXT_OR_FFFD(s, i, limit, c);
    654                 length+=U16_LENGTH(c);
    655             }
    656             iter->length=length;
    657         }
    658         return iter->length;
    659     default:
    660         /* not a valid origin */
    661         /* Should never get here! */
    662         return -1;
    663     }
    664 }
    665 
    666 static int32_t U_CALLCONV
    667 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    668     const uint8_t *s;
    669     UChar32 c;
    670     int32_t pos; /* requested UTF-16 index */
    671     int32_t i; /* UTF-8 index */
    672     UBool havePos;
    673 
    674     /* calculate the requested UTF-16 index */
    675     switch(origin) {
    676     case UITER_ZERO:
    677     case UITER_START:
    678         pos=delta;
    679         havePos=TRUE;
    680         /* iter->index<0 (unknown) is possible */
    681         break;
    682     case UITER_CURRENT:
    683         if(iter->index>=0) {
    684             pos=iter->index+delta;
    685             havePos=TRUE;
    686         } else {
    687             /* the current UTF-16 index is unknown after setState(), use only delta */
    688             pos=0;
    689             havePos=FALSE;
    690         }
    691         break;
    692     case UITER_LIMIT:
    693     case UITER_LENGTH:
    694         if(iter->length>=0) {
    695             pos=iter->length+delta;
    696             havePos=TRUE;
    697         } else {
    698             /* pin to the end, avoid counting the length */
    699             iter->index=-1;
    700             iter->start=iter->limit;
    701             iter->reservedField=0;
    702             if(delta>=0) {
    703                 return UITER_UNKNOWN_INDEX;
    704             } else {
    705                 /* the current UTF-16 index is unknown, use only delta */
    706                 pos=0;
    707                 havePos=FALSE;
    708             }
    709         }
    710         break;
    711     default:
    712         return -1;  /* Error */
    713     }
    714 
    715     if(havePos) {
    716         /* shortcuts: pinning to the edges of the string */
    717         if(pos<=0) {
    718             iter->index=iter->start=iter->reservedField=0;
    719             return 0;
    720         } else if(iter->length>=0 && pos>=iter->length) {
    721             iter->index=iter->length;
    722             iter->start=iter->limit;
    723             iter->reservedField=0;
    724             return iter->index;
    725         }
    726 
    727         /* minimize the number of U8_NEXT/PREV operations */
    728         if(iter->index<0 || pos<iter->index/2) {
    729             /* go forward from the start instead of backward from the current index */
    730             iter->index=iter->start=iter->reservedField=0;
    731         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
    732             /*
    733              * if we have the UTF-16 index and length and the new position is
    734              * closer to the end than the current index,
    735              * then go backward from the end instead of forward from the current index
    736              */
    737             iter->index=iter->length;
    738             iter->start=iter->limit;
    739             iter->reservedField=0;
    740         }
    741 
    742         delta=pos-iter->index;
    743         if(delta==0) {
    744             return iter->index; /* nothing to do */
    745         }
    746     } else {
    747         /* move relative to unknown UTF-16 index */
    748         if(delta==0) {
    749             return UITER_UNKNOWN_INDEX; /* nothing to do */
    750         } else if(-delta>=iter->start) {
    751             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
    752             iter->index=iter->start=iter->reservedField=0;
    753             return 0;
    754         } else if(delta>=(iter->limit-iter->start)) {
    755             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
    756             iter->index=iter->length; /* may or may not be <0 (unknown) */
    757             iter->start=iter->limit;
    758             iter->reservedField=0;
    759             return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
    760         }
    761     }
    762 
    763     /* delta!=0 */
    764 
    765     /* move towards the requested position, pin to the edges of the string */
    766     s=(const uint8_t *)iter->context;
    767     pos=iter->index; /* could be <0 (unknown) */
    768     i=iter->start;
    769     if(delta>0) {
    770         /* go forward */
    771         int32_t limit=iter->limit;
    772         if(iter->reservedField!=0) {
    773             iter->reservedField=0;
    774             ++pos;
    775             --delta;
    776         }
    777         while(delta>0 && i<limit) {
    778             U8_NEXT_OR_FFFD(s, i, limit, c);
    779             if(c<=0xffff) {
    780                 ++pos;
    781                 --delta;
    782             } else if(delta>=2) {
    783                 pos+=2;
    784                 delta-=2;
    785             } else /* delta==1 */ {
    786                 /* stop in the middle of a supplementary code point */
    787                 iter->reservedField=c;
    788                 ++pos;
    789                 break; /* delta=0; */
    790             }
    791         }
    792         if(i==limit) {
    793             if(iter->length<0 && iter->index>=0) {
    794                 iter->length= iter->reservedField==0 ? pos : pos+1;
    795             } else if(iter->index<0 && iter->length>=0) {
    796                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
    797             }
    798         }
    799     } else /* delta<0 */ {
    800         /* go backward */
    801         if(iter->reservedField!=0) {
    802             iter->reservedField=0;
    803             i-=4; /* we stayed behind the supplementary code point; go before it now */
    804             --pos;
    805             ++delta;
    806         }
    807         while(delta<0 && i>0) {
    808             U8_PREV_OR_FFFD(s, 0, i, c);
    809             if(c<=0xffff) {
    810                 --pos;
    811                 ++delta;
    812             } else if(delta<=-2) {
    813                 pos-=2;
    814                 delta+=2;
    815             } else /* delta==-1 */ {
    816                 /* stop in the middle of a supplementary code point */
    817                 i+=4; /* back to behind this supplementary code point for consistent state */
    818                 iter->reservedField=c;
    819                 --pos;
    820                 break; /* delta=0; */
    821             }
    822         }
    823     }
    824 
    825     iter->start=i;
    826     if(iter->index>=0) {
    827         return iter->index=pos;
    828     } else {
    829         /* we started with index<0 (unknown) so pos is bogus */
    830         if(i<=1) {
    831             return iter->index=i; /* reached the beginning */
    832         } else {
    833             /* we still don't know the UTF-16 index */
    834             return UITER_UNKNOWN_INDEX;
    835         }
    836     }
    837 }
    838 
    839 static UBool U_CALLCONV
    840 utf8IteratorHasNext(UCharIterator *iter) {
    841     return iter->start<iter->limit || iter->reservedField!=0;
    842 }
    843 
    844 static UBool U_CALLCONV
    845 utf8IteratorHasPrevious(UCharIterator *iter) {
    846     return iter->start>0;
    847 }
    848 
    849 static UChar32 U_CALLCONV
    850 utf8IteratorCurrent(UCharIterator *iter) {
    851     if(iter->reservedField!=0) {
    852         return U16_TRAIL(iter->reservedField);
    853     } else if(iter->start<iter->limit) {
    854         const uint8_t *s=(const uint8_t *)iter->context;
    855         UChar32 c;
    856         int32_t i=iter->start;
    857 
    858         U8_NEXT_OR_FFFD(s, i, iter->limit, c);
    859         if(c<=0xffff) {
    860             return c;
    861         } else {
    862             return U16_LEAD(c);
    863         }
    864     } else {
    865         return U_SENTINEL;
    866     }
    867 }
    868 
    869 static UChar32 U_CALLCONV
    870 utf8IteratorNext(UCharIterator *iter) {
    871     int32_t index;
    872 
    873     if(iter->reservedField!=0) {
    874         UChar trail=U16_TRAIL(iter->reservedField);
    875         iter->reservedField=0;
    876         if((index=iter->index)>=0) {
    877             iter->index=index+1;
    878         }
    879         return trail;
    880     } else if(iter->start<iter->limit) {
    881         const uint8_t *s=(const uint8_t *)iter->context;
    882         UChar32 c;
    883 
    884         U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
    885         if((index=iter->index)>=0) {
    886             iter->index=++index;
    887             if(iter->length<0 && iter->start==iter->limit) {
    888                 iter->length= c<=0xffff ? index : index+1;
    889             }
    890         } else if(iter->start==iter->limit && iter->length>=0) {
    891             iter->index= c<=0xffff ? iter->length : iter->length-1;
    892         }
    893         if(c<=0xffff) {
    894             return c;
    895         } else {
    896             iter->reservedField=c;
    897             return U16_LEAD(c);
    898         }
    899     } else {
    900         return U_SENTINEL;
    901     }
    902 }
    903 
    904 static UChar32 U_CALLCONV
    905 utf8IteratorPrevious(UCharIterator *iter) {
    906     int32_t index;
    907 
    908     if(iter->reservedField!=0) {
    909         UChar lead=U16_LEAD(iter->reservedField);
    910         iter->reservedField=0;
    911         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
    912         if((index=iter->index)>0) {
    913             iter->index=index-1;
    914         }
    915         return lead;
    916     } else if(iter->start>0) {
    917         const uint8_t *s=(const uint8_t *)iter->context;
    918         UChar32 c;
    919 
    920         U8_PREV_OR_FFFD(s, 0, iter->start, c);
    921         if((index=iter->index)>0) {
    922             iter->index=index-1;
    923         } else if(iter->start<=1) {
    924             iter->index= c<=0xffff ? iter->start : iter->start+1;
    925         }
    926         if(c<=0xffff) {
    927             return c;
    928         } else {
    929             iter->start+=4; /* back to behind this supplementary code point for consistent state */
    930             iter->reservedField=c;
    931             return U16_TRAIL(c);
    932         }
    933     } else {
    934         return U_SENTINEL;
    935     }
    936 }
    937 
    938 static uint32_t U_CALLCONV
    939 utf8IteratorGetState(const UCharIterator *iter) {
    940     uint32_t state=(uint32_t)(iter->start<<1);
    941     if(iter->reservedField!=0) {
    942         state|=1;
    943     }
    944     return state;
    945 }
    946 
    947 static void U_CALLCONV
    948 utf8IteratorSetState(UCharIterator *iter,
    949                      uint32_t state,
    950                      UErrorCode *pErrorCode)
    951 {
    952     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    953         /* do nothing */
    954     } else if(iter==NULL) {
    955         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    956     } else if(state==utf8IteratorGetState(iter)) {
    957         /* setting to the current state: no-op */
    958     } else {
    959         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
    960         state&=1; /* 1 if in surrogate pair, must be index>=4 */
    961 
    962         if((state==0 ? index<0 : index<4) || iter->limit<index) {
    963             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    964         } else {
    965             iter->start=index; /* restore UTF-8 byte index */
    966             if(index<=1) {
    967                 iter->index=index;
    968             } else {
    969                 iter->index=-1; /* unknown UTF-16 index */
    970             }
    971             if(state==0) {
    972                 iter->reservedField=0;
    973             } else {
    974                 /* verified index>=4 above */
    975                 UChar32 c;
    976                 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
    977                 if(c<=0xffff) {
    978                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    979                 } else {
    980                     iter->reservedField=c;
    981                 }
    982             }
    983         }
    984     }
    985 }
    986 
    987 static const UCharIterator utf8Iterator={
    988     0, 0, 0, 0, 0, 0,
    989     utf8IteratorGetIndex,
    990     utf8IteratorMove,
    991     utf8IteratorHasNext,
    992     utf8IteratorHasPrevious,
    993     utf8IteratorCurrent,
    994     utf8IteratorNext,
    995     utf8IteratorPrevious,
    996     NULL,
    997     utf8IteratorGetState,
    998     utf8IteratorSetState
    999 };
   1000 
   1001 U_CAPI void U_EXPORT2
   1002 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
   1003     if(iter!=0) {
   1004         if(s!=0 && length>=-1) {
   1005             *iter=utf8Iterator;
   1006             iter->context=s;
   1007             if(length>=0) {
   1008                 iter->limit=length;
   1009             } else {
   1010                 iter->limit=(int32_t)uprv_strlen(s);
   1011             }
   1012             iter->length= iter->limit<=1 ? iter->limit : -1;
   1013         } else {
   1014             *iter=noopIterator;
   1015         }
   1016     }
   1017 }
   1018 
   1019 /* Helper functions --------------------------------------------------------- */
   1020 
   1021 U_CAPI UChar32 U_EXPORT2
   1022 uiter_current32(UCharIterator *iter) {
   1023     UChar32 c, c2;
   1024 
   1025     c=iter->current(iter);
   1026     if(U16_IS_SURROGATE(c)) {
   1027         if(U16_IS_SURROGATE_LEAD(c)) {
   1028             /*
   1029              * go to the next code unit
   1030              * we know that we are not at the limit because c!=U_SENTINEL
   1031              */
   1032             iter->move(iter, 1, UITER_CURRENT);
   1033             if(U16_IS_TRAIL(c2=iter->current(iter))) {
   1034                 c=U16_GET_SUPPLEMENTARY(c, c2);
   1035             }
   1036 
   1037             /* undo index movement */
   1038             iter->move(iter, -1, UITER_CURRENT);
   1039         } else {
   1040             if(U16_IS_LEAD(c2=iter->previous(iter))) {
   1041                 c=U16_GET_SUPPLEMENTARY(c2, c);
   1042             }
   1043             if(c2>=0) {
   1044                 /* undo index movement */
   1045                 iter->move(iter, 1, UITER_CURRENT);
   1046             }
   1047         }
   1048     }
   1049     return c;
   1050 }
   1051 
   1052 U_CAPI UChar32 U_EXPORT2
   1053 uiter_next32(UCharIterator *iter) {
   1054     UChar32 c, c2;
   1055 
   1056     c=iter->next(iter);
   1057     if(U16_IS_LEAD(c)) {
   1058         if(U16_IS_TRAIL(c2=iter->next(iter))) {
   1059             c=U16_GET_SUPPLEMENTARY(c, c2);
   1060         } else if(c2>=0) {
   1061             /* unmatched first surrogate, undo index movement */
   1062             iter->move(iter, -1, UITER_CURRENT);
   1063         }
   1064     }
   1065     return c;
   1066 }
   1067 
   1068 U_CAPI UChar32 U_EXPORT2
   1069 uiter_previous32(UCharIterator *iter) {
   1070     UChar32 c, c2;
   1071 
   1072     c=iter->previous(iter);
   1073     if(U16_IS_TRAIL(c)) {
   1074         if(U16_IS_LEAD(c2=iter->previous(iter))) {
   1075             c=U16_GET_SUPPLEMENTARY(c2, c);
   1076         } else if(c2>=0) {
   1077             /* unmatched second surrogate, undo index movement */
   1078             iter->move(iter, 1, UITER_CURRENT);
   1079         }
   1080     }
   1081     return c;
   1082 }
   1083 
   1084 U_CAPI uint32_t U_EXPORT2
   1085 uiter_getState(const UCharIterator *iter) {
   1086     if(iter==NULL || iter->getState==NULL) {
   1087         return UITER_NO_STATE;
   1088     } else {
   1089         return iter->getState(iter);
   1090     }
   1091 }
   1092 
   1093 U_CAPI void U_EXPORT2
   1094 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
   1095     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1096         /* do nothing */
   1097     } else if(iter==NULL) {
   1098         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1099     } else if(iter->setState==NULL) {
   1100         *pErrorCode=U_UNSUPPORTED_ERROR;
   1101     } else {
   1102         iter->setState(iter, state, pErrorCode);
   1103     }
   1104 }
   1105 
   1106 U_CDECL_END
   1107