Home | History | Annotate | Download | only in uciter8
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2003-2006, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uit_len8.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2003feb10
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file contains the implementation of the "lenient UTF-8" UCharIterator
     17 *   as used in the uciter8 sample code.
     18 *   UTF-8-style macros are defined as well as the UCharIterator.
     19 *   The macros are incomplete (do not assemble code points from pairs of
     20 *   surrogates, see comment below)
     21 *   but sufficient for the iterator.
     22 */
     23 
     24 #include <string.h>
     25 #include "unicode/utypes.h"
     26 #include "unicode/uiter.h"
     27 
     28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
     29 
     30 /*
     31  * This code leniently reads 8-bit Unicode strings,
     32  * which could contain a mix of UTF-8 and CESU-8.
     33  * More precisely:
     34  * - supplementary code points may be encoded with dedicated 4-byte sequences
     35  *   (UTF-8 style)
     36  * - supplementary code points may be encoded with
     37  *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
     38  *   (CESU-8 style)
     39  * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
     40  *
     41  * Limitation:
     42  * Right now, the macros do not attempt to assemble code points from pairs of
     43  * separately encoded surrogates.
     44  * This would not be sufficient for processing based on these macros,
     45  * but it is sufficient for a UCharIterator that returns only UChars anyway.
     46  *
     47  * The code is copied and modified from utf_impl.c and utf8.h.
     48  *
     49  * Change 2006feb08: Much of the implementation code is replaced by calling
     50  * the utf_impl.c functions which accept a new "strict" parameter value
     51  * of -2 implementing exactly this leniency.
     52  */
     53 
     54 #define L8_NEXT(s, i, length, c) { \
     55     (c)=(uint8_t)(s)[(i)++]; \
     56     if((c)>=0x80) { \
     57         if(U8_IS_LEAD(c)) { \
     58             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
     59         } else { \
     60             (c)=U_SENTINEL; \
     61         } \
     62     } \
     63 }
     64 
     65 #define L8_PREV(s, start, i, c) { \
     66     (c)=(uint8_t)(s)[--(i)]; \
     67     if((c)>=0x80) { \
     68         if((c)<=0xbf) { \
     69             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
     70         } else { \
     71             (c)=U_SENTINEL; \
     72         } \
     73     } \
     74 }
     75 
     76 /* lenient-8 UCharIterator -------------------------------------------------- */
     77 
     78 /*
     79  * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
     80  * except that it uses the lenient-8-bit-Unicode macros above.
     81  */
     82 
     83 /*
     84  * Minimal implementation:
     85  * Maintain a single-UChar buffer for an additional surrogate.
     86  * The caller must not modify start and limit because they are used internally.
     87  *
     88  * Use UCharIterator fields as follows:
     89  *   context        pointer to UTF-8 string
     90  *   length         UTF-16 length of the string; -1 until lazy evaluation
     91  *   start          current UTF-8 index
     92  *   index          current UTF-16 index; may be -1="unknown" after setState()
     93  *   limit          UTF-8 length of the string
     94  *   reservedField  supplementary code point
     95  *
     96  * Since UCharIterator delivers 16-bit code units, the iteration can be
     97  * currently in the middle of the byte sequence for a supplementary code point.
     98  * In this case, reservedField will contain that code point and start will
     99  * point to after the corresponding byte sequence. The UTF-16 index will be
    100  * one less than what it would otherwise be corresponding to the UTF-8 index.
    101  * Otherwise, reservedField will be 0.
    102  */
    103 
    104 /*
    105  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
    106  * Add implementations that do not call strlen() for iteration but check for NUL.
    107  */
    108 
    109 static int32_t U_CALLCONV
    110 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    111     switch(origin) {
    112     case UITER_ZERO:
    113     case UITER_START:
    114         return 0;
    115     case UITER_CURRENT:
    116         if(iter->index<0) {
    117             /* the current UTF-16 index is unknown after setState(), count from the beginning */
    118             const uint8_t *s;
    119             UChar32 c;
    120             int32_t i, limit, index;
    121 
    122             s=(const uint8_t *)iter->context;
    123             i=index=0;
    124             limit=iter->start; /* count up to the UTF-8 index */
    125             while(i<limit) {
    126                 L8_NEXT(s, i, limit, c);
    127                 if(c<=0xffff) {
    128                     ++index;
    129                 } else {
    130                     index+=2;
    131                 }
    132             }
    133 
    134             iter->start=i; /* just in case setState() did not get us to a code point boundary */
    135             if(i==iter->limit) {
    136                 iter->length=index; /* in case it was <0 or wrong */
    137             }
    138             if(iter->reservedField!=0) {
    139                 --index; /* we are in the middle of a supplementary code point */
    140             }
    141             iter->index=index;
    142         }
    143         return iter->index;
    144     case UITER_LIMIT:
    145     case UITER_LENGTH:
    146         if(iter->length<0) {
    147             const uint8_t *s;
    148             UChar32 c;
    149             int32_t i, limit, length;
    150 
    151             s=(const uint8_t *)iter->context;
    152             if(iter->index<0) {
    153                 /*
    154                  * the current UTF-16 index is unknown after setState(),
    155                  * we must first count from the beginning to here
    156                  */
    157                 i=length=0;
    158                 limit=iter->start;
    159 
    160                 /* count from the beginning to the current index */
    161                 while(i<limit) {
    162                     L8_NEXT(s, i, limit, c);
    163                     if(c<=0xffff) {
    164                         ++length;
    165                     } else {
    166                         length+=2;
    167                     }
    168                 }
    169 
    170                 /* assume i==limit==iter->start, set the UTF-16 index */
    171                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
    172                 iter->index= iter->reservedField!=0 ? length-1 : length;
    173             } else {
    174                 i=iter->start;
    175                 length=iter->index;
    176                 if(iter->reservedField!=0) {
    177                     ++length;
    178                 }
    179             }
    180 
    181             /* count from the current index to the end */
    182             limit=iter->limit;
    183             while(i<limit) {
    184                 L8_NEXT(s, i, limit, c);
    185                 if(c<=0xffff) {
    186                     ++length;
    187                 } else {
    188                     length+=2;
    189                 }
    190             }
    191             iter->length=length;
    192         }
    193         return iter->length;
    194     default:
    195         /* not a valid origin */
    196         /* Should never get here! */
    197         return -1;
    198     }
    199 }
    200 
    201 static int32_t U_CALLCONV
    202 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    203     const uint8_t *s;
    204     UChar32 c;
    205     int32_t pos; /* requested UTF-16 index */
    206     int32_t i; /* UTF-8 index */
    207     UBool havePos;
    208 
    209     /* calculate the requested UTF-16 index */
    210     switch(origin) {
    211     case UITER_ZERO:
    212     case UITER_START:
    213         pos=delta;
    214         havePos=TRUE;
    215         /* iter->index<0 (unknown) is possible */
    216         break;
    217     case UITER_CURRENT:
    218         if(iter->index>=0) {
    219             pos=iter->index+delta;
    220             havePos=TRUE;
    221         } else {
    222             /* the current UTF-16 index is unknown after setState(), use only delta */
    223             pos=0;
    224             havePos=FALSE;
    225         }
    226         break;
    227     case UITER_LIMIT:
    228     case UITER_LENGTH:
    229         if(iter->length>=0) {
    230             pos=iter->length+delta;
    231             havePos=TRUE;
    232         } else {
    233             /* pin to the end, avoid counting the length */
    234             iter->index=-1;
    235             iter->start=iter->limit;
    236             iter->reservedField=0;
    237             if(delta>=0) {
    238                 return UITER_UNKNOWN_INDEX;
    239             } else {
    240                 /* the current UTF-16 index is unknown, use only delta */
    241                 pos=0;
    242                 havePos=FALSE;
    243             }
    244         }
    245         break;
    246     default:
    247         return -1;  /* Error */
    248     }
    249 
    250     if(havePos) {
    251         /* shortcuts: pinning to the edges of the string */
    252         if(pos<=0) {
    253             iter->index=iter->start=iter->reservedField=0;
    254             return 0;
    255         } else if(iter->length>=0 && pos>=iter->length) {
    256             iter->index=iter->length;
    257             iter->start=iter->limit;
    258             iter->reservedField=0;
    259             return iter->index;
    260         }
    261 
    262         /* minimize the number of L8_NEXT/PREV operations */
    263         if(iter->index<0 || pos<iter->index/2) {
    264             /* go forward from the start instead of backward from the current index */
    265             iter->index=iter->start=iter->reservedField=0;
    266         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
    267             /*
    268              * if we have the UTF-16 index and length and the new position is
    269              * closer to the end than the current index,
    270              * then go backward from the end instead of forward from the current index
    271              */
    272             iter->index=iter->length;
    273             iter->start=iter->limit;
    274             iter->reservedField=0;
    275         }
    276 
    277         delta=pos-iter->index;
    278         if(delta==0) {
    279             return iter->index; /* nothing to do */
    280         }
    281     } else {
    282         /* move relative to unknown UTF-16 index */
    283         if(delta==0) {
    284             return UITER_UNKNOWN_INDEX; /* nothing to do */
    285         } else if(-delta>=iter->start) {
    286             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
    287             iter->index=iter->start=iter->reservedField=0;
    288             return 0;
    289         } else if(delta>=(iter->limit-iter->start)) {
    290             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
    291             iter->index=iter->length; /* may or may not be <0 (unknown) */
    292             iter->start=iter->limit;
    293             iter->reservedField=0;
    294             return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
    295         }
    296     }
    297 
    298     /* delta!=0 */
    299 
    300     /* move towards the requested position, pin to the edges of the string */
    301     s=(const uint8_t *)iter->context;
    302     pos=iter->index; /* could be <0 (unknown) */
    303     i=iter->start;
    304     if(delta>0) {
    305         /* go forward */
    306         int32_t limit=iter->limit;
    307         if(iter->reservedField!=0) {
    308             iter->reservedField=0;
    309             ++pos;
    310             --delta;
    311         }
    312         while(delta>0 && i<limit) {
    313             L8_NEXT(s, i, limit, c);
    314             if(c<0xffff) {
    315                 ++pos;
    316                 --delta;
    317             } else if(delta>=2) {
    318                 pos+=2;
    319                 delta-=2;
    320             } else /* delta==1 */ {
    321                 /* stop in the middle of a supplementary code point */
    322                 iter->reservedField=c;
    323                 ++pos;
    324                 break; /* delta=0; */
    325             }
    326         }
    327         if(i==limit) {
    328             if(iter->length<0 && iter->index>=0) {
    329                 iter->length= iter->reservedField==0 ? pos : pos+1;
    330             } else if(iter->index<0 && iter->length>=0) {
    331                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
    332             }
    333         }
    334     } else /* delta<0 */ {
    335         /* go backward */
    336         if(iter->reservedField!=0) {
    337             iter->reservedField=0;
    338             i-=4; /* we stayed behind the supplementary code point; go before it now */
    339             --pos;
    340             ++delta;
    341         }
    342         while(delta<0 && i>0) {
    343             L8_PREV(s, 0, i, c);
    344             if(c<0xffff) {
    345                 --pos;
    346                 ++delta;
    347             } else if(delta<=-2) {
    348                 pos-=2;
    349                 delta+=2;
    350             } else /* delta==-1 */ {
    351                 /* stop in the middle of a supplementary code point */
    352                 i+=4; /* back to behind this supplementary code point for consistent state */
    353                 iter->reservedField=c;
    354                 --pos;
    355                 break; /* delta=0; */
    356             }
    357         }
    358     }
    359 
    360     iter->start=i;
    361     if(iter->index>=0) {
    362         return iter->index=pos;
    363     } else {
    364         /* we started with index<0 (unknown) so pos is bogus */
    365         if(i<=1) {
    366             return iter->index=i; /* reached the beginning */
    367         } else {
    368             /* we still don't know the UTF-16 index */
    369             return UITER_UNKNOWN_INDEX;
    370         }
    371     }
    372 }
    373 
    374 static UBool U_CALLCONV
    375 lenient8IteratorHasNext(UCharIterator *iter) {
    376     return iter->reservedField!=0 || iter->start<iter->limit;
    377 }
    378 
    379 static UBool U_CALLCONV
    380 lenient8IteratorHasPrevious(UCharIterator *iter) {
    381     return iter->start>0;
    382 }
    383 
    384 static UChar32 U_CALLCONV
    385 lenient8IteratorCurrent(UCharIterator *iter) {
    386     if(iter->reservedField!=0) {
    387         return U16_TRAIL(iter->reservedField);
    388     } else if(iter->start<iter->limit) {
    389         const uint8_t *s=(const uint8_t *)iter->context;
    390         UChar32 c;
    391         int32_t i=iter->start;
    392 
    393         L8_NEXT(s, i, iter->limit, c);
    394         if(c<0) {
    395             return 0xfffd;
    396         } else if(c<=0xffff) {
    397             return c;
    398         } else {
    399             return U16_LEAD(c);
    400         }
    401     } else {
    402         return U_SENTINEL;
    403     }
    404 }
    405 
    406 static UChar32 U_CALLCONV
    407 lenient8IteratorNext(UCharIterator *iter) {
    408     int32_t index;
    409 
    410     if(iter->reservedField!=0) {
    411         UChar trail=U16_TRAIL(iter->reservedField);
    412         iter->reservedField=0;
    413         if((index=iter->index)>=0) {
    414             iter->index=index+1;
    415         }
    416         return trail;
    417     } else if(iter->start<iter->limit) {
    418         const uint8_t *s=(const uint8_t *)iter->context;
    419         UChar32 c;
    420 
    421         L8_NEXT(s, iter->start, iter->limit, c);
    422         if((index=iter->index)>=0) {
    423             iter->index=++index;
    424             if(iter->length<0 && iter->start==iter->limit) {
    425                 iter->length= c<=0xffff ? index : index+1;
    426             }
    427         } else if(iter->start==iter->limit && iter->length>=0) {
    428             iter->index= c<=0xffff ? iter->length : iter->length-1;
    429         }
    430         if(c<0) {
    431             return 0xfffd;
    432         } else if(c<=0xffff) {
    433             return c;
    434         } else {
    435             iter->reservedField=c;
    436             return U16_LEAD(c);
    437         }
    438     } else {
    439         return U_SENTINEL;
    440     }
    441 }
    442 
    443 static UChar32 U_CALLCONV
    444 lenient8IteratorPrevious(UCharIterator *iter) {
    445     int32_t index;
    446 
    447     if(iter->reservedField!=0) {
    448         UChar lead=U16_LEAD(iter->reservedField);
    449         iter->reservedField=0;
    450         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
    451         if((index=iter->index)>0) {
    452             iter->index=index-1;
    453         }
    454         return lead;
    455     } else if(iter->start>0) {
    456         const uint8_t *s=(const uint8_t *)iter->context;
    457         UChar32 c;
    458 
    459         L8_PREV(s, 0, iter->start, c);
    460         if((index=iter->index)>0) {
    461             iter->index=index-1;
    462         } else if(iter->start<=1) {
    463             iter->index= c<=0xffff ? iter->start : iter->start+1;
    464         }
    465         if(c<0) {
    466             return 0xfffd;
    467         } else if(c<=0xffff) {
    468             return c;
    469         } else {
    470             iter->start+=4; /* back to behind this supplementary code point for consistent state */
    471             iter->reservedField=c;
    472             return U16_TRAIL(c);
    473         }
    474     } else {
    475         return U_SENTINEL;
    476     }
    477 }
    478 
    479 static uint32_t U_CALLCONV
    480 lenient8IteratorGetState(const UCharIterator *iter) {
    481     uint32_t state=(uint32_t)(iter->start<<1);
    482     if(iter->reservedField!=0) {
    483         state|=1;
    484     }
    485     return state;
    486 }
    487 
    488 static void U_CALLCONV
    489 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    490     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    491         /* do nothing */
    492     } else if(iter==NULL) {
    493         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    494     } else if(state==lenient8IteratorGetState(iter)) {
    495         /* setting to the current state: no-op */
    496     } else {
    497         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
    498         state&=1; /* 1 if in surrogate pair, must be index>=4 */
    499 
    500         if((state==0 ? index<0 : index<4) || iter->limit<index) {
    501             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    502         } else {
    503             iter->start=index; /* restore UTF-8 byte index */
    504             if(index<=1) {
    505                 iter->index=index;
    506             } else {
    507                 iter->index=-1; /* unknown UTF-16 index */
    508             }
    509             if(state==0) {
    510                 iter->reservedField=0;
    511             } else {
    512                 /* verified index>=4 above */
    513                 UChar32 c;
    514                 L8_PREV((const uint8_t *)iter->context, 0, index, c);
    515                 if(c<=0xffff) {
    516                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    517                 } else {
    518                     iter->reservedField=c;
    519                 }
    520             }
    521         }
    522     }
    523 }
    524 
    525 static const UCharIterator lenient8Iterator={
    526     0, 0, 0, 0, 0, 0,
    527     lenient8IteratorGetIndex,
    528     lenient8IteratorMove,
    529     lenient8IteratorHasNext,
    530     lenient8IteratorHasPrevious,
    531     lenient8IteratorCurrent,
    532     lenient8IteratorNext,
    533     lenient8IteratorPrevious,
    534     NULL,
    535     lenient8IteratorGetState,
    536     lenient8IteratorSetState
    537 };
    538 
    539 U_CAPI void U_EXPORT2
    540 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
    541     if(iter!=0) {
    542         if(s!=0 && length>=-1) {
    543             *iter=lenient8Iterator;
    544             iter->context=s;
    545             if(length>=0) {
    546                 iter->limit=length;
    547             } else {
    548                 iter->limit=strlen(s);
    549             }
    550             iter->length= iter->limit<=1 ? iter->limit : -1;
    551         } else {
    552             /* set no-op iterator */
    553             uiter_setString(iter, NULL, 0);
    554         }
    555     }
    556 }
    557