Home | History | Annotate | Download | only in uciter8
      1 /*
      2 *******************************************************************************
      3 *
      4 *    2016 and later: Unicode, Inc. and others.
      5 *   License & terms of use: http://www.unicode.org/copyright.html#License
      6 *
      7 *******************************************************************************
      8 *******************************************************************************
      9 *
     10 *   Copyright (C) 2003-2006, International Business Machines
     11 *   Corporation and others.  All Rights Reserved.
     12 *
     13 *******************************************************************************
     14 *   file name:  uit_len8.c
     15 *   encoding:   UTF-8
     16 *   tab size:   8 (not used)
     17 *   indentation:4
     18 *
     19 *   created on: 2003feb10
     20 *   created by: Markus W. Scherer
     21 *
     22 *   This file contains the implementation of the "lenient UTF-8" UCharIterator
     23 *   as used in the uciter8 sample code.
     24 *   UTF-8-style macros are defined as well as the UCharIterator.
     25 *   The macros are incomplete (do not assemble code points from pairs of
     26 *   surrogates, see comment below)
     27 *   but sufficient for the iterator.
     28 */
     29 
     30 #include <string.h>
     31 #include "unicode/utypes.h"
     32 #include "unicode/uiter.h"
     33 
     34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
     35 
     36 /*
     37  * This code leniently reads 8-bit Unicode strings,
     38  * which could contain a mix of UTF-8 and CESU-8.
     39  * More precisely:
     40  * - supplementary code points may be encoded with dedicated 4-byte sequences
     41  *   (UTF-8 style)
     42  * - supplementary code points may be encoded with
     43  *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
     44  *   (CESU-8 style)
     45  * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
     46  *
     47  * Limitation:
     48  * Right now, the macros do not attempt to assemble code points from pairs of
     49  * separately encoded surrogates.
     50  * This would not be sufficient for processing based on these macros,
     51  * but it is sufficient for a UCharIterator that returns only UChars anyway.
     52  *
     53  * The code is copied and modified from utf_impl.c and utf8.h.
     54  *
     55  * Change 2006feb08: Much of the implementation code is replaced by calling
     56  * the utf_impl.c functions which accept a new "strict" parameter value
     57  * of -2 implementing exactly this leniency.
     58  */
     59 
     60 #define L8_NEXT(s, i, length, c) { \
     61     (c)=(uint8_t)(s)[(i)++]; \
     62     if((c)>=0x80) { \
     63         if(U8_IS_LEAD(c)) { \
     64             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
     65         } else { \
     66             (c)=U_SENTINEL; \
     67         } \
     68     } \
     69 }
     70 
     71 #define L8_PREV(s, start, i, c) { \
     72     (c)=(uint8_t)(s)[--(i)]; \
     73     if((c)>=0x80) { \
     74         if((c)<=0xbf) { \
     75             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
     76         } else { \
     77             (c)=U_SENTINEL; \
     78         } \
     79     } \
     80 }
     81 
     82 /* lenient-8 UCharIterator -------------------------------------------------- */
     83 
     84 /*
     85  * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
     86  * except that it uses the lenient-8-bit-Unicode macros above.
     87  */
     88 
     89 /*
     90  * Minimal implementation:
     91  * Maintain a single-UChar buffer for an additional surrogate.
     92  * The caller must not modify start and limit because they are used internally.
     93  *
     94  * Use UCharIterator fields as follows:
     95  *   context        pointer to UTF-8 string
     96  *   length         UTF-16 length of the string; -1 until lazy evaluation
     97  *   start          current UTF-8 index
     98  *   index          current UTF-16 index; may be -1="unknown" after setState()
     99  *   limit          UTF-8 length of the string
    100  *   reservedField  supplementary code point
    101  *
    102  * Since UCharIterator delivers 16-bit code units, the iteration can be
    103  * currently in the middle of the byte sequence for a supplementary code point.
    104  * In this case, reservedField will contain that code point and start will
    105  * point to after the corresponding byte sequence. The UTF-16 index will be
    106  * one less than what it would otherwise be corresponding to the UTF-8 index.
    107  * Otherwise, reservedField will be 0.
    108  */
    109 
    110 /*
    111  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
    112  * Add implementations that do not call strlen() for iteration but check for NUL.
    113  */
    114 
    115 static int32_t U_CALLCONV
    116 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    117     switch(origin) {
    118     case UITER_ZERO:
    119     case UITER_START:
    120         return 0;
    121     case UITER_CURRENT:
    122         if(iter->index<0) {
    123             /* the current UTF-16 index is unknown after setState(), count from the beginning */
    124             const uint8_t *s;
    125             UChar32 c;
    126             int32_t i, limit, index;
    127 
    128             s=(const uint8_t *)iter->context;
    129             i=index=0;
    130             limit=iter->start; /* count up to the UTF-8 index */
    131             while(i<limit) {
    132                 L8_NEXT(s, i, limit, c);
    133                 if(c<=0xffff) {
    134                     ++index;
    135                 } else {
    136                     index+=2;
    137                 }
    138             }
    139 
    140             iter->start=i; /* just in case setState() did not get us to a code point boundary */
    141             if(i==iter->limit) {
    142                 iter->length=index; /* in case it was <0 or wrong */
    143             }
    144             if(iter->reservedField!=0) {
    145                 --index; /* we are in the middle of a supplementary code point */
    146             }
    147             iter->index=index;
    148         }
    149         return iter->index;
    150     case UITER_LIMIT:
    151     case UITER_LENGTH:
    152         if(iter->length<0) {
    153             const uint8_t *s;
    154             UChar32 c;
    155             int32_t i, limit, length;
    156 
    157             s=(const uint8_t *)iter->context;
    158             if(iter->index<0) {
    159                 /*
    160                  * the current UTF-16 index is unknown after setState(),
    161                  * we must first count from the beginning to here
    162                  */
    163                 i=length=0;
    164                 limit=iter->start;
    165 
    166                 /* count from the beginning to the current index */
    167                 while(i<limit) {
    168                     L8_NEXT(s, i, limit, c);
    169                     if(c<=0xffff) {
    170                         ++length;
    171                     } else {
    172                         length+=2;
    173                     }
    174                 }
    175 
    176                 /* assume i==limit==iter->start, set the UTF-16 index */
    177                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
    178                 iter->index= iter->reservedField!=0 ? length-1 : length;
    179             } else {
    180                 i=iter->start;
    181                 length=iter->index;
    182                 if(iter->reservedField!=0) {
    183                     ++length;
    184                 }
    185             }
    186 
    187             /* count from the current index to the end */
    188             limit=iter->limit;
    189             while(i<limit) {
    190                 L8_NEXT(s, i, limit, c);
    191                 if(c<=0xffff) {
    192                     ++length;
    193                 } else {
    194                     length+=2;
    195                 }
    196             }
    197             iter->length=length;
    198         }
    199         return iter->length;
    200     default:
    201         /* not a valid origin */
    202         /* Should never get here! */
    203         return -1;
    204     }
    205 }
    206 
    207 static int32_t U_CALLCONV
    208 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
    209     const uint8_t *s;
    210     UChar32 c;
    211     int32_t pos; /* requested UTF-16 index */
    212     int32_t i; /* UTF-8 index */
    213     UBool havePos;
    214 
    215     /* calculate the requested UTF-16 index */
    216     switch(origin) {
    217     case UITER_ZERO:
    218     case UITER_START:
    219         pos=delta;
    220         havePos=TRUE;
    221         /* iter->index<0 (unknown) is possible */
    222         break;
    223     case UITER_CURRENT:
    224         if(iter->index>=0) {
    225             pos=iter->index+delta;
    226             havePos=TRUE;
    227         } else {
    228             /* the current UTF-16 index is unknown after setState(), use only delta */
    229             pos=0;
    230             havePos=FALSE;
    231         }
    232         break;
    233     case UITER_LIMIT:
    234     case UITER_LENGTH:
    235         if(iter->length>=0) {
    236             pos=iter->length+delta;
    237             havePos=TRUE;
    238         } else {
    239             /* pin to the end, avoid counting the length */
    240             iter->index=-1;
    241             iter->start=iter->limit;
    242             iter->reservedField=0;
    243             if(delta>=0) {
    244                 return UITER_UNKNOWN_INDEX;
    245             } else {
    246                 /* the current UTF-16 index is unknown, use only delta */
    247                 pos=0;
    248                 havePos=FALSE;
    249             }
    250         }
    251         break;
    252     default:
    253         return -1;  /* Error */
    254     }
    255 
    256     if(havePos) {
    257         /* shortcuts: pinning to the edges of the string */
    258         if(pos<=0) {
    259             iter->index=iter->start=iter->reservedField=0;
    260             return 0;
    261         } else if(iter->length>=0 && pos>=iter->length) {
    262             iter->index=iter->length;
    263             iter->start=iter->limit;
    264             iter->reservedField=0;
    265             return iter->index;
    266         }
    267 
    268         /* minimize the number of L8_NEXT/PREV operations */
    269         if(iter->index<0 || pos<iter->index/2) {
    270             /* go forward from the start instead of backward from the current index */
    271             iter->index=iter->start=iter->reservedField=0;
    272         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
    273             /*
    274              * if we have the UTF-16 index and length and the new position is
    275              * closer to the end than the current index,
    276              * then go backward from the end instead of forward from the current index
    277              */
    278             iter->index=iter->length;
    279             iter->start=iter->limit;
    280             iter->reservedField=0;
    281         }
    282 
    283         delta=pos-iter->index;
    284         if(delta==0) {
    285             return iter->index; /* nothing to do */
    286         }
    287     } else {
    288         /* move relative to unknown UTF-16 index */
    289         if(delta==0) {
    290             return UITER_UNKNOWN_INDEX; /* nothing to do */
    291         } else if(-delta>=iter->start) {
    292             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
    293             iter->index=iter->start=iter->reservedField=0;
    294             return 0;
    295         } else if(delta>=(iter->limit-iter->start)) {
    296             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
    297             iter->index=iter->length; /* may or may not be <0 (unknown) */
    298             iter->start=iter->limit;
    299             iter->reservedField=0;
    300             return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
    301         }
    302     }
    303 
    304     /* delta!=0 */
    305 
    306     /* move towards the requested position, pin to the edges of the string */
    307     s=(const uint8_t *)iter->context;
    308     pos=iter->index; /* could be <0 (unknown) */
    309     i=iter->start;
    310     if(delta>0) {
    311         /* go forward */
    312         int32_t limit=iter->limit;
    313         if(iter->reservedField!=0) {
    314             iter->reservedField=0;
    315             ++pos;
    316             --delta;
    317         }
    318         while(delta>0 && i<limit) {
    319             L8_NEXT(s, i, limit, c);
    320             if(c<0xffff) {
    321                 ++pos;
    322                 --delta;
    323             } else if(delta>=2) {
    324                 pos+=2;
    325                 delta-=2;
    326             } else /* delta==1 */ {
    327                 /* stop in the middle of a supplementary code point */
    328                 iter->reservedField=c;
    329                 ++pos;
    330                 break; /* delta=0; */
    331             }
    332         }
    333         if(i==limit) {
    334             if(iter->length<0 && iter->index>=0) {
    335                 iter->length= iter->reservedField==0 ? pos : pos+1;
    336             } else if(iter->index<0 && iter->length>=0) {
    337                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
    338             }
    339         }
    340     } else /* delta<0 */ {
    341         /* go backward */
    342         if(iter->reservedField!=0) {
    343             iter->reservedField=0;
    344             i-=4; /* we stayed behind the supplementary code point; go before it now */
    345             --pos;
    346             ++delta;
    347         }
    348         while(delta<0 && i>0) {
    349             L8_PREV(s, 0, i, c);
    350             if(c<0xffff) {
    351                 --pos;
    352                 ++delta;
    353             } else if(delta<=-2) {
    354                 pos-=2;
    355                 delta+=2;
    356             } else /* delta==-1 */ {
    357                 /* stop in the middle of a supplementary code point */
    358                 i+=4; /* back to behind this supplementary code point for consistent state */
    359                 iter->reservedField=c;
    360                 --pos;
    361                 break; /* delta=0; */
    362             }
    363         }
    364     }
    365 
    366     iter->start=i;
    367     if(iter->index>=0) {
    368         return iter->index=pos;
    369     } else {
    370         /* we started with index<0 (unknown) so pos is bogus */
    371         if(i<=1) {
    372             return iter->index=i; /* reached the beginning */
    373         } else {
    374             /* we still don't know the UTF-16 index */
    375             return UITER_UNKNOWN_INDEX;
    376         }
    377     }
    378 }
    379 
    380 static UBool U_CALLCONV
    381 lenient8IteratorHasNext(UCharIterator *iter) {
    382     return iter->reservedField!=0 || iter->start<iter->limit;
    383 }
    384 
    385 static UBool U_CALLCONV
    386 lenient8IteratorHasPrevious(UCharIterator *iter) {
    387     return iter->start>0;
    388 }
    389 
    390 static UChar32 U_CALLCONV
    391 lenient8IteratorCurrent(UCharIterator *iter) {
    392     if(iter->reservedField!=0) {
    393         return U16_TRAIL(iter->reservedField);
    394     } else if(iter->start<iter->limit) {
    395         const uint8_t *s=(const uint8_t *)iter->context;
    396         UChar32 c;
    397         int32_t i=iter->start;
    398 
    399         L8_NEXT(s, i, iter->limit, c);
    400         if(c<0) {
    401             return 0xfffd;
    402         } else if(c<=0xffff) {
    403             return c;
    404         } else {
    405             return U16_LEAD(c);
    406         }
    407     } else {
    408         return U_SENTINEL;
    409     }
    410 }
    411 
    412 static UChar32 U_CALLCONV
    413 lenient8IteratorNext(UCharIterator *iter) {
    414     int32_t index;
    415 
    416     if(iter->reservedField!=0) {
    417         UChar trail=U16_TRAIL(iter->reservedField);
    418         iter->reservedField=0;
    419         if((index=iter->index)>=0) {
    420             iter->index=index+1;
    421         }
    422         return trail;
    423     } else if(iter->start<iter->limit) {
    424         const uint8_t *s=(const uint8_t *)iter->context;
    425         UChar32 c;
    426 
    427         L8_NEXT(s, iter->start, iter->limit, c);
    428         if((index=iter->index)>=0) {
    429             iter->index=++index;
    430             if(iter->length<0 && iter->start==iter->limit) {
    431                 iter->length= c<=0xffff ? index : index+1;
    432             }
    433         } else if(iter->start==iter->limit && iter->length>=0) {
    434             iter->index= c<=0xffff ? iter->length : iter->length-1;
    435         }
    436         if(c<0) {
    437             return 0xfffd;
    438         } else if(c<=0xffff) {
    439             return c;
    440         } else {
    441             iter->reservedField=c;
    442             return U16_LEAD(c);
    443         }
    444     } else {
    445         return U_SENTINEL;
    446     }
    447 }
    448 
    449 static UChar32 U_CALLCONV
    450 lenient8IteratorPrevious(UCharIterator *iter) {
    451     int32_t index;
    452 
    453     if(iter->reservedField!=0) {
    454         UChar lead=U16_LEAD(iter->reservedField);
    455         iter->reservedField=0;
    456         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
    457         if((index=iter->index)>0) {
    458             iter->index=index-1;
    459         }
    460         return lead;
    461     } else if(iter->start>0) {
    462         const uint8_t *s=(const uint8_t *)iter->context;
    463         UChar32 c;
    464 
    465         L8_PREV(s, 0, iter->start, c);
    466         if((index=iter->index)>0) {
    467             iter->index=index-1;
    468         } else if(iter->start<=1) {
    469             iter->index= c<=0xffff ? iter->start : iter->start+1;
    470         }
    471         if(c<0) {
    472             return 0xfffd;
    473         } else if(c<=0xffff) {
    474             return c;
    475         } else {
    476             iter->start+=4; /* back to behind this supplementary code point for consistent state */
    477             iter->reservedField=c;
    478             return U16_TRAIL(c);
    479         }
    480     } else {
    481         return U_SENTINEL;
    482     }
    483 }
    484 
    485 static uint32_t U_CALLCONV
    486 lenient8IteratorGetState(const UCharIterator *iter) {
    487     uint32_t state=(uint32_t)(iter->start<<1);
    488     if(iter->reservedField!=0) {
    489         state|=1;
    490     }
    491     return state;
    492 }
    493 
    494 static void U_CALLCONV
    495 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    496     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    497         /* do nothing */
    498     } else if(iter==NULL) {
    499         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    500     } else if(state==lenient8IteratorGetState(iter)) {
    501         /* setting to the current state: no-op */
    502     } else {
    503         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
    504         state&=1; /* 1 if in surrogate pair, must be index>=4 */
    505 
    506         if((state==0 ? index<0 : index<4) || iter->limit<index) {
    507             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    508         } else {
    509             iter->start=index; /* restore UTF-8 byte index */
    510             if(index<=1) {
    511                 iter->index=index;
    512             } else {
    513                 iter->index=-1; /* unknown UTF-16 index */
    514             }
    515             if(state==0) {
    516                 iter->reservedField=0;
    517             } else {
    518                 /* verified index>=4 above */
    519                 UChar32 c;
    520                 L8_PREV((const uint8_t *)iter->context, 0, index, c);
    521                 if(c<=0xffff) {
    522                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    523                 } else {
    524                     iter->reservedField=c;
    525                 }
    526             }
    527         }
    528     }
    529 }
    530 
    531 static const UCharIterator lenient8Iterator={
    532     0, 0, 0, 0, 0, 0,
    533     lenient8IteratorGetIndex,
    534     lenient8IteratorMove,
    535     lenient8IteratorHasNext,
    536     lenient8IteratorHasPrevious,
    537     lenient8IteratorCurrent,
    538     lenient8IteratorNext,
    539     lenient8IteratorPrevious,
    540     NULL,
    541     lenient8IteratorGetState,
    542     lenient8IteratorSetState
    543 };
    544 
    545 U_CAPI void U_EXPORT2
    546 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
    547     if(iter!=0) {
    548         if(s!=0 && length>=-1) {
    549             *iter=lenient8Iterator;
    550             iter->context=s;
    551             if(length>=0) {
    552                 iter->limit=length;
    553             } else {
    554                 iter->limit=strlen(s);
    555             }
    556             iter->length= iter->limit<=1 ? iter->limit : -1;
    557         } else {
    558             /* set no-op iterator */
    559             uiter_setString(iter, NULL, 0);
    560         }
    561     }
    562 }
    563