Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2006, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  utf_impl.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep13
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file provides implementation functions for macros in the utfXX.h
     17 *   that would otherwise be too long as macros.
     18 */
     19 
     20 /* set import/export definitions */
     21 #ifndef U_UTF8_IMPL
     22 #   define U_UTF8_IMPL
     23 #endif
     24 
     25 #include "unicode/utypes.h"
     26 
     27 /*
     28  * This table could be replaced on many machines by
     29  * a few lines of assembler code using an
     30  * "index of first 0-bit from msb" instruction and
     31  * one or two more integer instructions.
     32  *
     33  * For example, on an i386, do something like
     34  * - MOV AL, leadByte
     35  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
     36  * - MOV AH, 0
     37  * - BSR BX, AX     (16-bit)
     38  * - MOV AX, 6      (result)
     39  * - JZ finish      (ZF==1 if leadByte==0xff)
     40  * - SUB AX, BX (result)
     41  * -finish:
     42  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
     43  *
     44  * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
     45  * lead bytes above 0xf4 are illegal.
     46  * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
     47  */
     48 U_EXPORT const uint8_t
     49 utf8_countTrailBytes[256]={
     50     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     51     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     52     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     53     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     54 
     55     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     56     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     57     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     58     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     59 
     60     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     61     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     62     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     63     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     64 
     65     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     66     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     67 
     68     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     69     3, 3, 3, 3, 3,
     70     3, 3, 3,    /* illegal in Unicode */
     71     4, 4, 4, 4, /* illegal in Unicode */
     72     5, 5,       /* illegal in Unicode */
     73     0, 0        /* illegal bytes 0xfe and 0xff */
     74 };
     75 
     76 static const UChar32
     77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
     78 
     79 static const UChar32
     80 utf8_errorValue[6]={
     81     UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
     82     0x3ffffff, 0x7fffffff
     83 };
     84 
     85 /*
     86  * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
     87  * UTF8_NEXT_CHAR_SAFE().
     88  *
     89  * The "strict" parameter controls the error behavior:
     90  * <0  "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
     91  *     code point result.
     92  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
     93  *     All illegal byte sequences yield a positive code point such that this
     94  *     result code point would be encoded with the same number of bytes as
     95  *     the illegal sequence.
     96  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
     97  *     Same as the obsolete "safe" behavior, but non-characters are also treated
     98  *     like illegal sequences.
     99  *
    100  * The special negative (<0) value -2 is used for lenient treatment of surrogate
    101  * code points as legal. Some implementations use this for roundtripping of
    102  * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
    103  * contain unpaired surrogates.
    104  *
    105  * Note that a UBool is the same as an int8_t.
    106  */
    107 U_CAPI UChar32 U_EXPORT2
    108 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
    109     int32_t i=*pi;
    110     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
    111     if((i)+count<=(length)) {
    112         uint8_t trail, illegal=0;
    113 
    114         UTF8_MASK_LEAD_BYTE((c), count);
    115         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    116         switch(count) {
    117         /* each branch falls through to the next one */
    118         case 5:
    119         case 4:
    120             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    121             illegal=1;
    122             break;
    123         case 3:
    124             trail=s[(i)++];
    125             (c)=((c)<<6)|(trail&0x3f);
    126             if(c<0x110) {
    127                 illegal|=(trail&0xc0)^0x80;
    128             } else {
    129                 /* code point>0x10ffff, outside Unicode */
    130                 illegal=1;
    131                 break;
    132             }
    133         case 2:
    134             trail=s[(i)++];
    135             (c)=((c)<<6)|(trail&0x3f);
    136             illegal|=(trail&0xc0)^0x80;
    137         case 1:
    138             trail=s[(i)++];
    139             (c)=((c)<<6)|(trail&0x3f);
    140             illegal|=(trail&0xc0)^0x80;
    141             break;
    142         case 0:
    143             if(strict>=0) {
    144                 return UTF8_ERROR_VALUE_1;
    145             } else {
    146                 return U_SENTINEL;
    147             }
    148         /* no default branch to optimize switch()  - all values are covered */
    149         }
    150 
    151         /*
    152          * All the error handling should return a value
    153          * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
    154          *
    155          * Starting with Unicode 3.0.1, non-shortest forms are illegal.
    156          * Starting with Unicode 3.2, surrogate code points must not be
    157          * encoded in UTF-8, and there are no irregular sequences any more.
    158          *
    159          * U8_ macros (new in ICU 2.4) return negative values for error conditions.
    160          */
    161 
    162         /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    163         /* illegal is also set if count>=4 */
    164         if(illegal || (c)<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2)) {
    165             /* error handling */
    166             uint8_t errorCount=count;
    167             /* don't go beyond this sequence */
    168             i=*pi;
    169             while(count>0 && UTF8_IS_TRAIL(s[i])) {
    170                 ++(i);
    171                 --count;
    172             }
    173             if(strict>=0) {
    174                 c=utf8_errorValue[errorCount-count];
    175             } else {
    176                 c=U_SENTINEL;
    177             }
    178         } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
    179             /* strict: forbid non-characters like U+fffe */
    180             c=utf8_errorValue[count];
    181         }
    182     } else /* too few bytes left */ {
    183         /* error handling */
    184         int32_t i0=i;
    185         /* don't just set (i)=(length) in case there is an illegal sequence */
    186         while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
    187             ++(i);
    188         }
    189         if(strict>=0) {
    190             c=utf8_errorValue[i-i0];
    191         } else {
    192             c=U_SENTINEL;
    193         }
    194     }
    195     *pi=i;
    196     return c;
    197 }
    198 
    199 U_CAPI int32_t U_EXPORT2
    200 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
    201     if((uint32_t)(c)<=0x7ff) {
    202         if((i)+1<(length)) {
    203             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
    204             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    205             return i;
    206         }
    207     } else if((uint32_t)(c)<=0xffff) {
    208         /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
    209         if((i)+2<(length) && !U_IS_SURROGATE(c)) {
    210             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
    211             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
    212             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    213             return i;
    214         }
    215     } else if((uint32_t)(c)<=0x10ffff) {
    216         if((i)+3<(length)) {
    217             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
    218             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
    219             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
    220             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    221             return i;
    222         }
    223     }
    224     /* c>0x10ffff or not enough space, write an error value */
    225     if(pIsError!=NULL) {
    226         *pIsError=TRUE;
    227     } else {
    228         length-=i;
    229         if(length>0) {
    230             int32_t offset;
    231             if(length>3) {
    232                 length=3;
    233             }
    234             s+=i;
    235             offset=0;
    236             c=utf8_errorValue[length-1];
    237             UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
    238             i=i+offset;
    239         }
    240     }
    241     return i;
    242 }
    243 
    244 U_CAPI UChar32 U_EXPORT2
    245 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
    246     int32_t i=*pi;
    247     uint8_t b, count=1, shift=6;
    248 
    249     /* extract value bits from the last trail byte */
    250     c&=0x3f;
    251 
    252     for(;;) {
    253         if(i<=start) {
    254             /* no lead byte at all */
    255             if(strict>=0) {
    256                 return UTF8_ERROR_VALUE_1;
    257             } else {
    258                 return U_SENTINEL;
    259             }
    260             /*break;*/
    261         }
    262 
    263         /* read another previous byte */
    264         b=s[--i];
    265         if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
    266             if(b&0x40) {
    267                 /* lead byte, this will always end the loop */
    268                 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
    269 
    270                 if(count==shouldCount) {
    271                     /* set the new position */
    272                     *pi=i;
    273                     UTF8_MASK_LEAD_BYTE(b, count);
    274                     c|=(UChar32)b<<shift;
    275                     if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2) || (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) {
    276                         /* illegal sequence or (strict and non-character) */
    277                         if(count>=4) {
    278                             count=3;
    279                         }
    280                         if(strict>=0) {
    281                             c=utf8_errorValue[count];
    282                         } else {
    283                             c=U_SENTINEL;
    284                         }
    285                     } else {
    286                         /* exit with correct c */
    287                     }
    288                 } else {
    289                     /* the lead byte does not match the number of trail bytes */
    290                     /* only set the position to the lead byte if it would
    291                        include the trail byte that we started with */
    292                     if(count<shouldCount) {
    293                         *pi=i;
    294                         if(strict>=0) {
    295                             c=utf8_errorValue[count];
    296                         } else {
    297                             c=U_SENTINEL;
    298                         }
    299                     } else {
    300                         if(strict>=0) {
    301                             c=UTF8_ERROR_VALUE_1;
    302                         } else {
    303                             c=U_SENTINEL;
    304                         }
    305                     }
    306                 }
    307                 break;
    308             } else if(count<5) {
    309                 /* trail byte */
    310                 c|=(UChar32)(b&0x3f)<<shift;
    311                 ++count;
    312                 shift+=6;
    313             } else {
    314                 /* more than 5 trail bytes is illegal */
    315                 if(strict>=0) {
    316                     c=UTF8_ERROR_VALUE_1;
    317                 } else {
    318                     c=U_SENTINEL;
    319                 }
    320                 break;
    321             }
    322         } else {
    323             /* single-byte character precedes trailing bytes */
    324             if(strict>=0) {
    325                 c=UTF8_ERROR_VALUE_1;
    326             } else {
    327                 c=U_SENTINEL;
    328             }
    329             break;
    330         }
    331     }
    332     return c;
    333 }
    334 
    335 U_CAPI int32_t U_EXPORT2
    336 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
    337     /* i had been decremented once before the function call */
    338     int32_t I=i, Z;
    339     uint8_t b;
    340 
    341     /* read at most the 6 bytes s[Z] to s[i], inclusively */
    342     if(I-5>start) {
    343         Z=I-5;
    344     } else {
    345         Z=start;
    346     }
    347 
    348     /* return I if the sequence starting there is long enough to include i */
    349     do {
    350         b=s[I];
    351         if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
    352             break;
    353         } else if(b>=0xc0) {
    354             if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
    355                 return I;
    356             } else {
    357                 break;
    358             }
    359         }
    360     } while(Z<=--I);
    361 
    362     /* return i itself to be consistent with the FWD_1 macro */
    363     return i;
    364 }
    365