1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 1999-2006, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: utf_impl.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999sep13 14 * created by: Markus W. Scherer 15 * 16 * This file provides implementation functions for macros in the utfXX.h 17 * that would otherwise be too long as macros. 18 */ 19 20 /* set import/export definitions */ 21 #ifndef U_UTF8_IMPL 22 # define U_UTF8_IMPL 23 #endif 24 25 #include "unicode/utypes.h" 26 27 /* 28 * This table could be replaced on many machines by 29 * a few lines of assembler code using an 30 * "index of first 0-bit from msb" instruction and 31 * one or two more integer instructions. 32 * 33 * For example, on an i386, do something like 34 * - MOV AL, leadByte 35 * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0) 36 * - MOV AH, 0 37 * - BSR BX, AX (16-bit) 38 * - MOV AX, 6 (result) 39 * - JZ finish (ZF==1 if leadByte==0xff) 40 * - SUB AX, BX (result) 41 * -finish: 42 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) 43 * 44 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; 45 * lead bytes above 0xf4 are illegal. 46 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. 47 */ 48 U_EXPORT const uint8_t 49 utf8_countTrailBytes[256]={ 50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54 55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64 65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 67 68 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 69 3, 3, 3, 3, 3, 70 3, 3, 3, /* illegal in Unicode */ 71 4, 4, 4, 4, /* illegal in Unicode */ 72 5, 5, /* illegal in Unicode */ 73 0, 0 /* illegal bytes 0xfe and 0xff */ 74 }; 75 76 static const UChar32 77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 78 79 static const UChar32 80 utf8_errorValue[6]={ 81 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff, 82 0x3ffffff, 0x7fffffff 83 }; 84 85 /* 86 * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling 87 * UTF8_NEXT_CHAR_SAFE(). 88 * 89 * The "strict" parameter controls the error behavior: 90 * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative 91 * code point result. 92 * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): 93 * All illegal byte sequences yield a positive code point such that this 94 * result code point would be encoded with the same number of bytes as 95 * the illegal sequence. 96 * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE): 97 * Same as the obsolete "safe" behavior, but non-characters are also treated 98 * like illegal sequences. 99 * 100 * The special negative (<0) value -2 is used for lenient treatment of surrogate 101 * code points as legal. Some implementations use this for roundtripping of 102 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they 103 * contain unpaired surrogates. 104 * 105 * Note that a UBool is the same as an int8_t. 106 */ 107 U_CAPI UChar32 U_EXPORT2 108 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { 109 int32_t i=*pi; 110 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); 111 if((i)+count<=(length)) { 112 uint8_t trail, illegal=0; 113 114 UTF8_MASK_LEAD_BYTE((c), count); 115 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 116 switch(count) { 117 /* each branch falls through to the next one */ 118 case 5: 119 case 4: 120 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 121 illegal=1; 122 break; 123 case 3: 124 trail=s[(i)++]; 125 (c)=((c)<<6)|(trail&0x3f); 126 if(c<0x110) { 127 illegal|=(trail&0xc0)^0x80; 128 } else { 129 /* code point>0x10ffff, outside Unicode */ 130 illegal=1; 131 break; 132 } 133 case 2: 134 trail=s[(i)++]; 135 (c)=((c)<<6)|(trail&0x3f); 136 illegal|=(trail&0xc0)^0x80; 137 case 1: 138 trail=s[(i)++]; 139 (c)=((c)<<6)|(trail&0x3f); 140 illegal|=(trail&0xc0)^0x80; 141 break; 142 case 0: 143 if(strict>=0) { 144 return UTF8_ERROR_VALUE_1; 145 } else { 146 return U_SENTINEL; 147 } 148 /* no default branch to optimize switch() - all values are covered */ 149 } 150 151 /* 152 * All the error handling should return a value 153 * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right. 154 * 155 * Starting with Unicode 3.0.1, non-shortest forms are illegal. 156 * Starting with Unicode 3.2, surrogate code points must not be 157 * encoded in UTF-8, and there are no irregular sequences any more. 158 * 159 * U8_ macros (new in ICU 2.4) return negative values for error conditions. 160 */ 161 162 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 163 /* illegal is also set if count>=4 */ 164 if(illegal || (c)<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2)) { 165 /* error handling */ 166 uint8_t errorCount=count; 167 /* don't go beyond this sequence */ 168 i=*pi; 169 while(count>0 && UTF8_IS_TRAIL(s[i])) { 170 ++(i); 171 --count; 172 } 173 if(strict>=0) { 174 c=utf8_errorValue[errorCount-count]; 175 } else { 176 c=U_SENTINEL; 177 } 178 } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) { 179 /* strict: forbid non-characters like U+fffe */ 180 c=utf8_errorValue[count]; 181 } 182 } else /* too few bytes left */ { 183 /* error handling */ 184 int32_t i0=i; 185 /* don't just set (i)=(length) in case there is an illegal sequence */ 186 while((i)<(length) && UTF8_IS_TRAIL(s[i])) { 187 ++(i); 188 } 189 if(strict>=0) { 190 c=utf8_errorValue[i-i0]; 191 } else { 192 c=U_SENTINEL; 193 } 194 } 195 *pi=i; 196 return c; 197 } 198 199 U_CAPI int32_t U_EXPORT2 200 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { 201 if((uint32_t)(c)<=0x7ff) { 202 if((i)+1<(length)) { 203 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); 204 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 205 return i; 206 } 207 } else if((uint32_t)(c)<=0xffff) { 208 /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */ 209 if((i)+2<(length) && !U_IS_SURROGATE(c)) { 210 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); 211 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 212 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 213 return i; 214 } 215 } else if((uint32_t)(c)<=0x10ffff) { 216 if((i)+3<(length)) { 217 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); 218 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); 219 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 220 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 221 return i; 222 } 223 } 224 /* c>0x10ffff or not enough space, write an error value */ 225 if(pIsError!=NULL) { 226 *pIsError=TRUE; 227 } else { 228 length-=i; 229 if(length>0) { 230 int32_t offset; 231 if(length>3) { 232 length=3; 233 } 234 s+=i; 235 offset=0; 236 c=utf8_errorValue[length-1]; 237 UTF8_APPEND_CHAR_UNSAFE(s, offset, c); 238 i=i+offset; 239 } 240 } 241 return i; 242 } 243 244 U_CAPI UChar32 U_EXPORT2 245 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { 246 int32_t i=*pi; 247 uint8_t b, count=1, shift=6; 248 249 /* extract value bits from the last trail byte */ 250 c&=0x3f; 251 252 for(;;) { 253 if(i<=start) { 254 /* no lead byte at all */ 255 if(strict>=0) { 256 return UTF8_ERROR_VALUE_1; 257 } else { 258 return U_SENTINEL; 259 } 260 /*break;*/ 261 } 262 263 /* read another previous byte */ 264 b=s[--i]; 265 if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ 266 if(b&0x40) { 267 /* lead byte, this will always end the loop */ 268 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b); 269 270 if(count==shouldCount) { 271 /* set the new position */ 272 *pi=i; 273 UTF8_MASK_LEAD_BYTE(b, count); 274 c|=(UChar32)b<<shift; 275 if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict!=-2) || (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) { 276 /* illegal sequence or (strict and non-character) */ 277 if(count>=4) { 278 count=3; 279 } 280 if(strict>=0) { 281 c=utf8_errorValue[count]; 282 } else { 283 c=U_SENTINEL; 284 } 285 } else { 286 /* exit with correct c */ 287 } 288 } else { 289 /* the lead byte does not match the number of trail bytes */ 290 /* only set the position to the lead byte if it would 291 include the trail byte that we started with */ 292 if(count<shouldCount) { 293 *pi=i; 294 if(strict>=0) { 295 c=utf8_errorValue[count]; 296 } else { 297 c=U_SENTINEL; 298 } 299 } else { 300 if(strict>=0) { 301 c=UTF8_ERROR_VALUE_1; 302 } else { 303 c=U_SENTINEL; 304 } 305 } 306 } 307 break; 308 } else if(count<5) { 309 /* trail byte */ 310 c|=(UChar32)(b&0x3f)<<shift; 311 ++count; 312 shift+=6; 313 } else { 314 /* more than 5 trail bytes is illegal */ 315 if(strict>=0) { 316 c=UTF8_ERROR_VALUE_1; 317 } else { 318 c=U_SENTINEL; 319 } 320 break; 321 } 322 } else { 323 /* single-byte character precedes trailing bytes */ 324 if(strict>=0) { 325 c=UTF8_ERROR_VALUE_1; 326 } else { 327 c=U_SENTINEL; 328 } 329 break; 330 } 331 } 332 return c; 333 } 334 335 U_CAPI int32_t U_EXPORT2 336 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { 337 /* i had been decremented once before the function call */ 338 int32_t I=i, Z; 339 uint8_t b; 340 341 /* read at most the 6 bytes s[Z] to s[i], inclusively */ 342 if(I-5>start) { 343 Z=I-5; 344 } else { 345 Z=start; 346 } 347 348 /* return I if the sequence starting there is long enough to include i */ 349 do { 350 b=s[I]; 351 if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ 352 break; 353 } else if(b>=0xc0) { 354 if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) { 355 return I; 356 } else { 357 break; 358 } 359 } 360 } while(Z<=--I); 361 362 /* return i itself to be consistent with the FWD_1 macro */ 363 return i; 364 } 365