1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2006, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uit_len8.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003feb10 14 * created by: Markus W. Scherer 15 * 16 * This file contains the implementation of the "lenient UTF-8" UCharIterator 17 * as used in the uciter8 sample code. 18 * UTF-8-style macros are defined as well as the UCharIterator. 19 * The macros are incomplete (do not assemble code points from pairs of 20 * surrogates, see comment below) 21 * but sufficient for the iterator. 22 */ 23 24 #include <string.h> 25 #include "unicode/utypes.h" 26 #include "unicode/uiter.h" 27 28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */ 29 30 /* 31 * This code leniently reads 8-bit Unicode strings, 32 * which could contain a mix of UTF-8 and CESU-8. 33 * More precisely: 34 * - supplementary code points may be encoded with dedicated 4-byte sequences 35 * (UTF-8 style) 36 * - supplementary code points may be encoded with 37 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form 38 * (CESU-8 style) 39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences 40 * 41 * Limitation: 42 * Right now, the macros do not attempt to assemble code points from pairs of 43 * separately encoded surrogates. 44 * This would not be sufficient for processing based on these macros, 45 * but it is sufficient for a UCharIterator that returns only UChars anyway. 46 * 47 * The code is copied and modified from utf_impl.c and utf8.h. 48 * 49 * Change 2006feb08: Much of the implementation code is replaced by calling 50 * the utf_impl.c functions which accept a new "strict" parameter value 51 * of -2 implementing exactly this leniency. 52 */ 53 54 #define L8_NEXT(s, i, length, c) { \ 55 (c)=(uint8_t)(s)[(i)++]; \ 56 if((c)>=0x80) { \ 57 if(U8_IS_LEAD(c)) { \ 58 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \ 59 } else { \ 60 (c)=U_SENTINEL; \ 61 } \ 62 } \ 63 } 64 65 #define L8_PREV(s, start, i, c) { \ 66 (c)=(uint8_t)(s)[--(i)]; \ 67 if((c)>=0x80) { \ 68 if((c)<=0xbf) { \ 69 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \ 70 } else { \ 71 (c)=U_SENTINEL; \ 72 } \ 73 } \ 74 } 75 76 /* lenient-8 UCharIterator -------------------------------------------------- */ 77 78 /* 79 * This is a copy of the UTF-8 UCharIterator in uiter.cpp, 80 * except that it uses the lenient-8-bit-Unicode macros above. 81 */ 82 83 /* 84 * Minimal implementation: 85 * Maintain a single-UChar buffer for an additional surrogate. 86 * The caller must not modify start and limit because they are used internally. 87 * 88 * Use UCharIterator fields as follows: 89 * context pointer to UTF-8 string 90 * length UTF-16 length of the string; -1 until lazy evaluation 91 * start current UTF-8 index 92 * index current UTF-16 index; may be -1="unknown" after setState() 93 * limit UTF-8 length of the string 94 * reservedField supplementary code point 95 * 96 * Since UCharIterator delivers 16-bit code units, the iteration can be 97 * currently in the middle of the byte sequence for a supplementary code point. 98 * In this case, reservedField will contain that code point and start will 99 * point to after the corresponding byte sequence. The UTF-16 index will be 100 * one less than what it would otherwise be corresponding to the UTF-8 index. 101 * Otherwise, reservedField will be 0. 102 */ 103 104 /* 105 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 106 * Add implementations that do not call strlen() for iteration but check for NUL. 107 */ 108 109 static int32_t U_CALLCONV 110 lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 111 switch(origin) { 112 case UITER_ZERO: 113 case UITER_START: 114 return 0; 115 case UITER_CURRENT: 116 if(iter->index<0) { 117 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 118 const uint8_t *s; 119 UChar32 c; 120 int32_t i, limit, index; 121 122 s=(const uint8_t *)iter->context; 123 i=index=0; 124 limit=iter->start; /* count up to the UTF-8 index */ 125 while(i<limit) { 126 L8_NEXT(s, i, limit, c); 127 if(c<=0xffff) { 128 ++index; 129 } else { 130 index+=2; 131 } 132 } 133 134 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 135 if(i==iter->limit) { 136 iter->length=index; /* in case it was <0 or wrong */ 137 } 138 if(iter->reservedField!=0) { 139 --index; /* we are in the middle of a supplementary code point */ 140 } 141 iter->index=index; 142 } 143 return iter->index; 144 case UITER_LIMIT: 145 case UITER_LENGTH: 146 if(iter->length<0) { 147 const uint8_t *s; 148 UChar32 c; 149 int32_t i, limit, length; 150 151 s=(const uint8_t *)iter->context; 152 if(iter->index<0) { 153 /* 154 * the current UTF-16 index is unknown after setState(), 155 * we must first count from the beginning to here 156 */ 157 i=length=0; 158 limit=iter->start; 159 160 /* count from the beginning to the current index */ 161 while(i<limit) { 162 L8_NEXT(s, i, limit, c); 163 if(c<=0xffff) { 164 ++length; 165 } else { 166 length+=2; 167 } 168 } 169 170 /* assume i==limit==iter->start, set the UTF-16 index */ 171 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 172 iter->index= iter->reservedField!=0 ? length-1 : length; 173 } else { 174 i=iter->start; 175 length=iter->index; 176 if(iter->reservedField!=0) { 177 ++length; 178 } 179 } 180 181 /* count from the current index to the end */ 182 limit=iter->limit; 183 while(i<limit) { 184 L8_NEXT(s, i, limit, c); 185 if(c<=0xffff) { 186 ++length; 187 } else { 188 length+=2; 189 } 190 } 191 iter->length=length; 192 } 193 return iter->length; 194 default: 195 /* not a valid origin */ 196 /* Should never get here! */ 197 return -1; 198 } 199 } 200 201 static int32_t U_CALLCONV 202 lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 203 const uint8_t *s; 204 UChar32 c; 205 int32_t pos; /* requested UTF-16 index */ 206 int32_t i; /* UTF-8 index */ 207 UBool havePos; 208 209 /* calculate the requested UTF-16 index */ 210 switch(origin) { 211 case UITER_ZERO: 212 case UITER_START: 213 pos=delta; 214 havePos=TRUE; 215 /* iter->index<0 (unknown) is possible */ 216 break; 217 case UITER_CURRENT: 218 if(iter->index>=0) { 219 pos=iter->index+delta; 220 havePos=TRUE; 221 } else { 222 /* the current UTF-16 index is unknown after setState(), use only delta */ 223 pos=0; 224 havePos=FALSE; 225 } 226 break; 227 case UITER_LIMIT: 228 case UITER_LENGTH: 229 if(iter->length>=0) { 230 pos=iter->length+delta; 231 havePos=TRUE; 232 } else { 233 /* pin to the end, avoid counting the length */ 234 iter->index=-1; 235 iter->start=iter->limit; 236 iter->reservedField=0; 237 if(delta>=0) { 238 return UITER_UNKNOWN_INDEX; 239 } else { 240 /* the current UTF-16 index is unknown, use only delta */ 241 pos=0; 242 havePos=FALSE; 243 } 244 } 245 break; 246 default: 247 return -1; /* Error */ 248 } 249 250 if(havePos) { 251 /* shortcuts: pinning to the edges of the string */ 252 if(pos<=0) { 253 iter->index=iter->start=iter->reservedField=0; 254 return 0; 255 } else if(iter->length>=0 && pos>=iter->length) { 256 iter->index=iter->length; 257 iter->start=iter->limit; 258 iter->reservedField=0; 259 return iter->index; 260 } 261 262 /* minimize the number of L8_NEXT/PREV operations */ 263 if(iter->index<0 || pos<iter->index/2) { 264 /* go forward from the start instead of backward from the current index */ 265 iter->index=iter->start=iter->reservedField=0; 266 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 267 /* 268 * if we have the UTF-16 index and length and the new position is 269 * closer to the end than the current index, 270 * then go backward from the end instead of forward from the current index 271 */ 272 iter->index=iter->length; 273 iter->start=iter->limit; 274 iter->reservedField=0; 275 } 276 277 delta=pos-iter->index; 278 if(delta==0) { 279 return iter->index; /* nothing to do */ 280 } 281 } else { 282 /* move relative to unknown UTF-16 index */ 283 if(delta==0) { 284 return UITER_UNKNOWN_INDEX; /* nothing to do */ 285 } else if(-delta>=iter->start) { 286 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 287 iter->index=iter->start=iter->reservedField=0; 288 return 0; 289 } else if(delta>=(iter->limit-iter->start)) { 290 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 291 iter->index=iter->length; /* may or may not be <0 (unknown) */ 292 iter->start=iter->limit; 293 iter->reservedField=0; 294 return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX; 295 } 296 } 297 298 /* delta!=0 */ 299 300 /* move towards the requested position, pin to the edges of the string */ 301 s=(const uint8_t *)iter->context; 302 pos=iter->index; /* could be <0 (unknown) */ 303 i=iter->start; 304 if(delta>0) { 305 /* go forward */ 306 int32_t limit=iter->limit; 307 if(iter->reservedField!=0) { 308 iter->reservedField=0; 309 ++pos; 310 --delta; 311 } 312 while(delta>0 && i<limit) { 313 L8_NEXT(s, i, limit, c); 314 if(c<0xffff) { 315 ++pos; 316 --delta; 317 } else if(delta>=2) { 318 pos+=2; 319 delta-=2; 320 } else /* delta==1 */ { 321 /* stop in the middle of a supplementary code point */ 322 iter->reservedField=c; 323 ++pos; 324 break; /* delta=0; */ 325 } 326 } 327 if(i==limit) { 328 if(iter->length<0 && iter->index>=0) { 329 iter->length= iter->reservedField==0 ? pos : pos+1; 330 } else if(iter->index<0 && iter->length>=0) { 331 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 332 } 333 } 334 } else /* delta<0 */ { 335 /* go backward */ 336 if(iter->reservedField!=0) { 337 iter->reservedField=0; 338 i-=4; /* we stayed behind the supplementary code point; go before it now */ 339 --pos; 340 ++delta; 341 } 342 while(delta<0 && i>0) { 343 L8_PREV(s, 0, i, c); 344 if(c<0xffff) { 345 --pos; 346 ++delta; 347 } else if(delta<=-2) { 348 pos-=2; 349 delta+=2; 350 } else /* delta==-1 */ { 351 /* stop in the middle of a supplementary code point */ 352 i+=4; /* back to behind this supplementary code point for consistent state */ 353 iter->reservedField=c; 354 --pos; 355 break; /* delta=0; */ 356 } 357 } 358 } 359 360 iter->start=i; 361 if(iter->index>=0) { 362 return iter->index=pos; 363 } else { 364 /* we started with index<0 (unknown) so pos is bogus */ 365 if(i<=1) { 366 return iter->index=i; /* reached the beginning */ 367 } else { 368 /* we still don't know the UTF-16 index */ 369 return UITER_UNKNOWN_INDEX; 370 } 371 } 372 } 373 374 static UBool U_CALLCONV 375 lenient8IteratorHasNext(UCharIterator *iter) { 376 return iter->reservedField!=0 || iter->start<iter->limit; 377 } 378 379 static UBool U_CALLCONV 380 lenient8IteratorHasPrevious(UCharIterator *iter) { 381 return iter->start>0; 382 } 383 384 static UChar32 U_CALLCONV 385 lenient8IteratorCurrent(UCharIterator *iter) { 386 if(iter->reservedField!=0) { 387 return U16_TRAIL(iter->reservedField); 388 } else if(iter->start<iter->limit) { 389 const uint8_t *s=(const uint8_t *)iter->context; 390 UChar32 c; 391 int32_t i=iter->start; 392 393 L8_NEXT(s, i, iter->limit, c); 394 if(c<0) { 395 return 0xfffd; 396 } else if(c<=0xffff) { 397 return c; 398 } else { 399 return U16_LEAD(c); 400 } 401 } else { 402 return U_SENTINEL; 403 } 404 } 405 406 static UChar32 U_CALLCONV 407 lenient8IteratorNext(UCharIterator *iter) { 408 int32_t index; 409 410 if(iter->reservedField!=0) { 411 UChar trail=U16_TRAIL(iter->reservedField); 412 iter->reservedField=0; 413 if((index=iter->index)>=0) { 414 iter->index=index+1; 415 } 416 return trail; 417 } else if(iter->start<iter->limit) { 418 const uint8_t *s=(const uint8_t *)iter->context; 419 UChar32 c; 420 421 L8_NEXT(s, iter->start, iter->limit, c); 422 if((index=iter->index)>=0) { 423 iter->index=++index; 424 if(iter->length<0 && iter->start==iter->limit) { 425 iter->length= c<=0xffff ? index : index+1; 426 } 427 } else if(iter->start==iter->limit && iter->length>=0) { 428 iter->index= c<=0xffff ? iter->length : iter->length-1; 429 } 430 if(c<0) { 431 return 0xfffd; 432 } else if(c<=0xffff) { 433 return c; 434 } else { 435 iter->reservedField=c; 436 return U16_LEAD(c); 437 } 438 } else { 439 return U_SENTINEL; 440 } 441 } 442 443 static UChar32 U_CALLCONV 444 lenient8IteratorPrevious(UCharIterator *iter) { 445 int32_t index; 446 447 if(iter->reservedField!=0) { 448 UChar lead=U16_LEAD(iter->reservedField); 449 iter->reservedField=0; 450 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 451 if((index=iter->index)>0) { 452 iter->index=index-1; 453 } 454 return lead; 455 } else if(iter->start>0) { 456 const uint8_t *s=(const uint8_t *)iter->context; 457 UChar32 c; 458 459 L8_PREV(s, 0, iter->start, c); 460 if((index=iter->index)>0) { 461 iter->index=index-1; 462 } else if(iter->start<=1) { 463 iter->index= c<=0xffff ? iter->start : iter->start+1; 464 } 465 if(c<0) { 466 return 0xfffd; 467 } else if(c<=0xffff) { 468 return c; 469 } else { 470 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 471 iter->reservedField=c; 472 return U16_TRAIL(c); 473 } 474 } else { 475 return U_SENTINEL; 476 } 477 } 478 479 static uint32_t U_CALLCONV 480 lenient8IteratorGetState(const UCharIterator *iter) { 481 uint32_t state=(uint32_t)(iter->start<<1); 482 if(iter->reservedField!=0) { 483 state|=1; 484 } 485 return state; 486 } 487 488 static void U_CALLCONV 489 lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 490 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 491 /* do nothing */ 492 } else if(iter==NULL) { 493 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 494 } else if(state==lenient8IteratorGetState(iter)) { 495 /* setting to the current state: no-op */ 496 } else { 497 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 498 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 499 500 if((state==0 ? index<0 : index<4) || iter->limit<index) { 501 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 502 } else { 503 iter->start=index; /* restore UTF-8 byte index */ 504 if(index<=1) { 505 iter->index=index; 506 } else { 507 iter->index=-1; /* unknown UTF-16 index */ 508 } 509 if(state==0) { 510 iter->reservedField=0; 511 } else { 512 /* verified index>=4 above */ 513 UChar32 c; 514 L8_PREV((const uint8_t *)iter->context, 0, index, c); 515 if(c<=0xffff) { 516 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 517 } else { 518 iter->reservedField=c; 519 } 520 } 521 } 522 } 523 } 524 525 static const UCharIterator lenient8Iterator={ 526 0, 0, 0, 0, 0, 0, 527 lenient8IteratorGetIndex, 528 lenient8IteratorMove, 529 lenient8IteratorHasNext, 530 lenient8IteratorHasPrevious, 531 lenient8IteratorCurrent, 532 lenient8IteratorNext, 533 lenient8IteratorPrevious, 534 NULL, 535 lenient8IteratorGetState, 536 lenient8IteratorSetState 537 }; 538 539 U_CAPI void U_EXPORT2 540 uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) { 541 if(iter!=0) { 542 if(s!=0 && length>=-1) { 543 *iter=lenient8Iterator; 544 iter->context=s; 545 if(length>=0) { 546 iter->limit=length; 547 } else { 548 iter->limit=strlen(s); 549 } 550 iter->length= iter->limit<=1 ? iter->limit : -1; 551 } else { 552 /* set no-op iterator */ 553 uiter_setString(iter, NULL, 0); 554 } 555 } 556 } 557