1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2003-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnv_ext.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jun13 14 * created by: Markus W. Scherer 15 * 16 * Conversion extensions 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 22 23 #include "unicode/uset.h" 24 #include "ucnv_bld.h" 25 #include "ucnv_cnv.h" 26 #include "ucnv_ext.h" 27 #include "cmemory.h" 28 #include "uassert.h" 29 30 /* to Unicode --------------------------------------------------------------- */ 31 32 /* 33 * @return lookup value for the byte, if found; else 0 34 */ 35 static inline uint32_t 36 ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { 37 uint32_t word0, word; 38 int32_t i, start, limit; 39 40 /* check the input byte against the lowest and highest section bytes */ 41 start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); 42 limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); 43 if(byte<start || limit<byte) { 44 return 0; /* the byte is out of range */ 45 } 46 47 if(length==((limit-start)+1)) { 48 /* direct access on a linear array */ 49 return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ 50 } 51 52 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ 53 word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); 54 55 /* 56 * Shift byte once instead of each section word and add 0xffffff. 57 * We will compare the shifted/added byte (bbffffff) against 58 * section words which have byte values in the same bit position. 59 * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv 60 * for all v=0..f 61 * so we need not mask off the lower 24 bits of each section word. 62 */ 63 word=word0|UCNV_EXT_TO_U_VALUE_MASK; 64 65 /* binary search */ 66 start=0; 67 limit=length; 68 for(;;) { 69 i=limit-start; 70 if(i<=1) { 71 break; /* done */ 72 } 73 /* start<limit-1 */ 74 75 if(i<=4) { 76 /* linear search for the last part */ 77 if(word0<=toUSection[start]) { 78 break; 79 } 80 if(++start<limit && word0<=toUSection[start]) { 81 break; 82 } 83 if(++start<limit && word0<=toUSection[start]) { 84 break; 85 } 86 /* always break at start==limit-1 */ 87 ++start; 88 break; 89 } 90 91 i=(start+limit)/2; 92 if(word<toUSection[i]) { 93 limit=i; 94 } else { 95 start=i; 96 } 97 } 98 99 /* did we really find it? */ 100 if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { 101 return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ 102 } else { 103 return 0; /* not found */ 104 } 105 } 106 107 /* 108 * TRUE if not an SI/SO stateful converter, 109 * or if the match length fits with the current converter state 110 */ 111 #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ 112 ((sisoState)<0 || ((sisoState)==0) == (match==1)) 113 114 /* 115 * this works like ucnv_extMatchFromU() except 116 * - the first character is in pre 117 * - no trie is used 118 * - the returned matchLength is not offset by 2 119 */ 120 static int32_t 121 ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, 122 const char *pre, int32_t preLength, 123 const char *src, int32_t srcLength, 124 uint32_t *pMatchValue, 125 UBool /*useFallback*/, UBool flush) { 126 const uint32_t *toUTable, *toUSection; 127 128 uint32_t value, matchValue; 129 int32_t i, j, idx, length, matchLength; 130 uint8_t b; 131 132 if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) { 133 return 0; /* no extension data, no match */ 134 } 135 136 /* initialize */ 137 toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); 138 idx=0; 139 140 matchValue=0; 141 i=j=matchLength=0; 142 143 if(sisoState==0) { 144 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ 145 if(preLength>1) { 146 return 0; /* no match of a DBCS sequence in SBCS mode */ 147 } else if(preLength==1) { 148 srcLength=0; 149 } else /* preLength==0 */ { 150 if(srcLength>1) { 151 srcLength=1; 152 } 153 } 154 flush=TRUE; 155 } 156 157 /* we must not remember fallback matches when not using fallbacks */ 158 159 /* match input units until there is a full match or the input is consumed */ 160 for(;;) { 161 /* go to the next section */ 162 toUSection=toUTable+idx; 163 164 /* read first pair of the section */ 165 value=*toUSection++; 166 length=UCNV_EXT_TO_U_GET_BYTE(value); 167 value=UCNV_EXT_TO_U_GET_VALUE(value); 168 if( value!=0 && 169 (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 170 TO_U_USE_FALLBACK(useFallback)) && 171 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 172 ) { 173 /* remember longest match so far */ 174 matchValue=value; 175 matchLength=i+j; 176 } 177 178 /* match pre[] then src[] */ 179 if(i<preLength) { 180 b=(uint8_t)pre[i++]; 181 } else if(j<srcLength) { 182 b=(uint8_t)src[j++]; 183 } else { 184 /* all input consumed, partial match */ 185 if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { 186 /* 187 * end of the entire input stream, stop with the longest match so far 188 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES 189 * because it must fit into state buffers 190 */ 191 break; 192 } else { 193 /* continue with more input next time */ 194 return -length; 195 } 196 } 197 198 /* search for the current UChar */ 199 value=ucnv_extFindToU(toUSection, length, b); 200 if(value==0) { 201 /* no match here, stop with the longest match so far */ 202 break; 203 } else { 204 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 205 /* partial match, continue */ 206 idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); 207 } else { 208 if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 209 TO_U_USE_FALLBACK(useFallback)) && 210 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 211 ) { 212 /* full match, stop with result */ 213 matchValue=value; 214 matchLength=i+j; 215 } else { 216 /* full match on fallback not taken, stop with the longest match so far */ 217 } 218 break; 219 } 220 } 221 } 222 223 if(matchLength==0) { 224 /* no match at all */ 225 return 0; 226 } 227 228 /* return result */ 229 *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); 230 return matchLength; 231 } 232 233 static inline void 234 ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, 235 uint32_t value, 236 UChar **target, const UChar *targetLimit, 237 int32_t **offsets, int32_t srcIndex, 238 UErrorCode *pErrorCode) { 239 /* output the result */ 240 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 241 /* output a single code point */ 242 ucnv_toUWriteCodePoint( 243 cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), 244 target, targetLimit, 245 offsets, srcIndex, 246 pErrorCode); 247 } else { 248 /* output a string - with correct data we have resultLength>0 */ 249 ucnv_toUWriteUChars( 250 cnv, 251 UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+ 252 UCNV_EXT_TO_U_GET_INDEX(value), 253 UCNV_EXT_TO_U_GET_LENGTH(value), 254 target, targetLimit, 255 offsets, srcIndex, 256 pErrorCode); 257 } 258 } 259 260 /* 261 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), 262 * or 1 for DBCS-only, 263 * or -1 if the converter is not SI/SO stateful 264 * 265 * Note: For SI/SO stateful converters getting here, 266 * cnv->mode==0 is equivalent to firstLength==1. 267 */ 268 #define UCNV_SISO_STATE(cnv) \ 269 ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ 270 (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) 271 272 /* 273 * target<targetLimit; set error code for overflow 274 */ 275 U_CFUNC UBool 276 ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, 277 int32_t firstLength, 278 const char **src, const char *srcLimit, 279 UChar **target, const UChar *targetLimit, 280 int32_t **offsets, int32_t srcIndex, 281 UBool flush, 282 UErrorCode *pErrorCode) { 283 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 284 int32_t match; 285 286 /* try to match */ 287 match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), 288 (const char *)cnv->toUBytes, firstLength, 289 *src, (int32_t)(srcLimit-*src), 290 &value, 291 cnv->useFallback, flush); 292 if(match>0) { 293 /* advance src pointer for the consumed input */ 294 *src+=match-firstLength; 295 296 /* write result to target */ 297 ucnv_extWriteToU(cnv, cx, 298 value, 299 target, targetLimit, 300 offsets, srcIndex, 301 pErrorCode); 302 return TRUE; 303 } else if(match<0) { 304 /* save state for partial match */ 305 const char *s; 306 int32_t j; 307 308 /* copy the first code point */ 309 s=(const char *)cnv->toUBytes; 310 cnv->preToUFirstLength=(int8_t)firstLength; 311 for(j=0; j<firstLength; ++j) { 312 cnv->preToU[j]=*s++; 313 } 314 315 /* now copy the newly consumed input */ 316 s=*src; 317 match=-match; 318 for(; j<match; ++j) { 319 cnv->preToU[j]=*s++; 320 } 321 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 322 cnv->preToULength=(int8_t)match; 323 return TRUE; 324 } else /* match==0 no match */ { 325 return FALSE; 326 } 327 } 328 329 U_CFUNC UChar32 330 ucnv_extSimpleMatchToU(const int32_t *cx, 331 const char *source, int32_t length, 332 UBool useFallback) { 333 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 334 int32_t match; 335 336 if(length<=0) { 337 return 0xffff; 338 } 339 340 /* try to match */ 341 match=ucnv_extMatchToU(cx, -1, 342 source, length, 343 NULL, 0, 344 &value, 345 useFallback, TRUE); 346 if(match==length) { 347 /* write result for simple, single-character conversion */ 348 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 349 return UCNV_EXT_TO_U_GET_CODE_POINT(value); 350 } 351 } 352 353 /* 354 * return no match because 355 * - match>0 && value points to string: simple conversion cannot handle multiple code points 356 * - match>0 && match!=length: not all input consumed, forbidden for this function 357 * - match==0: no match found in the first place 358 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 359 */ 360 return 0xfffe; 361 } 362 363 /* 364 * continue partial match with new input 365 * never called for simple, single-character conversion 366 */ 367 U_CFUNC void 368 ucnv_extContinueMatchToU(UConverter *cnv, 369 UConverterToUnicodeArgs *pArgs, int32_t srcIndex, 370 UErrorCode *pErrorCode) { 371 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 372 int32_t match, length; 373 374 match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), 375 cnv->preToU, cnv->preToULength, 376 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 377 &value, 378 cnv->useFallback, pArgs->flush); 379 if(match>0) { 380 if(match>=cnv->preToULength) { 381 /* advance src pointer for the consumed input */ 382 pArgs->source+=match-cnv->preToULength; 383 cnv->preToULength=0; 384 } else { 385 /* the match did not use all of preToU[] - keep the rest for replay */ 386 length=cnv->preToULength-match; 387 uprv_memmove(cnv->preToU, cnv->preToU+match, length); 388 cnv->preToULength=(int8_t)-length; 389 } 390 391 /* write result */ 392 ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, 393 value, 394 &pArgs->target, pArgs->targetLimit, 395 &pArgs->offsets, srcIndex, 396 pErrorCode); 397 } else if(match<0) { 398 /* save state for partial match */ 399 const char *s; 400 int32_t j; 401 402 /* just _append_ the newly consumed input to preToU[] */ 403 s=pArgs->source; 404 match=-match; 405 for(j=cnv->preToULength; j<match; ++j) { 406 cnv->preToU[j]=*s++; 407 } 408 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 409 cnv->preToULength=(int8_t)match; 410 } else /* match==0 */ { 411 /* 412 * no match 413 * 414 * We need to split the previous input into two parts: 415 * 416 * 1. The first codepage character is unmappable - that's how we got into 417 * trying the extension data in the first place. 418 * We need to move it from the preToU buffer 419 * to the error buffer, set an error code, 420 * and prepare the rest of the previous input for 2. 421 * 422 * 2. The rest of the previous input must be converted once we 423 * come back from the callback for the first character. 424 * At that time, we have to try again from scratch to convert 425 * these input characters. 426 * The replay will be handled by the ucnv.c conversion code. 427 */ 428 429 /* move the first codepage character to the error field */ 430 uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); 431 cnv->toULength=cnv->preToUFirstLength; 432 433 /* move the rest up inside the buffer */ 434 length=cnv->preToULength-cnv->preToUFirstLength; 435 if(length>0) { 436 uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); 437 } 438 439 /* mark preToU for replay */ 440 cnv->preToULength=(int8_t)-length; 441 442 /* set the error code for unassigned */ 443 *pErrorCode=U_INVALID_CHAR_FOUND; 444 } 445 } 446 447 /* from Unicode ------------------------------------------------------------- */ 448 449 /* 450 * @return index of the UChar, if found; else <0 451 */ 452 static inline int32_t 453 ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { 454 int32_t i, start, limit; 455 456 /* binary search */ 457 start=0; 458 limit=length; 459 for(;;) { 460 i=limit-start; 461 if(i<=1) { 462 break; /* done */ 463 } 464 /* start<limit-1 */ 465 466 if(i<=4) { 467 /* linear search for the last part */ 468 if(u<=fromUSection[start]) { 469 break; 470 } 471 if(++start<limit && u<=fromUSection[start]) { 472 break; 473 } 474 if(++start<limit && u<=fromUSection[start]) { 475 break; 476 } 477 /* always break at start==limit-1 */ 478 ++start; 479 break; 480 } 481 482 i=(start+limit)/2; 483 if(u<fromUSection[i]) { 484 limit=i; 485 } else { 486 start=i; 487 } 488 } 489 490 /* did we really find it? */ 491 if(start<limit && u==fromUSection[start]) { 492 return start; 493 } else { 494 return -1; /* not found */ 495 } 496 } 497 498 /* 499 * @param cx pointer to extension data; if NULL, returns 0 500 * @param firstCP the first code point before all the other UChars 501 * @param pre UChars that must match; !initialMatch: partial match with them 502 * @param preLength length of pre, >=0 503 * @param src UChars that can be used to complete a match 504 * @param srcLength length of src, >=0 505 * @param pMatchValue [out] output result value for the match from the data structure 506 * @param useFallback "use fallback" flag, usually from cnv->useFallback 507 * @param flush TRUE if the end of the input stream is reached 508 * @return >1: matched, return value=total match length (number of input units matched) 509 * 1: matched, no mapping but request for <subchar1> 510 * (only for the first code point) 511 * 0: no match 512 * <0: partial match, return value=negative total match length 513 * (partial matches are never returned for flush==TRUE) 514 * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) 515 * the matchLength is 2 if only firstCP matched, and >2 if firstCP and 516 * further code units matched 517 */ 518 static int32_t 519 ucnv_extMatchFromU(const int32_t *cx, 520 UChar32 firstCP, 521 const UChar *pre, int32_t preLength, 522 const UChar *src, int32_t srcLength, 523 uint32_t *pMatchValue, 524 UBool useFallback, UBool flush) { 525 const uint16_t *stage12, *stage3; 526 const uint32_t *stage3b; 527 528 const UChar *fromUTableUChars, *fromUSectionUChars; 529 const uint32_t *fromUTableValues, *fromUSectionValues; 530 531 uint32_t value, matchValue; 532 int32_t i, j, idx, length, matchLength; 533 UChar c; 534 535 if(cx==NULL) { 536 return 0; /* no extension data, no match */ 537 } 538 539 /* trie lookup of firstCP */ 540 idx=firstCP>>10; /* stage 1 index */ 541 if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { 542 return 0; /* the first code point is outside the trie */ 543 } 544 545 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 546 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 547 idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP); 548 549 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 550 value=stage3b[idx]; 551 if(value==0) { 552 return 0; 553 } 554 555 /* 556 * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: 557 * Do not interpret values with reserved bits used, for forward compatibility, 558 * and do not even remember intermediate results with reserved bits used. 559 */ 560 561 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 562 /* partial match, enter the loop below */ 563 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 564 565 /* initialize */ 566 fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); 567 fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); 568 569 matchValue=0; 570 i=j=matchLength=0; 571 572 /* we must not remember fallback matches when not using fallbacks */ 573 574 /* match input units until there is a full match or the input is consumed */ 575 for(;;) { 576 /* go to the next section */ 577 fromUSectionUChars=fromUTableUChars+idx; 578 fromUSectionValues=fromUTableValues+idx; 579 580 /* read first pair of the section */ 581 length=*fromUSectionUChars++; 582 value=*fromUSectionValues++; 583 if( value!=0 && 584 (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || 585 FROM_U_USE_FALLBACK(useFallback, firstCP)) && 586 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 587 ) { 588 /* remember longest match so far */ 589 matchValue=value; 590 matchLength=2+i+j; 591 } 592 593 /* match pre[] then src[] */ 594 if(i<preLength) { 595 c=pre[i++]; 596 } else if(j<srcLength) { 597 c=src[j++]; 598 } else { 599 /* all input consumed, partial match */ 600 if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { 601 /* 602 * end of the entire input stream, stop with the longest match so far 603 * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS 604 * because it must fit into state buffers 605 */ 606 break; 607 } else { 608 /* continue with more input next time */ 609 return -(2+length); 610 } 611 } 612 613 /* search for the current UChar */ 614 idx=ucnv_extFindFromU(fromUSectionUChars, length, c); 615 if(idx<0) { 616 /* no match here, stop with the longest match so far */ 617 break; 618 } else { 619 value=fromUSectionValues[idx]; 620 if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 621 /* partial match, continue */ 622 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 623 } else { 624 if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || 625 FROM_U_USE_FALLBACK(useFallback, firstCP)) && 626 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 627 ) { 628 /* full match, stop with result */ 629 matchValue=value; 630 matchLength=2+i+j; 631 } else { 632 /* full match on fallback not taken, stop with the longest match so far */ 633 } 634 break; 635 } 636 } 637 } 638 639 if(matchLength==0) { 640 /* no match at all */ 641 return 0; 642 } 643 } else /* result from firstCP trie lookup */ { 644 if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || 645 FROM_U_USE_FALLBACK(useFallback, firstCP)) && 646 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 647 ) { 648 /* full match, stop with result */ 649 matchValue=value; 650 matchLength=2; 651 } else { 652 /* fallback not taken */ 653 return 0; 654 } 655 } 656 657 /* return result */ 658 if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { 659 return 1; /* assert matchLength==2 */ 660 } 661 662 *pMatchValue=matchValue; 663 return matchLength; 664 } 665 666 /* 667 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits 668 */ 669 static inline void 670 ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, 671 uint32_t value, 672 char **target, const char *targetLimit, 673 int32_t **offsets, int32_t srcIndex, 674 UErrorCode *pErrorCode) { 675 uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; 676 const uint8_t *result; 677 int32_t length, prevLength; 678 679 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 680 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 681 682 /* output the result */ 683 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 684 /* 685 * Generate a byte array and then write it below. 686 * This is not the fastest possible way, but it should be ok for 687 * extension mappings, and it is much simpler. 688 * Offset and overflow handling are only done once this way. 689 */ 690 uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ 691 switch(length) { 692 case 3: 693 *p++=(uint8_t)(value>>16); 694 case 2: /*fall through*/ 695 *p++=(uint8_t)(value>>8); 696 case 1: /*fall through*/ 697 *p++=(uint8_t)value; 698 default: 699 break; /* will never occur */ 700 } 701 result=buffer+1; 702 } else { 703 result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 704 } 705 706 /* with correct data we have length>0 */ 707 708 if((prevLength=cnv->fromUnicodeStatus)!=0) { 709 /* handle SI/SO stateful output */ 710 uint8_t shiftByte; 711 712 if(prevLength>1 && length==1) { 713 /* change from double-byte mode to single-byte */ 714 shiftByte=(uint8_t)UCNV_SI; 715 cnv->fromUnicodeStatus=1; 716 } else if(prevLength==1 && length>1) { 717 /* change from single-byte mode to double-byte */ 718 shiftByte=(uint8_t)UCNV_SO; 719 cnv->fromUnicodeStatus=2; 720 } else { 721 shiftByte=0; 722 } 723 724 if(shiftByte!=0) { 725 /* prepend the shift byte to the result bytes */ 726 buffer[0]=shiftByte; 727 if(result!=buffer+1) { 728 uprv_memcpy(buffer+1, result, length); 729 } 730 result=buffer; 731 ++length; 732 } 733 } 734 735 ucnv_fromUWriteBytes(cnv, (const char *)result, length, 736 target, targetLimit, 737 offsets, srcIndex, 738 pErrorCode); 739 } 740 741 /* 742 * target<targetLimit; set error code for overflow 743 */ 744 U_CFUNC UBool 745 ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, 746 UChar32 cp, 747 const UChar **src, const UChar *srcLimit, 748 char **target, const char *targetLimit, 749 int32_t **offsets, int32_t srcIndex, 750 UBool flush, 751 UErrorCode *pErrorCode) { 752 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 753 int32_t match; 754 755 /* try to match */ 756 match=ucnv_extMatchFromU(cx, cp, 757 NULL, 0, 758 *src, (int32_t)(srcLimit-*src), 759 &value, 760 cnv->useFallback, flush); 761 762 /* reject a match if the result is a single byte for DBCS-only */ 763 if( match>=2 && 764 !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && 765 cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) 766 ) { 767 /* advance src pointer for the consumed input */ 768 *src+=match-2; /* remove 2 for the initial code point */ 769 770 /* write result to target */ 771 ucnv_extWriteFromU(cnv, cx, 772 value, 773 target, targetLimit, 774 offsets, srcIndex, 775 pErrorCode); 776 return TRUE; 777 } else if(match<0) { 778 /* save state for partial match */ 779 const UChar *s; 780 int32_t j; 781 782 /* copy the first code point */ 783 cnv->preFromUFirstCP=cp; 784 785 /* now copy the newly consumed input */ 786 s=*src; 787 match=-match-2; /* remove 2 for the initial code point */ 788 for(j=0; j<match; ++j) { 789 cnv->preFromU[j]=*s++; 790 } 791 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 792 cnv->preFromULength=(int8_t)match; 793 return TRUE; 794 } else if(match==1) { 795 /* matched, no mapping but request for <subchar1> */ 796 cnv->useSubChar1=TRUE; 797 return FALSE; 798 } else /* match==0 no match */ { 799 return FALSE; 800 } 801 } 802 803 /* 804 * Used by ISO 2022 implementation. 805 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping 806 */ 807 U_CFUNC int32_t 808 ucnv_extSimpleMatchFromU(const int32_t *cx, 809 UChar32 cp, uint32_t *pValue, 810 UBool useFallback) { 811 uint32_t value; 812 int32_t match; 813 814 /* try to match */ 815 match=ucnv_extMatchFromU(cx, 816 cp, 817 NULL, 0, 818 NULL, 0, 819 &value, 820 useFallback, TRUE); 821 if(match>=2) { 822 /* write result for simple, single-character conversion */ 823 int32_t length; 824 int isRoundtrip; 825 826 isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); 827 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 828 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 829 830 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 831 *pValue=value; 832 return isRoundtrip ? length : -length; 833 #if 0 /* not currently used */ 834 } else if(length==4) { 835 /* de-serialize a 4-byte result */ 836 const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 837 *pValue= 838 ((uint32_t)result[0]<<24)| 839 ((uint32_t)result[1]<<16)| 840 ((uint32_t)result[2]<<8)| 841 result[3]; 842 return isRoundtrip ? 4 : -4; 843 #endif 844 } 845 } 846 847 /* 848 * return no match because 849 * - match>1 && resultLength>4: result too long for simple conversion 850 * - match==1: no match found, <subchar1> preferred 851 * - match==0: no match found in the first place 852 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 853 */ 854 return 0; 855 } 856 857 /* 858 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 859 * never called for simple, single-character conversion 860 */ 861 U_CFUNC void 862 ucnv_extContinueMatchFromU(UConverter *cnv, 863 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, 864 UErrorCode *pErrorCode) { 865 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 866 int32_t match; 867 868 match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, 869 cnv->preFromUFirstCP, 870 cnv->preFromU, cnv->preFromULength, 871 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 872 &value, 873 cnv->useFallback, pArgs->flush); 874 if(match>=2) { 875 match-=2; /* remove 2 for the initial code point */ 876 877 if(match>=cnv->preFromULength) { 878 /* advance src pointer for the consumed input */ 879 pArgs->source+=match-cnv->preFromULength; 880 cnv->preFromULength=0; 881 } else { 882 /* the match did not use all of preFromU[] - keep the rest for replay */ 883 int32_t length=cnv->preFromULength-match; 884 uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR); 885 cnv->preFromULength=(int8_t)-length; 886 } 887 888 /* finish the partial match */ 889 cnv->preFromUFirstCP=U_SENTINEL; 890 891 /* write result */ 892 ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, 893 value, 894 &pArgs->target, pArgs->targetLimit, 895 &pArgs->offsets, srcIndex, 896 pErrorCode); 897 } else if(match<0) { 898 /* save state for partial match */ 899 const UChar *s; 900 int32_t j; 901 902 /* just _append_ the newly consumed input to preFromU[] */ 903 s=pArgs->source; 904 match=-match-2; /* remove 2 for the initial code point */ 905 for(j=cnv->preFromULength; j<match; ++j) { 906 U_ASSERT(j>=0); 907 cnv->preFromU[j]=*s++; 908 } 909 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 910 cnv->preFromULength=(int8_t)match; 911 } else /* match==0 or 1 */ { 912 /* 913 * no match 914 * 915 * We need to split the previous input into two parts: 916 * 917 * 1. The first code point is unmappable - that's how we got into 918 * trying the extension data in the first place. 919 * We need to move it from the preFromU buffer 920 * to the error buffer, set an error code, 921 * and prepare the rest of the previous input for 2. 922 * 923 * 2. The rest of the previous input must be converted once we 924 * come back from the callback for the first code point. 925 * At that time, we have to try again from scratch to convert 926 * these input characters. 927 * The replay will be handled by the ucnv.c conversion code. 928 */ 929 930 if(match==1) { 931 /* matched, no mapping but request for <subchar1> */ 932 cnv->useSubChar1=TRUE; 933 } 934 935 /* move the first code point to the error field */ 936 cnv->fromUChar32=cnv->preFromUFirstCP; 937 cnv->preFromUFirstCP=U_SENTINEL; 938 939 /* mark preFromU for replay */ 940 cnv->preFromULength=-cnv->preFromULength; 941 942 /* set the error code for unassigned */ 943 *pErrorCode=U_INVALID_CHAR_FOUND; 944 } 945 } 946 947 static void 948 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, 949 const int32_t *cx, 950 const USetAdder *sa, 951 UBool useFallback, 952 int32_t minLength, 953 UChar32 c, 954 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, 955 int32_t sectionIndex, 956 UErrorCode *pErrorCode) { 957 const UChar *fromUSectionUChars; 958 const uint32_t *fromUSectionValues; 959 960 uint32_t value; 961 int32_t i, count; 962 963 fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; 964 fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; 965 966 /* read first pair of the section */ 967 count=*fromUSectionUChars++; 968 value=*fromUSectionValues++; 969 970 if( value!=0 && 971 (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && 972 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength 973 ) { 974 if(c>=0) { 975 /* add the initial code point */ 976 sa->add(sa->set, c); 977 } else { 978 /* add the string so far */ 979 sa->addString(sa->set, s, length); 980 } 981 } 982 983 for(i=0; i<count; ++i) { 984 /* append this code unit and recurse or add the string */ 985 s[length]=fromUSectionUChars[i]; 986 value=fromUSectionValues[i]; 987 988 if(value==0) { 989 /* no mapping, do nothing */ 990 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 991 ucnv_extGetUnicodeSetString( 992 sharedData, cx, sa, useFallback, minLength, 993 U_SENTINEL, s, length+1, 994 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 995 pErrorCode); 996 } else if((useFallback ? 997 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : 998 ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== 999 UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && 1000 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength 1001 ) { 1002 sa->addString(sa->set, s, length+1); 1003 } 1004 } 1005 } 1006 1007 U_CFUNC void 1008 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, 1009 const USetAdder *sa, 1010 UConverterUnicodeSet which, 1011 UConverterSetFilter filter, 1012 UErrorCode *pErrorCode) { 1013 const int32_t *cx; 1014 const uint16_t *stage12, *stage3, *ps2, *ps3; 1015 const uint32_t *stage3b; 1016 1017 uint32_t value; 1018 int32_t st1, stage1Length, st2, st3, minLength; 1019 UBool useFallback; 1020 1021 UChar s[UCNV_EXT_MAX_UCHARS]; 1022 UChar32 c; 1023 int32_t length; 1024 1025 cx=sharedData->mbcs.extIndexes; 1026 if(cx==NULL) { 1027 return; 1028 } 1029 1030 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 1031 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 1032 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 1033 1034 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; 1035 1036 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 1037 1038 /* enumerate the from-Unicode trie table */ 1039 c=0; /* keep track of the current code point while enumerating */ 1040 1041 if(filter==UCNV_SET_FILTER_2022_CN) { 1042 minLength=3; 1043 } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || 1044 filter!=UCNV_SET_FILTER_NONE 1045 ) { 1046 /* DBCS-only, ignore single-byte results */ 1047 minLength=2; 1048 } else { 1049 minLength=1; 1050 } 1051 1052 /* 1053 * the trie enumeration is almost the same as 1054 * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 1055 */ 1056 for(st1=0; st1<stage1Length; ++st1) { 1057 st2=stage12[st1]; 1058 if(st2>stage1Length) { 1059 ps2=stage12+st2; 1060 for(st2=0; st2<64; ++st2) { 1061 if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { 1062 /* read the stage 3 block */ 1063 ps3=stage3+st3; 1064 1065 /* 1066 * Add code points for which the roundtrip flag is set. 1067 * Do not add <subchar1> entries or other (future?) pseudo-entries 1068 * with an output length of 0, or entries with reserved bits set. 1069 * Recurse for partial results. 1070 */ 1071 do { 1072 value=stage3b[*ps3++]; 1073 if(value==0) { 1074 /* no mapping, do nothing */ 1075 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 1076 length=0; 1077 U16_APPEND_UNSAFE(s, length, c); 1078 ucnv_extGetUnicodeSetString( 1079 sharedData, cx, sa, useFallback, minLength, 1080 c, s, length, 1081 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 1082 pErrorCode); 1083 } else if((useFallback ? 1084 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : 1085 ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== 1086 UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && 1087 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength 1088 ) { 1089 switch(filter) { 1090 case UCNV_SET_FILTER_2022_CN: 1091 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { 1092 continue; 1093 } 1094 break; 1095 case UCNV_SET_FILTER_SJIS: 1096 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { 1097 continue; 1098 } 1099 break; 1100 case UCNV_SET_FILTER_GR94DBCS: 1101 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1102 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && 1103 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1104 continue; 1105 } 1106 break; 1107 case UCNV_SET_FILTER_HZ: 1108 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1109 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1110 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1111 continue; 1112 } 1113 break; 1114 default: 1115 /* 1116 * UCNV_SET_FILTER_NONE, 1117 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength 1118 */ 1119 break; 1120 } 1121 sa->add(sa->set, c); 1122 } 1123 } while((++c&0xf)!=0); 1124 } else { 1125 c+=16; /* empty stage 3 block */ 1126 } 1127 } 1128 } else { 1129 c+=1024; /* empty stage 2 block */ 1130 } 1131 } 1132 } 1133 1134 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1135