1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2003-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnv_ext.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003jun13 16 * created by: Markus W. Scherer 17 * 18 * Conversion extensions 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 24 25 #include "unicode/uset.h" 26 #include "unicode/ustring.h" 27 #include "ucnv_bld.h" 28 #include "ucnv_cnv.h" 29 #include "ucnv_ext.h" 30 #include "cmemory.h" 31 #include "uassert.h" 32 33 /* to Unicode --------------------------------------------------------------- */ 34 35 /* 36 * @return lookup value for the byte, if found; else 0 37 */ 38 static inline uint32_t 39 ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { 40 uint32_t word0, word; 41 int32_t i, start, limit; 42 43 /* check the input byte against the lowest and highest section bytes */ 44 start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); 45 limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); 46 if(byte<start || limit<byte) { 47 return 0; /* the byte is out of range */ 48 } 49 50 if(length==((limit-start)+1)) { 51 /* direct access on a linear array */ 52 return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ 53 } 54 55 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ 56 word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); 57 58 /* 59 * Shift byte once instead of each section word and add 0xffffff. 60 * We will compare the shifted/added byte (bbffffff) against 61 * section words which have byte values in the same bit position. 62 * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv 63 * for all v=0..f 64 * so we need not mask off the lower 24 bits of each section word. 65 */ 66 word=word0|UCNV_EXT_TO_U_VALUE_MASK; 67 68 /* binary search */ 69 start=0; 70 limit=length; 71 for(;;) { 72 i=limit-start; 73 if(i<=1) { 74 break; /* done */ 75 } 76 /* start<limit-1 */ 77 78 if(i<=4) { 79 /* linear search for the last part */ 80 if(word0<=toUSection[start]) { 81 break; 82 } 83 if(++start<limit && word0<=toUSection[start]) { 84 break; 85 } 86 if(++start<limit && word0<=toUSection[start]) { 87 break; 88 } 89 /* always break at start==limit-1 */ 90 ++start; 91 break; 92 } 93 94 i=(start+limit)/2; 95 if(word<toUSection[i]) { 96 limit=i; 97 } else { 98 start=i; 99 } 100 } 101 102 /* did we really find it? */ 103 if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { 104 return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ 105 } else { 106 return 0; /* not found */ 107 } 108 } 109 110 /* 111 * TRUE if not an SI/SO stateful converter, 112 * or if the match length fits with the current converter state 113 */ 114 #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ 115 ((sisoState)<0 || ((sisoState)==0) == (match==1)) 116 117 /* 118 * this works like ucnv_extMatchFromU() except 119 * - the first character is in pre 120 * - no trie is used 121 * - the returned matchLength is not offset by 2 122 */ 123 static int32_t 124 ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, 125 const char *pre, int32_t preLength, 126 const char *src, int32_t srcLength, 127 uint32_t *pMatchValue, 128 UBool /*useFallback*/, UBool flush) { 129 const uint32_t *toUTable, *toUSection; 130 131 uint32_t value, matchValue; 132 int32_t i, j, idx, length, matchLength; 133 uint8_t b; 134 135 if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) { 136 return 0; /* no extension data, no match */ 137 } 138 139 /* initialize */ 140 toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); 141 idx=0; 142 143 matchValue=0; 144 i=j=matchLength=0; 145 146 if(sisoState==0) { 147 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ 148 if(preLength>1) { 149 return 0; /* no match of a DBCS sequence in SBCS mode */ 150 } else if(preLength==1) { 151 srcLength=0; 152 } else /* preLength==0 */ { 153 if(srcLength>1) { 154 srcLength=1; 155 } 156 } 157 flush=TRUE; 158 } 159 160 /* we must not remember fallback matches when not using fallbacks */ 161 162 /* match input units until there is a full match or the input is consumed */ 163 for(;;) { 164 /* go to the next section */ 165 toUSection=toUTable+idx; 166 167 /* read first pair of the section */ 168 value=*toUSection++; 169 length=UCNV_EXT_TO_U_GET_BYTE(value); 170 value=UCNV_EXT_TO_U_GET_VALUE(value); 171 if( value!=0 && 172 (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 173 TO_U_USE_FALLBACK(useFallback)) && 174 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 175 ) { 176 /* remember longest match so far */ 177 matchValue=value; 178 matchLength=i+j; 179 } 180 181 /* match pre[] then src[] */ 182 if(i<preLength) { 183 b=(uint8_t)pre[i++]; 184 } else if(j<srcLength) { 185 b=(uint8_t)src[j++]; 186 } else { 187 /* all input consumed, partial match */ 188 if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { 189 /* 190 * end of the entire input stream, stop with the longest match so far 191 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES 192 * because it must fit into state buffers 193 */ 194 break; 195 } else { 196 /* continue with more input next time */ 197 return -length; 198 } 199 } 200 201 /* search for the current UChar */ 202 value=ucnv_extFindToU(toUSection, length, b); 203 if(value==0) { 204 /* no match here, stop with the longest match so far */ 205 break; 206 } else { 207 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 208 /* partial match, continue */ 209 idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); 210 } else { 211 if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 212 TO_U_USE_FALLBACK(useFallback)) && 213 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 214 ) { 215 /* full match, stop with result */ 216 matchValue=value; 217 matchLength=i+j; 218 } else { 219 /* full match on fallback not taken, stop with the longest match so far */ 220 } 221 break; 222 } 223 } 224 } 225 226 if(matchLength==0) { 227 /* no match at all */ 228 return 0; 229 } 230 231 /* return result */ 232 *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); 233 return matchLength; 234 } 235 236 static inline void 237 ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, 238 uint32_t value, 239 UChar **target, const UChar *targetLimit, 240 int32_t **offsets, int32_t srcIndex, 241 UErrorCode *pErrorCode) { 242 /* output the result */ 243 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 244 /* output a single code point */ 245 ucnv_toUWriteCodePoint( 246 cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), 247 target, targetLimit, 248 offsets, srcIndex, 249 pErrorCode); 250 } else { 251 /* output a string - with correct data we have resultLength>0 */ 252 ucnv_toUWriteUChars( 253 cnv, 254 UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+ 255 UCNV_EXT_TO_U_GET_INDEX(value), 256 UCNV_EXT_TO_U_GET_LENGTH(value), 257 target, targetLimit, 258 offsets, srcIndex, 259 pErrorCode); 260 } 261 } 262 263 /* 264 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), 265 * or 1 for DBCS-only, 266 * or -1 if the converter is not SI/SO stateful 267 * 268 * Note: For SI/SO stateful converters getting here, 269 * cnv->mode==0 is equivalent to firstLength==1. 270 */ 271 #define UCNV_SISO_STATE(cnv) \ 272 ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ 273 (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) 274 275 /* 276 * target<targetLimit; set error code for overflow 277 */ 278 U_CFUNC UBool 279 ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, 280 int32_t firstLength, 281 const char **src, const char *srcLimit, 282 UChar **target, const UChar *targetLimit, 283 int32_t **offsets, int32_t srcIndex, 284 UBool flush, 285 UErrorCode *pErrorCode) { 286 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 287 int32_t match; 288 289 /* try to match */ 290 match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), 291 (const char *)cnv->toUBytes, firstLength, 292 *src, (int32_t)(srcLimit-*src), 293 &value, 294 cnv->useFallback, flush); 295 if(match>0) { 296 /* advance src pointer for the consumed input */ 297 *src+=match-firstLength; 298 299 /* write result to target */ 300 ucnv_extWriteToU(cnv, cx, 301 value, 302 target, targetLimit, 303 offsets, srcIndex, 304 pErrorCode); 305 return TRUE; 306 } else if(match<0) { 307 /* save state for partial match */ 308 const char *s; 309 int32_t j; 310 311 /* copy the first code point */ 312 s=(const char *)cnv->toUBytes; 313 cnv->preToUFirstLength=(int8_t)firstLength; 314 for(j=0; j<firstLength; ++j) { 315 cnv->preToU[j]=*s++; 316 } 317 318 /* now copy the newly consumed input */ 319 s=*src; 320 match=-match; 321 for(; j<match; ++j) { 322 cnv->preToU[j]=*s++; 323 } 324 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 325 cnv->preToULength=(int8_t)match; 326 return TRUE; 327 } else /* match==0 no match */ { 328 return FALSE; 329 } 330 } 331 332 U_CFUNC UChar32 333 ucnv_extSimpleMatchToU(const int32_t *cx, 334 const char *source, int32_t length, 335 UBool useFallback) { 336 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 337 int32_t match; 338 339 if(length<=0) { 340 return 0xffff; 341 } 342 343 /* try to match */ 344 match=ucnv_extMatchToU(cx, -1, 345 source, length, 346 NULL, 0, 347 &value, 348 useFallback, TRUE); 349 if(match==length) { 350 /* write result for simple, single-character conversion */ 351 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 352 return UCNV_EXT_TO_U_GET_CODE_POINT(value); 353 } 354 } 355 356 /* 357 * return no match because 358 * - match>0 && value points to string: simple conversion cannot handle multiple code points 359 * - match>0 && match!=length: not all input consumed, forbidden for this function 360 * - match==0: no match found in the first place 361 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 362 */ 363 return 0xfffe; 364 } 365 366 /* 367 * continue partial match with new input 368 * never called for simple, single-character conversion 369 */ 370 U_CFUNC void 371 ucnv_extContinueMatchToU(UConverter *cnv, 372 UConverterToUnicodeArgs *pArgs, int32_t srcIndex, 373 UErrorCode *pErrorCode) { 374 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 375 int32_t match, length; 376 377 match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), 378 cnv->preToU, cnv->preToULength, 379 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 380 &value, 381 cnv->useFallback, pArgs->flush); 382 if(match>0) { 383 if(match>=cnv->preToULength) { 384 /* advance src pointer for the consumed input */ 385 pArgs->source+=match-cnv->preToULength; 386 cnv->preToULength=0; 387 } else { 388 /* the match did not use all of preToU[] - keep the rest for replay */ 389 length=cnv->preToULength-match; 390 uprv_memmove(cnv->preToU, cnv->preToU+match, length); 391 cnv->preToULength=(int8_t)-length; 392 } 393 394 /* write result */ 395 ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, 396 value, 397 &pArgs->target, pArgs->targetLimit, 398 &pArgs->offsets, srcIndex, 399 pErrorCode); 400 } else if(match<0) { 401 /* save state for partial match */ 402 const char *s; 403 int32_t j; 404 405 /* just _append_ the newly consumed input to preToU[] */ 406 s=pArgs->source; 407 match=-match; 408 for(j=cnv->preToULength; j<match; ++j) { 409 cnv->preToU[j]=*s++; 410 } 411 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 412 cnv->preToULength=(int8_t)match; 413 } else /* match==0 */ { 414 /* 415 * no match 416 * 417 * We need to split the previous input into two parts: 418 * 419 * 1. The first codepage character is unmappable - that's how we got into 420 * trying the extension data in the first place. 421 * We need to move it from the preToU buffer 422 * to the error buffer, set an error code, 423 * and prepare the rest of the previous input for 2. 424 * 425 * 2. The rest of the previous input must be converted once we 426 * come back from the callback for the first character. 427 * At that time, we have to try again from scratch to convert 428 * these input characters. 429 * The replay will be handled by the ucnv.c conversion code. 430 */ 431 432 /* move the first codepage character to the error field */ 433 uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); 434 cnv->toULength=cnv->preToUFirstLength; 435 436 /* move the rest up inside the buffer */ 437 length=cnv->preToULength-cnv->preToUFirstLength; 438 if(length>0) { 439 uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); 440 } 441 442 /* mark preToU for replay */ 443 cnv->preToULength=(int8_t)-length; 444 445 /* set the error code for unassigned */ 446 *pErrorCode=U_INVALID_CHAR_FOUND; 447 } 448 } 449 450 /* from Unicode ------------------------------------------------------------- */ 451 452 // Use roundtrips, "good one-way" mappings, and some normal fallbacks. 453 static inline UBool 454 extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) { 455 return 456 ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 || 457 FROM_U_USE_FALLBACK(useFallback, firstCP)) && 458 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0; 459 } 460 461 /* 462 * @return index of the UChar, if found; else <0 463 */ 464 static inline int32_t 465 ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { 466 int32_t i, start, limit; 467 468 /* binary search */ 469 start=0; 470 limit=length; 471 for(;;) { 472 i=limit-start; 473 if(i<=1) { 474 break; /* done */ 475 } 476 /* start<limit-1 */ 477 478 if(i<=4) { 479 /* linear search for the last part */ 480 if(u<=fromUSection[start]) { 481 break; 482 } 483 if(++start<limit && u<=fromUSection[start]) { 484 break; 485 } 486 if(++start<limit && u<=fromUSection[start]) { 487 break; 488 } 489 /* always break at start==limit-1 */ 490 ++start; 491 break; 492 } 493 494 i=(start+limit)/2; 495 if(u<fromUSection[i]) { 496 limit=i; 497 } else { 498 start=i; 499 } 500 } 501 502 /* did we really find it? */ 503 if(start<limit && u==fromUSection[start]) { 504 return start; 505 } else { 506 return -1; /* not found */ 507 } 508 } 509 510 /* 511 * @param cx pointer to extension data; if NULL, returns 0 512 * @param firstCP the first code point before all the other UChars 513 * @param pre UChars that must match; !initialMatch: partial match with them 514 * @param preLength length of pre, >=0 515 * @param src UChars that can be used to complete a match 516 * @param srcLength length of src, >=0 517 * @param pMatchValue [out] output result value for the match from the data structure 518 * @param useFallback "use fallback" flag, usually from cnv->useFallback 519 * @param flush TRUE if the end of the input stream is reached 520 * @return >1: matched, return value=total match length (number of input units matched) 521 * 1: matched, no mapping but request for <subchar1> 522 * (only for the first code point) 523 * 0: no match 524 * <0: partial match, return value=negative total match length 525 * (partial matches are never returned for flush==TRUE) 526 * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) 527 * the matchLength is 2 if only firstCP matched, and >2 if firstCP and 528 * further code units matched 529 */ 530 static int32_t 531 ucnv_extMatchFromU(const int32_t *cx, 532 UChar32 firstCP, 533 const UChar *pre, int32_t preLength, 534 const UChar *src, int32_t srcLength, 535 uint32_t *pMatchValue, 536 UBool useFallback, UBool flush) { 537 const uint16_t *stage12, *stage3; 538 const uint32_t *stage3b; 539 540 const UChar *fromUTableUChars, *fromUSectionUChars; 541 const uint32_t *fromUTableValues, *fromUSectionValues; 542 543 uint32_t value, matchValue; 544 int32_t i, j, idx, length, matchLength; 545 UChar c; 546 547 if(cx==NULL) { 548 return 0; /* no extension data, no match */ 549 } 550 551 /* trie lookup of firstCP */ 552 idx=firstCP>>10; /* stage 1 index */ 553 if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { 554 return 0; /* the first code point is outside the trie */ 555 } 556 557 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 558 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 559 idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP); 560 561 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 562 value=stage3b[idx]; 563 if(value==0) { 564 return 0; 565 } 566 567 /* 568 * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: 569 * Do not interpret values with reserved bits used, for forward compatibility, 570 * and do not even remember intermediate results with reserved bits used. 571 */ 572 573 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 574 /* partial match, enter the loop below */ 575 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 576 577 /* initialize */ 578 fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); 579 fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); 580 581 matchValue=0; 582 i=j=matchLength=0; 583 584 /* we must not remember fallback matches when not using fallbacks */ 585 586 /* match input units until there is a full match or the input is consumed */ 587 for(;;) { 588 /* go to the next section */ 589 fromUSectionUChars=fromUTableUChars+idx; 590 fromUSectionValues=fromUTableValues+idx; 591 592 /* read first pair of the section */ 593 length=*fromUSectionUChars++; 594 value=*fromUSectionValues++; 595 if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) { 596 /* remember longest match so far */ 597 matchValue=value; 598 matchLength=2+i+j; 599 } 600 601 /* match pre[] then src[] */ 602 if(i<preLength) { 603 c=pre[i++]; 604 } else if(j<srcLength) { 605 c=src[j++]; 606 } else { 607 /* all input consumed, partial match */ 608 if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { 609 /* 610 * end of the entire input stream, stop with the longest match so far 611 * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS 612 * because it must fit into state buffers 613 */ 614 break; 615 } else { 616 /* continue with more input next time */ 617 return -(2+length); 618 } 619 } 620 621 /* search for the current UChar */ 622 idx=ucnv_extFindFromU(fromUSectionUChars, length, c); 623 if(idx<0) { 624 /* no match here, stop with the longest match so far */ 625 break; 626 } else { 627 value=fromUSectionValues[idx]; 628 if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 629 /* partial match, continue */ 630 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 631 } else { 632 if(extFromUUseMapping(useFallback, value, firstCP)) { 633 /* full match, stop with result */ 634 matchValue=value; 635 matchLength=2+i+j; 636 } else { 637 /* full match on fallback not taken, stop with the longest match so far */ 638 } 639 break; 640 } 641 } 642 } 643 644 if(matchLength==0) { 645 /* no match at all */ 646 return 0; 647 } 648 } else /* result from firstCP trie lookup */ { 649 if(extFromUUseMapping(useFallback, value, firstCP)) { 650 /* full match, stop with result */ 651 matchValue=value; 652 matchLength=2; 653 } else { 654 /* fallback not taken */ 655 return 0; 656 } 657 } 658 659 /* return result */ 660 if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { 661 return 1; /* assert matchLength==2 */ 662 } 663 664 *pMatchValue=matchValue; 665 return matchLength; 666 } 667 668 /* 669 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits 670 */ 671 static inline void 672 ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, 673 uint32_t value, 674 char **target, const char *targetLimit, 675 int32_t **offsets, int32_t srcIndex, 676 UErrorCode *pErrorCode) { 677 uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; 678 const uint8_t *result; 679 int32_t length, prevLength; 680 681 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 682 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 683 684 /* output the result */ 685 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 686 /* 687 * Generate a byte array and then write it below. 688 * This is not the fastest possible way, but it should be ok for 689 * extension mappings, and it is much simpler. 690 * Offset and overflow handling are only done once this way. 691 */ 692 uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ 693 switch(length) { 694 case 3: 695 *p++=(uint8_t)(value>>16); 696 U_FALLTHROUGH; 697 case 2: 698 *p++=(uint8_t)(value>>8); 699 U_FALLTHROUGH; 700 case 1: 701 *p++=(uint8_t)value; 702 U_FALLTHROUGH; 703 default: 704 break; /* will never occur */ 705 } 706 result=buffer+1; 707 } else { 708 result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 709 } 710 711 /* with correct data we have length>0 */ 712 713 if((prevLength=cnv->fromUnicodeStatus)!=0) { 714 /* handle SI/SO stateful output */ 715 uint8_t shiftByte; 716 717 if(prevLength>1 && length==1) { 718 /* change from double-byte mode to single-byte */ 719 shiftByte=(uint8_t)UCNV_SI; 720 cnv->fromUnicodeStatus=1; 721 } else if(prevLength==1 && length>1) { 722 /* change from single-byte mode to double-byte */ 723 shiftByte=(uint8_t)UCNV_SO; 724 cnv->fromUnicodeStatus=2; 725 } else { 726 shiftByte=0; 727 } 728 729 if(shiftByte!=0) { 730 /* prepend the shift byte to the result bytes */ 731 buffer[0]=shiftByte; 732 if(result!=buffer+1) { 733 uprv_memcpy(buffer+1, result, length); 734 } 735 result=buffer; 736 ++length; 737 } 738 } 739 740 ucnv_fromUWriteBytes(cnv, (const char *)result, length, 741 target, targetLimit, 742 offsets, srcIndex, 743 pErrorCode); 744 } 745 746 /* 747 * target<targetLimit; set error code for overflow 748 */ 749 U_CFUNC UBool 750 ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, 751 UChar32 cp, 752 const UChar **src, const UChar *srcLimit, 753 char **target, const char *targetLimit, 754 int32_t **offsets, int32_t srcIndex, 755 UBool flush, 756 UErrorCode *pErrorCode) { 757 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 758 int32_t match; 759 760 /* try to match */ 761 match=ucnv_extMatchFromU(cx, cp, 762 NULL, 0, 763 *src, (int32_t)(srcLimit-*src), 764 &value, 765 cnv->useFallback, flush); 766 767 /* reject a match if the result is a single byte for DBCS-only */ 768 if( match>=2 && 769 !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && 770 cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) 771 ) { 772 /* advance src pointer for the consumed input */ 773 *src+=match-2; /* remove 2 for the initial code point */ 774 775 /* write result to target */ 776 ucnv_extWriteFromU(cnv, cx, 777 value, 778 target, targetLimit, 779 offsets, srcIndex, 780 pErrorCode); 781 return TRUE; 782 } else if(match<0) { 783 /* save state for partial match */ 784 const UChar *s; 785 int32_t j; 786 787 /* copy the first code point */ 788 cnv->preFromUFirstCP=cp; 789 790 /* now copy the newly consumed input */ 791 s=*src; 792 match=-match-2; /* remove 2 for the initial code point */ 793 for(j=0; j<match; ++j) { 794 cnv->preFromU[j]=*s++; 795 } 796 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 797 cnv->preFromULength=(int8_t)match; 798 return TRUE; 799 } else if(match==1) { 800 /* matched, no mapping but request for <subchar1> */ 801 cnv->useSubChar1=TRUE; 802 return FALSE; 803 } else /* match==0 no match */ { 804 return FALSE; 805 } 806 } 807 808 /* 809 * Used by ISO 2022 implementation. 810 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping 811 */ 812 U_CFUNC int32_t 813 ucnv_extSimpleMatchFromU(const int32_t *cx, 814 UChar32 cp, uint32_t *pValue, 815 UBool useFallback) { 816 uint32_t value; 817 int32_t match; 818 819 /* try to match */ 820 match=ucnv_extMatchFromU(cx, 821 cp, 822 NULL, 0, 823 NULL, 0, 824 &value, 825 useFallback, TRUE); 826 if(match>=2) { 827 /* write result for simple, single-character conversion */ 828 int32_t length; 829 int isRoundtrip; 830 831 isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); 832 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 833 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 834 835 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 836 *pValue=value; 837 return isRoundtrip ? length : -length; 838 #if 0 /* not currently used */ 839 } else if(length==4) { 840 /* de-serialize a 4-byte result */ 841 const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 842 *pValue= 843 ((uint32_t)result[0]<<24)| 844 ((uint32_t)result[1]<<16)| 845 ((uint32_t)result[2]<<8)| 846 result[3]; 847 return isRoundtrip ? 4 : -4; 848 #endif 849 } 850 } 851 852 /* 853 * return no match because 854 * - match>1 && resultLength>4: result too long for simple conversion 855 * - match==1: no match found, <subchar1> preferred 856 * - match==0: no match found in the first place 857 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 858 */ 859 return 0; 860 } 861 862 /* 863 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 864 * never called for simple, single-character conversion 865 */ 866 U_CFUNC void 867 ucnv_extContinueMatchFromU(UConverter *cnv, 868 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, 869 UErrorCode *pErrorCode) { 870 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 871 int32_t match; 872 873 match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, 874 cnv->preFromUFirstCP, 875 cnv->preFromU, cnv->preFromULength, 876 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 877 &value, 878 cnv->useFallback, pArgs->flush); 879 if(match>=2) { 880 match-=2; /* remove 2 for the initial code point */ 881 882 if(match>=cnv->preFromULength) { 883 /* advance src pointer for the consumed input */ 884 pArgs->source+=match-cnv->preFromULength; 885 cnv->preFromULength=0; 886 } else { 887 /* the match did not use all of preFromU[] - keep the rest for replay */ 888 int32_t length=cnv->preFromULength-match; 889 u_memmove(cnv->preFromU, cnv->preFromU+match, length); 890 cnv->preFromULength=(int8_t)-length; 891 } 892 893 /* finish the partial match */ 894 cnv->preFromUFirstCP=U_SENTINEL; 895 896 /* write result */ 897 ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, 898 value, 899 &pArgs->target, pArgs->targetLimit, 900 &pArgs->offsets, srcIndex, 901 pErrorCode); 902 } else if(match<0) { 903 /* save state for partial match */ 904 const UChar *s; 905 int32_t j; 906 907 /* just _append_ the newly consumed input to preFromU[] */ 908 s=pArgs->source; 909 match=-match-2; /* remove 2 for the initial code point */ 910 for(j=cnv->preFromULength; j<match; ++j) { 911 U_ASSERT(j>=0); 912 cnv->preFromU[j]=*s++; 913 } 914 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 915 cnv->preFromULength=(int8_t)match; 916 } else /* match==0 or 1 */ { 917 /* 918 * no match 919 * 920 * We need to split the previous input into two parts: 921 * 922 * 1. The first code point is unmappable - that's how we got into 923 * trying the extension data in the first place. 924 * We need to move it from the preFromU buffer 925 * to the error buffer, set an error code, 926 * and prepare the rest of the previous input for 2. 927 * 928 * 2. The rest of the previous input must be converted once we 929 * come back from the callback for the first code point. 930 * At that time, we have to try again from scratch to convert 931 * these input characters. 932 * The replay will be handled by the ucnv.c conversion code. 933 */ 934 935 if(match==1) { 936 /* matched, no mapping but request for <subchar1> */ 937 cnv->useSubChar1=TRUE; 938 } 939 940 /* move the first code point to the error field */ 941 cnv->fromUChar32=cnv->preFromUFirstCP; 942 cnv->preFromUFirstCP=U_SENTINEL; 943 944 /* mark preFromU for replay */ 945 cnv->preFromULength=-cnv->preFromULength; 946 947 /* set the error code for unassigned */ 948 *pErrorCode=U_INVALID_CHAR_FOUND; 949 } 950 } 951 952 static UBool 953 extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) { 954 if(which==UCNV_ROUNDTRIP_SET) { 955 // Add only code points for which the roundtrip flag is set. 956 // Do not add any fallbacks, even if ucnv_fromUnicode() would use them 957 // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet(). 958 // 959 // By analogy, also do not add "good one-way" mappings. 960 // 961 // Do not add entries with reserved bits set. 962 if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!= 963 UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) { 964 return FALSE; 965 } 966 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 967 // Do not add entries with reserved bits set. 968 if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) { 969 return FALSE; 970 } 971 } 972 // Do not add <subchar1> entries or other (future?) pseudo-entries 973 // with an output length of 0. 974 return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength; 975 } 976 977 static void 978 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, 979 const int32_t *cx, 980 const USetAdder *sa, 981 UConverterUnicodeSet which, 982 int32_t minLength, 983 UChar32 firstCP, 984 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, 985 int32_t sectionIndex, 986 UErrorCode *pErrorCode) { 987 const UChar *fromUSectionUChars; 988 const uint32_t *fromUSectionValues; 989 990 uint32_t value; 991 int32_t i, count; 992 993 fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; 994 fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; 995 996 /* read first pair of the section */ 997 count=*fromUSectionUChars++; 998 value=*fromUSectionValues++; 999 1000 if(extSetUseMapping(which, minLength, value)) { 1001 if(length==U16_LENGTH(firstCP)) { 1002 /* add the initial code point */ 1003 sa->add(sa->set, firstCP); 1004 } else { 1005 /* add the string so far */ 1006 sa->addString(sa->set, s, length); 1007 } 1008 } 1009 1010 for(i=0; i<count; ++i) { 1011 /* append this code unit and recurse or add the string */ 1012 s[length]=fromUSectionUChars[i]; 1013 value=fromUSectionValues[i]; 1014 1015 if(value==0) { 1016 /* no mapping, do nothing */ 1017 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 1018 ucnv_extGetUnicodeSetString( 1019 sharedData, cx, sa, which, minLength, 1020 firstCP, s, length+1, 1021 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 1022 pErrorCode); 1023 } else if(extSetUseMapping(which, minLength, value)) { 1024 sa->addString(sa->set, s, length+1); 1025 } 1026 } 1027 } 1028 1029 U_CFUNC void 1030 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, 1031 const USetAdder *sa, 1032 UConverterUnicodeSet which, 1033 UConverterSetFilter filter, 1034 UErrorCode *pErrorCode) { 1035 const int32_t *cx; 1036 const uint16_t *stage12, *stage3, *ps2, *ps3; 1037 const uint32_t *stage3b; 1038 1039 uint32_t value; 1040 int32_t st1, stage1Length, st2, st3, minLength; 1041 1042 UChar s[UCNV_EXT_MAX_UCHARS]; 1043 UChar32 c; 1044 int32_t length; 1045 1046 cx=sharedData->mbcs.extIndexes; 1047 if(cx==NULL) { 1048 return; 1049 } 1050 1051 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 1052 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 1053 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 1054 1055 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; 1056 1057 /* enumerate the from-Unicode trie table */ 1058 c=0; /* keep track of the current code point while enumerating */ 1059 1060 if(filter==UCNV_SET_FILTER_2022_CN) { 1061 minLength=3; 1062 } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || 1063 filter!=UCNV_SET_FILTER_NONE 1064 ) { 1065 /* DBCS-only, ignore single-byte results */ 1066 minLength=2; 1067 } else { 1068 minLength=1; 1069 } 1070 1071 /* 1072 * the trie enumeration is almost the same as 1073 * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 1074 */ 1075 for(st1=0; st1<stage1Length; ++st1) { 1076 st2=stage12[st1]; 1077 if(st2>stage1Length) { 1078 ps2=stage12+st2; 1079 for(st2=0; st2<64; ++st2) { 1080 if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { 1081 /* read the stage 3 block */ 1082 ps3=stage3+st3; 1083 1084 do { 1085 value=stage3b[*ps3++]; 1086 if(value==0) { 1087 /* no mapping, do nothing */ 1088 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 1089 // Recurse for partial results. 1090 length=0; 1091 U16_APPEND_UNSAFE(s, length, c); 1092 ucnv_extGetUnicodeSetString( 1093 sharedData, cx, sa, which, minLength, 1094 c, s, length, 1095 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 1096 pErrorCode); 1097 } else if(extSetUseMapping(which, minLength, value)) { 1098 switch(filter) { 1099 case UCNV_SET_FILTER_2022_CN: 1100 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { 1101 continue; 1102 } 1103 break; 1104 case UCNV_SET_FILTER_SJIS: 1105 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { 1106 continue; 1107 } 1108 break; 1109 case UCNV_SET_FILTER_GR94DBCS: 1110 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1111 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && 1112 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1113 continue; 1114 } 1115 break; 1116 case UCNV_SET_FILTER_HZ: 1117 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1118 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1119 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1120 continue; 1121 } 1122 break; 1123 default: 1124 /* 1125 * UCNV_SET_FILTER_NONE, 1126 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength 1127 */ 1128 break; 1129 } 1130 sa->add(sa->set, c); 1131 } 1132 } while((++c&0xf)!=0); 1133 } else { 1134 c+=16; /* empty stage 3 block */ 1135 } 1136 } 1137 } else { 1138 c+=1024; /* empty stage 2 block */ 1139 } 1140 } 1141 } 1142 1143 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1144