1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2003-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnv_ext.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003jun13 16 * created by: Markus W. Scherer 17 * 18 * Conversion extensions 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 24 25 #include "unicode/uset.h" 26 #include "ucnv_bld.h" 27 #include "ucnv_cnv.h" 28 #include "ucnv_ext.h" 29 #include "cmemory.h" 30 #include "uassert.h" 31 32 /* to Unicode --------------------------------------------------------------- */ 33 34 /* 35 * @return lookup value for the byte, if found; else 0 36 */ 37 static inline uint32_t 38 ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { 39 uint32_t word0, word; 40 int32_t i, start, limit; 41 42 /* check the input byte against the lowest and highest section bytes */ 43 start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); 44 limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); 45 if(byte<start || limit<byte) { 46 return 0; /* the byte is out of range */ 47 } 48 49 if(length==((limit-start)+1)) { 50 /* direct access on a linear array */ 51 return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */ 52 } 53 54 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ 55 word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0); 56 57 /* 58 * Shift byte once instead of each section word and add 0xffffff. 59 * We will compare the shifted/added byte (bbffffff) against 60 * section words which have byte values in the same bit position. 61 * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv 62 * for all v=0..f 63 * so we need not mask off the lower 24 bits of each section word. 64 */ 65 word=word0|UCNV_EXT_TO_U_VALUE_MASK; 66 67 /* binary search */ 68 start=0; 69 limit=length; 70 for(;;) { 71 i=limit-start; 72 if(i<=1) { 73 break; /* done */ 74 } 75 /* start<limit-1 */ 76 77 if(i<=4) { 78 /* linear search for the last part */ 79 if(word0<=toUSection[start]) { 80 break; 81 } 82 if(++start<limit && word0<=toUSection[start]) { 83 break; 84 } 85 if(++start<limit && word0<=toUSection[start]) { 86 break; 87 } 88 /* always break at start==limit-1 */ 89 ++start; 90 break; 91 } 92 93 i=(start+limit)/2; 94 if(word<toUSection[i]) { 95 limit=i; 96 } else { 97 start=i; 98 } 99 } 100 101 /* did we really find it? */ 102 if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) { 103 return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */ 104 } else { 105 return 0; /* not found */ 106 } 107 } 108 109 /* 110 * TRUE if not an SI/SO stateful converter, 111 * or if the match length fits with the current converter state 112 */ 113 #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \ 114 ((sisoState)<0 || ((sisoState)==0) == (match==1)) 115 116 /* 117 * this works like ucnv_extMatchFromU() except 118 * - the first character is in pre 119 * - no trie is used 120 * - the returned matchLength is not offset by 2 121 */ 122 static int32_t 123 ucnv_extMatchToU(const int32_t *cx, int8_t sisoState, 124 const char *pre, int32_t preLength, 125 const char *src, int32_t srcLength, 126 uint32_t *pMatchValue, 127 UBool /*useFallback*/, UBool flush) { 128 const uint32_t *toUTable, *toUSection; 129 130 uint32_t value, matchValue; 131 int32_t i, j, idx, length, matchLength; 132 uint8_t b; 133 134 if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) { 135 return 0; /* no extension data, no match */ 136 } 137 138 /* initialize */ 139 toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t); 140 idx=0; 141 142 matchValue=0; 143 i=j=matchLength=0; 144 145 if(sisoState==0) { 146 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ 147 if(preLength>1) { 148 return 0; /* no match of a DBCS sequence in SBCS mode */ 149 } else if(preLength==1) { 150 srcLength=0; 151 } else /* preLength==0 */ { 152 if(srcLength>1) { 153 srcLength=1; 154 } 155 } 156 flush=TRUE; 157 } 158 159 /* we must not remember fallback matches when not using fallbacks */ 160 161 /* match input units until there is a full match or the input is consumed */ 162 for(;;) { 163 /* go to the next section */ 164 toUSection=toUTable+idx; 165 166 /* read first pair of the section */ 167 value=*toUSection++; 168 length=UCNV_EXT_TO_U_GET_BYTE(value); 169 value=UCNV_EXT_TO_U_GET_VALUE(value); 170 if( value!=0 && 171 (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 172 TO_U_USE_FALLBACK(useFallback)) && 173 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 174 ) { 175 /* remember longest match so far */ 176 matchValue=value; 177 matchLength=i+j; 178 } 179 180 /* match pre[] then src[] */ 181 if(i<preLength) { 182 b=(uint8_t)pre[i++]; 183 } else if(j<srcLength) { 184 b=(uint8_t)src[j++]; 185 } else { 186 /* all input consumed, partial match */ 187 if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) { 188 /* 189 * end of the entire input stream, stop with the longest match so far 190 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES 191 * because it must fit into state buffers 192 */ 193 break; 194 } else { 195 /* continue with more input next time */ 196 return -length; 197 } 198 } 199 200 /* search for the current UChar */ 201 value=ucnv_extFindToU(toUSection, length, b); 202 if(value==0) { 203 /* no match here, stop with the longest match so far */ 204 break; 205 } else { 206 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 207 /* partial match, continue */ 208 idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); 209 } else { 210 if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || 211 TO_U_USE_FALLBACK(useFallback)) && 212 UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j) 213 ) { 214 /* full match, stop with result */ 215 matchValue=value; 216 matchLength=i+j; 217 } else { 218 /* full match on fallback not taken, stop with the longest match so far */ 219 } 220 break; 221 } 222 } 223 } 224 225 if(matchLength==0) { 226 /* no match at all */ 227 return 0; 228 } 229 230 /* return result */ 231 *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); 232 return matchLength; 233 } 234 235 static inline void 236 ucnv_extWriteToU(UConverter *cnv, const int32_t *cx, 237 uint32_t value, 238 UChar **target, const UChar *targetLimit, 239 int32_t **offsets, int32_t srcIndex, 240 UErrorCode *pErrorCode) { 241 /* output the result */ 242 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 243 /* output a single code point */ 244 ucnv_toUWriteCodePoint( 245 cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value), 246 target, targetLimit, 247 offsets, srcIndex, 248 pErrorCode); 249 } else { 250 /* output a string - with correct data we have resultLength>0 */ 251 ucnv_toUWriteUChars( 252 cnv, 253 UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+ 254 UCNV_EXT_TO_U_GET_INDEX(value), 255 UCNV_EXT_TO_U_GET_LENGTH(value), 256 target, targetLimit, 257 offsets, srcIndex, 258 pErrorCode); 259 } 260 } 261 262 /* 263 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), 264 * or 1 for DBCS-only, 265 * or -1 if the converter is not SI/SO stateful 266 * 267 * Note: For SI/SO stateful converters getting here, 268 * cnv->mode==0 is equivalent to firstLength==1. 269 */ 270 #define UCNV_SISO_STATE(cnv) \ 271 ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \ 272 (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1) 273 274 /* 275 * target<targetLimit; set error code for overflow 276 */ 277 U_CFUNC UBool 278 ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, 279 int32_t firstLength, 280 const char **src, const char *srcLimit, 281 UChar **target, const UChar *targetLimit, 282 int32_t **offsets, int32_t srcIndex, 283 UBool flush, 284 UErrorCode *pErrorCode) { 285 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 286 int32_t match; 287 288 /* try to match */ 289 match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv), 290 (const char *)cnv->toUBytes, firstLength, 291 *src, (int32_t)(srcLimit-*src), 292 &value, 293 cnv->useFallback, flush); 294 if(match>0) { 295 /* advance src pointer for the consumed input */ 296 *src+=match-firstLength; 297 298 /* write result to target */ 299 ucnv_extWriteToU(cnv, cx, 300 value, 301 target, targetLimit, 302 offsets, srcIndex, 303 pErrorCode); 304 return TRUE; 305 } else if(match<0) { 306 /* save state for partial match */ 307 const char *s; 308 int32_t j; 309 310 /* copy the first code point */ 311 s=(const char *)cnv->toUBytes; 312 cnv->preToUFirstLength=(int8_t)firstLength; 313 for(j=0; j<firstLength; ++j) { 314 cnv->preToU[j]=*s++; 315 } 316 317 /* now copy the newly consumed input */ 318 s=*src; 319 match=-match; 320 for(; j<match; ++j) { 321 cnv->preToU[j]=*s++; 322 } 323 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 324 cnv->preToULength=(int8_t)match; 325 return TRUE; 326 } else /* match==0 no match */ { 327 return FALSE; 328 } 329 } 330 331 U_CFUNC UChar32 332 ucnv_extSimpleMatchToU(const int32_t *cx, 333 const char *source, int32_t length, 334 UBool useFallback) { 335 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 336 int32_t match; 337 338 if(length<=0) { 339 return 0xffff; 340 } 341 342 /* try to match */ 343 match=ucnv_extMatchToU(cx, -1, 344 source, length, 345 NULL, 0, 346 &value, 347 useFallback, TRUE); 348 if(match==length) { 349 /* write result for simple, single-character conversion */ 350 if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { 351 return UCNV_EXT_TO_U_GET_CODE_POINT(value); 352 } 353 } 354 355 /* 356 * return no match because 357 * - match>0 && value points to string: simple conversion cannot handle multiple code points 358 * - match>0 && match!=length: not all input consumed, forbidden for this function 359 * - match==0: no match found in the first place 360 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 361 */ 362 return 0xfffe; 363 } 364 365 /* 366 * continue partial match with new input 367 * never called for simple, single-character conversion 368 */ 369 U_CFUNC void 370 ucnv_extContinueMatchToU(UConverter *cnv, 371 UConverterToUnicodeArgs *pArgs, int32_t srcIndex, 372 UErrorCode *pErrorCode) { 373 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 374 int32_t match, length; 375 376 match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv), 377 cnv->preToU, cnv->preToULength, 378 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 379 &value, 380 cnv->useFallback, pArgs->flush); 381 if(match>0) { 382 if(match>=cnv->preToULength) { 383 /* advance src pointer for the consumed input */ 384 pArgs->source+=match-cnv->preToULength; 385 cnv->preToULength=0; 386 } else { 387 /* the match did not use all of preToU[] - keep the rest for replay */ 388 length=cnv->preToULength-match; 389 uprv_memmove(cnv->preToU, cnv->preToU+match, length); 390 cnv->preToULength=(int8_t)-length; 391 } 392 393 /* write result */ 394 ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes, 395 value, 396 &pArgs->target, pArgs->targetLimit, 397 &pArgs->offsets, srcIndex, 398 pErrorCode); 399 } else if(match<0) { 400 /* save state for partial match */ 401 const char *s; 402 int32_t j; 403 404 /* just _append_ the newly consumed input to preToU[] */ 405 s=pArgs->source; 406 match=-match; 407 for(j=cnv->preToULength; j<match; ++j) { 408 cnv->preToU[j]=*s++; 409 } 410 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 411 cnv->preToULength=(int8_t)match; 412 } else /* match==0 */ { 413 /* 414 * no match 415 * 416 * We need to split the previous input into two parts: 417 * 418 * 1. The first codepage character is unmappable - that's how we got into 419 * trying the extension data in the first place. 420 * We need to move it from the preToU buffer 421 * to the error buffer, set an error code, 422 * and prepare the rest of the previous input for 2. 423 * 424 * 2. The rest of the previous input must be converted once we 425 * come back from the callback for the first character. 426 * At that time, we have to try again from scratch to convert 427 * these input characters. 428 * The replay will be handled by the ucnv.c conversion code. 429 */ 430 431 /* move the first codepage character to the error field */ 432 uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); 433 cnv->toULength=cnv->preToUFirstLength; 434 435 /* move the rest up inside the buffer */ 436 length=cnv->preToULength-cnv->preToUFirstLength; 437 if(length>0) { 438 uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); 439 } 440 441 /* mark preToU for replay */ 442 cnv->preToULength=(int8_t)-length; 443 444 /* set the error code for unassigned */ 445 *pErrorCode=U_INVALID_CHAR_FOUND; 446 } 447 } 448 449 /* from Unicode ------------------------------------------------------------- */ 450 451 // Use roundtrips, "good one-way" mappings, and some normal fallbacks. 452 static inline UBool 453 extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) { 454 return 455 ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 || 456 FROM_U_USE_FALLBACK(useFallback, firstCP)) && 457 (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0; 458 } 459 460 /* 461 * @return index of the UChar, if found; else <0 462 */ 463 static inline int32_t 464 ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { 465 int32_t i, start, limit; 466 467 /* binary search */ 468 start=0; 469 limit=length; 470 for(;;) { 471 i=limit-start; 472 if(i<=1) { 473 break; /* done */ 474 } 475 /* start<limit-1 */ 476 477 if(i<=4) { 478 /* linear search for the last part */ 479 if(u<=fromUSection[start]) { 480 break; 481 } 482 if(++start<limit && u<=fromUSection[start]) { 483 break; 484 } 485 if(++start<limit && u<=fromUSection[start]) { 486 break; 487 } 488 /* always break at start==limit-1 */ 489 ++start; 490 break; 491 } 492 493 i=(start+limit)/2; 494 if(u<fromUSection[i]) { 495 limit=i; 496 } else { 497 start=i; 498 } 499 } 500 501 /* did we really find it? */ 502 if(start<limit && u==fromUSection[start]) { 503 return start; 504 } else { 505 return -1; /* not found */ 506 } 507 } 508 509 /* 510 * @param cx pointer to extension data; if NULL, returns 0 511 * @param firstCP the first code point before all the other UChars 512 * @param pre UChars that must match; !initialMatch: partial match with them 513 * @param preLength length of pre, >=0 514 * @param src UChars that can be used to complete a match 515 * @param srcLength length of src, >=0 516 * @param pMatchValue [out] output result value for the match from the data structure 517 * @param useFallback "use fallback" flag, usually from cnv->useFallback 518 * @param flush TRUE if the end of the input stream is reached 519 * @return >1: matched, return value=total match length (number of input units matched) 520 * 1: matched, no mapping but request for <subchar1> 521 * (only for the first code point) 522 * 0: no match 523 * <0: partial match, return value=negative total match length 524 * (partial matches are never returned for flush==TRUE) 525 * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) 526 * the matchLength is 2 if only firstCP matched, and >2 if firstCP and 527 * further code units matched 528 */ 529 static int32_t 530 ucnv_extMatchFromU(const int32_t *cx, 531 UChar32 firstCP, 532 const UChar *pre, int32_t preLength, 533 const UChar *src, int32_t srcLength, 534 uint32_t *pMatchValue, 535 UBool useFallback, UBool flush) { 536 const uint16_t *stage12, *stage3; 537 const uint32_t *stage3b; 538 539 const UChar *fromUTableUChars, *fromUSectionUChars; 540 const uint32_t *fromUTableValues, *fromUSectionValues; 541 542 uint32_t value, matchValue; 543 int32_t i, j, idx, length, matchLength; 544 UChar c; 545 546 if(cx==NULL) { 547 return 0; /* no extension data, no match */ 548 } 549 550 /* trie lookup of firstCP */ 551 idx=firstCP>>10; /* stage 1 index */ 552 if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { 553 return 0; /* the first code point is outside the trie */ 554 } 555 556 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 557 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 558 idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP); 559 560 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 561 value=stage3b[idx]; 562 if(value==0) { 563 return 0; 564 } 565 566 /* 567 * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0: 568 * Do not interpret values with reserved bits used, for forward compatibility, 569 * and do not even remember intermediate results with reserved bits used. 570 */ 571 572 if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { 573 /* partial match, enter the loop below */ 574 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 575 576 /* initialize */ 577 fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); 578 fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); 579 580 matchValue=0; 581 i=j=matchLength=0; 582 583 /* we must not remember fallback matches when not using fallbacks */ 584 585 /* match input units until there is a full match or the input is consumed */ 586 for(;;) { 587 /* go to the next section */ 588 fromUSectionUChars=fromUTableUChars+idx; 589 fromUSectionValues=fromUTableValues+idx; 590 591 /* read first pair of the section */ 592 length=*fromUSectionUChars++; 593 value=*fromUSectionValues++; 594 if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) { 595 /* remember longest match so far */ 596 matchValue=value; 597 matchLength=2+i+j; 598 } 599 600 /* match pre[] then src[] */ 601 if(i<preLength) { 602 c=pre[i++]; 603 } else if(j<srcLength) { 604 c=src[j++]; 605 } else { 606 /* all input consumed, partial match */ 607 if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) { 608 /* 609 * end of the entire input stream, stop with the longest match so far 610 * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS 611 * because it must fit into state buffers 612 */ 613 break; 614 } else { 615 /* continue with more input next time */ 616 return -(2+length); 617 } 618 } 619 620 /* search for the current UChar */ 621 idx=ucnv_extFindFromU(fromUSectionUChars, length, c); 622 if(idx<0) { 623 /* no match here, stop with the longest match so far */ 624 break; 625 } else { 626 value=fromUSectionValues[idx]; 627 if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 628 /* partial match, continue */ 629 idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); 630 } else { 631 if(extFromUUseMapping(useFallback, value, firstCP)) { 632 /* full match, stop with result */ 633 matchValue=value; 634 matchLength=2+i+j; 635 } else { 636 /* full match on fallback not taken, stop with the longest match so far */ 637 } 638 break; 639 } 640 } 641 } 642 643 if(matchLength==0) { 644 /* no match at all */ 645 return 0; 646 } 647 } else /* result from firstCP trie lookup */ { 648 if(extFromUUseMapping(useFallback, value, firstCP)) { 649 /* full match, stop with result */ 650 matchValue=value; 651 matchLength=2; 652 } else { 653 /* fallback not taken */ 654 return 0; 655 } 656 } 657 658 /* return result */ 659 if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { 660 return 1; /* assert matchLength==2 */ 661 } 662 663 *pMatchValue=matchValue; 664 return matchLength; 665 } 666 667 /* 668 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits 669 */ 670 static inline void 671 ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx, 672 uint32_t value, 673 char **target, const char *targetLimit, 674 int32_t **offsets, int32_t srcIndex, 675 UErrorCode *pErrorCode) { 676 uint8_t buffer[1+UCNV_EXT_MAX_BYTES]; 677 const uint8_t *result; 678 int32_t length, prevLength; 679 680 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 681 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 682 683 /* output the result */ 684 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 685 /* 686 * Generate a byte array and then write it below. 687 * This is not the fastest possible way, but it should be ok for 688 * extension mappings, and it is much simpler. 689 * Offset and overflow handling are only done once this way. 690 */ 691 uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */ 692 switch(length) { 693 case 3: 694 *p++=(uint8_t)(value>>16); 695 U_FALLTHROUGH; 696 case 2: 697 *p++=(uint8_t)(value>>8); 698 U_FALLTHROUGH; 699 case 1: 700 *p++=(uint8_t)value; 701 U_FALLTHROUGH; 702 default: 703 break; /* will never occur */ 704 } 705 result=buffer+1; 706 } else { 707 result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 708 } 709 710 /* with correct data we have length>0 */ 711 712 if((prevLength=cnv->fromUnicodeStatus)!=0) { 713 /* handle SI/SO stateful output */ 714 uint8_t shiftByte; 715 716 if(prevLength>1 && length==1) { 717 /* change from double-byte mode to single-byte */ 718 shiftByte=(uint8_t)UCNV_SI; 719 cnv->fromUnicodeStatus=1; 720 } else if(prevLength==1 && length>1) { 721 /* change from single-byte mode to double-byte */ 722 shiftByte=(uint8_t)UCNV_SO; 723 cnv->fromUnicodeStatus=2; 724 } else { 725 shiftByte=0; 726 } 727 728 if(shiftByte!=0) { 729 /* prepend the shift byte to the result bytes */ 730 buffer[0]=shiftByte; 731 if(result!=buffer+1) { 732 uprv_memcpy(buffer+1, result, length); 733 } 734 result=buffer; 735 ++length; 736 } 737 } 738 739 ucnv_fromUWriteBytes(cnv, (const char *)result, length, 740 target, targetLimit, 741 offsets, srcIndex, 742 pErrorCode); 743 } 744 745 /* 746 * target<targetLimit; set error code for overflow 747 */ 748 U_CFUNC UBool 749 ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, 750 UChar32 cp, 751 const UChar **src, const UChar *srcLimit, 752 char **target, const char *targetLimit, 753 int32_t **offsets, int32_t srcIndex, 754 UBool flush, 755 UErrorCode *pErrorCode) { 756 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 757 int32_t match; 758 759 /* try to match */ 760 match=ucnv_extMatchFromU(cx, cp, 761 NULL, 0, 762 *src, (int32_t)(srcLimit-*src), 763 &value, 764 cnv->useFallback, flush); 765 766 /* reject a match if the result is a single byte for DBCS-only */ 767 if( match>=2 && 768 !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 && 769 cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) 770 ) { 771 /* advance src pointer for the consumed input */ 772 *src+=match-2; /* remove 2 for the initial code point */ 773 774 /* write result to target */ 775 ucnv_extWriteFromU(cnv, cx, 776 value, 777 target, targetLimit, 778 offsets, srcIndex, 779 pErrorCode); 780 return TRUE; 781 } else if(match<0) { 782 /* save state for partial match */ 783 const UChar *s; 784 int32_t j; 785 786 /* copy the first code point */ 787 cnv->preFromUFirstCP=cp; 788 789 /* now copy the newly consumed input */ 790 s=*src; 791 match=-match-2; /* remove 2 for the initial code point */ 792 for(j=0; j<match; ++j) { 793 cnv->preFromU[j]=*s++; 794 } 795 *src=s; /* same as *src=srcLimit; because we reached the end of input */ 796 cnv->preFromULength=(int8_t)match; 797 return TRUE; 798 } else if(match==1) { 799 /* matched, no mapping but request for <subchar1> */ 800 cnv->useSubChar1=TRUE; 801 return FALSE; 802 } else /* match==0 no match */ { 803 return FALSE; 804 } 805 } 806 807 /* 808 * Used by ISO 2022 implementation. 809 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping 810 */ 811 U_CFUNC int32_t 812 ucnv_extSimpleMatchFromU(const int32_t *cx, 813 UChar32 cp, uint32_t *pValue, 814 UBool useFallback) { 815 uint32_t value; 816 int32_t match; 817 818 /* try to match */ 819 match=ucnv_extMatchFromU(cx, 820 cp, 821 NULL, 0, 822 NULL, 0, 823 &value, 824 useFallback, TRUE); 825 if(match>=2) { 826 /* write result for simple, single-character conversion */ 827 int32_t length; 828 int isRoundtrip; 829 830 isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value); 831 length=UCNV_EXT_FROM_U_GET_LENGTH(value); 832 value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value); 833 834 if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { 835 *pValue=value; 836 return isRoundtrip ? length : -length; 837 #if 0 /* not currently used */ 838 } else if(length==4) { 839 /* de-serialize a 4-byte result */ 840 const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 841 *pValue= 842 ((uint32_t)result[0]<<24)| 843 ((uint32_t)result[1]<<16)| 844 ((uint32_t)result[2]<<8)| 845 result[3]; 846 return isRoundtrip ? 4 : -4; 847 #endif 848 } 849 } 850 851 /* 852 * return no match because 853 * - match>1 && resultLength>4: result too long for simple conversion 854 * - match==1: no match found, <subchar1> preferred 855 * - match==0: no match found in the first place 856 * - match<0: partial match, not supported for simple conversion (and flush==TRUE) 857 */ 858 return 0; 859 } 860 861 /* 862 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 863 * never called for simple, single-character conversion 864 */ 865 U_CFUNC void 866 ucnv_extContinueMatchFromU(UConverter *cnv, 867 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, 868 UErrorCode *pErrorCode) { 869 uint32_t value = 0; /* initialize output-only param to 0 to silence gcc */ 870 int32_t match; 871 872 match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes, 873 cnv->preFromUFirstCP, 874 cnv->preFromU, cnv->preFromULength, 875 pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), 876 &value, 877 cnv->useFallback, pArgs->flush); 878 if(match>=2) { 879 match-=2; /* remove 2 for the initial code point */ 880 881 if(match>=cnv->preFromULength) { 882 /* advance src pointer for the consumed input */ 883 pArgs->source+=match-cnv->preFromULength; 884 cnv->preFromULength=0; 885 } else { 886 /* the match did not use all of preFromU[] - keep the rest for replay */ 887 int32_t length=cnv->preFromULength-match; 888 u_memmove(cnv->preFromU, cnv->preFromU+match, length); 889 cnv->preFromULength=(int8_t)-length; 890 } 891 892 /* finish the partial match */ 893 cnv->preFromUFirstCP=U_SENTINEL; 894 895 /* write result */ 896 ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes, 897 value, 898 &pArgs->target, pArgs->targetLimit, 899 &pArgs->offsets, srcIndex, 900 pErrorCode); 901 } else if(match<0) { 902 /* save state for partial match */ 903 const UChar *s; 904 int32_t j; 905 906 /* just _append_ the newly consumed input to preFromU[] */ 907 s=pArgs->source; 908 match=-match-2; /* remove 2 for the initial code point */ 909 for(j=cnv->preFromULength; j<match; ++j) { 910 U_ASSERT(j>=0); 911 cnv->preFromU[j]=*s++; 912 } 913 pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ 914 cnv->preFromULength=(int8_t)match; 915 } else /* match==0 or 1 */ { 916 /* 917 * no match 918 * 919 * We need to split the previous input into two parts: 920 * 921 * 1. The first code point is unmappable - that's how we got into 922 * trying the extension data in the first place. 923 * We need to move it from the preFromU buffer 924 * to the error buffer, set an error code, 925 * and prepare the rest of the previous input for 2. 926 * 927 * 2. The rest of the previous input must be converted once we 928 * come back from the callback for the first code point. 929 * At that time, we have to try again from scratch to convert 930 * these input characters. 931 * The replay will be handled by the ucnv.c conversion code. 932 */ 933 934 if(match==1) { 935 /* matched, no mapping but request for <subchar1> */ 936 cnv->useSubChar1=TRUE; 937 } 938 939 /* move the first code point to the error field */ 940 cnv->fromUChar32=cnv->preFromUFirstCP; 941 cnv->preFromUFirstCP=U_SENTINEL; 942 943 /* mark preFromU for replay */ 944 cnv->preFromULength=-cnv->preFromULength; 945 946 /* set the error code for unassigned */ 947 *pErrorCode=U_INVALID_CHAR_FOUND; 948 } 949 } 950 951 static UBool 952 extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) { 953 if(which==UCNV_ROUNDTRIP_SET) { 954 // Add only code points for which the roundtrip flag is set. 955 // Do not add any fallbacks, even if ucnv_fromUnicode() would use them 956 // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet(). 957 // 958 // By analogy, also do not add "good one-way" mappings. 959 // 960 // Do not add entries with reserved bits set. 961 if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!= 962 UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) { 963 return FALSE; 964 } 965 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 966 // Do not add entries with reserved bits set. 967 if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) { 968 return FALSE; 969 } 970 } 971 // Do not add <subchar1> entries or other (future?) pseudo-entries 972 // with an output length of 0. 973 return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength; 974 } 975 976 static void 977 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, 978 const int32_t *cx, 979 const USetAdder *sa, 980 UConverterUnicodeSet which, 981 int32_t minLength, 982 UChar32 firstCP, 983 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, 984 int32_t sectionIndex, 985 UErrorCode *pErrorCode) { 986 const UChar *fromUSectionUChars; 987 const uint32_t *fromUSectionValues; 988 989 uint32_t value; 990 int32_t i, count; 991 992 fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex; 993 fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex; 994 995 /* read first pair of the section */ 996 count=*fromUSectionUChars++; 997 value=*fromUSectionValues++; 998 999 if(extSetUseMapping(which, minLength, value)) { 1000 if(length==U16_LENGTH(firstCP)) { 1001 /* add the initial code point */ 1002 sa->add(sa->set, firstCP); 1003 } else { 1004 /* add the string so far */ 1005 sa->addString(sa->set, s, length); 1006 } 1007 } 1008 1009 for(i=0; i<count; ++i) { 1010 /* append this code unit and recurse or add the string */ 1011 s[length]=fromUSectionUChars[i]; 1012 value=fromUSectionValues[i]; 1013 1014 if(value==0) { 1015 /* no mapping, do nothing */ 1016 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 1017 ucnv_extGetUnicodeSetString( 1018 sharedData, cx, sa, which, minLength, 1019 firstCP, s, length+1, 1020 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 1021 pErrorCode); 1022 } else if(extSetUseMapping(which, minLength, value)) { 1023 sa->addString(sa->set, s, length+1); 1024 } 1025 } 1026 } 1027 1028 U_CFUNC void 1029 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, 1030 const USetAdder *sa, 1031 UConverterUnicodeSet which, 1032 UConverterSetFilter filter, 1033 UErrorCode *pErrorCode) { 1034 const int32_t *cx; 1035 const uint16_t *stage12, *stage3, *ps2, *ps3; 1036 const uint32_t *stage3b; 1037 1038 uint32_t value; 1039 int32_t st1, stage1Length, st2, st3, minLength; 1040 1041 UChar s[UCNV_EXT_MAX_UCHARS]; 1042 UChar32 c; 1043 int32_t length; 1044 1045 cx=sharedData->mbcs.extIndexes; 1046 if(cx==NULL) { 1047 return; 1048 } 1049 1050 stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); 1051 stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); 1052 stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); 1053 1054 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; 1055 1056 /* enumerate the from-Unicode trie table */ 1057 c=0; /* keep track of the current code point while enumerating */ 1058 1059 if(filter==UCNV_SET_FILTER_2022_CN) { 1060 minLength=3; 1061 } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || 1062 filter!=UCNV_SET_FILTER_NONE 1063 ) { 1064 /* DBCS-only, ignore single-byte results */ 1065 minLength=2; 1066 } else { 1067 minLength=1; 1068 } 1069 1070 /* 1071 * the trie enumeration is almost the same as 1072 * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1 1073 */ 1074 for(st1=0; st1<stage1Length; ++st1) { 1075 st2=stage12[st1]; 1076 if(st2>stage1Length) { 1077 ps2=stage12+st2; 1078 for(st2=0; st2<64; ++st2) { 1079 if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) { 1080 /* read the stage 3 block */ 1081 ps3=stage3+st3; 1082 1083 do { 1084 value=stage3b[*ps3++]; 1085 if(value==0) { 1086 /* no mapping, do nothing */ 1087 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { 1088 // Recurse for partial results. 1089 length=0; 1090 U16_APPEND_UNSAFE(s, length, c); 1091 ucnv_extGetUnicodeSetString( 1092 sharedData, cx, sa, which, minLength, 1093 c, s, length, 1094 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), 1095 pErrorCode); 1096 } else if(extSetUseMapping(which, minLength, value)) { 1097 switch(filter) { 1098 case UCNV_SET_FILTER_2022_CN: 1099 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { 1100 continue; 1101 } 1102 break; 1103 case UCNV_SET_FILTER_SJIS: 1104 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { 1105 continue; 1106 } 1107 break; 1108 case UCNV_SET_FILTER_GR94DBCS: 1109 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1110 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && 1111 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1112 continue; 1113 } 1114 break; 1115 case UCNV_SET_FILTER_HZ: 1116 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && 1117 (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1118 (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { 1119 continue; 1120 } 1121 break; 1122 default: 1123 /* 1124 * UCNV_SET_FILTER_NONE, 1125 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength 1126 */ 1127 break; 1128 } 1129 sa->add(sa->set, c); 1130 } 1131 } while((++c&0xf)!=0); 1132 } else { 1133 c+=16; /* empty stage 3 block */ 1134 } 1135 } 1136 } else { 1137 c+=1024; /* empty stage 2 block */ 1138 } 1139 } 1140 } 1141 1142 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1143