1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2000-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvscsu.c 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000nov18 16 * created by: Markus W. Scherer 17 * 18 * This is an implementation of the Standard Compression Scheme for Unicode 19 * as defined in http://www.unicode.org/unicode/reports/tr6/ . 20 * Reserved commands and window settings are treated as illegal sequences and 21 * will result in callback calls. 22 */ 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 27 28 #include "unicode/ucnv.h" 29 #include "unicode/ucnv_cb.h" 30 #include "unicode/utf16.h" 31 #include "ucnv_bld.h" 32 #include "ucnv_cnv.h" 33 #include "cmemory.h" 34 35 /* SCSU definitions --------------------------------------------------------- */ 36 37 /* SCSU command byte values */ 38 enum { 39 SQ0=0x01, /* Quote from window pair 0 */ 40 SQ7=0x08, /* Quote from window pair 7 */ 41 SDX=0x0B, /* Define a window as extended */ 42 Srs=0x0C, /* reserved */ 43 SQU=0x0E, /* Quote a single Unicode character */ 44 SCU=0x0F, /* Change to Unicode mode */ 45 SC0=0x10, /* Select window 0 */ 46 SC7=0x17, /* Select window 7 */ 47 SD0=0x18, /* Define and select window 0 */ 48 SD7=0x1F, /* Define and select window 7 */ 49 50 UC0=0xE0, /* Select window 0 */ 51 UC7=0xE7, /* Select window 7 */ 52 UD0=0xE8, /* Define and select window 0 */ 53 UD7=0xEF, /* Define and select window 7 */ 54 UQU=0xF0, /* Quote a single Unicode character */ 55 UDX=0xF1, /* Define a Window as extended */ 56 Urs=0xF2 /* reserved */ 57 }; 58 59 enum { 60 /* 61 * Unicode code points from 3400 to E000 are not adressible by 62 * dynamic window, since in these areas no short run alphabets are 63 * found. Therefore add gapOffset to all values from gapThreshold. 64 */ 65 gapThreshold=0x68, 66 gapOffset=0xAC00, 67 68 /* values between reservedStart and fixedThreshold are reserved */ 69 reservedStart=0xA8, 70 71 /* use table of predefined fixed offsets for values from fixedThreshold */ 72 fixedThreshold=0xF9 73 }; 74 75 /* constant offsets for the 8 static windows */ 76 static const uint32_t staticOffsets[8]={ 77 0x0000, /* ASCII for quoted tags */ 78 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 79 0x0100, /* Latin Extended-A */ 80 0x0300, /* Combining Diacritical Marks */ 81 0x2000, /* General Punctuation */ 82 0x2080, /* Currency Symbols */ 83 0x2100, /* Letterlike Symbols and Number Forms */ 84 0x3000 /* CJK Symbols and punctuation */ 85 }; 86 87 /* initial offsets for the 8 dynamic (sliding) windows */ 88 static const uint32_t initialDynamicOffsets[8]={ 89 0x0080, /* Latin-1 */ 90 0x00C0, /* Latin Extended A */ 91 0x0400, /* Cyrillic */ 92 0x0600, /* Arabic */ 93 0x0900, /* Devanagari */ 94 0x3040, /* Hiragana */ 95 0x30A0, /* Katakana */ 96 0xFF00 /* Fullwidth ASCII */ 97 }; 98 99 /* Table of fixed predefined Offsets */ 100 static const uint32_t fixedOffsets[]={ 101 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 102 /* 0xFA */ 0x0250, /* IPA extensions */ 103 /* 0xFB */ 0x0370, /* Greek */ 104 /* 0xFC */ 0x0530, /* Armenian */ 105 /* 0xFD */ 0x3040, /* Hiragana */ 106 /* 0xFE */ 0x30A0, /* Katakana */ 107 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 108 }; 109 110 /* state values */ 111 enum { 112 readCommand, 113 quotePairOne, 114 quotePairTwo, 115 quoteOne, 116 definePairOne, 117 definePairTwo, 118 defineOne 119 }; 120 121 typedef struct SCSUData { 122 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 123 uint32_t toUDynamicOffsets[8]; 124 uint32_t fromUDynamicOffsets[8]; 125 126 /* state machine state - toUnicode */ 127 UBool toUIsSingleByteMode; 128 uint8_t toUState; 129 int8_t toUQuoteWindow, toUDynamicWindow; 130 uint8_t toUByteOne; 131 uint8_t toUPadding[3]; 132 133 /* state machine state - fromUnicode */ 134 UBool fromUIsSingleByteMode; 135 int8_t fromUDynamicWindow; 136 137 /* 138 * windowUse[] keeps track of the use of the dynamic windows: 139 * At nextWindowUseIndex there is the least recently used window, 140 * and the following windows (in a wrapping manner) are more and more 141 * recently used. 142 * At nextWindowUseIndex-1 there is the most recently used window. 143 */ 144 uint8_t locale; 145 int8_t nextWindowUseIndex; 146 int8_t windowUse[8]; 147 } SCSUData; 148 149 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 150 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 151 152 enum { 153 lGeneric, l_ja 154 }; 155 156 /* SCSU setup functions ----------------------------------------------------- */ 157 158 static void 159 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { 160 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 161 162 if(choice<=UCNV_RESET_TO_UNICODE) { 163 /* reset toUnicode */ 164 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); 165 166 scsu->toUIsSingleByteMode=TRUE; 167 scsu->toUState=readCommand; 168 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; 169 scsu->toUByteOne=0; 170 171 cnv->toULength=0; 172 } 173 if(choice!=UCNV_RESET_TO_UNICODE) { 174 /* reset fromUnicode */ 175 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); 176 177 scsu->fromUIsSingleByteMode=TRUE; 178 scsu->fromUDynamicWindow=0; 179 180 scsu->nextWindowUseIndex=0; 181 switch(scsu->locale) { 182 case l_ja: 183 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); 184 break; 185 default: 186 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); 187 break; 188 } 189 190 cnv->fromUChar32=0; 191 } 192 } 193 194 static void 195 _SCSUOpen(UConverter *cnv, 196 UConverterLoadArgs *pArgs, 197 UErrorCode *pErrorCode) { 198 const char *locale=pArgs->locale; 199 if(pArgs->onlyTestIsLoadable) { 200 return; 201 } 202 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); 203 if(cnv->extraInfo!=NULL) { 204 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { 205 ((SCSUData *)cnv->extraInfo)->locale=l_ja; 206 } else { 207 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; 208 } 209 _SCSUReset(cnv, UCNV_RESET_BOTH); 210 } else { 211 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 212 } 213 214 /* Set the substitution character U+fffd as a Unicode string. */ 215 cnv->subUChars[0]=0xfffd; 216 cnv->subCharLen=-1; 217 } 218 219 static void 220 _SCSUClose(UConverter *cnv) { 221 if(cnv->extraInfo!=NULL) { 222 if(!cnv->isExtraLocal) { 223 uprv_free(cnv->extraInfo); 224 } 225 cnv->extraInfo=NULL; 226 } 227 } 228 229 /* SCSU-to-Unicode conversion functions ------------------------------------- */ 230 231 static void 232 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 233 UErrorCode *pErrorCode) { 234 UConverter *cnv; 235 SCSUData *scsu; 236 const uint8_t *source, *sourceLimit; 237 UChar *target; 238 const UChar *targetLimit; 239 int32_t *offsets; 240 UBool isSingleByteMode; 241 uint8_t state, byteOne; 242 int8_t quoteWindow, dynamicWindow; 243 244 int32_t sourceIndex, nextSourceIndex; 245 246 uint8_t b; 247 248 /* set up the local pointers */ 249 cnv=pArgs->converter; 250 scsu=(SCSUData *)cnv->extraInfo; 251 252 source=(const uint8_t *)pArgs->source; 253 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 254 target=pArgs->target; 255 targetLimit=pArgs->targetLimit; 256 offsets=pArgs->offsets; 257 258 /* get the state machine state */ 259 isSingleByteMode=scsu->toUIsSingleByteMode; 260 state=scsu->toUState; 261 quoteWindow=scsu->toUQuoteWindow; 262 dynamicWindow=scsu->toUDynamicWindow; 263 byteOne=scsu->toUByteOne; 264 265 /* sourceIndex=-1 if the current character began in the previous buffer */ 266 sourceIndex=state==readCommand ? 0 : -1; 267 nextSourceIndex=0; 268 269 /* 270 * conversion "loop" 271 * 272 * For performance, this is not a normal C loop. 273 * Instead, there are two code blocks for the two SCSU modes. 274 * The function branches to either one, and a change of the mode is done with a goto to 275 * the other branch. 276 * 277 * Each branch has two conventional loops: 278 * - a fast-path loop for the most common codes in the mode 279 * - a loop for all other codes in the mode 280 * When the fast-path runs into a code that it cannot handle, its loop ends and it 281 * runs into the following loop to handle the other codes. 282 * The end of the input or output buffer is also handled by the slower loop. 283 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 284 * 285 * The callback handling is done by returning with an error code. 286 * The conversion framework actually calls the callback function. 287 */ 288 if(isSingleByteMode) { 289 /* fast path for single-byte mode */ 290 if(state==readCommand) { 291 fastSingle: 292 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 293 ++source; 294 ++nextSourceIndex; 295 if(b<=0x7f) { 296 /* write US-ASCII graphic character or DEL */ 297 *target++=(UChar)b; 298 if(offsets!=NULL) { 299 *offsets++=sourceIndex; 300 } 301 } else { 302 /* write from dynamic window */ 303 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 304 if(c<=0xffff) { 305 *target++=(UChar)c; 306 if(offsets!=NULL) { 307 *offsets++=sourceIndex; 308 } 309 } else { 310 /* output surrogate pair */ 311 *target++=(UChar)(0xd7c0+(c>>10)); 312 if(target<targetLimit) { 313 *target++=(UChar)(0xdc00|(c&0x3ff)); 314 if(offsets!=NULL) { 315 *offsets++=sourceIndex; 316 *offsets++=sourceIndex; 317 } 318 } else { 319 /* target overflow */ 320 if(offsets!=NULL) { 321 *offsets++=sourceIndex; 322 } 323 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 324 cnv->UCharErrorBufferLength=1; 325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 326 goto endloop; 327 } 328 } 329 } 330 sourceIndex=nextSourceIndex; 331 } 332 } 333 334 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 335 singleByteMode: 336 while(source<sourceLimit) { 337 if(target>=targetLimit) { 338 /* target is full */ 339 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 340 break; 341 } 342 b=*source++; 343 ++nextSourceIndex; 344 switch(state) { 345 case readCommand: 346 /* redundant conditions are commented out */ 347 /* here: b<0x20 because otherwise we would be in fastSingle */ 348 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 349 /* CR/LF/TAB/NUL */ 350 *target++=(UChar)b; 351 if(offsets!=NULL) { 352 *offsets++=sourceIndex; 353 } 354 sourceIndex=nextSourceIndex; 355 goto fastSingle; 356 } else if(SC0<=b) { 357 if(b<=SC7) { 358 dynamicWindow=(int8_t)(b-SC0); 359 sourceIndex=nextSourceIndex; 360 goto fastSingle; 361 } else /* if(SD0<=b && b<=SD7) */ { 362 dynamicWindow=(int8_t)(b-SD0); 363 state=defineOne; 364 } 365 } else if(/* SQ0<=b && */ b<=SQ7) { 366 quoteWindow=(int8_t)(b-SQ0); 367 state=quoteOne; 368 } else if(b==SDX) { 369 state=definePairOne; 370 } else if(b==SQU) { 371 state=quotePairOne; 372 } else if(b==SCU) { 373 sourceIndex=nextSourceIndex; 374 isSingleByteMode=FALSE; 375 goto fastUnicode; 376 } else /* Srs */ { 377 /* callback(illegal) */ 378 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 379 cnv->toUBytes[0]=b; 380 cnv->toULength=1; 381 goto endloop; 382 } 383 384 /* store the first byte of a multibyte sequence in toUBytes[] */ 385 cnv->toUBytes[0]=b; 386 cnv->toULength=1; 387 break; 388 case quotePairOne: 389 byteOne=b; 390 cnv->toUBytes[1]=b; 391 cnv->toULength=2; 392 state=quotePairTwo; 393 break; 394 case quotePairTwo: 395 *target++=(UChar)((byteOne<<8)|b); 396 if(offsets!=NULL) { 397 *offsets++=sourceIndex; 398 } 399 sourceIndex=nextSourceIndex; 400 state=readCommand; 401 goto fastSingle; 402 case quoteOne: 403 if(b<0x80) { 404 /* all static offsets are in the BMP */ 405 *target++=(UChar)(staticOffsets[quoteWindow]+b); 406 if(offsets!=NULL) { 407 *offsets++=sourceIndex; 408 } 409 } else { 410 /* write from dynamic window */ 411 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 412 if(c<=0xffff) { 413 *target++=(UChar)c; 414 if(offsets!=NULL) { 415 *offsets++=sourceIndex; 416 } 417 } else { 418 /* output surrogate pair */ 419 *target++=(UChar)(0xd7c0+(c>>10)); 420 if(target<targetLimit) { 421 *target++=(UChar)(0xdc00|(c&0x3ff)); 422 if(offsets!=NULL) { 423 *offsets++=sourceIndex; 424 *offsets++=sourceIndex; 425 } 426 } else { 427 /* target overflow */ 428 if(offsets!=NULL) { 429 *offsets++=sourceIndex; 430 } 431 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 432 cnv->UCharErrorBufferLength=1; 433 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 434 goto endloop; 435 } 436 } 437 } 438 sourceIndex=nextSourceIndex; 439 state=readCommand; 440 goto fastSingle; 441 case definePairOne: 442 dynamicWindow=(int8_t)((b>>5)&7); 443 byteOne=(uint8_t)(b&0x1f); 444 cnv->toUBytes[1]=b; 445 cnv->toULength=2; 446 state=definePairTwo; 447 break; 448 case definePairTwo: 449 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 450 sourceIndex=nextSourceIndex; 451 state=readCommand; 452 goto fastSingle; 453 case defineOne: 454 if(b==0) { 455 /* callback(illegal): Reserved window offset value 0 */ 456 cnv->toUBytes[1]=b; 457 cnv->toULength=2; 458 goto endloop; 459 } else if(b<gapThreshold) { 460 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 461 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 462 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 463 } else if(b>=fixedThreshold) { 464 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 465 } else { 466 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 467 cnv->toUBytes[1]=b; 468 cnv->toULength=2; 469 goto endloop; 470 } 471 sourceIndex=nextSourceIndex; 472 state=readCommand; 473 goto fastSingle; 474 } 475 } 476 } else { 477 /* fast path for Unicode mode */ 478 if(state==readCommand) { 479 fastUnicode: 480 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 481 *target++=(UChar)((b<<8)|source[1]); 482 if(offsets!=NULL) { 483 *offsets++=sourceIndex; 484 } 485 sourceIndex=nextSourceIndex; 486 nextSourceIndex+=2; 487 source+=2; 488 } 489 } 490 491 /* normal state machine for Unicode mode */ 492 /* unicodeByteMode: */ 493 while(source<sourceLimit) { 494 if(target>=targetLimit) { 495 /* target is full */ 496 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 497 break; 498 } 499 b=*source++; 500 ++nextSourceIndex; 501 switch(state) { 502 case readCommand: 503 if((uint8_t)(b-UC0)>(Urs-UC0)) { 504 byteOne=b; 505 cnv->toUBytes[0]=b; 506 cnv->toULength=1; 507 state=quotePairTwo; 508 } else if(/* UC0<=b && */ b<=UC7) { 509 dynamicWindow=(int8_t)(b-UC0); 510 sourceIndex=nextSourceIndex; 511 isSingleByteMode=TRUE; 512 goto fastSingle; 513 } else if(/* UD0<=b && */ b<=UD7) { 514 dynamicWindow=(int8_t)(b-UD0); 515 isSingleByteMode=TRUE; 516 cnv->toUBytes[0]=b; 517 cnv->toULength=1; 518 state=defineOne; 519 goto singleByteMode; 520 } else if(b==UDX) { 521 isSingleByteMode=TRUE; 522 cnv->toUBytes[0]=b; 523 cnv->toULength=1; 524 state=definePairOne; 525 goto singleByteMode; 526 } else if(b==UQU) { 527 cnv->toUBytes[0]=b; 528 cnv->toULength=1; 529 state=quotePairOne; 530 } else /* Urs */ { 531 /* callback(illegal) */ 532 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 533 cnv->toUBytes[0]=b; 534 cnv->toULength=1; 535 goto endloop; 536 } 537 break; 538 case quotePairOne: 539 byteOne=b; 540 cnv->toUBytes[1]=b; 541 cnv->toULength=2; 542 state=quotePairTwo; 543 break; 544 case quotePairTwo: 545 *target++=(UChar)((byteOne<<8)|b); 546 if(offsets!=NULL) { 547 *offsets++=sourceIndex; 548 } 549 sourceIndex=nextSourceIndex; 550 state=readCommand; 551 goto fastUnicode; 552 } 553 } 554 } 555 endloop: 556 557 /* set the converter state back into UConverter */ 558 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 559 /* reset to deal with the next character */ 560 state=readCommand; 561 } else if(state==readCommand) { 562 /* not in a multi-byte sequence, reset toULength */ 563 cnv->toULength=0; 564 } 565 scsu->toUIsSingleByteMode=isSingleByteMode; 566 scsu->toUState=state; 567 scsu->toUQuoteWindow=quoteWindow; 568 scsu->toUDynamicWindow=dynamicWindow; 569 scsu->toUByteOne=byteOne; 570 571 /* write back the updated pointers */ 572 pArgs->source=(const char *)source; 573 pArgs->target=target; 574 pArgs->offsets=offsets; 575 return; 576 } 577 578 /* 579 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. 580 * If a change is made in the original function, then either 581 * change this function the same way or 582 * re-copy the original function and remove the variables 583 * offsets, sourceIndex, and nextSourceIndex. 584 */ 585 static void 586 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, 587 UErrorCode *pErrorCode) { 588 UConverter *cnv; 589 SCSUData *scsu; 590 const uint8_t *source, *sourceLimit; 591 UChar *target; 592 const UChar *targetLimit; 593 UBool isSingleByteMode; 594 uint8_t state, byteOne; 595 int8_t quoteWindow, dynamicWindow; 596 597 uint8_t b; 598 599 /* set up the local pointers */ 600 cnv=pArgs->converter; 601 scsu=(SCSUData *)cnv->extraInfo; 602 603 source=(const uint8_t *)pArgs->source; 604 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 605 target=pArgs->target; 606 targetLimit=pArgs->targetLimit; 607 608 /* get the state machine state */ 609 isSingleByteMode=scsu->toUIsSingleByteMode; 610 state=scsu->toUState; 611 quoteWindow=scsu->toUQuoteWindow; 612 dynamicWindow=scsu->toUDynamicWindow; 613 byteOne=scsu->toUByteOne; 614 615 /* 616 * conversion "loop" 617 * 618 * For performance, this is not a normal C loop. 619 * Instead, there are two code blocks for the two SCSU modes. 620 * The function branches to either one, and a change of the mode is done with a goto to 621 * the other branch. 622 * 623 * Each branch has two conventional loops: 624 * - a fast-path loop for the most common codes in the mode 625 * - a loop for all other codes in the mode 626 * When the fast-path runs into a code that it cannot handle, its loop ends and it 627 * runs into the following loop to handle the other codes. 628 * The end of the input or output buffer is also handled by the slower loop. 629 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 630 * 631 * The callback handling is done by returning with an error code. 632 * The conversion framework actually calls the callback function. 633 */ 634 if(isSingleByteMode) { 635 /* fast path for single-byte mode */ 636 if(state==readCommand) { 637 fastSingle: 638 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 639 ++source; 640 if(b<=0x7f) { 641 /* write US-ASCII graphic character or DEL */ 642 *target++=(UChar)b; 643 } else { 644 /* write from dynamic window */ 645 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 646 if(c<=0xffff) { 647 *target++=(UChar)c; 648 } else { 649 /* output surrogate pair */ 650 *target++=(UChar)(0xd7c0+(c>>10)); 651 if(target<targetLimit) { 652 *target++=(UChar)(0xdc00|(c&0x3ff)); 653 } else { 654 /* target overflow */ 655 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 656 cnv->UCharErrorBufferLength=1; 657 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 658 goto endloop; 659 } 660 } 661 } 662 } 663 } 664 665 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 666 singleByteMode: 667 while(source<sourceLimit) { 668 if(target>=targetLimit) { 669 /* target is full */ 670 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 671 break; 672 } 673 b=*source++; 674 switch(state) { 675 case readCommand: 676 /* redundant conditions are commented out */ 677 /* here: b<0x20 because otherwise we would be in fastSingle */ 678 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 679 /* CR/LF/TAB/NUL */ 680 *target++=(UChar)b; 681 goto fastSingle; 682 } else if(SC0<=b) { 683 if(b<=SC7) { 684 dynamicWindow=(int8_t)(b-SC0); 685 goto fastSingle; 686 } else /* if(SD0<=b && b<=SD7) */ { 687 dynamicWindow=(int8_t)(b-SD0); 688 state=defineOne; 689 } 690 } else if(/* SQ0<=b && */ b<=SQ7) { 691 quoteWindow=(int8_t)(b-SQ0); 692 state=quoteOne; 693 } else if(b==SDX) { 694 state=definePairOne; 695 } else if(b==SQU) { 696 state=quotePairOne; 697 } else if(b==SCU) { 698 isSingleByteMode=FALSE; 699 goto fastUnicode; 700 } else /* Srs */ { 701 /* callback(illegal) */ 702 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 703 cnv->toUBytes[0]=b; 704 cnv->toULength=1; 705 goto endloop; 706 } 707 708 /* store the first byte of a multibyte sequence in toUBytes[] */ 709 cnv->toUBytes[0]=b; 710 cnv->toULength=1; 711 break; 712 case quotePairOne: 713 byteOne=b; 714 cnv->toUBytes[1]=b; 715 cnv->toULength=2; 716 state=quotePairTwo; 717 break; 718 case quotePairTwo: 719 *target++=(UChar)((byteOne<<8)|b); 720 state=readCommand; 721 goto fastSingle; 722 case quoteOne: 723 if(b<0x80) { 724 /* all static offsets are in the BMP */ 725 *target++=(UChar)(staticOffsets[quoteWindow]+b); 726 } else { 727 /* write from dynamic window */ 728 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 729 if(c<=0xffff) { 730 *target++=(UChar)c; 731 } else { 732 /* output surrogate pair */ 733 *target++=(UChar)(0xd7c0+(c>>10)); 734 if(target<targetLimit) { 735 *target++=(UChar)(0xdc00|(c&0x3ff)); 736 } else { 737 /* target overflow */ 738 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 739 cnv->UCharErrorBufferLength=1; 740 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 741 goto endloop; 742 } 743 } 744 } 745 state=readCommand; 746 goto fastSingle; 747 case definePairOne: 748 dynamicWindow=(int8_t)((b>>5)&7); 749 byteOne=(uint8_t)(b&0x1f); 750 cnv->toUBytes[1]=b; 751 cnv->toULength=2; 752 state=definePairTwo; 753 break; 754 case definePairTwo: 755 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 756 state=readCommand; 757 goto fastSingle; 758 case defineOne: 759 if(b==0) { 760 /* callback(illegal): Reserved window offset value 0 */ 761 cnv->toUBytes[1]=b; 762 cnv->toULength=2; 763 goto endloop; 764 } else if(b<gapThreshold) { 765 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 766 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 767 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 768 } else if(b>=fixedThreshold) { 769 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 770 } else { 771 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 772 cnv->toUBytes[1]=b; 773 cnv->toULength=2; 774 goto endloop; 775 } 776 state=readCommand; 777 goto fastSingle; 778 } 779 } 780 } else { 781 /* fast path for Unicode mode */ 782 if(state==readCommand) { 783 fastUnicode: 784 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 785 *target++=(UChar)((b<<8)|source[1]); 786 source+=2; 787 } 788 } 789 790 /* normal state machine for Unicode mode */ 791 /* unicodeByteMode: */ 792 while(source<sourceLimit) { 793 if(target>=targetLimit) { 794 /* target is full */ 795 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 796 break; 797 } 798 b=*source++; 799 switch(state) { 800 case readCommand: 801 if((uint8_t)(b-UC0)>(Urs-UC0)) { 802 byteOne=b; 803 cnv->toUBytes[0]=b; 804 cnv->toULength=1; 805 state=quotePairTwo; 806 } else if(/* UC0<=b && */ b<=UC7) { 807 dynamicWindow=(int8_t)(b-UC0); 808 isSingleByteMode=TRUE; 809 goto fastSingle; 810 } else if(/* UD0<=b && */ b<=UD7) { 811 dynamicWindow=(int8_t)(b-UD0); 812 isSingleByteMode=TRUE; 813 cnv->toUBytes[0]=b; 814 cnv->toULength=1; 815 state=defineOne; 816 goto singleByteMode; 817 } else if(b==UDX) { 818 isSingleByteMode=TRUE; 819 cnv->toUBytes[0]=b; 820 cnv->toULength=1; 821 state=definePairOne; 822 goto singleByteMode; 823 } else if(b==UQU) { 824 cnv->toUBytes[0]=b; 825 cnv->toULength=1; 826 state=quotePairOne; 827 } else /* Urs */ { 828 /* callback(illegal) */ 829 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 830 cnv->toUBytes[0]=b; 831 cnv->toULength=1; 832 goto endloop; 833 } 834 break; 835 case quotePairOne: 836 byteOne=b; 837 cnv->toUBytes[1]=b; 838 cnv->toULength=2; 839 state=quotePairTwo; 840 break; 841 case quotePairTwo: 842 *target++=(UChar)((byteOne<<8)|b); 843 state=readCommand; 844 goto fastUnicode; 845 } 846 } 847 } 848 endloop: 849 850 /* set the converter state back into UConverter */ 851 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 852 /* reset to deal with the next character */ 853 state=readCommand; 854 } else if(state==readCommand) { 855 /* not in a multi-byte sequence, reset toULength */ 856 cnv->toULength=0; 857 } 858 scsu->toUIsSingleByteMode=isSingleByteMode; 859 scsu->toUState=state; 860 scsu->toUQuoteWindow=quoteWindow; 861 scsu->toUDynamicWindow=dynamicWindow; 862 scsu->toUByteOne=byteOne; 863 864 /* write back the updated pointers */ 865 pArgs->source=(const char *)source; 866 pArgs->target=target; 867 return; 868 } 869 870 /* SCSU-from-Unicode conversion functions ----------------------------------- */ 871 872 /* 873 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve 874 * reasonable results. The lookahead is minimal. 875 * Many cases are simple: 876 * A character fits directly into the current mode, a dynamic or static window, 877 * or is not compressible. These cases are tested first. 878 * Real compression heuristics are applied to the rest, in code branches for 879 * single/Unicode mode and BMP/supplementary code points. 880 * The heuristics used here are extremely simple. 881 */ 882 883 /* get the number of the window that this character is in, or -1 */ 884 static int8_t 885 getWindow(const uint32_t offsets[8], uint32_t c) { 886 int i; 887 for(i=0; i<8; ++i) { 888 if((uint32_t)(c-offsets[i])<=0x7f) { 889 return (int8_t)(i); 890 } 891 } 892 return -1; 893 } 894 895 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ 896 static UBool 897 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { 898 return (UBool)(c<=offset+0x7f && 899 (c>=offset || (c<=0x7f && 900 (c>=0x20 || (1UL<<c)&0x2601)))); 901 /* binary 0010 0110 0000 0001, 902 check for b==0xd || b==0xa || b==9 || b==0 */ 903 } 904 905 /* 906 * getNextDynamicWindow returns the next dynamic window to be redefined 907 */ 908 static int8_t 909 getNextDynamicWindow(SCSUData *scsu) { 910 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; 911 if(++scsu->nextWindowUseIndex==8) { 912 scsu->nextWindowUseIndex=0; 913 } 914 return window; 915 } 916 917 /* 918 * useDynamicWindow() adjusts 919 * windowUse[] and nextWindowUseIndex for the algorithm to choose 920 * the next dynamic window to be defined; 921 * a subclass may override it and provide its own algorithm. 922 */ 923 static void 924 useDynamicWindow(SCSUData *scsu, int8_t window) { 925 /* 926 * move the existing window, which just became the most recently used one, 927 * up in windowUse[] to nextWindowUseIndex-1 928 */ 929 930 /* first, find the index of the window - backwards to favor the more recently used windows */ 931 int i, j; 932 933 i=scsu->nextWindowUseIndex; 934 do { 935 if(--i<0) { 936 i=7; 937 } 938 } while(scsu->windowUse[i]!=window); 939 940 /* now copy each windowUse[i+1] to [i] */ 941 j=i+1; 942 if(j==8) { 943 j=0; 944 } 945 while(j!=scsu->nextWindowUseIndex) { 946 scsu->windowUse[i]=scsu->windowUse[j]; 947 i=j; 948 if(++j==8) { j=0; } 949 } 950 951 /* finally, set the window into the most recently used index */ 952 scsu->windowUse[i]=window; 953 } 954 955 /* 956 * calculate the offset and the code for a dynamic window that contains the character 957 * takes fixed offsets into account 958 * the offset of the window is stored in the offset variable, 959 * the code is returned 960 * 961 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code 962 */ 963 static int 964 getDynamicOffset(uint32_t c, uint32_t *pOffset) { 965 int i; 966 967 for(i=0; i<7; ++i) { 968 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { 969 *pOffset=fixedOffsets[i]; 970 return 0xf9+i; 971 } 972 } 973 974 if(c<0x80) { 975 /* No dynamic window for US-ASCII. */ 976 return -1; 977 } else if(c<0x3400 || 978 (uint32_t)(c-0x10000)<(0x14000-0x10000) || 979 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) 980 ) { 981 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ 982 *pOffset=c&0x7fffff80; 983 return (int)(c>>7); 984 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { 985 /* For these characters we need to take the gapOffset into account. */ 986 *pOffset=c&0x7fffff80; 987 return (int)((c-gapOffset)>>7); 988 } else { 989 return -1; 990 } 991 } 992 993 /* 994 * Idea for compression: 995 * - save SCSUData and other state before really starting work 996 * - at endloop, see if compression could be better with just unicode mode 997 * - don't do this if a callback has been called 998 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning 999 * - different buffer handling! 1000 * 1001 * Drawback or need for corrective handling: 1002 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and 1003 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible 1004 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. 1005 * 1006 * How to achieve both? 1007 * - Only replace the result after an SDX or SCU? 1008 */ 1009 1010 static void 1011 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1012 UErrorCode *pErrorCode) { 1013 UConverter *cnv; 1014 SCSUData *scsu; 1015 const UChar *source, *sourceLimit; 1016 uint8_t *target; 1017 int32_t targetCapacity; 1018 int32_t *offsets; 1019 1020 UBool isSingleByteMode; 1021 uint8_t dynamicWindow; 1022 uint32_t currentOffset; 1023 1024 uint32_t c, delta; 1025 1026 int32_t sourceIndex, nextSourceIndex; 1027 1028 int32_t length; 1029 1030 /* variables for compression heuristics */ 1031 uint32_t offset; 1032 UChar lead, trail; 1033 int code; 1034 int8_t window; 1035 1036 /* set up the local pointers */ 1037 cnv=pArgs->converter; 1038 scsu=(SCSUData *)cnv->extraInfo; 1039 1040 /* set up the local pointers */ 1041 source=pArgs->source; 1042 sourceLimit=pArgs->sourceLimit; 1043 target=(uint8_t *)pArgs->target; 1044 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1045 offsets=pArgs->offsets; 1046 1047 /* get the state machine state */ 1048 isSingleByteMode=scsu->fromUIsSingleByteMode; 1049 dynamicWindow=scsu->fromUDynamicWindow; 1050 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1051 1052 c=cnv->fromUChar32; 1053 1054 /* sourceIndex=-1 if the current character began in the previous buffer */ 1055 sourceIndex= c==0 ? 0 : -1; 1056 nextSourceIndex=0; 1057 1058 /* similar conversion "loop" as in toUnicode */ 1059 loop: 1060 if(isSingleByteMode) { 1061 if(c!=0 && targetCapacity>0) { 1062 goto getTrailSingle; 1063 } 1064 1065 /* state machine for single-byte mode */ 1066 /* singleByteMode: */ 1067 while(source<sourceLimit) { 1068 if(targetCapacity<=0) { 1069 /* target is full */ 1070 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1071 break; 1072 } 1073 c=*source++; 1074 ++nextSourceIndex; 1075 1076 if((c-0x20)<=0x5f) { 1077 /* pass US-ASCII graphic character through */ 1078 *target++=(uint8_t)c; 1079 if(offsets!=NULL) { 1080 *offsets++=sourceIndex; 1081 } 1082 --targetCapacity; 1083 } else if(c<0x20) { 1084 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1085 /* CR/LF/TAB/NUL */ 1086 *target++=(uint8_t)c; 1087 if(offsets!=NULL) { 1088 *offsets++=sourceIndex; 1089 } 1090 --targetCapacity; 1091 } else { 1092 /* quote C0 control character */ 1093 c|=SQ0<<8; 1094 length=2; 1095 goto outputBytes; 1096 } 1097 } else if((delta=c-currentOffset)<=0x7f) { 1098 /* use the current dynamic window */ 1099 *target++=(uint8_t)(delta|0x80); 1100 if(offsets!=NULL) { 1101 *offsets++=sourceIndex; 1102 } 1103 --targetCapacity; 1104 } else if(U16_IS_SURROGATE(c)) { 1105 if(U16_IS_SURROGATE_LEAD(c)) { 1106 getTrailSingle: 1107 lead=(UChar)c; 1108 if(source<sourceLimit) { 1109 /* test the following code unit */ 1110 trail=*source; 1111 if(U16_IS_TRAIL(trail)) { 1112 ++source; 1113 ++nextSourceIndex; 1114 c=U16_GET_SUPPLEMENTARY(c, trail); 1115 /* convert this surrogate code point */ 1116 /* exit this condition tree */ 1117 } else { 1118 /* this is an unmatched lead code unit (1st surrogate) */ 1119 /* callback(illegal) */ 1120 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1121 goto endloop; 1122 } 1123 } else { 1124 /* no more input */ 1125 break; 1126 } 1127 } else { 1128 /* this is an unmatched trail code unit (2nd surrogate) */ 1129 /* callback(illegal) */ 1130 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1131 goto endloop; 1132 } 1133 1134 /* compress supplementary character U+10000..U+10ffff */ 1135 if((delta=c-currentOffset)<=0x7f) { 1136 /* use the current dynamic window */ 1137 *target++=(uint8_t)(delta|0x80); 1138 if(offsets!=NULL) { 1139 *offsets++=sourceIndex; 1140 } 1141 --targetCapacity; 1142 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1143 /* there is a dynamic window that contains this character, change to it */ 1144 dynamicWindow=window; 1145 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1146 useDynamicWindow(scsu, dynamicWindow); 1147 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1148 length=2; 1149 goto outputBytes; 1150 } else if((code=getDynamicOffset(c, &offset))>=0) { 1151 /* might check if there are more characters in this window to come */ 1152 /* define an extended window with this character */ 1153 code-=0x200; 1154 dynamicWindow=getNextDynamicWindow(scsu); 1155 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1156 useDynamicWindow(scsu, dynamicWindow); 1157 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1158 length=4; 1159 goto outputBytes; 1160 } else { 1161 /* change to Unicode mode and output this (lead, trail) pair */ 1162 isSingleByteMode=FALSE; 1163 *target++=(uint8_t)SCU; 1164 if(offsets!=NULL) { 1165 *offsets++=sourceIndex; 1166 } 1167 --targetCapacity; 1168 c=((uint32_t)lead<<16)|trail; 1169 length=4; 1170 goto outputBytes; 1171 } 1172 } else if(c<0xa0) { 1173 /* quote C1 control character */ 1174 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1175 length=2; 1176 goto outputBytes; 1177 } else if(c==0xfeff || c>=0xfff0) { 1178 /* quote signature character=byte order mark and specials */ 1179 c|=SQU<<16; 1180 length=3; 1181 goto outputBytes; 1182 } else { 1183 /* compress all other BMP characters */ 1184 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1185 /* there is a window defined that contains this character - switch to it or quote from it? */ 1186 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1187 /* change to dynamic window */ 1188 dynamicWindow=window; 1189 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1190 useDynamicWindow(scsu, dynamicWindow); 1191 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1192 length=2; 1193 goto outputBytes; 1194 } else { 1195 /* quote from dynamic window */ 1196 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1197 length=2; 1198 goto outputBytes; 1199 } 1200 } else if((window=getWindow(staticOffsets, c))>=0) { 1201 /* quote from static window */ 1202 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1203 length=2; 1204 goto outputBytes; 1205 } else if((code=getDynamicOffset(c, &offset))>=0) { 1206 /* define a dynamic window with this character */ 1207 dynamicWindow=getNextDynamicWindow(scsu); 1208 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1209 useDynamicWindow(scsu, dynamicWindow); 1210 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1211 length=3; 1212 goto outputBytes; 1213 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1214 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1215 ) { 1216 /* 1217 * this character is not compressible (a BMP ideograph or similar); 1218 * switch to Unicode mode if this is the last character in the block 1219 * or there is at least one more ideograph following immediately 1220 */ 1221 isSingleByteMode=FALSE; 1222 c|=SCU<<16; 1223 length=3; 1224 goto outputBytes; 1225 } else { 1226 /* quote Unicode */ 1227 c|=SQU<<16; 1228 length=3; 1229 goto outputBytes; 1230 } 1231 } 1232 1233 /* normal end of conversion: prepare for a new character */ 1234 c=0; 1235 sourceIndex=nextSourceIndex; 1236 } 1237 } else { 1238 if(c!=0 && targetCapacity>0) { 1239 goto getTrailUnicode; 1240 } 1241 1242 /* state machine for Unicode mode */ 1243 /* unicodeByteMode: */ 1244 while(source<sourceLimit) { 1245 if(targetCapacity<=0) { 1246 /* target is full */ 1247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1248 break; 1249 } 1250 c=*source++; 1251 ++nextSourceIndex; 1252 1253 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1254 /* not compressible, write character directly */ 1255 if(targetCapacity>=2) { 1256 *target++=(uint8_t)(c>>8); 1257 *target++=(uint8_t)c; 1258 if(offsets!=NULL) { 1259 *offsets++=sourceIndex; 1260 *offsets++=sourceIndex; 1261 } 1262 targetCapacity-=2; 1263 } else { 1264 length=2; 1265 goto outputBytes; 1266 } 1267 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1268 /* compress BMP character if the following one is not an uncompressible ideograph */ 1269 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1270 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1271 /* ASCII digit or letter */ 1272 isSingleByteMode=TRUE; 1273 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1274 length=2; 1275 goto outputBytes; 1276 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1277 /* there is a dynamic window that contains this character, change to it */ 1278 isSingleByteMode=TRUE; 1279 dynamicWindow=window; 1280 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1281 useDynamicWindow(scsu, dynamicWindow); 1282 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1283 length=2; 1284 goto outputBytes; 1285 } else if((code=getDynamicOffset(c, &offset))>=0) { 1286 /* define a dynamic window with this character */ 1287 isSingleByteMode=TRUE; 1288 dynamicWindow=getNextDynamicWindow(scsu); 1289 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1290 useDynamicWindow(scsu, dynamicWindow); 1291 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1292 length=3; 1293 goto outputBytes; 1294 } 1295 } 1296 1297 /* don't know how to compress this character, just write it directly */ 1298 length=2; 1299 goto outputBytes; 1300 } else if(c<0xe000) { 1301 /* c is a surrogate */ 1302 if(U16_IS_SURROGATE_LEAD(c)) { 1303 getTrailUnicode: 1304 lead=(UChar)c; 1305 if(source<sourceLimit) { 1306 /* test the following code unit */ 1307 trail=*source; 1308 if(U16_IS_TRAIL(trail)) { 1309 ++source; 1310 ++nextSourceIndex; 1311 c=U16_GET_SUPPLEMENTARY(c, trail); 1312 /* convert this surrogate code point */ 1313 /* exit this condition tree */ 1314 } else { 1315 /* this is an unmatched lead code unit (1st surrogate) */ 1316 /* callback(illegal) */ 1317 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1318 goto endloop; 1319 } 1320 } else { 1321 /* no more input */ 1322 break; 1323 } 1324 } else { 1325 /* this is an unmatched trail code unit (2nd surrogate) */ 1326 /* callback(illegal) */ 1327 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1328 goto endloop; 1329 } 1330 1331 /* compress supplementary character */ 1332 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1333 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1334 ) { 1335 /* 1336 * there is a dynamic window that contains this character and 1337 * the following character is not uncompressible, 1338 * change to the window 1339 */ 1340 isSingleByteMode=TRUE; 1341 dynamicWindow=window; 1342 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1343 useDynamicWindow(scsu, dynamicWindow); 1344 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1345 length=2; 1346 goto outputBytes; 1347 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1348 (code=getDynamicOffset(c, &offset))>=0 1349 ) { 1350 /* two supplementary characters in (probably) the same window - define an extended one */ 1351 isSingleByteMode=TRUE; 1352 code-=0x200; 1353 dynamicWindow=getNextDynamicWindow(scsu); 1354 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1355 useDynamicWindow(scsu, dynamicWindow); 1356 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1357 length=4; 1358 goto outputBytes; 1359 } else { 1360 /* don't know how to compress this character, just write it directly */ 1361 c=((uint32_t)lead<<16)|trail; 1362 length=4; 1363 goto outputBytes; 1364 } 1365 } else /* 0xe000<=c<0xf300 */ { 1366 /* quote to avoid SCSU tags */ 1367 c|=UQU<<16; 1368 length=3; 1369 goto outputBytes; 1370 } 1371 1372 /* normal end of conversion: prepare for a new character */ 1373 c=0; 1374 sourceIndex=nextSourceIndex; 1375 } 1376 } 1377 endloop: 1378 1379 /* set the converter state back into UConverter */ 1380 scsu->fromUIsSingleByteMode=isSingleByteMode; 1381 scsu->fromUDynamicWindow=dynamicWindow; 1382 1383 cnv->fromUChar32=c; 1384 1385 /* write back the updated pointers */ 1386 pArgs->source=source; 1387 pArgs->target=(char *)target; 1388 pArgs->offsets=offsets; 1389 return; 1390 1391 outputBytes: 1392 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1393 /* from the first if in the loop we know that targetCapacity>0 */ 1394 if(length<=targetCapacity) { 1395 if(offsets==NULL) { 1396 switch(length) { 1397 /* each branch falls through to the next one */ 1398 case 4: 1399 *target++=(uint8_t)(c>>24); 1400 U_FALLTHROUGH; 1401 case 3: 1402 *target++=(uint8_t)(c>>16); 1403 U_FALLTHROUGH; 1404 case 2: 1405 *target++=(uint8_t)(c>>8); 1406 U_FALLTHROUGH; 1407 case 1: 1408 *target++=(uint8_t)c; 1409 U_FALLTHROUGH; 1410 default: 1411 /* will never occur */ 1412 break; 1413 } 1414 } else { 1415 switch(length) { 1416 /* each branch falls through to the next one */ 1417 case 4: 1418 *target++=(uint8_t)(c>>24); 1419 *offsets++=sourceIndex; 1420 U_FALLTHROUGH; 1421 case 3: 1422 *target++=(uint8_t)(c>>16); 1423 *offsets++=sourceIndex; 1424 U_FALLTHROUGH; 1425 case 2: 1426 *target++=(uint8_t)(c>>8); 1427 *offsets++=sourceIndex; 1428 U_FALLTHROUGH; 1429 case 1: 1430 *target++=(uint8_t)c; 1431 *offsets++=sourceIndex; 1432 U_FALLTHROUGH; 1433 default: 1434 /* will never occur */ 1435 break; 1436 } 1437 } 1438 targetCapacity-=length; 1439 1440 /* normal end of conversion: prepare for a new character */ 1441 c=0; 1442 sourceIndex=nextSourceIndex; 1443 goto loop; 1444 } else { 1445 uint8_t *p; 1446 1447 /* 1448 * We actually do this backwards here: 1449 * In order to save an intermediate variable, we output 1450 * first to the overflow buffer what does not fit into the 1451 * regular target. 1452 */ 1453 /* we know that 0<=targetCapacity<length<=4 */ 1454 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1455 length-=targetCapacity; 1456 p=(uint8_t *)cnv->charErrorBuffer; 1457 switch(length) { 1458 /* each branch falls through to the next one */ 1459 case 4: 1460 *p++=(uint8_t)(c>>24); 1461 U_FALLTHROUGH; 1462 case 3: 1463 *p++=(uint8_t)(c>>16); 1464 U_FALLTHROUGH; 1465 case 2: 1466 *p++=(uint8_t)(c>>8); 1467 U_FALLTHROUGH; 1468 case 1: 1469 *p=(uint8_t)c; 1470 U_FALLTHROUGH; 1471 default: 1472 /* will never occur */ 1473 break; 1474 } 1475 cnv->charErrorBufferLength=(int8_t)length; 1476 1477 /* now output what fits into the regular target */ 1478 c>>=8*length; /* length was reduced by targetCapacity */ 1479 switch(targetCapacity) { 1480 /* each branch falls through to the next one */ 1481 case 3: 1482 *target++=(uint8_t)(c>>16); 1483 if(offsets!=NULL) { 1484 *offsets++=sourceIndex; 1485 } 1486 U_FALLTHROUGH; 1487 case 2: 1488 *target++=(uint8_t)(c>>8); 1489 if(offsets!=NULL) { 1490 *offsets++=sourceIndex; 1491 } 1492 U_FALLTHROUGH; 1493 case 1: 1494 *target++=(uint8_t)c; 1495 if(offsets!=NULL) { 1496 *offsets++=sourceIndex; 1497 } 1498 U_FALLTHROUGH; 1499 default: 1500 break; 1501 } 1502 1503 /* target overflow */ 1504 targetCapacity=0; 1505 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1506 c=0; 1507 goto endloop; 1508 } 1509 } 1510 1511 /* 1512 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. 1513 * If a change is made in the original function, then either 1514 * change this function the same way or 1515 * re-copy the original function and remove the variables 1516 * offsets, sourceIndex, and nextSourceIndex. 1517 */ 1518 static void 1519 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, 1520 UErrorCode *pErrorCode) { 1521 UConverter *cnv; 1522 SCSUData *scsu; 1523 const UChar *source, *sourceLimit; 1524 uint8_t *target; 1525 int32_t targetCapacity; 1526 1527 UBool isSingleByteMode; 1528 uint8_t dynamicWindow; 1529 uint32_t currentOffset; 1530 1531 uint32_t c, delta; 1532 1533 int32_t length; 1534 1535 /* variables for compression heuristics */ 1536 uint32_t offset; 1537 UChar lead, trail; 1538 int code; 1539 int8_t window; 1540 1541 /* set up the local pointers */ 1542 cnv=pArgs->converter; 1543 scsu=(SCSUData *)cnv->extraInfo; 1544 1545 /* set up the local pointers */ 1546 source=pArgs->source; 1547 sourceLimit=pArgs->sourceLimit; 1548 target=(uint8_t *)pArgs->target; 1549 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1550 1551 /* get the state machine state */ 1552 isSingleByteMode=scsu->fromUIsSingleByteMode; 1553 dynamicWindow=scsu->fromUDynamicWindow; 1554 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1555 1556 c=cnv->fromUChar32; 1557 1558 /* similar conversion "loop" as in toUnicode */ 1559 loop: 1560 if(isSingleByteMode) { 1561 if(c!=0 && targetCapacity>0) { 1562 goto getTrailSingle; 1563 } 1564 1565 /* state machine for single-byte mode */ 1566 /* singleByteMode: */ 1567 while(source<sourceLimit) { 1568 if(targetCapacity<=0) { 1569 /* target is full */ 1570 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1571 break; 1572 } 1573 c=*source++; 1574 1575 if((c-0x20)<=0x5f) { 1576 /* pass US-ASCII graphic character through */ 1577 *target++=(uint8_t)c; 1578 --targetCapacity; 1579 } else if(c<0x20) { 1580 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1581 /* CR/LF/TAB/NUL */ 1582 *target++=(uint8_t)c; 1583 --targetCapacity; 1584 } else { 1585 /* quote C0 control character */ 1586 c|=SQ0<<8; 1587 length=2; 1588 goto outputBytes; 1589 } 1590 } else if((delta=c-currentOffset)<=0x7f) { 1591 /* use the current dynamic window */ 1592 *target++=(uint8_t)(delta|0x80); 1593 --targetCapacity; 1594 } else if(U16_IS_SURROGATE(c)) { 1595 if(U16_IS_SURROGATE_LEAD(c)) { 1596 getTrailSingle: 1597 lead=(UChar)c; 1598 if(source<sourceLimit) { 1599 /* test the following code unit */ 1600 trail=*source; 1601 if(U16_IS_TRAIL(trail)) { 1602 ++source; 1603 c=U16_GET_SUPPLEMENTARY(c, trail); 1604 /* convert this surrogate code point */ 1605 /* exit this condition tree */ 1606 } else { 1607 /* this is an unmatched lead code unit (1st surrogate) */ 1608 /* callback(illegal) */ 1609 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1610 goto endloop; 1611 } 1612 } else { 1613 /* no more input */ 1614 break; 1615 } 1616 } else { 1617 /* this is an unmatched trail code unit (2nd surrogate) */ 1618 /* callback(illegal) */ 1619 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1620 goto endloop; 1621 } 1622 1623 /* compress supplementary character U+10000..U+10ffff */ 1624 if((delta=c-currentOffset)<=0x7f) { 1625 /* use the current dynamic window */ 1626 *target++=(uint8_t)(delta|0x80); 1627 --targetCapacity; 1628 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1629 /* there is a dynamic window that contains this character, change to it */ 1630 dynamicWindow=window; 1631 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1632 useDynamicWindow(scsu, dynamicWindow); 1633 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1634 length=2; 1635 goto outputBytes; 1636 } else if((code=getDynamicOffset(c, &offset))>=0) { 1637 /* might check if there are more characters in this window to come */ 1638 /* define an extended window with this character */ 1639 code-=0x200; 1640 dynamicWindow=getNextDynamicWindow(scsu); 1641 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1642 useDynamicWindow(scsu, dynamicWindow); 1643 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1644 length=4; 1645 goto outputBytes; 1646 } else { 1647 /* change to Unicode mode and output this (lead, trail) pair */ 1648 isSingleByteMode=FALSE; 1649 *target++=(uint8_t)SCU; 1650 --targetCapacity; 1651 c=((uint32_t)lead<<16)|trail; 1652 length=4; 1653 goto outputBytes; 1654 } 1655 } else if(c<0xa0) { 1656 /* quote C1 control character */ 1657 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1658 length=2; 1659 goto outputBytes; 1660 } else if(c==0xfeff || c>=0xfff0) { 1661 /* quote signature character=byte order mark and specials */ 1662 c|=SQU<<16; 1663 length=3; 1664 goto outputBytes; 1665 } else { 1666 /* compress all other BMP characters */ 1667 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1668 /* there is a window defined that contains this character - switch to it or quote from it? */ 1669 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1670 /* change to dynamic window */ 1671 dynamicWindow=window; 1672 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1673 useDynamicWindow(scsu, dynamicWindow); 1674 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1675 length=2; 1676 goto outputBytes; 1677 } else { 1678 /* quote from dynamic window */ 1679 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1680 length=2; 1681 goto outputBytes; 1682 } 1683 } else if((window=getWindow(staticOffsets, c))>=0) { 1684 /* quote from static window */ 1685 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1686 length=2; 1687 goto outputBytes; 1688 } else if((code=getDynamicOffset(c, &offset))>=0) { 1689 /* define a dynamic window with this character */ 1690 dynamicWindow=getNextDynamicWindow(scsu); 1691 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1692 useDynamicWindow(scsu, dynamicWindow); 1693 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1694 length=3; 1695 goto outputBytes; 1696 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1697 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1698 ) { 1699 /* 1700 * this character is not compressible (a BMP ideograph or similar); 1701 * switch to Unicode mode if this is the last character in the block 1702 * or there is at least one more ideograph following immediately 1703 */ 1704 isSingleByteMode=FALSE; 1705 c|=SCU<<16; 1706 length=3; 1707 goto outputBytes; 1708 } else { 1709 /* quote Unicode */ 1710 c|=SQU<<16; 1711 length=3; 1712 goto outputBytes; 1713 } 1714 } 1715 1716 /* normal end of conversion: prepare for a new character */ 1717 c=0; 1718 } 1719 } else { 1720 if(c!=0 && targetCapacity>0) { 1721 goto getTrailUnicode; 1722 } 1723 1724 /* state machine for Unicode mode */ 1725 /* unicodeByteMode: */ 1726 while(source<sourceLimit) { 1727 if(targetCapacity<=0) { 1728 /* target is full */ 1729 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1730 break; 1731 } 1732 c=*source++; 1733 1734 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1735 /* not compressible, write character directly */ 1736 if(targetCapacity>=2) { 1737 *target++=(uint8_t)(c>>8); 1738 *target++=(uint8_t)c; 1739 targetCapacity-=2; 1740 } else { 1741 length=2; 1742 goto outputBytes; 1743 } 1744 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1745 /* compress BMP character if the following one is not an uncompressible ideograph */ 1746 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1747 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1748 /* ASCII digit or letter */ 1749 isSingleByteMode=TRUE; 1750 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1751 length=2; 1752 goto outputBytes; 1753 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1754 /* there is a dynamic window that contains this character, change to it */ 1755 isSingleByteMode=TRUE; 1756 dynamicWindow=window; 1757 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1758 useDynamicWindow(scsu, dynamicWindow); 1759 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1760 length=2; 1761 goto outputBytes; 1762 } else if((code=getDynamicOffset(c, &offset))>=0) { 1763 /* define a dynamic window with this character */ 1764 isSingleByteMode=TRUE; 1765 dynamicWindow=getNextDynamicWindow(scsu); 1766 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1767 useDynamicWindow(scsu, dynamicWindow); 1768 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1769 length=3; 1770 goto outputBytes; 1771 } 1772 } 1773 1774 /* don't know how to compress this character, just write it directly */ 1775 length=2; 1776 goto outputBytes; 1777 } else if(c<0xe000) { 1778 /* c is a surrogate */ 1779 if(U16_IS_SURROGATE_LEAD(c)) { 1780 getTrailUnicode: 1781 lead=(UChar)c; 1782 if(source<sourceLimit) { 1783 /* test the following code unit */ 1784 trail=*source; 1785 if(U16_IS_TRAIL(trail)) { 1786 ++source; 1787 c=U16_GET_SUPPLEMENTARY(c, trail); 1788 /* convert this surrogate code point */ 1789 /* exit this condition tree */ 1790 } else { 1791 /* this is an unmatched lead code unit (1st surrogate) */ 1792 /* callback(illegal) */ 1793 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1794 goto endloop; 1795 } 1796 } else { 1797 /* no more input */ 1798 break; 1799 } 1800 } else { 1801 /* this is an unmatched trail code unit (2nd surrogate) */ 1802 /* callback(illegal) */ 1803 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1804 goto endloop; 1805 } 1806 1807 /* compress supplementary character */ 1808 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1809 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1810 ) { 1811 /* 1812 * there is a dynamic window that contains this character and 1813 * the following character is not uncompressible, 1814 * change to the window 1815 */ 1816 isSingleByteMode=TRUE; 1817 dynamicWindow=window; 1818 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1819 useDynamicWindow(scsu, dynamicWindow); 1820 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1821 length=2; 1822 goto outputBytes; 1823 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1824 (code=getDynamicOffset(c, &offset))>=0 1825 ) { 1826 /* two supplementary characters in (probably) the same window - define an extended one */ 1827 isSingleByteMode=TRUE; 1828 code-=0x200; 1829 dynamicWindow=getNextDynamicWindow(scsu); 1830 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1831 useDynamicWindow(scsu, dynamicWindow); 1832 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1833 length=4; 1834 goto outputBytes; 1835 } else { 1836 /* don't know how to compress this character, just write it directly */ 1837 c=((uint32_t)lead<<16)|trail; 1838 length=4; 1839 goto outputBytes; 1840 } 1841 } else /* 0xe000<=c<0xf300 */ { 1842 /* quote to avoid SCSU tags */ 1843 c|=UQU<<16; 1844 length=3; 1845 goto outputBytes; 1846 } 1847 1848 /* normal end of conversion: prepare for a new character */ 1849 c=0; 1850 } 1851 } 1852 endloop: 1853 1854 /* set the converter state back into UConverter */ 1855 scsu->fromUIsSingleByteMode=isSingleByteMode; 1856 scsu->fromUDynamicWindow=dynamicWindow; 1857 1858 cnv->fromUChar32=c; 1859 1860 /* write back the updated pointers */ 1861 pArgs->source=source; 1862 pArgs->target=(char *)target; 1863 return; 1864 1865 outputBytes: 1866 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1867 /* from the first if in the loop we know that targetCapacity>0 */ 1868 if(length<=targetCapacity) { 1869 switch(length) { 1870 /* each branch falls through to the next one */ 1871 case 4: 1872 *target++=(uint8_t)(c>>24); 1873 U_FALLTHROUGH; 1874 case 3: 1875 *target++=(uint8_t)(c>>16); 1876 U_FALLTHROUGH; 1877 case 2: 1878 *target++=(uint8_t)(c>>8); 1879 U_FALLTHROUGH; 1880 case 1: 1881 *target++=(uint8_t)c; 1882 U_FALLTHROUGH; 1883 default: 1884 /* will never occur */ 1885 break; 1886 } 1887 targetCapacity-=length; 1888 1889 /* normal end of conversion: prepare for a new character */ 1890 c=0; 1891 goto loop; 1892 } else { 1893 uint8_t *p; 1894 1895 /* 1896 * We actually do this backwards here: 1897 * In order to save an intermediate variable, we output 1898 * first to the overflow buffer what does not fit into the 1899 * regular target. 1900 */ 1901 /* we know that 0<=targetCapacity<length<=4 */ 1902 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1903 length-=targetCapacity; 1904 p=(uint8_t *)cnv->charErrorBuffer; 1905 switch(length) { 1906 /* each branch falls through to the next one */ 1907 case 4: 1908 *p++=(uint8_t)(c>>24); 1909 U_FALLTHROUGH; 1910 case 3: 1911 *p++=(uint8_t)(c>>16); 1912 U_FALLTHROUGH; 1913 case 2: 1914 *p++=(uint8_t)(c>>8); 1915 U_FALLTHROUGH; 1916 case 1: 1917 *p=(uint8_t)c; 1918 U_FALLTHROUGH; 1919 default: 1920 /* will never occur */ 1921 break; 1922 } 1923 cnv->charErrorBufferLength=(int8_t)length; 1924 1925 /* now output what fits into the regular target */ 1926 c>>=8*length; /* length was reduced by targetCapacity */ 1927 switch(targetCapacity) { 1928 /* each branch falls through to the next one */ 1929 case 3: 1930 *target++=(uint8_t)(c>>16); 1931 U_FALLTHROUGH; 1932 case 2: 1933 *target++=(uint8_t)(c>>8); 1934 U_FALLTHROUGH; 1935 case 1: 1936 *target++=(uint8_t)c; 1937 U_FALLTHROUGH; 1938 default: 1939 break; 1940 } 1941 1942 /* target overflow */ 1943 targetCapacity=0; 1944 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1945 c=0; 1946 goto endloop; 1947 } 1948 } 1949 1950 /* miscellaneous ------------------------------------------------------------ */ 1951 1952 static const char * 1953 _SCSUGetName(const UConverter *cnv) { 1954 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1955 1956 switch(scsu->locale) { 1957 case l_ja: 1958 return "SCSU,locale=ja"; 1959 default: 1960 return "SCSU"; 1961 } 1962 } 1963 1964 /* structure for SafeClone calculations */ 1965 struct cloneSCSUStruct 1966 { 1967 UConverter cnv; 1968 SCSUData mydata; 1969 }; 1970 1971 static UConverter * 1972 _SCSUSafeClone(const UConverter *cnv, 1973 void *stackBuffer, 1974 int32_t *pBufferSize, 1975 UErrorCode *status) 1976 { 1977 struct cloneSCSUStruct * localClone; 1978 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); 1979 1980 if (U_FAILURE(*status)){ 1981 return 0; 1982 } 1983 1984 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 1985 *pBufferSize = bufferSizeNeeded; 1986 return 0; 1987 } 1988 1989 localClone = (struct cloneSCSUStruct *)stackBuffer; 1990 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1991 1992 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); 1993 localClone->cnv.extraInfo = &localClone->mydata; 1994 localClone->cnv.isExtraLocal = TRUE; 1995 1996 return &localClone->cnv; 1997 } 1998 1999 2000 static const UConverterImpl _SCSUImpl={ 2001 UCNV_SCSU, 2002 2003 NULL, 2004 NULL, 2005 2006 _SCSUOpen, 2007 _SCSUClose, 2008 _SCSUReset, 2009 2010 _SCSUToUnicode, 2011 _SCSUToUnicodeWithOffsets, 2012 _SCSUFromUnicode, 2013 _SCSUFromUnicodeWithOffsets, 2014 NULL, 2015 2016 NULL, 2017 _SCSUGetName, 2018 NULL, 2019 _SCSUSafeClone, 2020 ucnv_getCompleteUnicodeSet 2021 }; 2022 2023 static const UConverterStaticData _SCSUStaticData={ 2024 sizeof(UConverterStaticData), 2025 "SCSU", 2026 1212, /* CCSID for SCSU */ 2027 UCNV_IBM, UCNV_SCSU, 2028 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ 2029 /* 2030 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode 2031 * substitution string. 2032 */ 2033 { 0x0e, 0xff, 0xfd, 0 }, 3, 2034 FALSE, FALSE, 2035 0, 2036 0, 2037 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 2038 }; 2039 2040 const UConverterSharedData _SCSUData= 2041 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl); 2042 2043 #endif 2044