1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvscsu.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000nov18 14 * created by: Markus W. Scherer 15 * 16 * This is an implementation of the Standard Compression Scheme for Unicode 17 * as defined in http://www.unicode.org/unicode/reports/tr6/ . 18 * Reserved commands and window settings are treated as illegal sequences and 19 * will result in callback calls. 20 */ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 25 26 #include "unicode/ucnv.h" 27 #include "unicode/ucnv_cb.h" 28 #include "unicode/utf16.h" 29 #include "ucnv_bld.h" 30 #include "ucnv_cnv.h" 31 #include "cmemory.h" 32 33 /* SCSU definitions --------------------------------------------------------- */ 34 35 /* SCSU command byte values */ 36 enum { 37 SQ0=0x01, /* Quote from window pair 0 */ 38 SQ7=0x08, /* Quote from window pair 7 */ 39 SDX=0x0B, /* Define a window as extended */ 40 Srs=0x0C, /* reserved */ 41 SQU=0x0E, /* Quote a single Unicode character */ 42 SCU=0x0F, /* Change to Unicode mode */ 43 SC0=0x10, /* Select window 0 */ 44 SC7=0x17, /* Select window 7 */ 45 SD0=0x18, /* Define and select window 0 */ 46 SD7=0x1F, /* Define and select window 7 */ 47 48 UC0=0xE0, /* Select window 0 */ 49 UC7=0xE7, /* Select window 7 */ 50 UD0=0xE8, /* Define and select window 0 */ 51 UD7=0xEF, /* Define and select window 7 */ 52 UQU=0xF0, /* Quote a single Unicode character */ 53 UDX=0xF1, /* Define a Window as extended */ 54 Urs=0xF2 /* reserved */ 55 }; 56 57 enum { 58 /* 59 * Unicode code points from 3400 to E000 are not adressible by 60 * dynamic window, since in these areas no short run alphabets are 61 * found. Therefore add gapOffset to all values from gapThreshold. 62 */ 63 gapThreshold=0x68, 64 gapOffset=0xAC00, 65 66 /* values between reservedStart and fixedThreshold are reserved */ 67 reservedStart=0xA8, 68 69 /* use table of predefined fixed offsets for values from fixedThreshold */ 70 fixedThreshold=0xF9 71 }; 72 73 /* constant offsets for the 8 static windows */ 74 static const uint32_t staticOffsets[8]={ 75 0x0000, /* ASCII for quoted tags */ 76 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 77 0x0100, /* Latin Extended-A */ 78 0x0300, /* Combining Diacritical Marks */ 79 0x2000, /* General Punctuation */ 80 0x2080, /* Currency Symbols */ 81 0x2100, /* Letterlike Symbols and Number Forms */ 82 0x3000 /* CJK Symbols and punctuation */ 83 }; 84 85 /* initial offsets for the 8 dynamic (sliding) windows */ 86 static const uint32_t initialDynamicOffsets[8]={ 87 0x0080, /* Latin-1 */ 88 0x00C0, /* Latin Extended A */ 89 0x0400, /* Cyrillic */ 90 0x0600, /* Arabic */ 91 0x0900, /* Devanagari */ 92 0x3040, /* Hiragana */ 93 0x30A0, /* Katakana */ 94 0xFF00 /* Fullwidth ASCII */ 95 }; 96 97 /* Table of fixed predefined Offsets */ 98 static const uint32_t fixedOffsets[]={ 99 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 100 /* 0xFA */ 0x0250, /* IPA extensions */ 101 /* 0xFB */ 0x0370, /* Greek */ 102 /* 0xFC */ 0x0530, /* Armenian */ 103 /* 0xFD */ 0x3040, /* Hiragana */ 104 /* 0xFE */ 0x30A0, /* Katakana */ 105 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 106 }; 107 108 /* state values */ 109 enum { 110 readCommand, 111 quotePairOne, 112 quotePairTwo, 113 quoteOne, 114 definePairOne, 115 definePairTwo, 116 defineOne 117 }; 118 119 typedef struct SCSUData { 120 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 121 uint32_t toUDynamicOffsets[8]; 122 uint32_t fromUDynamicOffsets[8]; 123 124 /* state machine state - toUnicode */ 125 UBool toUIsSingleByteMode; 126 uint8_t toUState; 127 int8_t toUQuoteWindow, toUDynamicWindow; 128 uint8_t toUByteOne; 129 uint8_t toUPadding[3]; 130 131 /* state machine state - fromUnicode */ 132 UBool fromUIsSingleByteMode; 133 int8_t fromUDynamicWindow; 134 135 /* 136 * windowUse[] keeps track of the use of the dynamic windows: 137 * At nextWindowUseIndex there is the least recently used window, 138 * and the following windows (in a wrapping manner) are more and more 139 * recently used. 140 * At nextWindowUseIndex-1 there is the most recently used window. 141 */ 142 uint8_t locale; 143 int8_t nextWindowUseIndex; 144 int8_t windowUse[8]; 145 } SCSUData; 146 147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 149 150 enum { 151 lGeneric, l_ja 152 }; 153 154 /* SCSU setup functions ----------------------------------------------------- */ 155 156 static void 157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { 158 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 159 160 if(choice<=UCNV_RESET_TO_UNICODE) { 161 /* reset toUnicode */ 162 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); 163 164 scsu->toUIsSingleByteMode=TRUE; 165 scsu->toUState=readCommand; 166 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; 167 scsu->toUByteOne=0; 168 169 cnv->toULength=0; 170 } 171 if(choice!=UCNV_RESET_TO_UNICODE) { 172 /* reset fromUnicode */ 173 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); 174 175 scsu->fromUIsSingleByteMode=TRUE; 176 scsu->fromUDynamicWindow=0; 177 178 scsu->nextWindowUseIndex=0; 179 switch(scsu->locale) { 180 case l_ja: 181 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); 182 break; 183 default: 184 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); 185 break; 186 } 187 188 cnv->fromUChar32=0; 189 } 190 } 191 192 static void 193 _SCSUOpen(UConverter *cnv, 194 UConverterLoadArgs *pArgs, 195 UErrorCode *pErrorCode) { 196 const char *locale=pArgs->locale; 197 if(pArgs->onlyTestIsLoadable) { 198 return; 199 } 200 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); 201 if(cnv->extraInfo!=NULL) { 202 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { 203 ((SCSUData *)cnv->extraInfo)->locale=l_ja; 204 } else { 205 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; 206 } 207 _SCSUReset(cnv, UCNV_RESET_BOTH); 208 } else { 209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 210 } 211 212 /* Set the substitution character U+fffd as a Unicode string. */ 213 cnv->subUChars[0]=0xfffd; 214 cnv->subCharLen=-1; 215 } 216 217 static void 218 _SCSUClose(UConverter *cnv) { 219 if(cnv->extraInfo!=NULL) { 220 if(!cnv->isExtraLocal) { 221 uprv_free(cnv->extraInfo); 222 } 223 cnv->extraInfo=NULL; 224 } 225 } 226 227 /* SCSU-to-Unicode conversion functions ------------------------------------- */ 228 229 static void 230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 231 UErrorCode *pErrorCode) { 232 UConverter *cnv; 233 SCSUData *scsu; 234 const uint8_t *source, *sourceLimit; 235 UChar *target; 236 const UChar *targetLimit; 237 int32_t *offsets; 238 UBool isSingleByteMode; 239 uint8_t state, byteOne; 240 int8_t quoteWindow, dynamicWindow; 241 242 int32_t sourceIndex, nextSourceIndex; 243 244 uint8_t b; 245 246 /* set up the local pointers */ 247 cnv=pArgs->converter; 248 scsu=(SCSUData *)cnv->extraInfo; 249 250 source=(const uint8_t *)pArgs->source; 251 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 252 target=pArgs->target; 253 targetLimit=pArgs->targetLimit; 254 offsets=pArgs->offsets; 255 256 /* get the state machine state */ 257 isSingleByteMode=scsu->toUIsSingleByteMode; 258 state=scsu->toUState; 259 quoteWindow=scsu->toUQuoteWindow; 260 dynamicWindow=scsu->toUDynamicWindow; 261 byteOne=scsu->toUByteOne; 262 263 /* sourceIndex=-1 if the current character began in the previous buffer */ 264 sourceIndex=state==readCommand ? 0 : -1; 265 nextSourceIndex=0; 266 267 /* 268 * conversion "loop" 269 * 270 * For performance, this is not a normal C loop. 271 * Instead, there are two code blocks for the two SCSU modes. 272 * The function branches to either one, and a change of the mode is done with a goto to 273 * the other branch. 274 * 275 * Each branch has two conventional loops: 276 * - a fast-path loop for the most common codes in the mode 277 * - a loop for all other codes in the mode 278 * When the fast-path runs into a code that it cannot handle, its loop ends and it 279 * runs into the following loop to handle the other codes. 280 * The end of the input or output buffer is also handled by the slower loop. 281 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 282 * 283 * The callback handling is done by returning with an error code. 284 * The conversion framework actually calls the callback function. 285 */ 286 if(isSingleByteMode) { 287 /* fast path for single-byte mode */ 288 if(state==readCommand) { 289 fastSingle: 290 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 291 ++source; 292 ++nextSourceIndex; 293 if(b<=0x7f) { 294 /* write US-ASCII graphic character or DEL */ 295 *target++=(UChar)b; 296 if(offsets!=NULL) { 297 *offsets++=sourceIndex; 298 } 299 } else { 300 /* write from dynamic window */ 301 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 302 if(c<=0xffff) { 303 *target++=(UChar)c; 304 if(offsets!=NULL) { 305 *offsets++=sourceIndex; 306 } 307 } else { 308 /* output surrogate pair */ 309 *target++=(UChar)(0xd7c0+(c>>10)); 310 if(target<targetLimit) { 311 *target++=(UChar)(0xdc00|(c&0x3ff)); 312 if(offsets!=NULL) { 313 *offsets++=sourceIndex; 314 *offsets++=sourceIndex; 315 } 316 } else { 317 /* target overflow */ 318 if(offsets!=NULL) { 319 *offsets++=sourceIndex; 320 } 321 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 322 cnv->UCharErrorBufferLength=1; 323 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 324 goto endloop; 325 } 326 } 327 } 328 sourceIndex=nextSourceIndex; 329 } 330 } 331 332 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 333 singleByteMode: 334 while(source<sourceLimit) { 335 if(target>=targetLimit) { 336 /* target is full */ 337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 338 break; 339 } 340 b=*source++; 341 ++nextSourceIndex; 342 switch(state) { 343 case readCommand: 344 /* redundant conditions are commented out */ 345 /* here: b<0x20 because otherwise we would be in fastSingle */ 346 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 347 /* CR/LF/TAB/NUL */ 348 *target++=(UChar)b; 349 if(offsets!=NULL) { 350 *offsets++=sourceIndex; 351 } 352 sourceIndex=nextSourceIndex; 353 goto fastSingle; 354 } else if(SC0<=b) { 355 if(b<=SC7) { 356 dynamicWindow=(int8_t)(b-SC0); 357 sourceIndex=nextSourceIndex; 358 goto fastSingle; 359 } else /* if(SD0<=b && b<=SD7) */ { 360 dynamicWindow=(int8_t)(b-SD0); 361 state=defineOne; 362 } 363 } else if(/* SQ0<=b && */ b<=SQ7) { 364 quoteWindow=(int8_t)(b-SQ0); 365 state=quoteOne; 366 } else if(b==SDX) { 367 state=definePairOne; 368 } else if(b==SQU) { 369 state=quotePairOne; 370 } else if(b==SCU) { 371 sourceIndex=nextSourceIndex; 372 isSingleByteMode=FALSE; 373 goto fastUnicode; 374 } else /* Srs */ { 375 /* callback(illegal) */ 376 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 377 cnv->toUBytes[0]=b; 378 cnv->toULength=1; 379 goto endloop; 380 } 381 382 /* store the first byte of a multibyte sequence in toUBytes[] */ 383 cnv->toUBytes[0]=b; 384 cnv->toULength=1; 385 break; 386 case quotePairOne: 387 byteOne=b; 388 cnv->toUBytes[1]=b; 389 cnv->toULength=2; 390 state=quotePairTwo; 391 break; 392 case quotePairTwo: 393 *target++=(UChar)((byteOne<<8)|b); 394 if(offsets!=NULL) { 395 *offsets++=sourceIndex; 396 } 397 sourceIndex=nextSourceIndex; 398 state=readCommand; 399 goto fastSingle; 400 case quoteOne: 401 if(b<0x80) { 402 /* all static offsets are in the BMP */ 403 *target++=(UChar)(staticOffsets[quoteWindow]+b); 404 if(offsets!=NULL) { 405 *offsets++=sourceIndex; 406 } 407 } else { 408 /* write from dynamic window */ 409 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 410 if(c<=0xffff) { 411 *target++=(UChar)c; 412 if(offsets!=NULL) { 413 *offsets++=sourceIndex; 414 } 415 } else { 416 /* output surrogate pair */ 417 *target++=(UChar)(0xd7c0+(c>>10)); 418 if(target<targetLimit) { 419 *target++=(UChar)(0xdc00|(c&0x3ff)); 420 if(offsets!=NULL) { 421 *offsets++=sourceIndex; 422 *offsets++=sourceIndex; 423 } 424 } else { 425 /* target overflow */ 426 if(offsets!=NULL) { 427 *offsets++=sourceIndex; 428 } 429 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 430 cnv->UCharErrorBufferLength=1; 431 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 432 goto endloop; 433 } 434 } 435 } 436 sourceIndex=nextSourceIndex; 437 state=readCommand; 438 goto fastSingle; 439 case definePairOne: 440 dynamicWindow=(int8_t)((b>>5)&7); 441 byteOne=(uint8_t)(b&0x1f); 442 cnv->toUBytes[1]=b; 443 cnv->toULength=2; 444 state=definePairTwo; 445 break; 446 case definePairTwo: 447 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 448 sourceIndex=nextSourceIndex; 449 state=readCommand; 450 goto fastSingle; 451 case defineOne: 452 if(b==0) { 453 /* callback(illegal): Reserved window offset value 0 */ 454 cnv->toUBytes[1]=b; 455 cnv->toULength=2; 456 goto endloop; 457 } else if(b<gapThreshold) { 458 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 459 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 460 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 461 } else if(b>=fixedThreshold) { 462 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 463 } else { 464 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 465 cnv->toUBytes[1]=b; 466 cnv->toULength=2; 467 goto endloop; 468 } 469 sourceIndex=nextSourceIndex; 470 state=readCommand; 471 goto fastSingle; 472 } 473 } 474 } else { 475 /* fast path for Unicode mode */ 476 if(state==readCommand) { 477 fastUnicode: 478 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 479 *target++=(UChar)((b<<8)|source[1]); 480 if(offsets!=NULL) { 481 *offsets++=sourceIndex; 482 } 483 sourceIndex=nextSourceIndex; 484 nextSourceIndex+=2; 485 source+=2; 486 } 487 } 488 489 /* normal state machine for Unicode mode */ 490 /* unicodeByteMode: */ 491 while(source<sourceLimit) { 492 if(target>=targetLimit) { 493 /* target is full */ 494 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 495 break; 496 } 497 b=*source++; 498 ++nextSourceIndex; 499 switch(state) { 500 case readCommand: 501 if((uint8_t)(b-UC0)>(Urs-UC0)) { 502 byteOne=b; 503 cnv->toUBytes[0]=b; 504 cnv->toULength=1; 505 state=quotePairTwo; 506 } else if(/* UC0<=b && */ b<=UC7) { 507 dynamicWindow=(int8_t)(b-UC0); 508 sourceIndex=nextSourceIndex; 509 isSingleByteMode=TRUE; 510 goto fastSingle; 511 } else if(/* UD0<=b && */ b<=UD7) { 512 dynamicWindow=(int8_t)(b-UD0); 513 isSingleByteMode=TRUE; 514 cnv->toUBytes[0]=b; 515 cnv->toULength=1; 516 state=defineOne; 517 goto singleByteMode; 518 } else if(b==UDX) { 519 isSingleByteMode=TRUE; 520 cnv->toUBytes[0]=b; 521 cnv->toULength=1; 522 state=definePairOne; 523 goto singleByteMode; 524 } else if(b==UQU) { 525 cnv->toUBytes[0]=b; 526 cnv->toULength=1; 527 state=quotePairOne; 528 } else /* Urs */ { 529 /* callback(illegal) */ 530 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 531 cnv->toUBytes[0]=b; 532 cnv->toULength=1; 533 goto endloop; 534 } 535 break; 536 case quotePairOne: 537 byteOne=b; 538 cnv->toUBytes[1]=b; 539 cnv->toULength=2; 540 state=quotePairTwo; 541 break; 542 case quotePairTwo: 543 *target++=(UChar)((byteOne<<8)|b); 544 if(offsets!=NULL) { 545 *offsets++=sourceIndex; 546 } 547 sourceIndex=nextSourceIndex; 548 state=readCommand; 549 goto fastUnicode; 550 } 551 } 552 } 553 endloop: 554 555 /* set the converter state back into UConverter */ 556 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 557 /* reset to deal with the next character */ 558 state=readCommand; 559 } else if(state==readCommand) { 560 /* not in a multi-byte sequence, reset toULength */ 561 cnv->toULength=0; 562 } 563 scsu->toUIsSingleByteMode=isSingleByteMode; 564 scsu->toUState=state; 565 scsu->toUQuoteWindow=quoteWindow; 566 scsu->toUDynamicWindow=dynamicWindow; 567 scsu->toUByteOne=byteOne; 568 569 /* write back the updated pointers */ 570 pArgs->source=(const char *)source; 571 pArgs->target=target; 572 pArgs->offsets=offsets; 573 return; 574 } 575 576 /* 577 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. 578 * If a change is made in the original function, then either 579 * change this function the same way or 580 * re-copy the original function and remove the variables 581 * offsets, sourceIndex, and nextSourceIndex. 582 */ 583 static void 584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, 585 UErrorCode *pErrorCode) { 586 UConverter *cnv; 587 SCSUData *scsu; 588 const uint8_t *source, *sourceLimit; 589 UChar *target; 590 const UChar *targetLimit; 591 UBool isSingleByteMode; 592 uint8_t state, byteOne; 593 int8_t quoteWindow, dynamicWindow; 594 595 uint8_t b; 596 597 /* set up the local pointers */ 598 cnv=pArgs->converter; 599 scsu=(SCSUData *)cnv->extraInfo; 600 601 source=(const uint8_t *)pArgs->source; 602 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 603 target=pArgs->target; 604 targetLimit=pArgs->targetLimit; 605 606 /* get the state machine state */ 607 isSingleByteMode=scsu->toUIsSingleByteMode; 608 state=scsu->toUState; 609 quoteWindow=scsu->toUQuoteWindow; 610 dynamicWindow=scsu->toUDynamicWindow; 611 byteOne=scsu->toUByteOne; 612 613 /* 614 * conversion "loop" 615 * 616 * For performance, this is not a normal C loop. 617 * Instead, there are two code blocks for the two SCSU modes. 618 * The function branches to either one, and a change of the mode is done with a goto to 619 * the other branch. 620 * 621 * Each branch has two conventional loops: 622 * - a fast-path loop for the most common codes in the mode 623 * - a loop for all other codes in the mode 624 * When the fast-path runs into a code that it cannot handle, its loop ends and it 625 * runs into the following loop to handle the other codes. 626 * The end of the input or output buffer is also handled by the slower loop. 627 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 628 * 629 * The callback handling is done by returning with an error code. 630 * The conversion framework actually calls the callback function. 631 */ 632 if(isSingleByteMode) { 633 /* fast path for single-byte mode */ 634 if(state==readCommand) { 635 fastSingle: 636 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 637 ++source; 638 if(b<=0x7f) { 639 /* write US-ASCII graphic character or DEL */ 640 *target++=(UChar)b; 641 } else { 642 /* write from dynamic window */ 643 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 644 if(c<=0xffff) { 645 *target++=(UChar)c; 646 } else { 647 /* output surrogate pair */ 648 *target++=(UChar)(0xd7c0+(c>>10)); 649 if(target<targetLimit) { 650 *target++=(UChar)(0xdc00|(c&0x3ff)); 651 } else { 652 /* target overflow */ 653 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 654 cnv->UCharErrorBufferLength=1; 655 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 656 goto endloop; 657 } 658 } 659 } 660 } 661 } 662 663 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 664 singleByteMode: 665 while(source<sourceLimit) { 666 if(target>=targetLimit) { 667 /* target is full */ 668 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 669 break; 670 } 671 b=*source++; 672 switch(state) { 673 case readCommand: 674 /* redundant conditions are commented out */ 675 /* here: b<0x20 because otherwise we would be in fastSingle */ 676 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 677 /* CR/LF/TAB/NUL */ 678 *target++=(UChar)b; 679 goto fastSingle; 680 } else if(SC0<=b) { 681 if(b<=SC7) { 682 dynamicWindow=(int8_t)(b-SC0); 683 goto fastSingle; 684 } else /* if(SD0<=b && b<=SD7) */ { 685 dynamicWindow=(int8_t)(b-SD0); 686 state=defineOne; 687 } 688 } else if(/* SQ0<=b && */ b<=SQ7) { 689 quoteWindow=(int8_t)(b-SQ0); 690 state=quoteOne; 691 } else if(b==SDX) { 692 state=definePairOne; 693 } else if(b==SQU) { 694 state=quotePairOne; 695 } else if(b==SCU) { 696 isSingleByteMode=FALSE; 697 goto fastUnicode; 698 } else /* Srs */ { 699 /* callback(illegal) */ 700 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 701 cnv->toUBytes[0]=b; 702 cnv->toULength=1; 703 goto endloop; 704 } 705 706 /* store the first byte of a multibyte sequence in toUBytes[] */ 707 cnv->toUBytes[0]=b; 708 cnv->toULength=1; 709 break; 710 case quotePairOne: 711 byteOne=b; 712 cnv->toUBytes[1]=b; 713 cnv->toULength=2; 714 state=quotePairTwo; 715 break; 716 case quotePairTwo: 717 *target++=(UChar)((byteOne<<8)|b); 718 state=readCommand; 719 goto fastSingle; 720 case quoteOne: 721 if(b<0x80) { 722 /* all static offsets are in the BMP */ 723 *target++=(UChar)(staticOffsets[quoteWindow]+b); 724 } else { 725 /* write from dynamic window */ 726 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 727 if(c<=0xffff) { 728 *target++=(UChar)c; 729 } else { 730 /* output surrogate pair */ 731 *target++=(UChar)(0xd7c0+(c>>10)); 732 if(target<targetLimit) { 733 *target++=(UChar)(0xdc00|(c&0x3ff)); 734 } else { 735 /* target overflow */ 736 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 737 cnv->UCharErrorBufferLength=1; 738 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 739 goto endloop; 740 } 741 } 742 } 743 state=readCommand; 744 goto fastSingle; 745 case definePairOne: 746 dynamicWindow=(int8_t)((b>>5)&7); 747 byteOne=(uint8_t)(b&0x1f); 748 cnv->toUBytes[1]=b; 749 cnv->toULength=2; 750 state=definePairTwo; 751 break; 752 case definePairTwo: 753 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 754 state=readCommand; 755 goto fastSingle; 756 case defineOne: 757 if(b==0) { 758 /* callback(illegal): Reserved window offset value 0 */ 759 cnv->toUBytes[1]=b; 760 cnv->toULength=2; 761 goto endloop; 762 } else if(b<gapThreshold) { 763 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 764 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 765 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 766 } else if(b>=fixedThreshold) { 767 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 768 } else { 769 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 770 cnv->toUBytes[1]=b; 771 cnv->toULength=2; 772 goto endloop; 773 } 774 state=readCommand; 775 goto fastSingle; 776 } 777 } 778 } else { 779 /* fast path for Unicode mode */ 780 if(state==readCommand) { 781 fastUnicode: 782 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 783 *target++=(UChar)((b<<8)|source[1]); 784 source+=2; 785 } 786 } 787 788 /* normal state machine for Unicode mode */ 789 /* unicodeByteMode: */ 790 while(source<sourceLimit) { 791 if(target>=targetLimit) { 792 /* target is full */ 793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 794 break; 795 } 796 b=*source++; 797 switch(state) { 798 case readCommand: 799 if((uint8_t)(b-UC0)>(Urs-UC0)) { 800 byteOne=b; 801 cnv->toUBytes[0]=b; 802 cnv->toULength=1; 803 state=quotePairTwo; 804 } else if(/* UC0<=b && */ b<=UC7) { 805 dynamicWindow=(int8_t)(b-UC0); 806 isSingleByteMode=TRUE; 807 goto fastSingle; 808 } else if(/* UD0<=b && */ b<=UD7) { 809 dynamicWindow=(int8_t)(b-UD0); 810 isSingleByteMode=TRUE; 811 cnv->toUBytes[0]=b; 812 cnv->toULength=1; 813 state=defineOne; 814 goto singleByteMode; 815 } else if(b==UDX) { 816 isSingleByteMode=TRUE; 817 cnv->toUBytes[0]=b; 818 cnv->toULength=1; 819 state=definePairOne; 820 goto singleByteMode; 821 } else if(b==UQU) { 822 cnv->toUBytes[0]=b; 823 cnv->toULength=1; 824 state=quotePairOne; 825 } else /* Urs */ { 826 /* callback(illegal) */ 827 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 828 cnv->toUBytes[0]=b; 829 cnv->toULength=1; 830 goto endloop; 831 } 832 break; 833 case quotePairOne: 834 byteOne=b; 835 cnv->toUBytes[1]=b; 836 cnv->toULength=2; 837 state=quotePairTwo; 838 break; 839 case quotePairTwo: 840 *target++=(UChar)((byteOne<<8)|b); 841 state=readCommand; 842 goto fastUnicode; 843 } 844 } 845 } 846 endloop: 847 848 /* set the converter state back into UConverter */ 849 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 850 /* reset to deal with the next character */ 851 state=readCommand; 852 } else if(state==readCommand) { 853 /* not in a multi-byte sequence, reset toULength */ 854 cnv->toULength=0; 855 } 856 scsu->toUIsSingleByteMode=isSingleByteMode; 857 scsu->toUState=state; 858 scsu->toUQuoteWindow=quoteWindow; 859 scsu->toUDynamicWindow=dynamicWindow; 860 scsu->toUByteOne=byteOne; 861 862 /* write back the updated pointers */ 863 pArgs->source=(const char *)source; 864 pArgs->target=target; 865 return; 866 } 867 868 /* SCSU-from-Unicode conversion functions ----------------------------------- */ 869 870 /* 871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve 872 * reasonable results. The lookahead is minimal. 873 * Many cases are simple: 874 * A character fits directly into the current mode, a dynamic or static window, 875 * or is not compressible. These cases are tested first. 876 * Real compression heuristics are applied to the rest, in code branches for 877 * single/Unicode mode and BMP/supplementary code points. 878 * The heuristics used here are extremely simple. 879 */ 880 881 /* get the number of the window that this character is in, or -1 */ 882 static int8_t 883 getWindow(const uint32_t offsets[8], uint32_t c) { 884 int i; 885 for(i=0; i<8; ++i) { 886 if((uint32_t)(c-offsets[i])<=0x7f) { 887 return (int8_t)(i); 888 } 889 } 890 return -1; 891 } 892 893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ 894 static UBool 895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { 896 return (UBool)(c<=offset+0x7f && 897 (c>=offset || (c<=0x7f && 898 (c>=0x20 || (1UL<<c)&0x2601)))); 899 /* binary 0010 0110 0000 0001, 900 check for b==0xd || b==0xa || b==9 || b==0 */ 901 } 902 903 /* 904 * getNextDynamicWindow returns the next dynamic window to be redefined 905 */ 906 static int8_t 907 getNextDynamicWindow(SCSUData *scsu) { 908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; 909 if(++scsu->nextWindowUseIndex==8) { 910 scsu->nextWindowUseIndex=0; 911 } 912 return window; 913 } 914 915 /* 916 * useDynamicWindow() adjusts 917 * windowUse[] and nextWindowUseIndex for the algorithm to choose 918 * the next dynamic window to be defined; 919 * a subclass may override it and provide its own algorithm. 920 */ 921 static void 922 useDynamicWindow(SCSUData *scsu, int8_t window) { 923 /* 924 * move the existing window, which just became the most recently used one, 925 * up in windowUse[] to nextWindowUseIndex-1 926 */ 927 928 /* first, find the index of the window - backwards to favor the more recently used windows */ 929 int i, j; 930 931 i=scsu->nextWindowUseIndex; 932 do { 933 if(--i<0) { 934 i=7; 935 } 936 } while(scsu->windowUse[i]!=window); 937 938 /* now copy each windowUse[i+1] to [i] */ 939 j=i+1; 940 if(j==8) { 941 j=0; 942 } 943 while(j!=scsu->nextWindowUseIndex) { 944 scsu->windowUse[i]=scsu->windowUse[j]; 945 i=j; 946 if(++j==8) { j=0; } 947 } 948 949 /* finally, set the window into the most recently used index */ 950 scsu->windowUse[i]=window; 951 } 952 953 /* 954 * calculate the offset and the code for a dynamic window that contains the character 955 * takes fixed offsets into account 956 * the offset of the window is stored in the offset variable, 957 * the code is returned 958 * 959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code 960 */ 961 static int 962 getDynamicOffset(uint32_t c, uint32_t *pOffset) { 963 int i; 964 965 for(i=0; i<7; ++i) { 966 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { 967 *pOffset=fixedOffsets[i]; 968 return 0xf9+i; 969 } 970 } 971 972 if(c<0x80) { 973 /* No dynamic window for US-ASCII. */ 974 return -1; 975 } else if(c<0x3400 || 976 (uint32_t)(c-0x10000)<(0x14000-0x10000) || 977 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) 978 ) { 979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ 980 *pOffset=c&0x7fffff80; 981 return (int)(c>>7); 982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { 983 /* For these characters we need to take the gapOffset into account. */ 984 *pOffset=c&0x7fffff80; 985 return (int)((c-gapOffset)>>7); 986 } else { 987 return -1; 988 } 989 } 990 991 /* 992 * Idea for compression: 993 * - save SCSUData and other state before really starting work 994 * - at endloop, see if compression could be better with just unicode mode 995 * - don't do this if a callback has been called 996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning 997 * - different buffer handling! 998 * 999 * Drawback or need for corrective handling: 1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and 1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible 1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. 1003 * 1004 * How to achieve both? 1005 * - Only replace the result after an SDX or SCU? 1006 */ 1007 1008 static void 1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1010 UErrorCode *pErrorCode) { 1011 UConverter *cnv; 1012 SCSUData *scsu; 1013 const UChar *source, *sourceLimit; 1014 uint8_t *target; 1015 int32_t targetCapacity; 1016 int32_t *offsets; 1017 1018 UBool isSingleByteMode; 1019 uint8_t dynamicWindow; 1020 uint32_t currentOffset; 1021 1022 uint32_t c, delta; 1023 1024 int32_t sourceIndex, nextSourceIndex; 1025 1026 int32_t length; 1027 1028 /* variables for compression heuristics */ 1029 uint32_t offset; 1030 UChar lead, trail; 1031 int code; 1032 int8_t window; 1033 1034 /* set up the local pointers */ 1035 cnv=pArgs->converter; 1036 scsu=(SCSUData *)cnv->extraInfo; 1037 1038 /* set up the local pointers */ 1039 source=pArgs->source; 1040 sourceLimit=pArgs->sourceLimit; 1041 target=(uint8_t *)pArgs->target; 1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1043 offsets=pArgs->offsets; 1044 1045 /* get the state machine state */ 1046 isSingleByteMode=scsu->fromUIsSingleByteMode; 1047 dynamicWindow=scsu->fromUDynamicWindow; 1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1049 1050 c=cnv->fromUChar32; 1051 1052 /* sourceIndex=-1 if the current character began in the previous buffer */ 1053 sourceIndex= c==0 ? 0 : -1; 1054 nextSourceIndex=0; 1055 1056 /* similar conversion "loop" as in toUnicode */ 1057 loop: 1058 if(isSingleByteMode) { 1059 if(c!=0 && targetCapacity>0) { 1060 goto getTrailSingle; 1061 } 1062 1063 /* state machine for single-byte mode */ 1064 /* singleByteMode: */ 1065 while(source<sourceLimit) { 1066 if(targetCapacity<=0) { 1067 /* target is full */ 1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1069 break; 1070 } 1071 c=*source++; 1072 ++nextSourceIndex; 1073 1074 if((c-0x20)<=0x5f) { 1075 /* pass US-ASCII graphic character through */ 1076 *target++=(uint8_t)c; 1077 if(offsets!=NULL) { 1078 *offsets++=sourceIndex; 1079 } 1080 --targetCapacity; 1081 } else if(c<0x20) { 1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1083 /* CR/LF/TAB/NUL */ 1084 *target++=(uint8_t)c; 1085 if(offsets!=NULL) { 1086 *offsets++=sourceIndex; 1087 } 1088 --targetCapacity; 1089 } else { 1090 /* quote C0 control character */ 1091 c|=SQ0<<8; 1092 length=2; 1093 goto outputBytes; 1094 } 1095 } else if((delta=c-currentOffset)<=0x7f) { 1096 /* use the current dynamic window */ 1097 *target++=(uint8_t)(delta|0x80); 1098 if(offsets!=NULL) { 1099 *offsets++=sourceIndex; 1100 } 1101 --targetCapacity; 1102 } else if(U16_IS_SURROGATE(c)) { 1103 if(U16_IS_SURROGATE_LEAD(c)) { 1104 getTrailSingle: 1105 lead=(UChar)c; 1106 if(source<sourceLimit) { 1107 /* test the following code unit */ 1108 trail=*source; 1109 if(U16_IS_TRAIL(trail)) { 1110 ++source; 1111 ++nextSourceIndex; 1112 c=U16_GET_SUPPLEMENTARY(c, trail); 1113 /* convert this surrogate code point */ 1114 /* exit this condition tree */ 1115 } else { 1116 /* this is an unmatched lead code unit (1st surrogate) */ 1117 /* callback(illegal) */ 1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1119 goto endloop; 1120 } 1121 } else { 1122 /* no more input */ 1123 break; 1124 } 1125 } else { 1126 /* this is an unmatched trail code unit (2nd surrogate) */ 1127 /* callback(illegal) */ 1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1129 goto endloop; 1130 } 1131 1132 /* compress supplementary character U+10000..U+10ffff */ 1133 if((delta=c-currentOffset)<=0x7f) { 1134 /* use the current dynamic window */ 1135 *target++=(uint8_t)(delta|0x80); 1136 if(offsets!=NULL) { 1137 *offsets++=sourceIndex; 1138 } 1139 --targetCapacity; 1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1141 /* there is a dynamic window that contains this character, change to it */ 1142 dynamicWindow=window; 1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1144 useDynamicWindow(scsu, dynamicWindow); 1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1146 length=2; 1147 goto outputBytes; 1148 } else if((code=getDynamicOffset(c, &offset))>=0) { 1149 /* might check if there are more characters in this window to come */ 1150 /* define an extended window with this character */ 1151 code-=0x200; 1152 dynamicWindow=getNextDynamicWindow(scsu); 1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1154 useDynamicWindow(scsu, dynamicWindow); 1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1156 length=4; 1157 goto outputBytes; 1158 } else { 1159 /* change to Unicode mode and output this (lead, trail) pair */ 1160 isSingleByteMode=FALSE; 1161 *target++=(uint8_t)SCU; 1162 if(offsets!=NULL) { 1163 *offsets++=sourceIndex; 1164 } 1165 --targetCapacity; 1166 c=((uint32_t)lead<<16)|trail; 1167 length=4; 1168 goto outputBytes; 1169 } 1170 } else if(c<0xa0) { 1171 /* quote C1 control character */ 1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1173 length=2; 1174 goto outputBytes; 1175 } else if(c==0xfeff || c>=0xfff0) { 1176 /* quote signature character=byte order mark and specials */ 1177 c|=SQU<<16; 1178 length=3; 1179 goto outputBytes; 1180 } else { 1181 /* compress all other BMP characters */ 1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1183 /* there is a window defined that contains this character - switch to it or quote from it? */ 1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1185 /* change to dynamic window */ 1186 dynamicWindow=window; 1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1188 useDynamicWindow(scsu, dynamicWindow); 1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1190 length=2; 1191 goto outputBytes; 1192 } else { 1193 /* quote from dynamic window */ 1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1195 length=2; 1196 goto outputBytes; 1197 } 1198 } else if((window=getWindow(staticOffsets, c))>=0) { 1199 /* quote from static window */ 1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1201 length=2; 1202 goto outputBytes; 1203 } else if((code=getDynamicOffset(c, &offset))>=0) { 1204 /* define a dynamic window with this character */ 1205 dynamicWindow=getNextDynamicWindow(scsu); 1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1207 useDynamicWindow(scsu, dynamicWindow); 1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1209 length=3; 1210 goto outputBytes; 1211 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1212 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1213 ) { 1214 /* 1215 * this character is not compressible (a BMP ideograph or similar); 1216 * switch to Unicode mode if this is the last character in the block 1217 * or there is at least one more ideograph following immediately 1218 */ 1219 isSingleByteMode=FALSE; 1220 c|=SCU<<16; 1221 length=3; 1222 goto outputBytes; 1223 } else { 1224 /* quote Unicode */ 1225 c|=SQU<<16; 1226 length=3; 1227 goto outputBytes; 1228 } 1229 } 1230 1231 /* normal end of conversion: prepare for a new character */ 1232 c=0; 1233 sourceIndex=nextSourceIndex; 1234 } 1235 } else { 1236 if(c!=0 && targetCapacity>0) { 1237 goto getTrailUnicode; 1238 } 1239 1240 /* state machine for Unicode mode */ 1241 /* unicodeByteMode: */ 1242 while(source<sourceLimit) { 1243 if(targetCapacity<=0) { 1244 /* target is full */ 1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1246 break; 1247 } 1248 c=*source++; 1249 ++nextSourceIndex; 1250 1251 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1252 /* not compressible, write character directly */ 1253 if(targetCapacity>=2) { 1254 *target++=(uint8_t)(c>>8); 1255 *target++=(uint8_t)c; 1256 if(offsets!=NULL) { 1257 *offsets++=sourceIndex; 1258 *offsets++=sourceIndex; 1259 } 1260 targetCapacity-=2; 1261 } else { 1262 length=2; 1263 goto outputBytes; 1264 } 1265 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1266 /* compress BMP character if the following one is not an uncompressible ideograph */ 1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1268 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1269 /* ASCII digit or letter */ 1270 isSingleByteMode=TRUE; 1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1272 length=2; 1273 goto outputBytes; 1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1275 /* there is a dynamic window that contains this character, change to it */ 1276 isSingleByteMode=TRUE; 1277 dynamicWindow=window; 1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1279 useDynamicWindow(scsu, dynamicWindow); 1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1281 length=2; 1282 goto outputBytes; 1283 } else if((code=getDynamicOffset(c, &offset))>=0) { 1284 /* define a dynamic window with this character */ 1285 isSingleByteMode=TRUE; 1286 dynamicWindow=getNextDynamicWindow(scsu); 1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1288 useDynamicWindow(scsu, dynamicWindow); 1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1290 length=3; 1291 goto outputBytes; 1292 } 1293 } 1294 1295 /* don't know how to compress this character, just write it directly */ 1296 length=2; 1297 goto outputBytes; 1298 } else if(c<0xe000) { 1299 /* c is a surrogate */ 1300 if(U16_IS_SURROGATE_LEAD(c)) { 1301 getTrailUnicode: 1302 lead=(UChar)c; 1303 if(source<sourceLimit) { 1304 /* test the following code unit */ 1305 trail=*source; 1306 if(U16_IS_TRAIL(trail)) { 1307 ++source; 1308 ++nextSourceIndex; 1309 c=U16_GET_SUPPLEMENTARY(c, trail); 1310 /* convert this surrogate code point */ 1311 /* exit this condition tree */ 1312 } else { 1313 /* this is an unmatched lead code unit (1st surrogate) */ 1314 /* callback(illegal) */ 1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1316 goto endloop; 1317 } 1318 } else { 1319 /* no more input */ 1320 break; 1321 } 1322 } else { 1323 /* this is an unmatched trail code unit (2nd surrogate) */ 1324 /* callback(illegal) */ 1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1326 goto endloop; 1327 } 1328 1329 /* compress supplementary character */ 1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1332 ) { 1333 /* 1334 * there is a dynamic window that contains this character and 1335 * the following character is not uncompressible, 1336 * change to the window 1337 */ 1338 isSingleByteMode=TRUE; 1339 dynamicWindow=window; 1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1341 useDynamicWindow(scsu, dynamicWindow); 1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1343 length=2; 1344 goto outputBytes; 1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1346 (code=getDynamicOffset(c, &offset))>=0 1347 ) { 1348 /* two supplementary characters in (probably) the same window - define an extended one */ 1349 isSingleByteMode=TRUE; 1350 code-=0x200; 1351 dynamicWindow=getNextDynamicWindow(scsu); 1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1353 useDynamicWindow(scsu, dynamicWindow); 1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1355 length=4; 1356 goto outputBytes; 1357 } else { 1358 /* don't know how to compress this character, just write it directly */ 1359 c=((uint32_t)lead<<16)|trail; 1360 length=4; 1361 goto outputBytes; 1362 } 1363 } else /* 0xe000<=c<0xf300 */ { 1364 /* quote to avoid SCSU tags */ 1365 c|=UQU<<16; 1366 length=3; 1367 goto outputBytes; 1368 } 1369 1370 /* normal end of conversion: prepare for a new character */ 1371 c=0; 1372 sourceIndex=nextSourceIndex; 1373 } 1374 } 1375 endloop: 1376 1377 /* set the converter state back into UConverter */ 1378 scsu->fromUIsSingleByteMode=isSingleByteMode; 1379 scsu->fromUDynamicWindow=dynamicWindow; 1380 1381 cnv->fromUChar32=c; 1382 1383 /* write back the updated pointers */ 1384 pArgs->source=source; 1385 pArgs->target=(char *)target; 1386 pArgs->offsets=offsets; 1387 return; 1388 1389 outputBytes: 1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1391 /* from the first if in the loop we know that targetCapacity>0 */ 1392 if(length<=targetCapacity) { 1393 if(offsets==NULL) { 1394 switch(length) { 1395 /* each branch falls through to the next one */ 1396 case 4: 1397 *target++=(uint8_t)(c>>24); 1398 case 3: /*fall through*/ 1399 *target++=(uint8_t)(c>>16); 1400 case 2: /*fall through*/ 1401 *target++=(uint8_t)(c>>8); 1402 case 1: /*fall through*/ 1403 *target++=(uint8_t)c; 1404 default: 1405 /* will never occur */ 1406 break; 1407 } 1408 } else { 1409 switch(length) { 1410 /* each branch falls through to the next one */ 1411 case 4: 1412 *target++=(uint8_t)(c>>24); 1413 *offsets++=sourceIndex; 1414 case 3: /*fall through*/ 1415 *target++=(uint8_t)(c>>16); 1416 *offsets++=sourceIndex; 1417 case 2: /*fall through*/ 1418 *target++=(uint8_t)(c>>8); 1419 *offsets++=sourceIndex; 1420 case 1: /*fall through*/ 1421 *target++=(uint8_t)c; 1422 *offsets++=sourceIndex; 1423 default: 1424 /* will never occur */ 1425 break; 1426 } 1427 } 1428 targetCapacity-=length; 1429 1430 /* normal end of conversion: prepare for a new character */ 1431 c=0; 1432 sourceIndex=nextSourceIndex; 1433 goto loop; 1434 } else { 1435 uint8_t *p; 1436 1437 /* 1438 * We actually do this backwards here: 1439 * In order to save an intermediate variable, we output 1440 * first to the overflow buffer what does not fit into the 1441 * regular target. 1442 */ 1443 /* we know that 0<=targetCapacity<length<=4 */ 1444 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1445 length-=targetCapacity; 1446 p=(uint8_t *)cnv->charErrorBuffer; 1447 switch(length) { 1448 /* each branch falls through to the next one */ 1449 case 4: 1450 *p++=(uint8_t)(c>>24); 1451 case 3: /*fall through*/ 1452 *p++=(uint8_t)(c>>16); 1453 case 2: /*fall through*/ 1454 *p++=(uint8_t)(c>>8); 1455 case 1: /*fall through*/ 1456 *p=(uint8_t)c; 1457 default: 1458 /* will never occur */ 1459 break; 1460 } 1461 cnv->charErrorBufferLength=(int8_t)length; 1462 1463 /* now output what fits into the regular target */ 1464 c>>=8*length; /* length was reduced by targetCapacity */ 1465 switch(targetCapacity) { 1466 /* each branch falls through to the next one */ 1467 case 3: 1468 *target++=(uint8_t)(c>>16); 1469 if(offsets!=NULL) { 1470 *offsets++=sourceIndex; 1471 } 1472 case 2: /*fall through*/ 1473 *target++=(uint8_t)(c>>8); 1474 if(offsets!=NULL) { 1475 *offsets++=sourceIndex; 1476 } 1477 case 1: /*fall through*/ 1478 *target++=(uint8_t)c; 1479 if(offsets!=NULL) { 1480 *offsets++=sourceIndex; 1481 } 1482 default: 1483 break; 1484 } 1485 1486 /* target overflow */ 1487 targetCapacity=0; 1488 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1489 c=0; 1490 goto endloop; 1491 } 1492 } 1493 1494 /* 1495 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. 1496 * If a change is made in the original function, then either 1497 * change this function the same way or 1498 * re-copy the original function and remove the variables 1499 * offsets, sourceIndex, and nextSourceIndex. 1500 */ 1501 static void 1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, 1503 UErrorCode *pErrorCode) { 1504 UConverter *cnv; 1505 SCSUData *scsu; 1506 const UChar *source, *sourceLimit; 1507 uint8_t *target; 1508 int32_t targetCapacity; 1509 1510 UBool isSingleByteMode; 1511 uint8_t dynamicWindow; 1512 uint32_t currentOffset; 1513 1514 uint32_t c, delta; 1515 1516 int32_t length; 1517 1518 /* variables for compression heuristics */ 1519 uint32_t offset; 1520 UChar lead, trail; 1521 int code; 1522 int8_t window; 1523 1524 /* set up the local pointers */ 1525 cnv=pArgs->converter; 1526 scsu=(SCSUData *)cnv->extraInfo; 1527 1528 /* set up the local pointers */ 1529 source=pArgs->source; 1530 sourceLimit=pArgs->sourceLimit; 1531 target=(uint8_t *)pArgs->target; 1532 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1533 1534 /* get the state machine state */ 1535 isSingleByteMode=scsu->fromUIsSingleByteMode; 1536 dynamicWindow=scsu->fromUDynamicWindow; 1537 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1538 1539 c=cnv->fromUChar32; 1540 1541 /* similar conversion "loop" as in toUnicode */ 1542 loop: 1543 if(isSingleByteMode) { 1544 if(c!=0 && targetCapacity>0) { 1545 goto getTrailSingle; 1546 } 1547 1548 /* state machine for single-byte mode */ 1549 /* singleByteMode: */ 1550 while(source<sourceLimit) { 1551 if(targetCapacity<=0) { 1552 /* target is full */ 1553 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1554 break; 1555 } 1556 c=*source++; 1557 1558 if((c-0x20)<=0x5f) { 1559 /* pass US-ASCII graphic character through */ 1560 *target++=(uint8_t)c; 1561 --targetCapacity; 1562 } else if(c<0x20) { 1563 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1564 /* CR/LF/TAB/NUL */ 1565 *target++=(uint8_t)c; 1566 --targetCapacity; 1567 } else { 1568 /* quote C0 control character */ 1569 c|=SQ0<<8; 1570 length=2; 1571 goto outputBytes; 1572 } 1573 } else if((delta=c-currentOffset)<=0x7f) { 1574 /* use the current dynamic window */ 1575 *target++=(uint8_t)(delta|0x80); 1576 --targetCapacity; 1577 } else if(U16_IS_SURROGATE(c)) { 1578 if(U16_IS_SURROGATE_LEAD(c)) { 1579 getTrailSingle: 1580 lead=(UChar)c; 1581 if(source<sourceLimit) { 1582 /* test the following code unit */ 1583 trail=*source; 1584 if(U16_IS_TRAIL(trail)) { 1585 ++source; 1586 c=U16_GET_SUPPLEMENTARY(c, trail); 1587 /* convert this surrogate code point */ 1588 /* exit this condition tree */ 1589 } else { 1590 /* this is an unmatched lead code unit (1st surrogate) */ 1591 /* callback(illegal) */ 1592 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1593 goto endloop; 1594 } 1595 } else { 1596 /* no more input */ 1597 break; 1598 } 1599 } else { 1600 /* this is an unmatched trail code unit (2nd surrogate) */ 1601 /* callback(illegal) */ 1602 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1603 goto endloop; 1604 } 1605 1606 /* compress supplementary character U+10000..U+10ffff */ 1607 if((delta=c-currentOffset)<=0x7f) { 1608 /* use the current dynamic window */ 1609 *target++=(uint8_t)(delta|0x80); 1610 --targetCapacity; 1611 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1612 /* there is a dynamic window that contains this character, change to it */ 1613 dynamicWindow=window; 1614 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1615 useDynamicWindow(scsu, dynamicWindow); 1616 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1617 length=2; 1618 goto outputBytes; 1619 } else if((code=getDynamicOffset(c, &offset))>=0) { 1620 /* might check if there are more characters in this window to come */ 1621 /* define an extended window with this character */ 1622 code-=0x200; 1623 dynamicWindow=getNextDynamicWindow(scsu); 1624 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1625 useDynamicWindow(scsu, dynamicWindow); 1626 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1627 length=4; 1628 goto outputBytes; 1629 } else { 1630 /* change to Unicode mode and output this (lead, trail) pair */ 1631 isSingleByteMode=FALSE; 1632 *target++=(uint8_t)SCU; 1633 --targetCapacity; 1634 c=((uint32_t)lead<<16)|trail; 1635 length=4; 1636 goto outputBytes; 1637 } 1638 } else if(c<0xa0) { 1639 /* quote C1 control character */ 1640 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1641 length=2; 1642 goto outputBytes; 1643 } else if(c==0xfeff || c>=0xfff0) { 1644 /* quote signature character=byte order mark and specials */ 1645 c|=SQU<<16; 1646 length=3; 1647 goto outputBytes; 1648 } else { 1649 /* compress all other BMP characters */ 1650 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1651 /* there is a window defined that contains this character - switch to it or quote from it? */ 1652 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1653 /* change to dynamic window */ 1654 dynamicWindow=window; 1655 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1656 useDynamicWindow(scsu, dynamicWindow); 1657 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1658 length=2; 1659 goto outputBytes; 1660 } else { 1661 /* quote from dynamic window */ 1662 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1663 length=2; 1664 goto outputBytes; 1665 } 1666 } else if((window=getWindow(staticOffsets, c))>=0) { 1667 /* quote from static window */ 1668 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1669 length=2; 1670 goto outputBytes; 1671 } else if((code=getDynamicOffset(c, &offset))>=0) { 1672 /* define a dynamic window with this character */ 1673 dynamicWindow=getNextDynamicWindow(scsu); 1674 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1675 useDynamicWindow(scsu, dynamicWindow); 1676 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1677 length=3; 1678 goto outputBytes; 1679 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1680 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1681 ) { 1682 /* 1683 * this character is not compressible (a BMP ideograph or similar); 1684 * switch to Unicode mode if this is the last character in the block 1685 * or there is at least one more ideograph following immediately 1686 */ 1687 isSingleByteMode=FALSE; 1688 c|=SCU<<16; 1689 length=3; 1690 goto outputBytes; 1691 } else { 1692 /* quote Unicode */ 1693 c|=SQU<<16; 1694 length=3; 1695 goto outputBytes; 1696 } 1697 } 1698 1699 /* normal end of conversion: prepare for a new character */ 1700 c=0; 1701 } 1702 } else { 1703 if(c!=0 && targetCapacity>0) { 1704 goto getTrailUnicode; 1705 } 1706 1707 /* state machine for Unicode mode */ 1708 /* unicodeByteMode: */ 1709 while(source<sourceLimit) { 1710 if(targetCapacity<=0) { 1711 /* target is full */ 1712 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1713 break; 1714 } 1715 c=*source++; 1716 1717 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1718 /* not compressible, write character directly */ 1719 if(targetCapacity>=2) { 1720 *target++=(uint8_t)(c>>8); 1721 *target++=(uint8_t)c; 1722 targetCapacity-=2; 1723 } else { 1724 length=2; 1725 goto outputBytes; 1726 } 1727 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1728 /* compress BMP character if the following one is not an uncompressible ideograph */ 1729 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1730 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1731 /* ASCII digit or letter */ 1732 isSingleByteMode=TRUE; 1733 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1734 length=2; 1735 goto outputBytes; 1736 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1737 /* there is a dynamic window that contains this character, change to it */ 1738 isSingleByteMode=TRUE; 1739 dynamicWindow=window; 1740 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1741 useDynamicWindow(scsu, dynamicWindow); 1742 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1743 length=2; 1744 goto outputBytes; 1745 } else if((code=getDynamicOffset(c, &offset))>=0) { 1746 /* define a dynamic window with this character */ 1747 isSingleByteMode=TRUE; 1748 dynamicWindow=getNextDynamicWindow(scsu); 1749 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1750 useDynamicWindow(scsu, dynamicWindow); 1751 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1752 length=3; 1753 goto outputBytes; 1754 } 1755 } 1756 1757 /* don't know how to compress this character, just write it directly */ 1758 length=2; 1759 goto outputBytes; 1760 } else if(c<0xe000) { 1761 /* c is a surrogate */ 1762 if(U16_IS_SURROGATE_LEAD(c)) { 1763 getTrailUnicode: 1764 lead=(UChar)c; 1765 if(source<sourceLimit) { 1766 /* test the following code unit */ 1767 trail=*source; 1768 if(U16_IS_TRAIL(trail)) { 1769 ++source; 1770 c=U16_GET_SUPPLEMENTARY(c, trail); 1771 /* convert this surrogate code point */ 1772 /* exit this condition tree */ 1773 } else { 1774 /* this is an unmatched lead code unit (1st surrogate) */ 1775 /* callback(illegal) */ 1776 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1777 goto endloop; 1778 } 1779 } else { 1780 /* no more input */ 1781 break; 1782 } 1783 } else { 1784 /* this is an unmatched trail code unit (2nd surrogate) */ 1785 /* callback(illegal) */ 1786 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1787 goto endloop; 1788 } 1789 1790 /* compress supplementary character */ 1791 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1792 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1793 ) { 1794 /* 1795 * there is a dynamic window that contains this character and 1796 * the following character is not uncompressible, 1797 * change to the window 1798 */ 1799 isSingleByteMode=TRUE; 1800 dynamicWindow=window; 1801 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1802 useDynamicWindow(scsu, dynamicWindow); 1803 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1804 length=2; 1805 goto outputBytes; 1806 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1807 (code=getDynamicOffset(c, &offset))>=0 1808 ) { 1809 /* two supplementary characters in (probably) the same window - define an extended one */ 1810 isSingleByteMode=TRUE; 1811 code-=0x200; 1812 dynamicWindow=getNextDynamicWindow(scsu); 1813 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1814 useDynamicWindow(scsu, dynamicWindow); 1815 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1816 length=4; 1817 goto outputBytes; 1818 } else { 1819 /* don't know how to compress this character, just write it directly */ 1820 c=((uint32_t)lead<<16)|trail; 1821 length=4; 1822 goto outputBytes; 1823 } 1824 } else /* 0xe000<=c<0xf300 */ { 1825 /* quote to avoid SCSU tags */ 1826 c|=UQU<<16; 1827 length=3; 1828 goto outputBytes; 1829 } 1830 1831 /* normal end of conversion: prepare for a new character */ 1832 c=0; 1833 } 1834 } 1835 endloop: 1836 1837 /* set the converter state back into UConverter */ 1838 scsu->fromUIsSingleByteMode=isSingleByteMode; 1839 scsu->fromUDynamicWindow=dynamicWindow; 1840 1841 cnv->fromUChar32=c; 1842 1843 /* write back the updated pointers */ 1844 pArgs->source=source; 1845 pArgs->target=(char *)target; 1846 return; 1847 1848 outputBytes: 1849 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1850 /* from the first if in the loop we know that targetCapacity>0 */ 1851 if(length<=targetCapacity) { 1852 switch(length) { 1853 /* each branch falls through to the next one */ 1854 case 4: 1855 *target++=(uint8_t)(c>>24); 1856 case 3: /*fall through*/ 1857 *target++=(uint8_t)(c>>16); 1858 case 2: /*fall through*/ 1859 *target++=(uint8_t)(c>>8); 1860 case 1: /*fall through*/ 1861 *target++=(uint8_t)c; 1862 default: 1863 /* will never occur */ 1864 break; 1865 } 1866 targetCapacity-=length; 1867 1868 /* normal end of conversion: prepare for a new character */ 1869 c=0; 1870 goto loop; 1871 } else { 1872 uint8_t *p; 1873 1874 /* 1875 * We actually do this backwards here: 1876 * In order to save an intermediate variable, we output 1877 * first to the overflow buffer what does not fit into the 1878 * regular target. 1879 */ 1880 /* we know that 0<=targetCapacity<length<=4 */ 1881 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1882 length-=targetCapacity; 1883 p=(uint8_t *)cnv->charErrorBuffer; 1884 switch(length) { 1885 /* each branch falls through to the next one */ 1886 case 4: 1887 *p++=(uint8_t)(c>>24); 1888 case 3: /*fall through*/ 1889 *p++=(uint8_t)(c>>16); 1890 case 2: /*fall through*/ 1891 *p++=(uint8_t)(c>>8); 1892 case 1: /*fall through*/ 1893 *p=(uint8_t)c; 1894 default: 1895 /* will never occur */ 1896 break; 1897 } 1898 cnv->charErrorBufferLength=(int8_t)length; 1899 1900 /* now output what fits into the regular target */ 1901 c>>=8*length; /* length was reduced by targetCapacity */ 1902 switch(targetCapacity) { 1903 /* each branch falls through to the next one */ 1904 case 3: 1905 *target++=(uint8_t)(c>>16); 1906 case 2: /*fall through*/ 1907 *target++=(uint8_t)(c>>8); 1908 case 1: /*fall through*/ 1909 *target++=(uint8_t)c; 1910 default: 1911 break; 1912 } 1913 1914 /* target overflow */ 1915 targetCapacity=0; 1916 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1917 c=0; 1918 goto endloop; 1919 } 1920 } 1921 1922 /* miscellaneous ------------------------------------------------------------ */ 1923 1924 static const char * 1925 _SCSUGetName(const UConverter *cnv) { 1926 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1927 1928 switch(scsu->locale) { 1929 case l_ja: 1930 return "SCSU,locale=ja"; 1931 default: 1932 return "SCSU"; 1933 } 1934 } 1935 1936 /* structure for SafeClone calculations */ 1937 struct cloneSCSUStruct 1938 { 1939 UConverter cnv; 1940 SCSUData mydata; 1941 }; 1942 1943 static UConverter * 1944 _SCSUSafeClone(const UConverter *cnv, 1945 void *stackBuffer, 1946 int32_t *pBufferSize, 1947 UErrorCode *status) 1948 { 1949 struct cloneSCSUStruct * localClone; 1950 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); 1951 1952 if (U_FAILURE(*status)){ 1953 return 0; 1954 } 1955 1956 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 1957 *pBufferSize = bufferSizeNeeded; 1958 return 0; 1959 } 1960 1961 localClone = (struct cloneSCSUStruct *)stackBuffer; 1962 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1963 1964 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); 1965 localClone->cnv.extraInfo = &localClone->mydata; 1966 localClone->cnv.isExtraLocal = TRUE; 1967 1968 return &localClone->cnv; 1969 } 1970 1971 1972 static const UConverterImpl _SCSUImpl={ 1973 UCNV_SCSU, 1974 1975 NULL, 1976 NULL, 1977 1978 _SCSUOpen, 1979 _SCSUClose, 1980 _SCSUReset, 1981 1982 _SCSUToUnicode, 1983 _SCSUToUnicodeWithOffsets, 1984 _SCSUFromUnicode, 1985 _SCSUFromUnicodeWithOffsets, 1986 NULL, 1987 1988 NULL, 1989 _SCSUGetName, 1990 NULL, 1991 _SCSUSafeClone, 1992 ucnv_getCompleteUnicodeSet 1993 }; 1994 1995 static const UConverterStaticData _SCSUStaticData={ 1996 sizeof(UConverterStaticData), 1997 "SCSU", 1998 1212, /* CCSID for SCSU */ 1999 UCNV_IBM, UCNV_SCSU, 2000 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ 2001 /* 2002 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode 2003 * substitution string. 2004 */ 2005 { 0x0e, 0xff, 0xfd, 0 }, 3, 2006 FALSE, FALSE, 2007 0, 2008 0, 2009 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 2010 }; 2011 2012 const UConverterSharedData _SCSUData= 2013 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl); 2014 2015 #endif 2016