1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvscsu.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000nov18 14 * created by: Markus W. Scherer 15 * 16 * This is an implementation of the Standard Compression Scheme for Unicode 17 * as defined in http://www.unicode.org/unicode/reports/tr6/ . 18 * Reserved commands and window settings are treated as illegal sequences and 19 * will result in callback calls. 20 */ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_CONVERSION 25 26 #include "unicode/ucnv.h" 27 #include "unicode/ucnv_cb.h" 28 #include "ucnv_bld.h" 29 #include "ucnv_cnv.h" 30 #include "cmemory.h" 31 32 /* SCSU definitions --------------------------------------------------------- */ 33 34 /* SCSU command byte values */ 35 enum { 36 SQ0=0x01, /* Quote from window pair 0 */ 37 SQ7=0x08, /* Quote from window pair 7 */ 38 SDX=0x0B, /* Define a window as extended */ 39 Srs=0x0C, /* reserved */ 40 SQU=0x0E, /* Quote a single Unicode character */ 41 SCU=0x0F, /* Change to Unicode mode */ 42 SC0=0x10, /* Select window 0 */ 43 SC7=0x17, /* Select window 7 */ 44 SD0=0x18, /* Define and select window 0 */ 45 SD7=0x1F, /* Define and select window 7 */ 46 47 UC0=0xE0, /* Select window 0 */ 48 UC7=0xE7, /* Select window 7 */ 49 UD0=0xE8, /* Define and select window 0 */ 50 UD7=0xEF, /* Define and select window 7 */ 51 UQU=0xF0, /* Quote a single Unicode character */ 52 UDX=0xF1, /* Define a Window as extended */ 53 Urs=0xF2 /* reserved */ 54 }; 55 56 enum { 57 /* 58 * Unicode code points from 3400 to E000 are not adressible by 59 * dynamic window, since in these areas no short run alphabets are 60 * found. Therefore add gapOffset to all values from gapThreshold. 61 */ 62 gapThreshold=0x68, 63 gapOffset=0xAC00, 64 65 /* values between reservedStart and fixedThreshold are reserved */ 66 reservedStart=0xA8, 67 68 /* use table of predefined fixed offsets for values from fixedThreshold */ 69 fixedThreshold=0xF9 70 }; 71 72 /* constant offsets for the 8 static windows */ 73 static const uint32_t staticOffsets[8]={ 74 0x0000, /* ASCII for quoted tags */ 75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 76 0x0100, /* Latin Extended-A */ 77 0x0300, /* Combining Diacritical Marks */ 78 0x2000, /* General Punctuation */ 79 0x2080, /* Currency Symbols */ 80 0x2100, /* Letterlike Symbols and Number Forms */ 81 0x3000 /* CJK Symbols and punctuation */ 82 }; 83 84 /* initial offsets for the 8 dynamic (sliding) windows */ 85 static const uint32_t initialDynamicOffsets[8]={ 86 0x0080, /* Latin-1 */ 87 0x00C0, /* Latin Extended A */ 88 0x0400, /* Cyrillic */ 89 0x0600, /* Arabic */ 90 0x0900, /* Devanagari */ 91 0x3040, /* Hiragana */ 92 0x30A0, /* Katakana */ 93 0xFF00 /* Fullwidth ASCII */ 94 }; 95 96 /* Table of fixed predefined Offsets */ 97 static const uint32_t fixedOffsets[]={ 98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 99 /* 0xFA */ 0x0250, /* IPA extensions */ 100 /* 0xFB */ 0x0370, /* Greek */ 101 /* 0xFC */ 0x0530, /* Armenian */ 102 /* 0xFD */ 0x3040, /* Hiragana */ 103 /* 0xFE */ 0x30A0, /* Katakana */ 104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 105 }; 106 107 /* state values */ 108 enum { 109 readCommand, 110 quotePairOne, 111 quotePairTwo, 112 quoteOne, 113 definePairOne, 114 definePairTwo, 115 defineOne 116 }; 117 118 typedef struct SCSUData { 119 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ 120 uint32_t toUDynamicOffsets[8]; 121 uint32_t fromUDynamicOffsets[8]; 122 123 /* state machine state - toUnicode */ 124 UBool toUIsSingleByteMode; 125 uint8_t toUState; 126 int8_t toUQuoteWindow, toUDynamicWindow; 127 uint8_t toUByteOne; 128 uint8_t toUPadding[3]; 129 130 /* state machine state - fromUnicode */ 131 UBool fromUIsSingleByteMode; 132 int8_t fromUDynamicWindow; 133 134 /* 135 * windowUse[] keeps track of the use of the dynamic windows: 136 * At nextWindowUseIndex there is the least recently used window, 137 * and the following windows (in a wrapping manner) are more and more 138 * recently used. 139 * At nextWindowUseIndex-1 there is the most recently used window. 140 */ 141 uint8_t locale; 142 int8_t nextWindowUseIndex; 143 int8_t windowUse[8]; 144 } SCSUData; 145 146 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 147 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 148 149 enum { 150 lGeneric, l_ja 151 }; 152 153 /* SCSU setup functions ----------------------------------------------------- */ 154 155 static void 156 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { 157 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 158 159 if(choice<=UCNV_RESET_TO_UNICODE) { 160 /* reset toUnicode */ 161 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); 162 163 scsu->toUIsSingleByteMode=TRUE; 164 scsu->toUState=readCommand; 165 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; 166 scsu->toUByteOne=0; 167 168 cnv->toULength=0; 169 } 170 if(choice!=UCNV_RESET_TO_UNICODE) { 171 /* reset fromUnicode */ 172 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); 173 174 scsu->fromUIsSingleByteMode=TRUE; 175 scsu->fromUDynamicWindow=0; 176 177 scsu->nextWindowUseIndex=0; 178 switch(scsu->locale) { 179 case l_ja: 180 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); 181 break; 182 default: 183 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); 184 break; 185 } 186 187 cnv->fromUChar32=0; 188 } 189 } 190 191 static void 192 _SCSUOpen(UConverter *cnv, 193 UConverterLoadArgs *pArgs, 194 UErrorCode *pErrorCode) { 195 const char *locale=pArgs->locale; 196 if(pArgs->onlyTestIsLoadable) { 197 return; 198 } 199 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); 200 if(cnv->extraInfo!=NULL) { 201 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { 202 ((SCSUData *)cnv->extraInfo)->locale=l_ja; 203 } else { 204 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; 205 } 206 _SCSUReset(cnv, UCNV_RESET_BOTH); 207 } else { 208 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 209 } 210 211 /* Set the substitution character U+fffd as a Unicode string. */ 212 cnv->subUChars[0]=0xfffd; 213 cnv->subCharLen=-1; 214 } 215 216 static void 217 _SCSUClose(UConverter *cnv) { 218 if(cnv->extraInfo!=NULL) { 219 if(!cnv->isExtraLocal) { 220 uprv_free(cnv->extraInfo); 221 } 222 cnv->extraInfo=NULL; 223 } 224 } 225 226 /* SCSU-to-Unicode conversion functions ------------------------------------- */ 227 228 static void 229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 230 UErrorCode *pErrorCode) { 231 UConverter *cnv; 232 SCSUData *scsu; 233 const uint8_t *source, *sourceLimit; 234 UChar *target; 235 const UChar *targetLimit; 236 int32_t *offsets; 237 UBool isSingleByteMode; 238 uint8_t state, byteOne; 239 int8_t quoteWindow, dynamicWindow; 240 241 int32_t sourceIndex, nextSourceIndex; 242 243 uint8_t b; 244 245 /* set up the local pointers */ 246 cnv=pArgs->converter; 247 scsu=(SCSUData *)cnv->extraInfo; 248 249 source=(const uint8_t *)pArgs->source; 250 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 251 target=pArgs->target; 252 targetLimit=pArgs->targetLimit; 253 offsets=pArgs->offsets; 254 255 /* get the state machine state */ 256 isSingleByteMode=scsu->toUIsSingleByteMode; 257 state=scsu->toUState; 258 quoteWindow=scsu->toUQuoteWindow; 259 dynamicWindow=scsu->toUDynamicWindow; 260 byteOne=scsu->toUByteOne; 261 262 /* sourceIndex=-1 if the current character began in the previous buffer */ 263 sourceIndex=state==readCommand ? 0 : -1; 264 nextSourceIndex=0; 265 266 /* 267 * conversion "loop" 268 * 269 * For performance, this is not a normal C loop. 270 * Instead, there are two code blocks for the two SCSU modes. 271 * The function branches to either one, and a change of the mode is done with a goto to 272 * the other branch. 273 * 274 * Each branch has two conventional loops: 275 * - a fast-path loop for the most common codes in the mode 276 * - a loop for all other codes in the mode 277 * When the fast-path runs into a code that it cannot handle, its loop ends and it 278 * runs into the following loop to handle the other codes. 279 * The end of the input or output buffer is also handled by the slower loop. 280 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 281 * 282 * The callback handling is done by returning with an error code. 283 * The conversion framework actually calls the callback function. 284 */ 285 if(isSingleByteMode) { 286 /* fast path for single-byte mode */ 287 if(state==readCommand) { 288 fastSingle: 289 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 290 ++source; 291 ++nextSourceIndex; 292 if(b<=0x7f) { 293 /* write US-ASCII graphic character or DEL */ 294 *target++=(UChar)b; 295 if(offsets!=NULL) { 296 *offsets++=sourceIndex; 297 } 298 } else { 299 /* write from dynamic window */ 300 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 301 if(c<=0xffff) { 302 *target++=(UChar)c; 303 if(offsets!=NULL) { 304 *offsets++=sourceIndex; 305 } 306 } else { 307 /* output surrogate pair */ 308 *target++=(UChar)(0xd7c0+(c>>10)); 309 if(target<targetLimit) { 310 *target++=(UChar)(0xdc00|(c&0x3ff)); 311 if(offsets!=NULL) { 312 *offsets++=sourceIndex; 313 *offsets++=sourceIndex; 314 } 315 } else { 316 /* target overflow */ 317 if(offsets!=NULL) { 318 *offsets++=sourceIndex; 319 } 320 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 321 cnv->UCharErrorBufferLength=1; 322 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 323 goto endloop; 324 } 325 } 326 } 327 sourceIndex=nextSourceIndex; 328 } 329 } 330 331 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 332 singleByteMode: 333 while(source<sourceLimit) { 334 if(target>=targetLimit) { 335 /* target is full */ 336 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 337 break; 338 } 339 b=*source++; 340 ++nextSourceIndex; 341 switch(state) { 342 case readCommand: 343 /* redundant conditions are commented out */ 344 /* here: b<0x20 because otherwise we would be in fastSingle */ 345 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 346 /* CR/LF/TAB/NUL */ 347 *target++=(UChar)b; 348 if(offsets!=NULL) { 349 *offsets++=sourceIndex; 350 } 351 sourceIndex=nextSourceIndex; 352 goto fastSingle; 353 } else if(SC0<=b) { 354 if(b<=SC7) { 355 dynamicWindow=(int8_t)(b-SC0); 356 sourceIndex=nextSourceIndex; 357 goto fastSingle; 358 } else /* if(SD0<=b && b<=SD7) */ { 359 dynamicWindow=(int8_t)(b-SD0); 360 state=defineOne; 361 } 362 } else if(/* SQ0<=b && */ b<=SQ7) { 363 quoteWindow=(int8_t)(b-SQ0); 364 state=quoteOne; 365 } else if(b==SDX) { 366 state=definePairOne; 367 } else if(b==SQU) { 368 state=quotePairOne; 369 } else if(b==SCU) { 370 sourceIndex=nextSourceIndex; 371 isSingleByteMode=FALSE; 372 goto fastUnicode; 373 } else /* Srs */ { 374 /* callback(illegal) */ 375 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 376 cnv->toUBytes[0]=b; 377 cnv->toULength=1; 378 goto endloop; 379 } 380 381 /* store the first byte of a multibyte sequence in toUBytes[] */ 382 cnv->toUBytes[0]=b; 383 cnv->toULength=1; 384 break; 385 case quotePairOne: 386 byteOne=b; 387 cnv->toUBytes[1]=b; 388 cnv->toULength=2; 389 state=quotePairTwo; 390 break; 391 case quotePairTwo: 392 *target++=(UChar)((byteOne<<8)|b); 393 if(offsets!=NULL) { 394 *offsets++=sourceIndex; 395 } 396 sourceIndex=nextSourceIndex; 397 state=readCommand; 398 goto fastSingle; 399 case quoteOne: 400 if(b<0x80) { 401 /* all static offsets are in the BMP */ 402 *target++=(UChar)(staticOffsets[quoteWindow]+b); 403 if(offsets!=NULL) { 404 *offsets++=sourceIndex; 405 } 406 } else { 407 /* write from dynamic window */ 408 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 409 if(c<=0xffff) { 410 *target++=(UChar)c; 411 if(offsets!=NULL) { 412 *offsets++=sourceIndex; 413 } 414 } else { 415 /* output surrogate pair */ 416 *target++=(UChar)(0xd7c0+(c>>10)); 417 if(target<targetLimit) { 418 *target++=(UChar)(0xdc00|(c&0x3ff)); 419 if(offsets!=NULL) { 420 *offsets++=sourceIndex; 421 *offsets++=sourceIndex; 422 } 423 } else { 424 /* target overflow */ 425 if(offsets!=NULL) { 426 *offsets++=sourceIndex; 427 } 428 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 429 cnv->UCharErrorBufferLength=1; 430 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 431 goto endloop; 432 } 433 } 434 } 435 sourceIndex=nextSourceIndex; 436 state=readCommand; 437 goto fastSingle; 438 case definePairOne: 439 dynamicWindow=(int8_t)((b>>5)&7); 440 byteOne=(uint8_t)(b&0x1f); 441 cnv->toUBytes[1]=b; 442 cnv->toULength=2; 443 state=definePairTwo; 444 break; 445 case definePairTwo: 446 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 447 sourceIndex=nextSourceIndex; 448 state=readCommand; 449 goto fastSingle; 450 case defineOne: 451 if(b==0) { 452 /* callback(illegal): Reserved window offset value 0 */ 453 cnv->toUBytes[1]=b; 454 cnv->toULength=2; 455 goto endloop; 456 } else if(b<gapThreshold) { 457 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 458 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 459 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 460 } else if(b>=fixedThreshold) { 461 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 462 } else { 463 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 464 cnv->toUBytes[1]=b; 465 cnv->toULength=2; 466 goto endloop; 467 } 468 sourceIndex=nextSourceIndex; 469 state=readCommand; 470 goto fastSingle; 471 } 472 } 473 } else { 474 /* fast path for Unicode mode */ 475 if(state==readCommand) { 476 fastUnicode: 477 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 478 *target++=(UChar)((b<<8)|source[1]); 479 if(offsets!=NULL) { 480 *offsets++=sourceIndex; 481 } 482 sourceIndex=nextSourceIndex; 483 nextSourceIndex+=2; 484 source+=2; 485 } 486 } 487 488 /* normal state machine for Unicode mode */ 489 /* unicodeByteMode: */ 490 while(source<sourceLimit) { 491 if(target>=targetLimit) { 492 /* target is full */ 493 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 494 break; 495 } 496 b=*source++; 497 ++nextSourceIndex; 498 switch(state) { 499 case readCommand: 500 if((uint8_t)(b-UC0)>(Urs-UC0)) { 501 byteOne=b; 502 cnv->toUBytes[0]=b; 503 cnv->toULength=1; 504 state=quotePairTwo; 505 } else if(/* UC0<=b && */ b<=UC7) { 506 dynamicWindow=(int8_t)(b-UC0); 507 sourceIndex=nextSourceIndex; 508 isSingleByteMode=TRUE; 509 goto fastSingle; 510 } else if(/* UD0<=b && */ b<=UD7) { 511 dynamicWindow=(int8_t)(b-UD0); 512 isSingleByteMode=TRUE; 513 cnv->toUBytes[0]=b; 514 cnv->toULength=1; 515 state=defineOne; 516 goto singleByteMode; 517 } else if(b==UDX) { 518 isSingleByteMode=TRUE; 519 cnv->toUBytes[0]=b; 520 cnv->toULength=1; 521 state=definePairOne; 522 goto singleByteMode; 523 } else if(b==UQU) { 524 cnv->toUBytes[0]=b; 525 cnv->toULength=1; 526 state=quotePairOne; 527 } else /* Urs */ { 528 /* callback(illegal) */ 529 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 530 cnv->toUBytes[0]=b; 531 cnv->toULength=1; 532 goto endloop; 533 } 534 break; 535 case quotePairOne: 536 byteOne=b; 537 cnv->toUBytes[1]=b; 538 cnv->toULength=2; 539 state=quotePairTwo; 540 break; 541 case quotePairTwo: 542 *target++=(UChar)((byteOne<<8)|b); 543 if(offsets!=NULL) { 544 *offsets++=sourceIndex; 545 } 546 sourceIndex=nextSourceIndex; 547 state=readCommand; 548 goto fastUnicode; 549 } 550 } 551 } 552 endloop: 553 554 /* set the converter state back into UConverter */ 555 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 556 /* reset to deal with the next character */ 557 state=readCommand; 558 } else if(state==readCommand) { 559 /* not in a multi-byte sequence, reset toULength */ 560 cnv->toULength=0; 561 } 562 scsu->toUIsSingleByteMode=isSingleByteMode; 563 scsu->toUState=state; 564 scsu->toUQuoteWindow=quoteWindow; 565 scsu->toUDynamicWindow=dynamicWindow; 566 scsu->toUByteOne=byteOne; 567 568 /* write back the updated pointers */ 569 pArgs->source=(const char *)source; 570 pArgs->target=target; 571 pArgs->offsets=offsets; 572 return; 573 } 574 575 /* 576 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. 577 * If a change is made in the original function, then either 578 * change this function the same way or 579 * re-copy the original function and remove the variables 580 * offsets, sourceIndex, and nextSourceIndex. 581 */ 582 static void 583 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, 584 UErrorCode *pErrorCode) { 585 UConverter *cnv; 586 SCSUData *scsu; 587 const uint8_t *source, *sourceLimit; 588 UChar *target; 589 const UChar *targetLimit; 590 UBool isSingleByteMode; 591 uint8_t state, byteOne; 592 int8_t quoteWindow, dynamicWindow; 593 594 uint8_t b; 595 596 /* set up the local pointers */ 597 cnv=pArgs->converter; 598 scsu=(SCSUData *)cnv->extraInfo; 599 600 source=(const uint8_t *)pArgs->source; 601 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 602 target=pArgs->target; 603 targetLimit=pArgs->targetLimit; 604 605 /* get the state machine state */ 606 isSingleByteMode=scsu->toUIsSingleByteMode; 607 state=scsu->toUState; 608 quoteWindow=scsu->toUQuoteWindow; 609 dynamicWindow=scsu->toUDynamicWindow; 610 byteOne=scsu->toUByteOne; 611 612 /* 613 * conversion "loop" 614 * 615 * For performance, this is not a normal C loop. 616 * Instead, there are two code blocks for the two SCSU modes. 617 * The function branches to either one, and a change of the mode is done with a goto to 618 * the other branch. 619 * 620 * Each branch has two conventional loops: 621 * - a fast-path loop for the most common codes in the mode 622 * - a loop for all other codes in the mode 623 * When the fast-path runs into a code that it cannot handle, its loop ends and it 624 * runs into the following loop to handle the other codes. 625 * The end of the input or output buffer is also handled by the slower loop. 626 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 627 * 628 * The callback handling is done by returning with an error code. 629 * The conversion framework actually calls the callback function. 630 */ 631 if(isSingleByteMode) { 632 /* fast path for single-byte mode */ 633 if(state==readCommand) { 634 fastSingle: 635 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 636 ++source; 637 if(b<=0x7f) { 638 /* write US-ASCII graphic character or DEL */ 639 *target++=(UChar)b; 640 } else { 641 /* write from dynamic window */ 642 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 643 if(c<=0xffff) { 644 *target++=(UChar)c; 645 } else { 646 /* output surrogate pair */ 647 *target++=(UChar)(0xd7c0+(c>>10)); 648 if(target<targetLimit) { 649 *target++=(UChar)(0xdc00|(c&0x3ff)); 650 } else { 651 /* target overflow */ 652 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 653 cnv->UCharErrorBufferLength=1; 654 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 655 goto endloop; 656 } 657 } 658 } 659 } 660 } 661 662 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 663 singleByteMode: 664 while(source<sourceLimit) { 665 if(target>=targetLimit) { 666 /* target is full */ 667 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 668 break; 669 } 670 b=*source++; 671 switch(state) { 672 case readCommand: 673 /* redundant conditions are commented out */ 674 /* here: b<0x20 because otherwise we would be in fastSingle */ 675 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 676 /* CR/LF/TAB/NUL */ 677 *target++=(UChar)b; 678 goto fastSingle; 679 } else if(SC0<=b) { 680 if(b<=SC7) { 681 dynamicWindow=(int8_t)(b-SC0); 682 goto fastSingle; 683 } else /* if(SD0<=b && b<=SD7) */ { 684 dynamicWindow=(int8_t)(b-SD0); 685 state=defineOne; 686 } 687 } else if(/* SQ0<=b && */ b<=SQ7) { 688 quoteWindow=(int8_t)(b-SQ0); 689 state=quoteOne; 690 } else if(b==SDX) { 691 state=definePairOne; 692 } else if(b==SQU) { 693 state=quotePairOne; 694 } else if(b==SCU) { 695 isSingleByteMode=FALSE; 696 goto fastUnicode; 697 } else /* Srs */ { 698 /* callback(illegal) */ 699 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 700 cnv->toUBytes[0]=b; 701 cnv->toULength=1; 702 goto endloop; 703 } 704 705 /* store the first byte of a multibyte sequence in toUBytes[] */ 706 cnv->toUBytes[0]=b; 707 cnv->toULength=1; 708 break; 709 case quotePairOne: 710 byteOne=b; 711 cnv->toUBytes[1]=b; 712 cnv->toULength=2; 713 state=quotePairTwo; 714 break; 715 case quotePairTwo: 716 *target++=(UChar)((byteOne<<8)|b); 717 state=readCommand; 718 goto fastSingle; 719 case quoteOne: 720 if(b<0x80) { 721 /* all static offsets are in the BMP */ 722 *target++=(UChar)(staticOffsets[quoteWindow]+b); 723 } else { 724 /* write from dynamic window */ 725 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 726 if(c<=0xffff) { 727 *target++=(UChar)c; 728 } else { 729 /* output surrogate pair */ 730 *target++=(UChar)(0xd7c0+(c>>10)); 731 if(target<targetLimit) { 732 *target++=(UChar)(0xdc00|(c&0x3ff)); 733 } else { 734 /* target overflow */ 735 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); 736 cnv->UCharErrorBufferLength=1; 737 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 738 goto endloop; 739 } 740 } 741 } 742 state=readCommand; 743 goto fastSingle; 744 case definePairOne: 745 dynamicWindow=(int8_t)((b>>5)&7); 746 byteOne=(uint8_t)(b&0x1f); 747 cnv->toUBytes[1]=b; 748 cnv->toULength=2; 749 state=definePairTwo; 750 break; 751 case definePairTwo: 752 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 753 state=readCommand; 754 goto fastSingle; 755 case defineOne: 756 if(b==0) { 757 /* callback(illegal): Reserved window offset value 0 */ 758 cnv->toUBytes[1]=b; 759 cnv->toULength=2; 760 goto endloop; 761 } else if(b<gapThreshold) { 762 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 763 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 764 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 765 } else if(b>=fixedThreshold) { 766 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 767 } else { 768 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 769 cnv->toUBytes[1]=b; 770 cnv->toULength=2; 771 goto endloop; 772 } 773 state=readCommand; 774 goto fastSingle; 775 } 776 } 777 } else { 778 /* fast path for Unicode mode */ 779 if(state==readCommand) { 780 fastUnicode: 781 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 782 *target++=(UChar)((b<<8)|source[1]); 783 source+=2; 784 } 785 } 786 787 /* normal state machine for Unicode mode */ 788 /* unicodeByteMode: */ 789 while(source<sourceLimit) { 790 if(target>=targetLimit) { 791 /* target is full */ 792 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 793 break; 794 } 795 b=*source++; 796 switch(state) { 797 case readCommand: 798 if((uint8_t)(b-UC0)>(Urs-UC0)) { 799 byteOne=b; 800 cnv->toUBytes[0]=b; 801 cnv->toULength=1; 802 state=quotePairTwo; 803 } else if(/* UC0<=b && */ b<=UC7) { 804 dynamicWindow=(int8_t)(b-UC0); 805 isSingleByteMode=TRUE; 806 goto fastSingle; 807 } else if(/* UD0<=b && */ b<=UD7) { 808 dynamicWindow=(int8_t)(b-UD0); 809 isSingleByteMode=TRUE; 810 cnv->toUBytes[0]=b; 811 cnv->toULength=1; 812 state=defineOne; 813 goto singleByteMode; 814 } else if(b==UDX) { 815 isSingleByteMode=TRUE; 816 cnv->toUBytes[0]=b; 817 cnv->toULength=1; 818 state=definePairOne; 819 goto singleByteMode; 820 } else if(b==UQU) { 821 cnv->toUBytes[0]=b; 822 cnv->toULength=1; 823 state=quotePairOne; 824 } else /* Urs */ { 825 /* callback(illegal) */ 826 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 827 cnv->toUBytes[0]=b; 828 cnv->toULength=1; 829 goto endloop; 830 } 831 break; 832 case quotePairOne: 833 byteOne=b; 834 cnv->toUBytes[1]=b; 835 cnv->toULength=2; 836 state=quotePairTwo; 837 break; 838 case quotePairTwo: 839 *target++=(UChar)((byteOne<<8)|b); 840 state=readCommand; 841 goto fastUnicode; 842 } 843 } 844 } 845 endloop: 846 847 /* set the converter state back into UConverter */ 848 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 849 /* reset to deal with the next character */ 850 state=readCommand; 851 } else if(state==readCommand) { 852 /* not in a multi-byte sequence, reset toULength */ 853 cnv->toULength=0; 854 } 855 scsu->toUIsSingleByteMode=isSingleByteMode; 856 scsu->toUState=state; 857 scsu->toUQuoteWindow=quoteWindow; 858 scsu->toUDynamicWindow=dynamicWindow; 859 scsu->toUByteOne=byteOne; 860 861 /* write back the updated pointers */ 862 pArgs->source=(const char *)source; 863 pArgs->target=target; 864 return; 865 } 866 867 /* SCSU-from-Unicode conversion functions ----------------------------------- */ 868 869 /* 870 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve 871 * reasonable results. The lookahead is minimal. 872 * Many cases are simple: 873 * A character fits directly into the current mode, a dynamic or static window, 874 * or is not compressible. These cases are tested first. 875 * Real compression heuristics are applied to the rest, in code branches for 876 * single/Unicode mode and BMP/supplementary code points. 877 * The heuristics used here are extremely simple. 878 */ 879 880 /* get the number of the window that this character is in, or -1 */ 881 static int8_t 882 getWindow(const uint32_t offsets[8], uint32_t c) { 883 int i; 884 for(i=0; i<8; ++i) { 885 if((uint32_t)(c-offsets[i])<=0x7f) { 886 return (int8_t)(i); 887 } 888 } 889 return -1; 890 } 891 892 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ 893 static UBool 894 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { 895 return (UBool)(c<=offset+0x7f && 896 (c>=offset || (c<=0x7f && 897 (c>=0x20 || (1UL<<c)&0x2601)))); 898 /* binary 0010 0110 0000 0001, 899 check for b==0xd || b==0xa || b==9 || b==0 */ 900 } 901 902 /* 903 * getNextDynamicWindow returns the next dynamic window to be redefined 904 */ 905 static int8_t 906 getNextDynamicWindow(SCSUData *scsu) { 907 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; 908 if(++scsu->nextWindowUseIndex==8) { 909 scsu->nextWindowUseIndex=0; 910 } 911 return window; 912 } 913 914 /* 915 * useDynamicWindow() adjusts 916 * windowUse[] and nextWindowUseIndex for the algorithm to choose 917 * the next dynamic window to be defined; 918 * a subclass may override it and provide its own algorithm. 919 */ 920 static void 921 useDynamicWindow(SCSUData *scsu, int8_t window) { 922 /* 923 * move the existing window, which just became the most recently used one, 924 * up in windowUse[] to nextWindowUseIndex-1 925 */ 926 927 /* first, find the index of the window - backwards to favor the more recently used windows */ 928 int i, j; 929 930 i=scsu->nextWindowUseIndex; 931 do { 932 if(--i<0) { 933 i=7; 934 } 935 } while(scsu->windowUse[i]!=window); 936 937 /* now copy each windowUse[i+1] to [i] */ 938 j=i+1; 939 if(j==8) { 940 j=0; 941 } 942 while(j!=scsu->nextWindowUseIndex) { 943 scsu->windowUse[i]=scsu->windowUse[j]; 944 i=j; 945 if(++j==8) { j=0; } 946 } 947 948 /* finally, set the window into the most recently used index */ 949 scsu->windowUse[i]=window; 950 } 951 952 /* 953 * calculate the offset and the code for a dynamic window that contains the character 954 * takes fixed offsets into account 955 * the offset of the window is stored in the offset variable, 956 * the code is returned 957 * 958 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code 959 */ 960 static int 961 getDynamicOffset(uint32_t c, uint32_t *pOffset) { 962 int i; 963 964 for(i=0; i<7; ++i) { 965 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { 966 *pOffset=fixedOffsets[i]; 967 return 0xf9+i; 968 } 969 } 970 971 if(c<0x80) { 972 /* No dynamic window for US-ASCII. */ 973 return -1; 974 } else if(c<0x3400 || 975 (uint32_t)(c-0x10000)<(0x14000-0x10000) || 976 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) 977 ) { 978 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ 979 *pOffset=c&0x7fffff80; 980 return (int)(c>>7); 981 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { 982 /* For these characters we need to take the gapOffset into account. */ 983 *pOffset=c&0x7fffff80; 984 return (int)((c-gapOffset)>>7); 985 } else { 986 return -1; 987 } 988 } 989 990 /* 991 * Idea for compression: 992 * - save SCSUData and other state before really starting work 993 * - at endloop, see if compression could be better with just unicode mode 994 * - don't do this if a callback has been called 995 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning 996 * - different buffer handling! 997 * 998 * Drawback or need for corrective handling: 999 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and 1000 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible 1001 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. 1002 * 1003 * How to achieve both? 1004 * - Only replace the result after an SDX or SCU? 1005 */ 1006 1007 static void 1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1009 UErrorCode *pErrorCode) { 1010 UConverter *cnv; 1011 SCSUData *scsu; 1012 const UChar *source, *sourceLimit; 1013 uint8_t *target; 1014 int32_t targetCapacity; 1015 int32_t *offsets; 1016 1017 UBool isSingleByteMode; 1018 uint8_t dynamicWindow; 1019 uint32_t currentOffset; 1020 1021 uint32_t c, delta; 1022 1023 int32_t sourceIndex, nextSourceIndex; 1024 1025 int32_t length; 1026 1027 /* variables for compression heuristics */ 1028 uint32_t offset; 1029 UChar lead, trail; 1030 int code; 1031 int8_t window; 1032 1033 /* set up the local pointers */ 1034 cnv=pArgs->converter; 1035 scsu=(SCSUData *)cnv->extraInfo; 1036 1037 /* set up the local pointers */ 1038 source=pArgs->source; 1039 sourceLimit=pArgs->sourceLimit; 1040 target=(uint8_t *)pArgs->target; 1041 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1042 offsets=pArgs->offsets; 1043 1044 /* get the state machine state */ 1045 isSingleByteMode=scsu->fromUIsSingleByteMode; 1046 dynamicWindow=scsu->fromUDynamicWindow; 1047 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1048 1049 c=cnv->fromUChar32; 1050 1051 /* sourceIndex=-1 if the current character began in the previous buffer */ 1052 sourceIndex= c==0 ? 0 : -1; 1053 nextSourceIndex=0; 1054 1055 /* similar conversion "loop" as in toUnicode */ 1056 loop: 1057 if(isSingleByteMode) { 1058 if(c!=0 && targetCapacity>0) { 1059 goto getTrailSingle; 1060 } 1061 1062 /* state machine for single-byte mode */ 1063 /* singleByteMode: */ 1064 while(source<sourceLimit) { 1065 if(targetCapacity<=0) { 1066 /* target is full */ 1067 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1068 break; 1069 } 1070 c=*source++; 1071 ++nextSourceIndex; 1072 1073 if((c-0x20)<=0x5f) { 1074 /* pass US-ASCII graphic character through */ 1075 *target++=(uint8_t)c; 1076 if(offsets!=NULL) { 1077 *offsets++=sourceIndex; 1078 } 1079 --targetCapacity; 1080 } else if(c<0x20) { 1081 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1082 /* CR/LF/TAB/NUL */ 1083 *target++=(uint8_t)c; 1084 if(offsets!=NULL) { 1085 *offsets++=sourceIndex; 1086 } 1087 --targetCapacity; 1088 } else { 1089 /* quote C0 control character */ 1090 c|=SQ0<<8; 1091 length=2; 1092 goto outputBytes; 1093 } 1094 } else if((delta=c-currentOffset)<=0x7f) { 1095 /* use the current dynamic window */ 1096 *target++=(uint8_t)(delta|0x80); 1097 if(offsets!=NULL) { 1098 *offsets++=sourceIndex; 1099 } 1100 --targetCapacity; 1101 } else if(UTF_IS_SURROGATE(c)) { 1102 if(UTF_IS_SURROGATE_FIRST(c)) { 1103 getTrailSingle: 1104 lead=(UChar)c; 1105 if(source<sourceLimit) { 1106 /* test the following code unit */ 1107 trail=*source; 1108 if(UTF_IS_SECOND_SURROGATE(trail)) { 1109 ++source; 1110 ++nextSourceIndex; 1111 c=UTF16_GET_PAIR_VALUE(c, trail); 1112 /* convert this surrogate code point */ 1113 /* exit this condition tree */ 1114 } else { 1115 /* this is an unmatched lead code unit (1st surrogate) */ 1116 /* callback(illegal) */ 1117 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1118 goto endloop; 1119 } 1120 } else { 1121 /* no more input */ 1122 break; 1123 } 1124 } else { 1125 /* this is an unmatched trail code unit (2nd surrogate) */ 1126 /* callback(illegal) */ 1127 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1128 goto endloop; 1129 } 1130 1131 /* compress supplementary character U+10000..U+10ffff */ 1132 if((delta=c-currentOffset)<=0x7f) { 1133 /* use the current dynamic window */ 1134 *target++=(uint8_t)(delta|0x80); 1135 if(offsets!=NULL) { 1136 *offsets++=sourceIndex; 1137 } 1138 --targetCapacity; 1139 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1140 /* there is a dynamic window that contains this character, change to it */ 1141 dynamicWindow=window; 1142 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1143 useDynamicWindow(scsu, dynamicWindow); 1144 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1145 length=2; 1146 goto outputBytes; 1147 } else if((code=getDynamicOffset(c, &offset))>=0) { 1148 /* might check if there are more characters in this window to come */ 1149 /* define an extended window with this character */ 1150 code-=0x200; 1151 dynamicWindow=getNextDynamicWindow(scsu); 1152 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1153 useDynamicWindow(scsu, dynamicWindow); 1154 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1155 length=4; 1156 goto outputBytes; 1157 } else { 1158 /* change to Unicode mode and output this (lead, trail) pair */ 1159 isSingleByteMode=FALSE; 1160 *target++=(uint8_t)SCU; 1161 if(offsets!=NULL) { 1162 *offsets++=sourceIndex; 1163 } 1164 --targetCapacity; 1165 c=((uint32_t)lead<<16)|trail; 1166 length=4; 1167 goto outputBytes; 1168 } 1169 } else if(c<0xa0) { 1170 /* quote C1 control character */ 1171 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1172 length=2; 1173 goto outputBytes; 1174 } else if(c==0xfeff || c>=0xfff0) { 1175 /* quote signature character=byte order mark and specials */ 1176 c|=SQU<<16; 1177 length=3; 1178 goto outputBytes; 1179 } else { 1180 /* compress all other BMP characters */ 1181 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1182 /* there is a window defined that contains this character - switch to it or quote from it? */ 1183 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1184 /* change to dynamic window */ 1185 dynamicWindow=window; 1186 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1187 useDynamicWindow(scsu, dynamicWindow); 1188 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1189 length=2; 1190 goto outputBytes; 1191 } else { 1192 /* quote from dynamic window */ 1193 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1194 length=2; 1195 goto outputBytes; 1196 } 1197 } else if((window=getWindow(staticOffsets, c))>=0) { 1198 /* quote from static window */ 1199 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1200 length=2; 1201 goto outputBytes; 1202 } else if((code=getDynamicOffset(c, &offset))>=0) { 1203 /* define a dynamic window with this character */ 1204 dynamicWindow=getNextDynamicWindow(scsu); 1205 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1206 useDynamicWindow(scsu, dynamicWindow); 1207 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1208 length=3; 1209 goto outputBytes; 1210 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1211 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1212 ) { 1213 /* 1214 * this character is not compressible (a BMP ideograph or similar); 1215 * switch to Unicode mode if this is the last character in the block 1216 * or there is at least one more ideograph following immediately 1217 */ 1218 isSingleByteMode=FALSE; 1219 c|=SCU<<16; 1220 length=3; 1221 goto outputBytes; 1222 } else { 1223 /* quote Unicode */ 1224 c|=SQU<<16; 1225 length=3; 1226 goto outputBytes; 1227 } 1228 } 1229 1230 /* normal end of conversion: prepare for a new character */ 1231 c=0; 1232 sourceIndex=nextSourceIndex; 1233 } 1234 } else { 1235 if(c!=0 && targetCapacity>0) { 1236 goto getTrailUnicode; 1237 } 1238 1239 /* state machine for Unicode mode */ 1240 /* unicodeByteMode: */ 1241 while(source<sourceLimit) { 1242 if(targetCapacity<=0) { 1243 /* target is full */ 1244 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1245 break; 1246 } 1247 c=*source++; 1248 ++nextSourceIndex; 1249 1250 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1251 /* not compressible, write character directly */ 1252 if(targetCapacity>=2) { 1253 *target++=(uint8_t)(c>>8); 1254 *target++=(uint8_t)c; 1255 if(offsets!=NULL) { 1256 *offsets++=sourceIndex; 1257 *offsets++=sourceIndex; 1258 } 1259 targetCapacity-=2; 1260 } else { 1261 length=2; 1262 goto outputBytes; 1263 } 1264 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1265 /* compress BMP character if the following one is not an uncompressible ideograph */ 1266 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1267 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1268 /* ASCII digit or letter */ 1269 isSingleByteMode=TRUE; 1270 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1271 length=2; 1272 goto outputBytes; 1273 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1274 /* there is a dynamic window that contains this character, change to it */ 1275 isSingleByteMode=TRUE; 1276 dynamicWindow=window; 1277 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1278 useDynamicWindow(scsu, dynamicWindow); 1279 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1280 length=2; 1281 goto outputBytes; 1282 } else if((code=getDynamicOffset(c, &offset))>=0) { 1283 /* define a dynamic window with this character */ 1284 isSingleByteMode=TRUE; 1285 dynamicWindow=getNextDynamicWindow(scsu); 1286 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1287 useDynamicWindow(scsu, dynamicWindow); 1288 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1289 length=3; 1290 goto outputBytes; 1291 } 1292 } 1293 1294 /* don't know how to compress this character, just write it directly */ 1295 length=2; 1296 goto outputBytes; 1297 } else if(c<0xe000) { 1298 /* c is a surrogate */ 1299 if(UTF_IS_SURROGATE_FIRST(c)) { 1300 getTrailUnicode: 1301 lead=(UChar)c; 1302 if(source<sourceLimit) { 1303 /* test the following code unit */ 1304 trail=*source; 1305 if(UTF_IS_SECOND_SURROGATE(trail)) { 1306 ++source; 1307 ++nextSourceIndex; 1308 c=UTF16_GET_PAIR_VALUE(c, trail); 1309 /* convert this surrogate code point */ 1310 /* exit this condition tree */ 1311 } else { 1312 /* this is an unmatched lead code unit (1st surrogate) */ 1313 /* callback(illegal) */ 1314 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1315 goto endloop; 1316 } 1317 } else { 1318 /* no more input */ 1319 break; 1320 } 1321 } else { 1322 /* this is an unmatched trail code unit (2nd surrogate) */ 1323 /* callback(illegal) */ 1324 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1325 goto endloop; 1326 } 1327 1328 /* compress supplementary character */ 1329 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1330 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1331 ) { 1332 /* 1333 * there is a dynamic window that contains this character and 1334 * the following character is not uncompressible, 1335 * change to the window 1336 */ 1337 isSingleByteMode=TRUE; 1338 dynamicWindow=window; 1339 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1340 useDynamicWindow(scsu, dynamicWindow); 1341 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1342 length=2; 1343 goto outputBytes; 1344 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1345 (code=getDynamicOffset(c, &offset))>=0 1346 ) { 1347 /* two supplementary characters in (probably) the same window - define an extended one */ 1348 isSingleByteMode=TRUE; 1349 code-=0x200; 1350 dynamicWindow=getNextDynamicWindow(scsu); 1351 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1352 useDynamicWindow(scsu, dynamicWindow); 1353 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1354 length=4; 1355 goto outputBytes; 1356 } else { 1357 /* don't know how to compress this character, just write it directly */ 1358 c=((uint32_t)lead<<16)|trail; 1359 length=4; 1360 goto outputBytes; 1361 } 1362 } else /* 0xe000<=c<0xf300 */ { 1363 /* quote to avoid SCSU tags */ 1364 c|=UQU<<16; 1365 length=3; 1366 goto outputBytes; 1367 } 1368 1369 /* normal end of conversion: prepare for a new character */ 1370 c=0; 1371 sourceIndex=nextSourceIndex; 1372 } 1373 } 1374 endloop: 1375 1376 /* set the converter state back into UConverter */ 1377 scsu->fromUIsSingleByteMode=isSingleByteMode; 1378 scsu->fromUDynamicWindow=dynamicWindow; 1379 1380 cnv->fromUChar32=c; 1381 1382 /* write back the updated pointers */ 1383 pArgs->source=source; 1384 pArgs->target=(char *)target; 1385 pArgs->offsets=offsets; 1386 return; 1387 1388 outputBytes: 1389 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1390 /* from the first if in the loop we know that targetCapacity>0 */ 1391 if(length<=targetCapacity) { 1392 if(offsets==NULL) { 1393 switch(length) { 1394 /* each branch falls through to the next one */ 1395 case 4: 1396 *target++=(uint8_t)(c>>24); 1397 case 3: 1398 *target++=(uint8_t)(c>>16); 1399 case 2: 1400 *target++=(uint8_t)(c>>8); 1401 case 1: 1402 *target++=(uint8_t)c; 1403 default: 1404 /* will never occur */ 1405 break; 1406 } 1407 } else { 1408 switch(length) { 1409 /* each branch falls through to the next one */ 1410 case 4: 1411 *target++=(uint8_t)(c>>24); 1412 *offsets++=sourceIndex; 1413 case 3: 1414 *target++=(uint8_t)(c>>16); 1415 *offsets++=sourceIndex; 1416 case 2: 1417 *target++=(uint8_t)(c>>8); 1418 *offsets++=sourceIndex; 1419 case 1: 1420 *target++=(uint8_t)c; 1421 *offsets++=sourceIndex; 1422 default: 1423 /* will never occur */ 1424 break; 1425 } 1426 } 1427 targetCapacity-=length; 1428 1429 /* normal end of conversion: prepare for a new character */ 1430 c=0; 1431 sourceIndex=nextSourceIndex; 1432 goto loop; 1433 } else { 1434 uint8_t *p; 1435 1436 /* 1437 * We actually do this backwards here: 1438 * In order to save an intermediate variable, we output 1439 * first to the overflow buffer what does not fit into the 1440 * regular target. 1441 */ 1442 /* we know that 0<=targetCapacity<length<=4 */ 1443 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1444 length-=targetCapacity; 1445 p=(uint8_t *)cnv->charErrorBuffer; 1446 switch(length) { 1447 /* each branch falls through to the next one */ 1448 case 4: 1449 *p++=(uint8_t)(c>>24); 1450 case 3: 1451 *p++=(uint8_t)(c>>16); 1452 case 2: 1453 *p++=(uint8_t)(c>>8); 1454 case 1: 1455 *p=(uint8_t)c; 1456 default: 1457 /* will never occur */ 1458 break; 1459 } 1460 cnv->charErrorBufferLength=(int8_t)length; 1461 1462 /* now output what fits into the regular target */ 1463 c>>=8*length; /* length was reduced by targetCapacity */ 1464 switch(targetCapacity) { 1465 /* each branch falls through to the next one */ 1466 case 3: 1467 *target++=(uint8_t)(c>>16); 1468 if(offsets!=NULL) { 1469 *offsets++=sourceIndex; 1470 } 1471 case 2: 1472 *target++=(uint8_t)(c>>8); 1473 if(offsets!=NULL) { 1474 *offsets++=sourceIndex; 1475 } 1476 case 1: 1477 *target++=(uint8_t)c; 1478 if(offsets!=NULL) { 1479 *offsets++=sourceIndex; 1480 } 1481 default: 1482 break; 1483 } 1484 1485 /* target overflow */ 1486 targetCapacity=0; 1487 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1488 c=0; 1489 goto endloop; 1490 } 1491 } 1492 1493 /* 1494 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. 1495 * If a change is made in the original function, then either 1496 * change this function the same way or 1497 * re-copy the original function and remove the variables 1498 * offsets, sourceIndex, and nextSourceIndex. 1499 */ 1500 static void 1501 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, 1502 UErrorCode *pErrorCode) { 1503 UConverter *cnv; 1504 SCSUData *scsu; 1505 const UChar *source, *sourceLimit; 1506 uint8_t *target; 1507 int32_t targetCapacity; 1508 1509 UBool isSingleByteMode; 1510 uint8_t dynamicWindow; 1511 uint32_t currentOffset; 1512 1513 uint32_t c, delta; 1514 1515 int32_t length; 1516 1517 /* variables for compression heuristics */ 1518 uint32_t offset; 1519 UChar lead, trail; 1520 int code; 1521 int8_t window; 1522 1523 /* set up the local pointers */ 1524 cnv=pArgs->converter; 1525 scsu=(SCSUData *)cnv->extraInfo; 1526 1527 /* set up the local pointers */ 1528 source=pArgs->source; 1529 sourceLimit=pArgs->sourceLimit; 1530 target=(uint8_t *)pArgs->target; 1531 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1532 1533 /* get the state machine state */ 1534 isSingleByteMode=scsu->fromUIsSingleByteMode; 1535 dynamicWindow=scsu->fromUDynamicWindow; 1536 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1537 1538 c=cnv->fromUChar32; 1539 1540 /* similar conversion "loop" as in toUnicode */ 1541 loop: 1542 if(isSingleByteMode) { 1543 if(c!=0 && targetCapacity>0) { 1544 goto getTrailSingle; 1545 } 1546 1547 /* state machine for single-byte mode */ 1548 /* singleByteMode: */ 1549 while(source<sourceLimit) { 1550 if(targetCapacity<=0) { 1551 /* target is full */ 1552 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1553 break; 1554 } 1555 c=*source++; 1556 1557 if((c-0x20)<=0x5f) { 1558 /* pass US-ASCII graphic character through */ 1559 *target++=(uint8_t)c; 1560 --targetCapacity; 1561 } else if(c<0x20) { 1562 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1563 /* CR/LF/TAB/NUL */ 1564 *target++=(uint8_t)c; 1565 --targetCapacity; 1566 } else { 1567 /* quote C0 control character */ 1568 c|=SQ0<<8; 1569 length=2; 1570 goto outputBytes; 1571 } 1572 } else if((delta=c-currentOffset)<=0x7f) { 1573 /* use the current dynamic window */ 1574 *target++=(uint8_t)(delta|0x80); 1575 --targetCapacity; 1576 } else if(UTF_IS_SURROGATE(c)) { 1577 if(UTF_IS_SURROGATE_FIRST(c)) { 1578 getTrailSingle: 1579 lead=(UChar)c; 1580 if(source<sourceLimit) { 1581 /* test the following code unit */ 1582 trail=*source; 1583 if(UTF_IS_SECOND_SURROGATE(trail)) { 1584 ++source; 1585 c=UTF16_GET_PAIR_VALUE(c, trail); 1586 /* convert this surrogate code point */ 1587 /* exit this condition tree */ 1588 } else { 1589 /* this is an unmatched lead code unit (1st surrogate) */ 1590 /* callback(illegal) */ 1591 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1592 goto endloop; 1593 } 1594 } else { 1595 /* no more input */ 1596 break; 1597 } 1598 } else { 1599 /* this is an unmatched trail code unit (2nd surrogate) */ 1600 /* callback(illegal) */ 1601 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1602 goto endloop; 1603 } 1604 1605 /* compress supplementary character U+10000..U+10ffff */ 1606 if((delta=c-currentOffset)<=0x7f) { 1607 /* use the current dynamic window */ 1608 *target++=(uint8_t)(delta|0x80); 1609 --targetCapacity; 1610 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1611 /* there is a dynamic window that contains this character, change to it */ 1612 dynamicWindow=window; 1613 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1614 useDynamicWindow(scsu, dynamicWindow); 1615 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1616 length=2; 1617 goto outputBytes; 1618 } else if((code=getDynamicOffset(c, &offset))>=0) { 1619 /* might check if there are more characters in this window to come */ 1620 /* define an extended window with this character */ 1621 code-=0x200; 1622 dynamicWindow=getNextDynamicWindow(scsu); 1623 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1624 useDynamicWindow(scsu, dynamicWindow); 1625 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1626 length=4; 1627 goto outputBytes; 1628 } else { 1629 /* change to Unicode mode and output this (lead, trail) pair */ 1630 isSingleByteMode=FALSE; 1631 *target++=(uint8_t)SCU; 1632 --targetCapacity; 1633 c=((uint32_t)lead<<16)|trail; 1634 length=4; 1635 goto outputBytes; 1636 } 1637 } else if(c<0xa0) { 1638 /* quote C1 control character */ 1639 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1640 length=2; 1641 goto outputBytes; 1642 } else if(c==0xfeff || c>=0xfff0) { 1643 /* quote signature character=byte order mark and specials */ 1644 c|=SQU<<16; 1645 length=3; 1646 goto outputBytes; 1647 } else { 1648 /* compress all other BMP characters */ 1649 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1650 /* there is a window defined that contains this character - switch to it or quote from it? */ 1651 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1652 /* change to dynamic window */ 1653 dynamicWindow=window; 1654 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1655 useDynamicWindow(scsu, dynamicWindow); 1656 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1657 length=2; 1658 goto outputBytes; 1659 } else { 1660 /* quote from dynamic window */ 1661 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1662 length=2; 1663 goto outputBytes; 1664 } 1665 } else if((window=getWindow(staticOffsets, c))>=0) { 1666 /* quote from static window */ 1667 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1668 length=2; 1669 goto outputBytes; 1670 } else if((code=getDynamicOffset(c, &offset))>=0) { 1671 /* define a dynamic window with this character */ 1672 dynamicWindow=getNextDynamicWindow(scsu); 1673 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1674 useDynamicWindow(scsu, dynamicWindow); 1675 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1676 length=3; 1677 goto outputBytes; 1678 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && 1679 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1680 ) { 1681 /* 1682 * this character is not compressible (a BMP ideograph or similar); 1683 * switch to Unicode mode if this is the last character in the block 1684 * or there is at least one more ideograph following immediately 1685 */ 1686 isSingleByteMode=FALSE; 1687 c|=SCU<<16; 1688 length=3; 1689 goto outputBytes; 1690 } else { 1691 /* quote Unicode */ 1692 c|=SQU<<16; 1693 length=3; 1694 goto outputBytes; 1695 } 1696 } 1697 1698 /* normal end of conversion: prepare for a new character */ 1699 c=0; 1700 } 1701 } else { 1702 if(c!=0 && targetCapacity>0) { 1703 goto getTrailUnicode; 1704 } 1705 1706 /* state machine for Unicode mode */ 1707 /* unicodeByteMode: */ 1708 while(source<sourceLimit) { 1709 if(targetCapacity<=0) { 1710 /* target is full */ 1711 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1712 break; 1713 } 1714 c=*source++; 1715 1716 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { 1717 /* not compressible, write character directly */ 1718 if(targetCapacity>=2) { 1719 *target++=(uint8_t)(c>>8); 1720 *target++=(uint8_t)c; 1721 targetCapacity-=2; 1722 } else { 1723 length=2; 1724 goto outputBytes; 1725 } 1726 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { 1727 /* compress BMP character if the following one is not an uncompressible ideograph */ 1728 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1729 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { 1730 /* ASCII digit or letter */ 1731 isSingleByteMode=TRUE; 1732 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1733 length=2; 1734 goto outputBytes; 1735 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1736 /* there is a dynamic window that contains this character, change to it */ 1737 isSingleByteMode=TRUE; 1738 dynamicWindow=window; 1739 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1740 useDynamicWindow(scsu, dynamicWindow); 1741 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1742 length=2; 1743 goto outputBytes; 1744 } else if((code=getDynamicOffset(c, &offset))>=0) { 1745 /* define a dynamic window with this character */ 1746 isSingleByteMode=TRUE; 1747 dynamicWindow=getNextDynamicWindow(scsu); 1748 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1749 useDynamicWindow(scsu, dynamicWindow); 1750 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1751 length=3; 1752 goto outputBytes; 1753 } 1754 } 1755 1756 /* don't know how to compress this character, just write it directly */ 1757 length=2; 1758 goto outputBytes; 1759 } else if(c<0xe000) { 1760 /* c is a surrogate */ 1761 if(UTF_IS_SURROGATE_FIRST(c)) { 1762 getTrailUnicode: 1763 lead=(UChar)c; 1764 if(source<sourceLimit) { 1765 /* test the following code unit */ 1766 trail=*source; 1767 if(UTF_IS_SECOND_SURROGATE(trail)) { 1768 ++source; 1769 c=UTF16_GET_PAIR_VALUE(c, trail); 1770 /* convert this surrogate code point */ 1771 /* exit this condition tree */ 1772 } else { 1773 /* this is an unmatched lead code unit (1st surrogate) */ 1774 /* callback(illegal) */ 1775 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1776 goto endloop; 1777 } 1778 } else { 1779 /* no more input */ 1780 break; 1781 } 1782 } else { 1783 /* this is an unmatched trail code unit (2nd surrogate) */ 1784 /* callback(illegal) */ 1785 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1786 goto endloop; 1787 } 1788 1789 /* compress supplementary character */ 1790 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1791 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1792 ) { 1793 /* 1794 * there is a dynamic window that contains this character and 1795 * the following character is not uncompressible, 1796 * change to the window 1797 */ 1798 isSingleByteMode=TRUE; 1799 dynamicWindow=window; 1800 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1801 useDynamicWindow(scsu, dynamicWindow); 1802 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1803 length=2; 1804 goto outputBytes; 1805 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1806 (code=getDynamicOffset(c, &offset))>=0 1807 ) { 1808 /* two supplementary characters in (probably) the same window - define an extended one */ 1809 isSingleByteMode=TRUE; 1810 code-=0x200; 1811 dynamicWindow=getNextDynamicWindow(scsu); 1812 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1813 useDynamicWindow(scsu, dynamicWindow); 1814 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1815 length=4; 1816 goto outputBytes; 1817 } else { 1818 /* don't know how to compress this character, just write it directly */ 1819 c=((uint32_t)lead<<16)|trail; 1820 length=4; 1821 goto outputBytes; 1822 } 1823 } else /* 0xe000<=c<0xf300 */ { 1824 /* quote to avoid SCSU tags */ 1825 c|=UQU<<16; 1826 length=3; 1827 goto outputBytes; 1828 } 1829 1830 /* normal end of conversion: prepare for a new character */ 1831 c=0; 1832 } 1833 } 1834 endloop: 1835 1836 /* set the converter state back into UConverter */ 1837 scsu->fromUIsSingleByteMode=isSingleByteMode; 1838 scsu->fromUDynamicWindow=dynamicWindow; 1839 1840 cnv->fromUChar32=c; 1841 1842 /* write back the updated pointers */ 1843 pArgs->source=source; 1844 pArgs->target=(char *)target; 1845 return; 1846 1847 outputBytes: 1848 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1849 /* from the first if in the loop we know that targetCapacity>0 */ 1850 if(length<=targetCapacity) { 1851 switch(length) { 1852 /* each branch falls through to the next one */ 1853 case 4: 1854 *target++=(uint8_t)(c>>24); 1855 case 3: 1856 *target++=(uint8_t)(c>>16); 1857 case 2: 1858 *target++=(uint8_t)(c>>8); 1859 case 1: 1860 *target++=(uint8_t)c; 1861 default: 1862 /* will never occur */ 1863 break; 1864 } 1865 targetCapacity-=length; 1866 1867 /* normal end of conversion: prepare for a new character */ 1868 c=0; 1869 goto loop; 1870 } else { 1871 uint8_t *p; 1872 1873 /* 1874 * We actually do this backwards here: 1875 * In order to save an intermediate variable, we output 1876 * first to the overflow buffer what does not fit into the 1877 * regular target. 1878 */ 1879 /* we know that 0<=targetCapacity<length<=4 */ 1880 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1881 length-=targetCapacity; 1882 p=(uint8_t *)cnv->charErrorBuffer; 1883 switch(length) { 1884 /* each branch falls through to the next one */ 1885 case 4: 1886 *p++=(uint8_t)(c>>24); 1887 case 3: 1888 *p++=(uint8_t)(c>>16); 1889 case 2: 1890 *p++=(uint8_t)(c>>8); 1891 case 1: 1892 *p=(uint8_t)c; 1893 default: 1894 /* will never occur */ 1895 break; 1896 } 1897 cnv->charErrorBufferLength=(int8_t)length; 1898 1899 /* now output what fits into the regular target */ 1900 c>>=8*length; /* length was reduced by targetCapacity */ 1901 switch(targetCapacity) { 1902 /* each branch falls through to the next one */ 1903 case 3: 1904 *target++=(uint8_t)(c>>16); 1905 case 2: 1906 *target++=(uint8_t)(c>>8); 1907 case 1: 1908 *target++=(uint8_t)c; 1909 default: 1910 break; 1911 } 1912 1913 /* target overflow */ 1914 targetCapacity=0; 1915 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1916 c=0; 1917 goto endloop; 1918 } 1919 } 1920 1921 /* miscellaneous ------------------------------------------------------------ */ 1922 1923 static const char * 1924 _SCSUGetName(const UConverter *cnv) { 1925 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1926 1927 switch(scsu->locale) { 1928 case l_ja: 1929 return "SCSU,locale=ja"; 1930 default: 1931 return "SCSU"; 1932 } 1933 } 1934 1935 /* structure for SafeClone calculations */ 1936 struct cloneSCSUStruct 1937 { 1938 UConverter cnv; 1939 SCSUData mydata; 1940 }; 1941 1942 static UConverter * 1943 _SCSUSafeClone(const UConverter *cnv, 1944 void *stackBuffer, 1945 int32_t *pBufferSize, 1946 UErrorCode *status) 1947 { 1948 struct cloneSCSUStruct * localClone; 1949 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); 1950 1951 if (U_FAILURE(*status)){ 1952 return 0; 1953 } 1954 1955 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 1956 *pBufferSize = bufferSizeNeeded; 1957 return 0; 1958 } 1959 1960 localClone = (struct cloneSCSUStruct *)stackBuffer; 1961 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1962 1963 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); 1964 localClone->cnv.extraInfo = &localClone->mydata; 1965 localClone->cnv.isExtraLocal = TRUE; 1966 1967 return &localClone->cnv; 1968 } 1969 1970 1971 static const UConverterImpl _SCSUImpl={ 1972 UCNV_SCSU, 1973 1974 NULL, 1975 NULL, 1976 1977 _SCSUOpen, 1978 _SCSUClose, 1979 _SCSUReset, 1980 1981 _SCSUToUnicode, 1982 _SCSUToUnicodeWithOffsets, 1983 _SCSUFromUnicode, 1984 _SCSUFromUnicodeWithOffsets, 1985 NULL, 1986 1987 NULL, 1988 _SCSUGetName, 1989 NULL, 1990 _SCSUSafeClone, 1991 ucnv_getCompleteUnicodeSet 1992 }; 1993 1994 static const UConverterStaticData _SCSUStaticData={ 1995 sizeof(UConverterStaticData), 1996 "SCSU", 1997 1212, /* CCSID for SCSU */ 1998 UCNV_IBM, UCNV_SCSU, 1999 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ 2000 /* 2001 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode 2002 * substitution string. 2003 */ 2004 { 0x0e, 0xff, 0xfd, 0 }, 3, 2005 FALSE, FALSE, 2006 0, 2007 0, 2008 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 2009 }; 2010 2011 const UConverterSharedData _SCSUData={ 2012 sizeof(UConverterSharedData), ~((uint32_t)0), 2013 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, 2014 0 2015 }; 2016 2017 #endif 2018