Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2015, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvscsu.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000nov18
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is an implementation of the Standard Compression Scheme for Unicode
     17 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
     18 *   Reserved commands and window settings are treated as illegal sequences and
     19 *   will result in callback calls.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 
     24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     25 
     26 #include "unicode/ucnv.h"
     27 #include "unicode/ucnv_cb.h"
     28 #include "unicode/utf16.h"
     29 #include "ucnv_bld.h"
     30 #include "ucnv_cnv.h"
     31 #include "cmemory.h"
     32 
     33 /* SCSU definitions --------------------------------------------------------- */
     34 
     35 /* SCSU command byte values */
     36 enum {
     37     SQ0=0x01, /* Quote from window pair 0 */
     38     SQ7=0x08, /* Quote from window pair 7 */
     39     SDX=0x0B, /* Define a window as extended */
     40     Srs=0x0C, /* reserved */
     41     SQU=0x0E, /* Quote a single Unicode character */
     42     SCU=0x0F, /* Change to Unicode mode */
     43     SC0=0x10, /* Select window 0 */
     44     SC7=0x17, /* Select window 7 */
     45     SD0=0x18, /* Define and select window 0 */
     46     SD7=0x1F, /* Define and select window 7 */
     47 
     48     UC0=0xE0, /* Select window 0 */
     49     UC7=0xE7, /* Select window 7 */
     50     UD0=0xE8, /* Define and select window 0 */
     51     UD7=0xEF, /* Define and select window 7 */
     52     UQU=0xF0, /* Quote a single Unicode character */
     53     UDX=0xF1, /* Define a Window as extended */
     54     Urs=0xF2  /* reserved */
     55 };
     56 
     57 enum {
     58     /*
     59      * Unicode code points from 3400 to E000 are not adressible by
     60      * dynamic window, since in these areas no short run alphabets are
     61      * found. Therefore add gapOffset to all values from gapThreshold.
     62      */
     63     gapThreshold=0x68,
     64     gapOffset=0xAC00,
     65 
     66     /* values between reservedStart and fixedThreshold are reserved */
     67     reservedStart=0xA8,
     68 
     69     /* use table of predefined fixed offsets for values from fixedThreshold */
     70     fixedThreshold=0xF9
     71 };
     72 
     73 /* constant offsets for the 8 static windows */
     74 static const uint32_t staticOffsets[8]={
     75     0x0000, /* ASCII for quoted tags */
     76     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
     77     0x0100, /* Latin Extended-A */
     78     0x0300, /* Combining Diacritical Marks */
     79     0x2000, /* General Punctuation */
     80     0x2080, /* Currency Symbols */
     81     0x2100, /* Letterlike Symbols and Number Forms */
     82     0x3000  /* CJK Symbols and punctuation */
     83 };
     84 
     85 /* initial offsets for the 8 dynamic (sliding) windows */
     86 static const uint32_t initialDynamicOffsets[8]={
     87     0x0080, /* Latin-1 */
     88     0x00C0, /* Latin Extended A */
     89     0x0400, /* Cyrillic */
     90     0x0600, /* Arabic */
     91     0x0900, /* Devanagari */
     92     0x3040, /* Hiragana */
     93     0x30A0, /* Katakana */
     94     0xFF00  /* Fullwidth ASCII */
     95 };
     96 
     97 /* Table of fixed predefined Offsets */
     98 static const uint32_t fixedOffsets[]={
     99     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
    100     /* 0xFA */ 0x0250, /* IPA extensions */
    101     /* 0xFB */ 0x0370, /* Greek */
    102     /* 0xFC */ 0x0530, /* Armenian */
    103     /* 0xFD */ 0x3040, /* Hiragana */
    104     /* 0xFE */ 0x30A0, /* Katakana */
    105     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
    106 };
    107 
    108 /* state values */
    109 enum {
    110     readCommand,
    111     quotePairOne,
    112     quotePairTwo,
    113     quoteOne,
    114     definePairOne,
    115     definePairTwo,
    116     defineOne
    117 };
    118 
    119 typedef struct SCSUData {
    120     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
    121     uint32_t toUDynamicOffsets[8];
    122     uint32_t fromUDynamicOffsets[8];
    123 
    124     /* state machine state - toUnicode */
    125     UBool toUIsSingleByteMode;
    126     uint8_t toUState;
    127     int8_t toUQuoteWindow, toUDynamicWindow;
    128     uint8_t toUByteOne;
    129     uint8_t toUPadding[3];
    130 
    131     /* state machine state - fromUnicode */
    132     UBool fromUIsSingleByteMode;
    133     int8_t fromUDynamicWindow;
    134 
    135     /*
    136      * windowUse[] keeps track of the use of the dynamic windows:
    137      * At nextWindowUseIndex there is the least recently used window,
    138      * and the following windows (in a wrapping manner) are more and more
    139      * recently used.
    140      * At nextWindowUseIndex-1 there is the most recently used window.
    141      */
    142     uint8_t locale;
    143     int8_t nextWindowUseIndex;
    144     int8_t windowUse[8];
    145 } SCSUData;
    146 
    147 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
    148 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
    149 
    150 enum {
    151     lGeneric, l_ja
    152 };
    153 
    154 /* SCSU setup functions ----------------------------------------------------- */
    155 
    156 static void
    157 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
    158     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
    159 
    160     if(choice<=UCNV_RESET_TO_UNICODE) {
    161         /* reset toUnicode */
    162         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
    163 
    164         scsu->toUIsSingleByteMode=TRUE;
    165         scsu->toUState=readCommand;
    166         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
    167         scsu->toUByteOne=0;
    168 
    169         cnv->toULength=0;
    170     }
    171     if(choice!=UCNV_RESET_TO_UNICODE) {
    172         /* reset fromUnicode */
    173         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
    174 
    175         scsu->fromUIsSingleByteMode=TRUE;
    176         scsu->fromUDynamicWindow=0;
    177 
    178         scsu->nextWindowUseIndex=0;
    179         switch(scsu->locale) {
    180         case l_ja:
    181             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
    182             break;
    183         default:
    184             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
    185             break;
    186         }
    187 
    188         cnv->fromUChar32=0;
    189     }
    190 }
    191 
    192 static void
    193 _SCSUOpen(UConverter *cnv,
    194           UConverterLoadArgs *pArgs,
    195           UErrorCode *pErrorCode) {
    196     const char *locale=pArgs->locale;
    197     if(pArgs->onlyTestIsLoadable) {
    198         return;
    199     }
    200     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
    201     if(cnv->extraInfo!=NULL) {
    202         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
    203             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
    204         } else {
    205             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
    206         }
    207         _SCSUReset(cnv, UCNV_RESET_BOTH);
    208     } else {
    209         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    210     }
    211 
    212     /* Set the substitution character U+fffd as a Unicode string. */
    213     cnv->subUChars[0]=0xfffd;
    214     cnv->subCharLen=-1;
    215 }
    216 
    217 static void
    218 _SCSUClose(UConverter *cnv) {
    219     if(cnv->extraInfo!=NULL) {
    220         if(!cnv->isExtraLocal) {
    221             uprv_free(cnv->extraInfo);
    222         }
    223         cnv->extraInfo=NULL;
    224     }
    225 }
    226 
    227 /* SCSU-to-Unicode conversion functions ------------------------------------- */
    228 
    229 static void
    230 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    231                           UErrorCode *pErrorCode) {
    232     UConverter *cnv;
    233     SCSUData *scsu;
    234     const uint8_t *source, *sourceLimit;
    235     UChar *target;
    236     const UChar *targetLimit;
    237     int32_t *offsets;
    238     UBool isSingleByteMode;
    239     uint8_t state, byteOne;
    240     int8_t quoteWindow, dynamicWindow;
    241 
    242     int32_t sourceIndex, nextSourceIndex;
    243 
    244     uint8_t b;
    245 
    246     /* set up the local pointers */
    247     cnv=pArgs->converter;
    248     scsu=(SCSUData *)cnv->extraInfo;
    249 
    250     source=(const uint8_t *)pArgs->source;
    251     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    252     target=pArgs->target;
    253     targetLimit=pArgs->targetLimit;
    254     offsets=pArgs->offsets;
    255 
    256     /* get the state machine state */
    257     isSingleByteMode=scsu->toUIsSingleByteMode;
    258     state=scsu->toUState;
    259     quoteWindow=scsu->toUQuoteWindow;
    260     dynamicWindow=scsu->toUDynamicWindow;
    261     byteOne=scsu->toUByteOne;
    262 
    263     /* sourceIndex=-1 if the current character began in the previous buffer */
    264     sourceIndex=state==readCommand ? 0 : -1;
    265     nextSourceIndex=0;
    266 
    267     /*
    268      * conversion "loop"
    269      *
    270      * For performance, this is not a normal C loop.
    271      * Instead, there are two code blocks for the two SCSU modes.
    272      * The function branches to either one, and a change of the mode is done with a goto to
    273      * the other branch.
    274      *
    275      * Each branch has two conventional loops:
    276      * - a fast-path loop for the most common codes in the mode
    277      * - a loop for all other codes in the mode
    278      * When the fast-path runs into a code that it cannot handle, its loop ends and it
    279      * runs into the following loop to handle the other codes.
    280      * The end of the input or output buffer is also handled by the slower loop.
    281      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
    282      *
    283      * The callback handling is done by returning with an error code.
    284      * The conversion framework actually calls the callback function.
    285      */
    286     if(isSingleByteMode) {
    287         /* fast path for single-byte mode */
    288         if(state==readCommand) {
    289 fastSingle:
    290             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
    291                 ++source;
    292                 ++nextSourceIndex;
    293                 if(b<=0x7f) {
    294                     /* write US-ASCII graphic character or DEL */
    295                     *target++=(UChar)b;
    296                     if(offsets!=NULL) {
    297                         *offsets++=sourceIndex;
    298                     }
    299                 } else {
    300                     /* write from dynamic window */
    301                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
    302                     if(c<=0xffff) {
    303                         *target++=(UChar)c;
    304                         if(offsets!=NULL) {
    305                             *offsets++=sourceIndex;
    306                         }
    307                     } else {
    308                         /* output surrogate pair */
    309                         *target++=(UChar)(0xd7c0+(c>>10));
    310                         if(target<targetLimit) {
    311                             *target++=(UChar)(0xdc00|(c&0x3ff));
    312                             if(offsets!=NULL) {
    313                                 *offsets++=sourceIndex;
    314                                 *offsets++=sourceIndex;
    315                             }
    316                         } else {
    317                             /* target overflow */
    318                             if(offsets!=NULL) {
    319                                 *offsets++=sourceIndex;
    320                             }
    321                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    322                             cnv->UCharErrorBufferLength=1;
    323                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    324                             goto endloop;
    325                         }
    326                     }
    327                 }
    328                 sourceIndex=nextSourceIndex;
    329             }
    330         }
    331 
    332         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
    333 singleByteMode:
    334         while(source<sourceLimit) {
    335             if(target>=targetLimit) {
    336                 /* target is full */
    337                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    338                 break;
    339             }
    340             b=*source++;
    341             ++nextSourceIndex;
    342             switch(state) {
    343             case readCommand:
    344                 /* redundant conditions are commented out */
    345                 /* here: b<0x20 because otherwise we would be in fastSingle */
    346                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
    347                     /* CR/LF/TAB/NUL */
    348                     *target++=(UChar)b;
    349                     if(offsets!=NULL) {
    350                         *offsets++=sourceIndex;
    351                     }
    352                     sourceIndex=nextSourceIndex;
    353                     goto fastSingle;
    354                 } else if(SC0<=b) {
    355                     if(b<=SC7) {
    356                         dynamicWindow=(int8_t)(b-SC0);
    357                         sourceIndex=nextSourceIndex;
    358                         goto fastSingle;
    359                     } else /* if(SD0<=b && b<=SD7) */ {
    360                         dynamicWindow=(int8_t)(b-SD0);
    361                         state=defineOne;
    362                     }
    363                 } else if(/* SQ0<=b && */ b<=SQ7) {
    364                     quoteWindow=(int8_t)(b-SQ0);
    365                     state=quoteOne;
    366                 } else if(b==SDX) {
    367                     state=definePairOne;
    368                 } else if(b==SQU) {
    369                     state=quotePairOne;
    370                 } else if(b==SCU) {
    371                     sourceIndex=nextSourceIndex;
    372                     isSingleByteMode=FALSE;
    373                     goto fastUnicode;
    374                 } else /* Srs */ {
    375                     /* callback(illegal) */
    376                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    377                     cnv->toUBytes[0]=b;
    378                     cnv->toULength=1;
    379                     goto endloop;
    380                 }
    381 
    382                 /* store the first byte of a multibyte sequence in toUBytes[] */
    383                 cnv->toUBytes[0]=b;
    384                 cnv->toULength=1;
    385                 break;
    386             case quotePairOne:
    387                 byteOne=b;
    388                 cnv->toUBytes[1]=b;
    389                 cnv->toULength=2;
    390                 state=quotePairTwo;
    391                 break;
    392             case quotePairTwo:
    393                 *target++=(UChar)((byteOne<<8)|b);
    394                 if(offsets!=NULL) {
    395                     *offsets++=sourceIndex;
    396                 }
    397                 sourceIndex=nextSourceIndex;
    398                 state=readCommand;
    399                 goto fastSingle;
    400             case quoteOne:
    401                 if(b<0x80) {
    402                     /* all static offsets are in the BMP */
    403                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
    404                     if(offsets!=NULL) {
    405                         *offsets++=sourceIndex;
    406                     }
    407                 } else {
    408                     /* write from dynamic window */
    409                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
    410                     if(c<=0xffff) {
    411                         *target++=(UChar)c;
    412                         if(offsets!=NULL) {
    413                             *offsets++=sourceIndex;
    414                         }
    415                     } else {
    416                         /* output surrogate pair */
    417                         *target++=(UChar)(0xd7c0+(c>>10));
    418                         if(target<targetLimit) {
    419                             *target++=(UChar)(0xdc00|(c&0x3ff));
    420                             if(offsets!=NULL) {
    421                                 *offsets++=sourceIndex;
    422                                 *offsets++=sourceIndex;
    423                             }
    424                         } else {
    425                             /* target overflow */
    426                             if(offsets!=NULL) {
    427                                 *offsets++=sourceIndex;
    428                             }
    429                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    430                             cnv->UCharErrorBufferLength=1;
    431                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    432                             goto endloop;
    433                         }
    434                     }
    435                 }
    436                 sourceIndex=nextSourceIndex;
    437                 state=readCommand;
    438                 goto fastSingle;
    439             case definePairOne:
    440                 dynamicWindow=(int8_t)((b>>5)&7);
    441                 byteOne=(uint8_t)(b&0x1f);
    442                 cnv->toUBytes[1]=b;
    443                 cnv->toULength=2;
    444                 state=definePairTwo;
    445                 break;
    446             case definePairTwo:
    447                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
    448                 sourceIndex=nextSourceIndex;
    449                 state=readCommand;
    450                 goto fastSingle;
    451             case defineOne:
    452                 if(b==0) {
    453                     /* callback(illegal): Reserved window offset value 0 */
    454                     cnv->toUBytes[1]=b;
    455                     cnv->toULength=2;
    456                     goto endloop;
    457                 } else if(b<gapThreshold) {
    458                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
    459                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
    460                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
    461                 } else if(b>=fixedThreshold) {
    462                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
    463                 } else {
    464                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
    465                     cnv->toUBytes[1]=b;
    466                     cnv->toULength=2;
    467                     goto endloop;
    468                 }
    469                 sourceIndex=nextSourceIndex;
    470                 state=readCommand;
    471                 goto fastSingle;
    472             }
    473         }
    474     } else {
    475         /* fast path for Unicode mode */
    476         if(state==readCommand) {
    477 fastUnicode:
    478             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
    479                 *target++=(UChar)((b<<8)|source[1]);
    480                 if(offsets!=NULL) {
    481                     *offsets++=sourceIndex;
    482                 }
    483                 sourceIndex=nextSourceIndex;
    484                 nextSourceIndex+=2;
    485                 source+=2;
    486             }
    487         }
    488 
    489         /* normal state machine for Unicode mode */
    490 /* unicodeByteMode: */
    491         while(source<sourceLimit) {
    492             if(target>=targetLimit) {
    493                 /* target is full */
    494                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    495                 break;
    496             }
    497             b=*source++;
    498             ++nextSourceIndex;
    499             switch(state) {
    500             case readCommand:
    501                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
    502                     byteOne=b;
    503                     cnv->toUBytes[0]=b;
    504                     cnv->toULength=1;
    505                     state=quotePairTwo;
    506                 } else if(/* UC0<=b && */ b<=UC7) {
    507                     dynamicWindow=(int8_t)(b-UC0);
    508                     sourceIndex=nextSourceIndex;
    509                     isSingleByteMode=TRUE;
    510                     goto fastSingle;
    511                 } else if(/* UD0<=b && */ b<=UD7) {
    512                     dynamicWindow=(int8_t)(b-UD0);
    513                     isSingleByteMode=TRUE;
    514                     cnv->toUBytes[0]=b;
    515                     cnv->toULength=1;
    516                     state=defineOne;
    517                     goto singleByteMode;
    518                 } else if(b==UDX) {
    519                     isSingleByteMode=TRUE;
    520                     cnv->toUBytes[0]=b;
    521                     cnv->toULength=1;
    522                     state=definePairOne;
    523                     goto singleByteMode;
    524                 } else if(b==UQU) {
    525                     cnv->toUBytes[0]=b;
    526                     cnv->toULength=1;
    527                     state=quotePairOne;
    528                 } else /* Urs */ {
    529                     /* callback(illegal) */
    530                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    531                     cnv->toUBytes[0]=b;
    532                     cnv->toULength=1;
    533                     goto endloop;
    534                 }
    535                 break;
    536             case quotePairOne:
    537                 byteOne=b;
    538                 cnv->toUBytes[1]=b;
    539                 cnv->toULength=2;
    540                 state=quotePairTwo;
    541                 break;
    542             case quotePairTwo:
    543                 *target++=(UChar)((byteOne<<8)|b);
    544                 if(offsets!=NULL) {
    545                     *offsets++=sourceIndex;
    546                 }
    547                 sourceIndex=nextSourceIndex;
    548                 state=readCommand;
    549                 goto fastUnicode;
    550             }
    551         }
    552     }
    553 endloop:
    554 
    555     /* set the converter state back into UConverter */
    556     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
    557         /* reset to deal with the next character */
    558         state=readCommand;
    559     } else if(state==readCommand) {
    560         /* not in a multi-byte sequence, reset toULength */
    561         cnv->toULength=0;
    562     }
    563     scsu->toUIsSingleByteMode=isSingleByteMode;
    564     scsu->toUState=state;
    565     scsu->toUQuoteWindow=quoteWindow;
    566     scsu->toUDynamicWindow=dynamicWindow;
    567     scsu->toUByteOne=byteOne;
    568 
    569     /* write back the updated pointers */
    570     pArgs->source=(const char *)source;
    571     pArgs->target=target;
    572     pArgs->offsets=offsets;
    573     return;
    574 }
    575 
    576 /*
    577  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
    578  * If a change is made in the original function, then either
    579  * change this function the same way or
    580  * re-copy the original function and remove the variables
    581  * offsets, sourceIndex, and nextSourceIndex.
    582  */
    583 static void
    584 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
    585                UErrorCode *pErrorCode) {
    586     UConverter *cnv;
    587     SCSUData *scsu;
    588     const uint8_t *source, *sourceLimit;
    589     UChar *target;
    590     const UChar *targetLimit;
    591     UBool isSingleByteMode;
    592     uint8_t state, byteOne;
    593     int8_t quoteWindow, dynamicWindow;
    594 
    595     uint8_t b;
    596 
    597     /* set up the local pointers */
    598     cnv=pArgs->converter;
    599     scsu=(SCSUData *)cnv->extraInfo;
    600 
    601     source=(const uint8_t *)pArgs->source;
    602     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    603     target=pArgs->target;
    604     targetLimit=pArgs->targetLimit;
    605 
    606     /* get the state machine state */
    607     isSingleByteMode=scsu->toUIsSingleByteMode;
    608     state=scsu->toUState;
    609     quoteWindow=scsu->toUQuoteWindow;
    610     dynamicWindow=scsu->toUDynamicWindow;
    611     byteOne=scsu->toUByteOne;
    612 
    613     /*
    614      * conversion "loop"
    615      *
    616      * For performance, this is not a normal C loop.
    617      * Instead, there are two code blocks for the two SCSU modes.
    618      * The function branches to either one, and a change of the mode is done with a goto to
    619      * the other branch.
    620      *
    621      * Each branch has two conventional loops:
    622      * - a fast-path loop for the most common codes in the mode
    623      * - a loop for all other codes in the mode
    624      * When the fast-path runs into a code that it cannot handle, its loop ends and it
    625      * runs into the following loop to handle the other codes.
    626      * The end of the input or output buffer is also handled by the slower loop.
    627      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
    628      *
    629      * The callback handling is done by returning with an error code.
    630      * The conversion framework actually calls the callback function.
    631      */
    632     if(isSingleByteMode) {
    633         /* fast path for single-byte mode */
    634         if(state==readCommand) {
    635 fastSingle:
    636             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
    637                 ++source;
    638                 if(b<=0x7f) {
    639                     /* write US-ASCII graphic character or DEL */
    640                     *target++=(UChar)b;
    641                 } else {
    642                     /* write from dynamic window */
    643                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
    644                     if(c<=0xffff) {
    645                         *target++=(UChar)c;
    646                     } else {
    647                         /* output surrogate pair */
    648                         *target++=(UChar)(0xd7c0+(c>>10));
    649                         if(target<targetLimit) {
    650                             *target++=(UChar)(0xdc00|(c&0x3ff));
    651                         } else {
    652                             /* target overflow */
    653                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    654                             cnv->UCharErrorBufferLength=1;
    655                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    656                             goto endloop;
    657                         }
    658                     }
    659                 }
    660             }
    661         }
    662 
    663         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
    664 singleByteMode:
    665         while(source<sourceLimit) {
    666             if(target>=targetLimit) {
    667                 /* target is full */
    668                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    669                 break;
    670             }
    671             b=*source++;
    672             switch(state) {
    673             case readCommand:
    674                 /* redundant conditions are commented out */
    675                 /* here: b<0x20 because otherwise we would be in fastSingle */
    676                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
    677                     /* CR/LF/TAB/NUL */
    678                     *target++=(UChar)b;
    679                     goto fastSingle;
    680                 } else if(SC0<=b) {
    681                     if(b<=SC7) {
    682                         dynamicWindow=(int8_t)(b-SC0);
    683                         goto fastSingle;
    684                     } else /* if(SD0<=b && b<=SD7) */ {
    685                         dynamicWindow=(int8_t)(b-SD0);
    686                         state=defineOne;
    687                     }
    688                 } else if(/* SQ0<=b && */ b<=SQ7) {
    689                     quoteWindow=(int8_t)(b-SQ0);
    690                     state=quoteOne;
    691                 } else if(b==SDX) {
    692                     state=definePairOne;
    693                 } else if(b==SQU) {
    694                     state=quotePairOne;
    695                 } else if(b==SCU) {
    696                     isSingleByteMode=FALSE;
    697                     goto fastUnicode;
    698                 } else /* Srs */ {
    699                     /* callback(illegal) */
    700                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    701                     cnv->toUBytes[0]=b;
    702                     cnv->toULength=1;
    703                     goto endloop;
    704                 }
    705 
    706                 /* store the first byte of a multibyte sequence in toUBytes[] */
    707                 cnv->toUBytes[0]=b;
    708                 cnv->toULength=1;
    709                 break;
    710             case quotePairOne:
    711                 byteOne=b;
    712                 cnv->toUBytes[1]=b;
    713                 cnv->toULength=2;
    714                 state=quotePairTwo;
    715                 break;
    716             case quotePairTwo:
    717                 *target++=(UChar)((byteOne<<8)|b);
    718                 state=readCommand;
    719                 goto fastSingle;
    720             case quoteOne:
    721                 if(b<0x80) {
    722                     /* all static offsets are in the BMP */
    723                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
    724                 } else {
    725                     /* write from dynamic window */
    726                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
    727                     if(c<=0xffff) {
    728                         *target++=(UChar)c;
    729                     } else {
    730                         /* output surrogate pair */
    731                         *target++=(UChar)(0xd7c0+(c>>10));
    732                         if(target<targetLimit) {
    733                             *target++=(UChar)(0xdc00|(c&0x3ff));
    734                         } else {
    735                             /* target overflow */
    736                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    737                             cnv->UCharErrorBufferLength=1;
    738                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    739                             goto endloop;
    740                         }
    741                     }
    742                 }
    743                 state=readCommand;
    744                 goto fastSingle;
    745             case definePairOne:
    746                 dynamicWindow=(int8_t)((b>>5)&7);
    747                 byteOne=(uint8_t)(b&0x1f);
    748                 cnv->toUBytes[1]=b;
    749                 cnv->toULength=2;
    750                 state=definePairTwo;
    751                 break;
    752             case definePairTwo:
    753                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
    754                 state=readCommand;
    755                 goto fastSingle;
    756             case defineOne:
    757                 if(b==0) {
    758                     /* callback(illegal): Reserved window offset value 0 */
    759                     cnv->toUBytes[1]=b;
    760                     cnv->toULength=2;
    761                     goto endloop;
    762                 } else if(b<gapThreshold) {
    763                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
    764                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
    765                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
    766                 } else if(b>=fixedThreshold) {
    767                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
    768                 } else {
    769                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
    770                     cnv->toUBytes[1]=b;
    771                     cnv->toULength=2;
    772                     goto endloop;
    773                 }
    774                 state=readCommand;
    775                 goto fastSingle;
    776             }
    777         }
    778     } else {
    779         /* fast path for Unicode mode */
    780         if(state==readCommand) {
    781 fastUnicode:
    782             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
    783                 *target++=(UChar)((b<<8)|source[1]);
    784                 source+=2;
    785             }
    786         }
    787 
    788         /* normal state machine for Unicode mode */
    789 /* unicodeByteMode: */
    790         while(source<sourceLimit) {
    791             if(target>=targetLimit) {
    792                 /* target is full */
    793                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    794                 break;
    795             }
    796             b=*source++;
    797             switch(state) {
    798             case readCommand:
    799                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
    800                     byteOne=b;
    801                     cnv->toUBytes[0]=b;
    802                     cnv->toULength=1;
    803                     state=quotePairTwo;
    804                 } else if(/* UC0<=b && */ b<=UC7) {
    805                     dynamicWindow=(int8_t)(b-UC0);
    806                     isSingleByteMode=TRUE;
    807                     goto fastSingle;
    808                 } else if(/* UD0<=b && */ b<=UD7) {
    809                     dynamicWindow=(int8_t)(b-UD0);
    810                     isSingleByteMode=TRUE;
    811                     cnv->toUBytes[0]=b;
    812                     cnv->toULength=1;
    813                     state=defineOne;
    814                     goto singleByteMode;
    815                 } else if(b==UDX) {
    816                     isSingleByteMode=TRUE;
    817                     cnv->toUBytes[0]=b;
    818                     cnv->toULength=1;
    819                     state=definePairOne;
    820                     goto singleByteMode;
    821                 } else if(b==UQU) {
    822                     cnv->toUBytes[0]=b;
    823                     cnv->toULength=1;
    824                     state=quotePairOne;
    825                 } else /* Urs */ {
    826                     /* callback(illegal) */
    827                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    828                     cnv->toUBytes[0]=b;
    829                     cnv->toULength=1;
    830                     goto endloop;
    831                 }
    832                 break;
    833             case quotePairOne:
    834                 byteOne=b;
    835                 cnv->toUBytes[1]=b;
    836                 cnv->toULength=2;
    837                 state=quotePairTwo;
    838                 break;
    839             case quotePairTwo:
    840                 *target++=(UChar)((byteOne<<8)|b);
    841                 state=readCommand;
    842                 goto fastUnicode;
    843             }
    844         }
    845     }
    846 endloop:
    847 
    848     /* set the converter state back into UConverter */
    849     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
    850         /* reset to deal with the next character */
    851         state=readCommand;
    852     } else if(state==readCommand) {
    853         /* not in a multi-byte sequence, reset toULength */
    854         cnv->toULength=0;
    855     }
    856     scsu->toUIsSingleByteMode=isSingleByteMode;
    857     scsu->toUState=state;
    858     scsu->toUQuoteWindow=quoteWindow;
    859     scsu->toUDynamicWindow=dynamicWindow;
    860     scsu->toUByteOne=byteOne;
    861 
    862     /* write back the updated pointers */
    863     pArgs->source=(const char *)source;
    864     pArgs->target=target;
    865     return;
    866 }
    867 
    868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
    869 
    870 /*
    871  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
    872  * reasonable results. The lookahead is minimal.
    873  * Many cases are simple:
    874  * A character fits directly into the current mode, a dynamic or static window,
    875  * or is not compressible. These cases are tested first.
    876  * Real compression heuristics are applied to the rest, in code branches for
    877  * single/Unicode mode and BMP/supplementary code points.
    878  * The heuristics used here are extremely simple.
    879  */
    880 
    881 /* get the number of the window that this character is in, or -1 */
    882 static int8_t
    883 getWindow(const uint32_t offsets[8], uint32_t c) {
    884     int i;
    885     for(i=0; i<8; ++i) {
    886         if((uint32_t)(c-offsets[i])<=0x7f) {
    887             return (int8_t)(i);
    888         }
    889     }
    890     return -1;
    891 }
    892 
    893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
    894 static UBool
    895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
    896     return (UBool)(c<=offset+0x7f &&
    897           (c>=offset || (c<=0x7f &&
    898                         (c>=0x20 || (1UL<<c)&0x2601))));
    899                                 /* binary 0010 0110 0000 0001,
    900                                    check for b==0xd || b==0xa || b==9 || b==0 */
    901 }
    902 
    903 /*
    904  * getNextDynamicWindow returns the next dynamic window to be redefined
    905  */
    906 static int8_t
    907 getNextDynamicWindow(SCSUData *scsu) {
    908     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
    909     if(++scsu->nextWindowUseIndex==8) {
    910         scsu->nextWindowUseIndex=0;
    911     }
    912     return window;
    913 }
    914 
    915 /*
    916  * useDynamicWindow() adjusts
    917  * windowUse[] and nextWindowUseIndex for the algorithm to choose
    918  * the next dynamic window to be defined;
    919  * a subclass may override it and provide its own algorithm.
    920  */
    921 static void
    922 useDynamicWindow(SCSUData *scsu, int8_t window) {
    923     /*
    924      * move the existing window, which just became the most recently used one,
    925      * up in windowUse[] to nextWindowUseIndex-1
    926      */
    927 
    928     /* first, find the index of the window - backwards to favor the more recently used windows */
    929     int i, j;
    930 
    931     i=scsu->nextWindowUseIndex;
    932     do {
    933         if(--i<0) {
    934             i=7;
    935         }
    936     } while(scsu->windowUse[i]!=window);
    937 
    938     /* now copy each windowUse[i+1] to [i] */
    939     j=i+1;
    940     if(j==8) {
    941         j=0;
    942     }
    943     while(j!=scsu->nextWindowUseIndex) {
    944         scsu->windowUse[i]=scsu->windowUse[j];
    945         i=j;
    946         if(++j==8) { j=0; }
    947     }
    948 
    949     /* finally, set the window into the most recently used index */
    950     scsu->windowUse[i]=window;
    951 }
    952 
    953 /*
    954  * calculate the offset and the code for a dynamic window that contains the character
    955  * takes fixed offsets into account
    956  * the offset of the window is stored in the offset variable,
    957  * the code is returned
    958  *
    959  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
    960  */
    961 static int
    962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
    963     int i;
    964 
    965     for(i=0; i<7; ++i) {
    966         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
    967             *pOffset=fixedOffsets[i];
    968             return 0xf9+i;
    969         }
    970     }
    971 
    972     if(c<0x80) {
    973         /* No dynamic window for US-ASCII. */
    974         return -1;
    975     } else if(c<0x3400 ||
    976               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
    977               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
    978     ) {
    979         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
    980         *pOffset=c&0x7fffff80;
    981         return (int)(c>>7);
    982     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
    983         /* For these characters we need to take the gapOffset into account. */
    984         *pOffset=c&0x7fffff80;
    985         return (int)((c-gapOffset)>>7);
    986     } else {
    987         return -1;
    988     }
    989 }
    990 
    991 /*
    992  * Idea for compression:
    993  *  - save SCSUData and other state before really starting work
    994  *  - at endloop, see if compression could be better with just unicode mode
    995  *  - don't do this if a callback has been called
    996  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
    997  *  - different buffer handling!
    998  *
    999  * Drawback or need for corrective handling:
   1000  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
   1001  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
   1002  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
   1003  *
   1004  * How to achieve both?
   1005  *  - Only replace the result after an SDX or SCU?
   1006  */
   1007 
   1008 static void
   1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1010                             UErrorCode *pErrorCode) {
   1011     UConverter *cnv;
   1012     SCSUData *scsu;
   1013     const UChar *source, *sourceLimit;
   1014     uint8_t *target;
   1015     int32_t targetCapacity;
   1016     int32_t *offsets;
   1017 
   1018     UBool isSingleByteMode;
   1019     uint8_t dynamicWindow;
   1020     uint32_t currentOffset;
   1021 
   1022     uint32_t c, delta;
   1023 
   1024     int32_t sourceIndex, nextSourceIndex;
   1025 
   1026     int32_t length;
   1027 
   1028     /* variables for compression heuristics */
   1029     uint32_t offset;
   1030     UChar lead, trail;
   1031     int code;
   1032     int8_t window;
   1033 
   1034     /* set up the local pointers */
   1035     cnv=pArgs->converter;
   1036     scsu=(SCSUData *)cnv->extraInfo;
   1037 
   1038     /* set up the local pointers */
   1039     source=pArgs->source;
   1040     sourceLimit=pArgs->sourceLimit;
   1041     target=(uint8_t *)pArgs->target;
   1042     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1043     offsets=pArgs->offsets;
   1044 
   1045     /* get the state machine state */
   1046     isSingleByteMode=scsu->fromUIsSingleByteMode;
   1047     dynamicWindow=scsu->fromUDynamicWindow;
   1048     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1049 
   1050     c=cnv->fromUChar32;
   1051 
   1052     /* sourceIndex=-1 if the current character began in the previous buffer */
   1053     sourceIndex= c==0 ? 0 : -1;
   1054     nextSourceIndex=0;
   1055 
   1056     /* similar conversion "loop" as in toUnicode */
   1057 loop:
   1058     if(isSingleByteMode) {
   1059         if(c!=0 && targetCapacity>0) {
   1060             goto getTrailSingle;
   1061         }
   1062 
   1063         /* state machine for single-byte mode */
   1064 /* singleByteMode: */
   1065         while(source<sourceLimit) {
   1066             if(targetCapacity<=0) {
   1067                 /* target is full */
   1068                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1069                 break;
   1070             }
   1071             c=*source++;
   1072             ++nextSourceIndex;
   1073 
   1074             if((c-0x20)<=0x5f) {
   1075                 /* pass US-ASCII graphic character through */
   1076                 *target++=(uint8_t)c;
   1077                 if(offsets!=NULL) {
   1078                     *offsets++=sourceIndex;
   1079                 }
   1080                 --targetCapacity;
   1081             } else if(c<0x20) {
   1082                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   1083                     /* CR/LF/TAB/NUL */
   1084                     *target++=(uint8_t)c;
   1085                     if(offsets!=NULL) {
   1086                         *offsets++=sourceIndex;
   1087                     }
   1088                     --targetCapacity;
   1089                 } else {
   1090                     /* quote C0 control character */
   1091                     c|=SQ0<<8;
   1092                     length=2;
   1093                     goto outputBytes;
   1094                 }
   1095             } else if((delta=c-currentOffset)<=0x7f) {
   1096                 /* use the current dynamic window */
   1097                 *target++=(uint8_t)(delta|0x80);
   1098                 if(offsets!=NULL) {
   1099                     *offsets++=sourceIndex;
   1100                 }
   1101                 --targetCapacity;
   1102             } else if(U16_IS_SURROGATE(c)) {
   1103                 if(U16_IS_SURROGATE_LEAD(c)) {
   1104 getTrailSingle:
   1105                     lead=(UChar)c;
   1106                     if(source<sourceLimit) {
   1107                         /* test the following code unit */
   1108                         trail=*source;
   1109                         if(U16_IS_TRAIL(trail)) {
   1110                             ++source;
   1111                             ++nextSourceIndex;
   1112                             c=U16_GET_SUPPLEMENTARY(c, trail);
   1113                             /* convert this surrogate code point */
   1114                             /* exit this condition tree */
   1115                         } else {
   1116                             /* this is an unmatched lead code unit (1st surrogate) */
   1117                             /* callback(illegal) */
   1118                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1119                             goto endloop;
   1120                         }
   1121                     } else {
   1122                         /* no more input */
   1123                         break;
   1124                     }
   1125                 } else {
   1126                     /* this is an unmatched trail code unit (2nd surrogate) */
   1127                     /* callback(illegal) */
   1128                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1129                     goto endloop;
   1130                 }
   1131 
   1132                 /* compress supplementary character U+10000..U+10ffff */
   1133                 if((delta=c-currentOffset)<=0x7f) {
   1134                     /* use the current dynamic window */
   1135                     *target++=(uint8_t)(delta|0x80);
   1136                     if(offsets!=NULL) {
   1137                         *offsets++=sourceIndex;
   1138                     }
   1139                     --targetCapacity;
   1140                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1141                     /* there is a dynamic window that contains this character, change to it */
   1142                     dynamicWindow=window;
   1143                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1144                     useDynamicWindow(scsu, dynamicWindow);
   1145                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1146                     length=2;
   1147                     goto outputBytes;
   1148                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1149                     /* might check if there are more characters in this window to come */
   1150                     /* define an extended window with this character */
   1151                     code-=0x200;
   1152                     dynamicWindow=getNextDynamicWindow(scsu);
   1153                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1154                     useDynamicWindow(scsu, dynamicWindow);
   1155                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1156                     length=4;
   1157                     goto outputBytes;
   1158                 } else {
   1159                     /* change to Unicode mode and output this (lead, trail) pair */
   1160                     isSingleByteMode=FALSE;
   1161                     *target++=(uint8_t)SCU;
   1162                     if(offsets!=NULL) {
   1163                         *offsets++=sourceIndex;
   1164                     }
   1165                     --targetCapacity;
   1166                     c=((uint32_t)lead<<16)|trail;
   1167                     length=4;
   1168                     goto outputBytes;
   1169                 }
   1170             } else if(c<0xa0) {
   1171                 /* quote C1 control character */
   1172                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
   1173                 length=2;
   1174                 goto outputBytes;
   1175             } else if(c==0xfeff || c>=0xfff0) {
   1176                 /* quote signature character=byte order mark and specials */
   1177                 c|=SQU<<16;
   1178                 length=3;
   1179                 goto outputBytes;
   1180             } else {
   1181                 /* compress all other BMP characters */
   1182                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1183                     /* there is a window defined that contains this character - switch to it or quote from it? */
   1184                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
   1185                         /* change to dynamic window */
   1186                         dynamicWindow=window;
   1187                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1188                         useDynamicWindow(scsu, dynamicWindow);
   1189                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1190                         length=2;
   1191                         goto outputBytes;
   1192                     } else {
   1193                         /* quote from dynamic window */
   1194                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
   1195                         length=2;
   1196                         goto outputBytes;
   1197                     }
   1198                 } else if((window=getWindow(staticOffsets, c))>=0) {
   1199                     /* quote from static window */
   1200                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
   1201                     length=2;
   1202                     goto outputBytes;
   1203                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1204                     /* define a dynamic window with this character */
   1205                     dynamicWindow=getNextDynamicWindow(scsu);
   1206                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1207                     useDynamicWindow(scsu, dynamicWindow);
   1208                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1209                     length=3;
   1210                     goto outputBytes;
   1211                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
   1212                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1213                 ) {
   1214                     /*
   1215                      * this character is not compressible (a BMP ideograph or similar);
   1216                      * switch to Unicode mode if this is the last character in the block
   1217                      * or there is at least one more ideograph following immediately
   1218                      */
   1219                     isSingleByteMode=FALSE;
   1220                     c|=SCU<<16;
   1221                     length=3;
   1222                     goto outputBytes;
   1223                 } else {
   1224                     /* quote Unicode */
   1225                     c|=SQU<<16;
   1226                     length=3;
   1227                     goto outputBytes;
   1228                 }
   1229             }
   1230 
   1231             /* normal end of conversion: prepare for a new character */
   1232             c=0;
   1233             sourceIndex=nextSourceIndex;
   1234         }
   1235     } else {
   1236         if(c!=0 && targetCapacity>0) {
   1237             goto getTrailUnicode;
   1238         }
   1239 
   1240         /* state machine for Unicode mode */
   1241 /* unicodeByteMode: */
   1242         while(source<sourceLimit) {
   1243             if(targetCapacity<=0) {
   1244                 /* target is full */
   1245                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1246                 break;
   1247             }
   1248             c=*source++;
   1249             ++nextSourceIndex;
   1250 
   1251             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
   1252                 /* not compressible, write character directly */
   1253                 if(targetCapacity>=2) {
   1254                     *target++=(uint8_t)(c>>8);
   1255                     *target++=(uint8_t)c;
   1256                     if(offsets!=NULL) {
   1257                         *offsets++=sourceIndex;
   1258                         *offsets++=sourceIndex;
   1259                     }
   1260                     targetCapacity-=2;
   1261                 } else {
   1262                     length=2;
   1263                     goto outputBytes;
   1264                 }
   1265             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
   1266                 /* compress BMP character if the following one is not an uncompressible ideograph */
   1267                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
   1268                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
   1269                         /* ASCII digit or letter */
   1270                         isSingleByteMode=TRUE;
   1271                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
   1272                         length=2;
   1273                         goto outputBytes;
   1274                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1275                         /* there is a dynamic window that contains this character, change to it */
   1276                         isSingleByteMode=TRUE;
   1277                         dynamicWindow=window;
   1278                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1279                         useDynamicWindow(scsu, dynamicWindow);
   1280                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1281                         length=2;
   1282                         goto outputBytes;
   1283                     } else if((code=getDynamicOffset(c, &offset))>=0) {
   1284                         /* define a dynamic window with this character */
   1285                         isSingleByteMode=TRUE;
   1286                         dynamicWindow=getNextDynamicWindow(scsu);
   1287                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1288                         useDynamicWindow(scsu, dynamicWindow);
   1289                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1290                         length=3;
   1291                         goto outputBytes;
   1292                     }
   1293                 }
   1294 
   1295                 /* don't know how to compress this character, just write it directly */
   1296                 length=2;
   1297                 goto outputBytes;
   1298             } else if(c<0xe000) {
   1299                 /* c is a surrogate */
   1300                 if(U16_IS_SURROGATE_LEAD(c)) {
   1301 getTrailUnicode:
   1302                     lead=(UChar)c;
   1303                     if(source<sourceLimit) {
   1304                         /* test the following code unit */
   1305                         trail=*source;
   1306                         if(U16_IS_TRAIL(trail)) {
   1307                             ++source;
   1308                             ++nextSourceIndex;
   1309                             c=U16_GET_SUPPLEMENTARY(c, trail);
   1310                             /* convert this surrogate code point */
   1311                             /* exit this condition tree */
   1312                         } else {
   1313                             /* this is an unmatched lead code unit (1st surrogate) */
   1314                             /* callback(illegal) */
   1315                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1316                             goto endloop;
   1317                         }
   1318                     } else {
   1319                         /* no more input */
   1320                         break;
   1321                     }
   1322                 } else {
   1323                     /* this is an unmatched trail code unit (2nd surrogate) */
   1324                     /* callback(illegal) */
   1325                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1326                     goto endloop;
   1327                 }
   1328 
   1329                 /* compress supplementary character */
   1330                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
   1331                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1332                 ) {
   1333                     /*
   1334                      * there is a dynamic window that contains this character and
   1335                      * the following character is not uncompressible,
   1336                      * change to the window
   1337                      */
   1338                     isSingleByteMode=TRUE;
   1339                     dynamicWindow=window;
   1340                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1341                     useDynamicWindow(scsu, dynamicWindow);
   1342                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1343                     length=2;
   1344                     goto outputBytes;
   1345                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
   1346                           (code=getDynamicOffset(c, &offset))>=0
   1347                 ) {
   1348                     /* two supplementary characters in (probably) the same window - define an extended one */
   1349                     isSingleByteMode=TRUE;
   1350                     code-=0x200;
   1351                     dynamicWindow=getNextDynamicWindow(scsu);
   1352                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1353                     useDynamicWindow(scsu, dynamicWindow);
   1354                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1355                     length=4;
   1356                     goto outputBytes;
   1357                 } else {
   1358                     /* don't know how to compress this character, just write it directly */
   1359                     c=((uint32_t)lead<<16)|trail;
   1360                     length=4;
   1361                     goto outputBytes;
   1362                 }
   1363             } else /* 0xe000<=c<0xf300 */ {
   1364                 /* quote to avoid SCSU tags */
   1365                 c|=UQU<<16;
   1366                 length=3;
   1367                 goto outputBytes;
   1368             }
   1369 
   1370             /* normal end of conversion: prepare for a new character */
   1371             c=0;
   1372             sourceIndex=nextSourceIndex;
   1373         }
   1374     }
   1375 endloop:
   1376 
   1377     /* set the converter state back into UConverter */
   1378     scsu->fromUIsSingleByteMode=isSingleByteMode;
   1379     scsu->fromUDynamicWindow=dynamicWindow;
   1380 
   1381     cnv->fromUChar32=c;
   1382 
   1383     /* write back the updated pointers */
   1384     pArgs->source=source;
   1385     pArgs->target=(char *)target;
   1386     pArgs->offsets=offsets;
   1387     return;
   1388 
   1389 outputBytes:
   1390     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
   1391     /* from the first if in the loop we know that targetCapacity>0 */
   1392     if(length<=targetCapacity) {
   1393         if(offsets==NULL) {
   1394             switch(length) {
   1395                 /* each branch falls through to the next one */
   1396             case 4:
   1397                 *target++=(uint8_t)(c>>24);
   1398             case 3: /*fall through*/
   1399                 *target++=(uint8_t)(c>>16);
   1400             case 2: /*fall through*/
   1401                 *target++=(uint8_t)(c>>8);
   1402             case 1: /*fall through*/
   1403                 *target++=(uint8_t)c;
   1404             default:
   1405                 /* will never occur */
   1406                 break;
   1407             }
   1408         } else {
   1409             switch(length) {
   1410                 /* each branch falls through to the next one */
   1411             case 4:
   1412                 *target++=(uint8_t)(c>>24);
   1413                 *offsets++=sourceIndex;
   1414             case 3: /*fall through*/
   1415                 *target++=(uint8_t)(c>>16);
   1416                 *offsets++=sourceIndex;
   1417             case 2: /*fall through*/
   1418                 *target++=(uint8_t)(c>>8);
   1419                 *offsets++=sourceIndex;
   1420             case 1: /*fall through*/
   1421                 *target++=(uint8_t)c;
   1422                 *offsets++=sourceIndex;
   1423             default:
   1424                 /* will never occur */
   1425                 break;
   1426             }
   1427         }
   1428         targetCapacity-=length;
   1429 
   1430         /* normal end of conversion: prepare for a new character */
   1431         c=0;
   1432         sourceIndex=nextSourceIndex;
   1433         goto loop;
   1434     } else {
   1435         uint8_t *p;
   1436 
   1437         /*
   1438          * We actually do this backwards here:
   1439          * In order to save an intermediate variable, we output
   1440          * first to the overflow buffer what does not fit into the
   1441          * regular target.
   1442          */
   1443         /* we know that 0<=targetCapacity<length<=4 */
   1444         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
   1445         length-=targetCapacity;
   1446         p=(uint8_t *)cnv->charErrorBuffer;
   1447         switch(length) {
   1448             /* each branch falls through to the next one */
   1449         case 4:
   1450             *p++=(uint8_t)(c>>24);
   1451         case 3: /*fall through*/
   1452             *p++=(uint8_t)(c>>16);
   1453         case 2: /*fall through*/
   1454             *p++=(uint8_t)(c>>8);
   1455         case 1: /*fall through*/
   1456             *p=(uint8_t)c;
   1457         default:
   1458             /* will never occur */
   1459             break;
   1460         }
   1461         cnv->charErrorBufferLength=(int8_t)length;
   1462 
   1463         /* now output what fits into the regular target */
   1464         c>>=8*length; /* length was reduced by targetCapacity */
   1465         switch(targetCapacity) {
   1466             /* each branch falls through to the next one */
   1467         case 3:
   1468             *target++=(uint8_t)(c>>16);
   1469             if(offsets!=NULL) {
   1470                 *offsets++=sourceIndex;
   1471             }
   1472         case 2: /*fall through*/
   1473             *target++=(uint8_t)(c>>8);
   1474             if(offsets!=NULL) {
   1475                 *offsets++=sourceIndex;
   1476             }
   1477         case 1: /*fall through*/
   1478             *target++=(uint8_t)c;
   1479             if(offsets!=NULL) {
   1480                 *offsets++=sourceIndex;
   1481             }
   1482         default:
   1483             break;
   1484         }
   1485 
   1486         /* target overflow */
   1487         targetCapacity=0;
   1488         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1489         c=0;
   1490         goto endloop;
   1491     }
   1492 }
   1493 
   1494 /*
   1495  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
   1496  * If a change is made in the original function, then either
   1497  * change this function the same way or
   1498  * re-copy the original function and remove the variables
   1499  * offsets, sourceIndex, and nextSourceIndex.
   1500  */
   1501 static void
   1502 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
   1503                  UErrorCode *pErrorCode) {
   1504     UConverter *cnv;
   1505     SCSUData *scsu;
   1506     const UChar *source, *sourceLimit;
   1507     uint8_t *target;
   1508     int32_t targetCapacity;
   1509 
   1510     UBool isSingleByteMode;
   1511     uint8_t dynamicWindow;
   1512     uint32_t currentOffset;
   1513 
   1514     uint32_t c, delta;
   1515 
   1516     int32_t length;
   1517 
   1518     /* variables for compression heuristics */
   1519     uint32_t offset;
   1520     UChar lead, trail;
   1521     int code;
   1522     int8_t window;
   1523 
   1524     /* set up the local pointers */
   1525     cnv=pArgs->converter;
   1526     scsu=(SCSUData *)cnv->extraInfo;
   1527 
   1528     /* set up the local pointers */
   1529     source=pArgs->source;
   1530     sourceLimit=pArgs->sourceLimit;
   1531     target=(uint8_t *)pArgs->target;
   1532     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1533 
   1534     /* get the state machine state */
   1535     isSingleByteMode=scsu->fromUIsSingleByteMode;
   1536     dynamicWindow=scsu->fromUDynamicWindow;
   1537     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1538 
   1539     c=cnv->fromUChar32;
   1540 
   1541     /* similar conversion "loop" as in toUnicode */
   1542 loop:
   1543     if(isSingleByteMode) {
   1544         if(c!=0 && targetCapacity>0) {
   1545             goto getTrailSingle;
   1546         }
   1547 
   1548         /* state machine for single-byte mode */
   1549 /* singleByteMode: */
   1550         while(source<sourceLimit) {
   1551             if(targetCapacity<=0) {
   1552                 /* target is full */
   1553                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1554                 break;
   1555             }
   1556             c=*source++;
   1557 
   1558             if((c-0x20)<=0x5f) {
   1559                 /* pass US-ASCII graphic character through */
   1560                 *target++=(uint8_t)c;
   1561                 --targetCapacity;
   1562             } else if(c<0x20) {
   1563                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   1564                     /* CR/LF/TAB/NUL */
   1565                     *target++=(uint8_t)c;
   1566                     --targetCapacity;
   1567                 } else {
   1568                     /* quote C0 control character */
   1569                     c|=SQ0<<8;
   1570                     length=2;
   1571                     goto outputBytes;
   1572                 }
   1573             } else if((delta=c-currentOffset)<=0x7f) {
   1574                 /* use the current dynamic window */
   1575                 *target++=(uint8_t)(delta|0x80);
   1576                 --targetCapacity;
   1577             } else if(U16_IS_SURROGATE(c)) {
   1578                 if(U16_IS_SURROGATE_LEAD(c)) {
   1579 getTrailSingle:
   1580                     lead=(UChar)c;
   1581                     if(source<sourceLimit) {
   1582                         /* test the following code unit */
   1583                         trail=*source;
   1584                         if(U16_IS_TRAIL(trail)) {
   1585                             ++source;
   1586                             c=U16_GET_SUPPLEMENTARY(c, trail);
   1587                             /* convert this surrogate code point */
   1588                             /* exit this condition tree */
   1589                         } else {
   1590                             /* this is an unmatched lead code unit (1st surrogate) */
   1591                             /* callback(illegal) */
   1592                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1593                             goto endloop;
   1594                         }
   1595                     } else {
   1596                         /* no more input */
   1597                         break;
   1598                     }
   1599                 } else {
   1600                     /* this is an unmatched trail code unit (2nd surrogate) */
   1601                     /* callback(illegal) */
   1602                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1603                     goto endloop;
   1604                 }
   1605 
   1606                 /* compress supplementary character U+10000..U+10ffff */
   1607                 if((delta=c-currentOffset)<=0x7f) {
   1608                     /* use the current dynamic window */
   1609                     *target++=(uint8_t)(delta|0x80);
   1610                     --targetCapacity;
   1611                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1612                     /* there is a dynamic window that contains this character, change to it */
   1613                     dynamicWindow=window;
   1614                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1615                     useDynamicWindow(scsu, dynamicWindow);
   1616                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1617                     length=2;
   1618                     goto outputBytes;
   1619                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1620                     /* might check if there are more characters in this window to come */
   1621                     /* define an extended window with this character */
   1622                     code-=0x200;
   1623                     dynamicWindow=getNextDynamicWindow(scsu);
   1624                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1625                     useDynamicWindow(scsu, dynamicWindow);
   1626                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1627                     length=4;
   1628                     goto outputBytes;
   1629                 } else {
   1630                     /* change to Unicode mode and output this (lead, trail) pair */
   1631                     isSingleByteMode=FALSE;
   1632                     *target++=(uint8_t)SCU;
   1633                     --targetCapacity;
   1634                     c=((uint32_t)lead<<16)|trail;
   1635                     length=4;
   1636                     goto outputBytes;
   1637                 }
   1638             } else if(c<0xa0) {
   1639                 /* quote C1 control character */
   1640                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
   1641                 length=2;
   1642                 goto outputBytes;
   1643             } else if(c==0xfeff || c>=0xfff0) {
   1644                 /* quote signature character=byte order mark and specials */
   1645                 c|=SQU<<16;
   1646                 length=3;
   1647                 goto outputBytes;
   1648             } else {
   1649                 /* compress all other BMP characters */
   1650                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1651                     /* there is a window defined that contains this character - switch to it or quote from it? */
   1652                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
   1653                         /* change to dynamic window */
   1654                         dynamicWindow=window;
   1655                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1656                         useDynamicWindow(scsu, dynamicWindow);
   1657                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1658                         length=2;
   1659                         goto outputBytes;
   1660                     } else {
   1661                         /* quote from dynamic window */
   1662                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
   1663                         length=2;
   1664                         goto outputBytes;
   1665                     }
   1666                 } else if((window=getWindow(staticOffsets, c))>=0) {
   1667                     /* quote from static window */
   1668                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
   1669                     length=2;
   1670                     goto outputBytes;
   1671                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1672                     /* define a dynamic window with this character */
   1673                     dynamicWindow=getNextDynamicWindow(scsu);
   1674                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1675                     useDynamicWindow(scsu, dynamicWindow);
   1676                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1677                     length=3;
   1678                     goto outputBytes;
   1679                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
   1680                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1681                 ) {
   1682                     /*
   1683                      * this character is not compressible (a BMP ideograph or similar);
   1684                      * switch to Unicode mode if this is the last character in the block
   1685                      * or there is at least one more ideograph following immediately
   1686                      */
   1687                     isSingleByteMode=FALSE;
   1688                     c|=SCU<<16;
   1689                     length=3;
   1690                     goto outputBytes;
   1691                 } else {
   1692                     /* quote Unicode */
   1693                     c|=SQU<<16;
   1694                     length=3;
   1695                     goto outputBytes;
   1696                 }
   1697             }
   1698 
   1699             /* normal end of conversion: prepare for a new character */
   1700             c=0;
   1701         }
   1702     } else {
   1703         if(c!=0 && targetCapacity>0) {
   1704             goto getTrailUnicode;
   1705         }
   1706 
   1707         /* state machine for Unicode mode */
   1708 /* unicodeByteMode: */
   1709         while(source<sourceLimit) {
   1710             if(targetCapacity<=0) {
   1711                 /* target is full */
   1712                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1713                 break;
   1714             }
   1715             c=*source++;
   1716 
   1717             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
   1718                 /* not compressible, write character directly */
   1719                 if(targetCapacity>=2) {
   1720                     *target++=(uint8_t)(c>>8);
   1721                     *target++=(uint8_t)c;
   1722                     targetCapacity-=2;
   1723                 } else {
   1724                     length=2;
   1725                     goto outputBytes;
   1726                 }
   1727             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
   1728                 /* compress BMP character if the following one is not an uncompressible ideograph */
   1729                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
   1730                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
   1731                         /* ASCII digit or letter */
   1732                         isSingleByteMode=TRUE;
   1733                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
   1734                         length=2;
   1735                         goto outputBytes;
   1736                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1737                         /* there is a dynamic window that contains this character, change to it */
   1738                         isSingleByteMode=TRUE;
   1739                         dynamicWindow=window;
   1740                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1741                         useDynamicWindow(scsu, dynamicWindow);
   1742                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1743                         length=2;
   1744                         goto outputBytes;
   1745                     } else if((code=getDynamicOffset(c, &offset))>=0) {
   1746                         /* define a dynamic window with this character */
   1747                         isSingleByteMode=TRUE;
   1748                         dynamicWindow=getNextDynamicWindow(scsu);
   1749                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1750                         useDynamicWindow(scsu, dynamicWindow);
   1751                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1752                         length=3;
   1753                         goto outputBytes;
   1754                     }
   1755                 }
   1756 
   1757                 /* don't know how to compress this character, just write it directly */
   1758                 length=2;
   1759                 goto outputBytes;
   1760             } else if(c<0xe000) {
   1761                 /* c is a surrogate */
   1762                 if(U16_IS_SURROGATE_LEAD(c)) {
   1763 getTrailUnicode:
   1764                     lead=(UChar)c;
   1765                     if(source<sourceLimit) {
   1766                         /* test the following code unit */
   1767                         trail=*source;
   1768                         if(U16_IS_TRAIL(trail)) {
   1769                             ++source;
   1770                             c=U16_GET_SUPPLEMENTARY(c, trail);
   1771                             /* convert this surrogate code point */
   1772                             /* exit this condition tree */
   1773                         } else {
   1774                             /* this is an unmatched lead code unit (1st surrogate) */
   1775                             /* callback(illegal) */
   1776                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1777                             goto endloop;
   1778                         }
   1779                     } else {
   1780                         /* no more input */
   1781                         break;
   1782                     }
   1783                 } else {
   1784                     /* this is an unmatched trail code unit (2nd surrogate) */
   1785                     /* callback(illegal) */
   1786                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1787                     goto endloop;
   1788                 }
   1789 
   1790                 /* compress supplementary character */
   1791                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
   1792                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1793                 ) {
   1794                     /*
   1795                      * there is a dynamic window that contains this character and
   1796                      * the following character is not uncompressible,
   1797                      * change to the window
   1798                      */
   1799                     isSingleByteMode=TRUE;
   1800                     dynamicWindow=window;
   1801                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1802                     useDynamicWindow(scsu, dynamicWindow);
   1803                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1804                     length=2;
   1805                     goto outputBytes;
   1806                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
   1807                           (code=getDynamicOffset(c, &offset))>=0
   1808                 ) {
   1809                     /* two supplementary characters in (probably) the same window - define an extended one */
   1810                     isSingleByteMode=TRUE;
   1811                     code-=0x200;
   1812                     dynamicWindow=getNextDynamicWindow(scsu);
   1813                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1814                     useDynamicWindow(scsu, dynamicWindow);
   1815                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1816                     length=4;
   1817                     goto outputBytes;
   1818                 } else {
   1819                     /* don't know how to compress this character, just write it directly */
   1820                     c=((uint32_t)lead<<16)|trail;
   1821                     length=4;
   1822                     goto outputBytes;
   1823                 }
   1824             } else /* 0xe000<=c<0xf300 */ {
   1825                 /* quote to avoid SCSU tags */
   1826                 c|=UQU<<16;
   1827                 length=3;
   1828                 goto outputBytes;
   1829             }
   1830 
   1831             /* normal end of conversion: prepare for a new character */
   1832             c=0;
   1833         }
   1834     }
   1835 endloop:
   1836 
   1837     /* set the converter state back into UConverter */
   1838     scsu->fromUIsSingleByteMode=isSingleByteMode;
   1839     scsu->fromUDynamicWindow=dynamicWindow;
   1840 
   1841     cnv->fromUChar32=c;
   1842 
   1843     /* write back the updated pointers */
   1844     pArgs->source=source;
   1845     pArgs->target=(char *)target;
   1846     return;
   1847 
   1848 outputBytes:
   1849     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
   1850     /* from the first if in the loop we know that targetCapacity>0 */
   1851     if(length<=targetCapacity) {
   1852         switch(length) {
   1853             /* each branch falls through to the next one */
   1854         case 4:
   1855             *target++=(uint8_t)(c>>24);
   1856         case 3: /*fall through*/
   1857             *target++=(uint8_t)(c>>16);
   1858         case 2: /*fall through*/
   1859             *target++=(uint8_t)(c>>8);
   1860         case 1: /*fall through*/
   1861             *target++=(uint8_t)c;
   1862         default:
   1863             /* will never occur */
   1864             break;
   1865         }
   1866         targetCapacity-=length;
   1867 
   1868         /* normal end of conversion: prepare for a new character */
   1869         c=0;
   1870         goto loop;
   1871     } else {
   1872         uint8_t *p;
   1873 
   1874         /*
   1875          * We actually do this backwards here:
   1876          * In order to save an intermediate variable, we output
   1877          * first to the overflow buffer what does not fit into the
   1878          * regular target.
   1879          */
   1880         /* we know that 0<=targetCapacity<length<=4 */
   1881         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
   1882         length-=targetCapacity;
   1883         p=(uint8_t *)cnv->charErrorBuffer;
   1884         switch(length) {
   1885             /* each branch falls through to the next one */
   1886         case 4:
   1887             *p++=(uint8_t)(c>>24);
   1888         case 3: /*fall through*/
   1889             *p++=(uint8_t)(c>>16);
   1890         case 2: /*fall through*/
   1891             *p++=(uint8_t)(c>>8);
   1892         case 1: /*fall through*/
   1893             *p=(uint8_t)c;
   1894         default:
   1895             /* will never occur */
   1896             break;
   1897         }
   1898         cnv->charErrorBufferLength=(int8_t)length;
   1899 
   1900         /* now output what fits into the regular target */
   1901         c>>=8*length; /* length was reduced by targetCapacity */
   1902         switch(targetCapacity) {
   1903             /* each branch falls through to the next one */
   1904         case 3:
   1905             *target++=(uint8_t)(c>>16);
   1906         case 2: /*fall through*/
   1907             *target++=(uint8_t)(c>>8);
   1908         case 1: /*fall through*/
   1909             *target++=(uint8_t)c;
   1910         default:
   1911             break;
   1912         }
   1913 
   1914         /* target overflow */
   1915         targetCapacity=0;
   1916         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1917         c=0;
   1918         goto endloop;
   1919     }
   1920 }
   1921 
   1922 /* miscellaneous ------------------------------------------------------------ */
   1923 
   1924 static const char *
   1925 _SCSUGetName(const UConverter *cnv) {
   1926     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
   1927 
   1928     switch(scsu->locale) {
   1929     case l_ja:
   1930         return "SCSU,locale=ja";
   1931     default:
   1932         return "SCSU";
   1933     }
   1934 }
   1935 
   1936 /* structure for SafeClone calculations */
   1937 struct cloneSCSUStruct
   1938 {
   1939     UConverter cnv;
   1940     SCSUData mydata;
   1941 };
   1942 
   1943 static UConverter *
   1944 _SCSUSafeClone(const UConverter *cnv,
   1945                void *stackBuffer,
   1946                int32_t *pBufferSize,
   1947                UErrorCode *status)
   1948 {
   1949     struct cloneSCSUStruct * localClone;
   1950     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
   1951 
   1952     if (U_FAILURE(*status)){
   1953         return 0;
   1954     }
   1955 
   1956     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
   1957         *pBufferSize = bufferSizeNeeded;
   1958         return 0;
   1959     }
   1960 
   1961     localClone = (struct cloneSCSUStruct *)stackBuffer;
   1962     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   1963 
   1964     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
   1965     localClone->cnv.extraInfo = &localClone->mydata;
   1966     localClone->cnv.isExtraLocal = TRUE;
   1967 
   1968     return &localClone->cnv;
   1969 }
   1970 
   1971 
   1972 static const UConverterImpl _SCSUImpl={
   1973     UCNV_SCSU,
   1974 
   1975     NULL,
   1976     NULL,
   1977 
   1978     _SCSUOpen,
   1979     _SCSUClose,
   1980     _SCSUReset,
   1981 
   1982     _SCSUToUnicode,
   1983     _SCSUToUnicodeWithOffsets,
   1984     _SCSUFromUnicode,
   1985     _SCSUFromUnicodeWithOffsets,
   1986     NULL,
   1987 
   1988     NULL,
   1989     _SCSUGetName,
   1990     NULL,
   1991     _SCSUSafeClone,
   1992     ucnv_getCompleteUnicodeSet
   1993 };
   1994 
   1995 static const UConverterStaticData _SCSUStaticData={
   1996     sizeof(UConverterStaticData),
   1997     "SCSU",
   1998     1212, /* CCSID for SCSU */
   1999     UCNV_IBM, UCNV_SCSU,
   2000     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
   2001     /*
   2002      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
   2003      * substitution string.
   2004      */
   2005     { 0x0e, 0xff, 0xfd, 0 }, 3,
   2006     FALSE, FALSE,
   2007     0,
   2008     0,
   2009     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   2010 };
   2011 
   2012 const UConverterSharedData _SCSUData={
   2013     sizeof(UConverterSharedData), ~((uint32_t)0),
   2014     NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
   2015     0
   2016 };
   2017 
   2018 #endif
   2019