Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvscsu.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000nov18
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is an implementation of the Standard Compression Scheme for Unicode
     17 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
     18 *   Reserved commands and window settings are treated as illegal sequences and
     19 *   will result in callback calls.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 
     24 #if !UCONFIG_NO_CONVERSION
     25 
     26 #include "unicode/ucnv.h"
     27 #include "unicode/ucnv_cb.h"
     28 #include "ucnv_bld.h"
     29 #include "ucnv_cnv.h"
     30 #include "cmemory.h"
     31 
     32 /* SCSU definitions --------------------------------------------------------- */
     33 
     34 /* SCSU command byte values */
     35 enum {
     36     SQ0=0x01, /* Quote from window pair 0 */
     37     SQ7=0x08, /* Quote from window pair 7 */
     38     SDX=0x0B, /* Define a window as extended */
     39     Srs=0x0C, /* reserved */
     40     SQU=0x0E, /* Quote a single Unicode character */
     41     SCU=0x0F, /* Change to Unicode mode */
     42     SC0=0x10, /* Select window 0 */
     43     SC7=0x17, /* Select window 7 */
     44     SD0=0x18, /* Define and select window 0 */
     45     SD7=0x1F, /* Define and select window 7 */
     46 
     47     UC0=0xE0, /* Select window 0 */
     48     UC7=0xE7, /* Select window 7 */
     49     UD0=0xE8, /* Define and select window 0 */
     50     UD7=0xEF, /* Define and select window 7 */
     51     UQU=0xF0, /* Quote a single Unicode character */
     52     UDX=0xF1, /* Define a Window as extended */
     53     Urs=0xF2  /* reserved */
     54 };
     55 
     56 enum {
     57     /*
     58      * Unicode code points from 3400 to E000 are not adressible by
     59      * dynamic window, since in these areas no short run alphabets are
     60      * found. Therefore add gapOffset to all values from gapThreshold.
     61      */
     62     gapThreshold=0x68,
     63     gapOffset=0xAC00,
     64 
     65     /* values between reservedStart and fixedThreshold are reserved */
     66     reservedStart=0xA8,
     67 
     68     /* use table of predefined fixed offsets for values from fixedThreshold */
     69     fixedThreshold=0xF9
     70 };
     71 
     72 /* constant offsets for the 8 static windows */
     73 static const uint32_t staticOffsets[8]={
     74     0x0000, /* ASCII for quoted tags */
     75     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
     76     0x0100, /* Latin Extended-A */
     77     0x0300, /* Combining Diacritical Marks */
     78     0x2000, /* General Punctuation */
     79     0x2080, /* Currency Symbols */
     80     0x2100, /* Letterlike Symbols and Number Forms */
     81     0x3000  /* CJK Symbols and punctuation */
     82 };
     83 
     84 /* initial offsets for the 8 dynamic (sliding) windows */
     85 static const uint32_t initialDynamicOffsets[8]={
     86     0x0080, /* Latin-1 */
     87     0x00C0, /* Latin Extended A */
     88     0x0400, /* Cyrillic */
     89     0x0600, /* Arabic */
     90     0x0900, /* Devanagari */
     91     0x3040, /* Hiragana */
     92     0x30A0, /* Katakana */
     93     0xFF00  /* Fullwidth ASCII */
     94 };
     95 
     96 /* Table of fixed predefined Offsets */
     97 static const uint32_t fixedOffsets[]={
     98     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
     99     /* 0xFA */ 0x0250, /* IPA extensions */
    100     /* 0xFB */ 0x0370, /* Greek */
    101     /* 0xFC */ 0x0530, /* Armenian */
    102     /* 0xFD */ 0x3040, /* Hiragana */
    103     /* 0xFE */ 0x30A0, /* Katakana */
    104     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
    105 };
    106 
    107 /* state values */
    108 enum {
    109     readCommand,
    110     quotePairOne,
    111     quotePairTwo,
    112     quoteOne,
    113     definePairOne,
    114     definePairTwo,
    115     defineOne
    116 };
    117 
    118 typedef struct SCSUData {
    119     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
    120     uint32_t toUDynamicOffsets[8];
    121     uint32_t fromUDynamicOffsets[8];
    122 
    123     /* state machine state - toUnicode */
    124     UBool toUIsSingleByteMode;
    125     uint8_t toUState;
    126     int8_t toUQuoteWindow, toUDynamicWindow;
    127     uint8_t toUByteOne;
    128     uint8_t toUPadding[3];
    129 
    130     /* state machine state - fromUnicode */
    131     UBool fromUIsSingleByteMode;
    132     int8_t fromUDynamicWindow;
    133 
    134     /*
    135      * windowUse[] keeps track of the use of the dynamic windows:
    136      * At nextWindowUseIndex there is the least recently used window,
    137      * and the following windows (in a wrapping manner) are more and more
    138      * recently used.
    139      * At nextWindowUseIndex-1 there is the most recently used window.
    140      */
    141     uint8_t locale;
    142     int8_t nextWindowUseIndex;
    143     int8_t windowUse[8];
    144 } SCSUData;
    145 
    146 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
    147 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
    148 
    149 enum {
    150     lGeneric, l_ja
    151 };
    152 
    153 /* SCSU setup functions ----------------------------------------------------- */
    154 
    155 static void
    156 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
    157     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
    158 
    159     if(choice<=UCNV_RESET_TO_UNICODE) {
    160         /* reset toUnicode */
    161         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
    162 
    163         scsu->toUIsSingleByteMode=TRUE;
    164         scsu->toUState=readCommand;
    165         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
    166         scsu->toUByteOne=0;
    167 
    168         cnv->toULength=0;
    169     }
    170     if(choice!=UCNV_RESET_TO_UNICODE) {
    171         /* reset fromUnicode */
    172         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
    173 
    174         scsu->fromUIsSingleByteMode=TRUE;
    175         scsu->fromUDynamicWindow=0;
    176 
    177         scsu->nextWindowUseIndex=0;
    178         switch(scsu->locale) {
    179         case l_ja:
    180             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
    181             break;
    182         default:
    183             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
    184             break;
    185         }
    186 
    187         cnv->fromUChar32=0;
    188     }
    189 }
    190 
    191 static void
    192 _SCSUOpen(UConverter *cnv,
    193           UConverterLoadArgs *pArgs,
    194           UErrorCode *pErrorCode) {
    195     const char *locale=pArgs->locale;
    196     if(pArgs->onlyTestIsLoadable) {
    197         return;
    198     }
    199     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
    200     if(cnv->extraInfo!=NULL) {
    201         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
    202             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
    203         } else {
    204             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
    205         }
    206         _SCSUReset(cnv, UCNV_RESET_BOTH);
    207     } else {
    208         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    209     }
    210 
    211     /* Set the substitution character U+fffd as a Unicode string. */
    212     cnv->subUChars[0]=0xfffd;
    213     cnv->subCharLen=-1;
    214 }
    215 
    216 static void
    217 _SCSUClose(UConverter *cnv) {
    218     if(cnv->extraInfo!=NULL) {
    219         if(!cnv->isExtraLocal) {
    220             uprv_free(cnv->extraInfo);
    221         }
    222         cnv->extraInfo=NULL;
    223     }
    224 }
    225 
    226 /* SCSU-to-Unicode conversion functions ------------------------------------- */
    227 
    228 static void
    229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    230                           UErrorCode *pErrorCode) {
    231     UConverter *cnv;
    232     SCSUData *scsu;
    233     const uint8_t *source, *sourceLimit;
    234     UChar *target;
    235     const UChar *targetLimit;
    236     int32_t *offsets;
    237     UBool isSingleByteMode;
    238     uint8_t state, byteOne;
    239     int8_t quoteWindow, dynamicWindow;
    240 
    241     int32_t sourceIndex, nextSourceIndex;
    242 
    243     uint8_t b;
    244 
    245     /* set up the local pointers */
    246     cnv=pArgs->converter;
    247     scsu=(SCSUData *)cnv->extraInfo;
    248 
    249     source=(const uint8_t *)pArgs->source;
    250     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    251     target=pArgs->target;
    252     targetLimit=pArgs->targetLimit;
    253     offsets=pArgs->offsets;
    254 
    255     /* get the state machine state */
    256     isSingleByteMode=scsu->toUIsSingleByteMode;
    257     state=scsu->toUState;
    258     quoteWindow=scsu->toUQuoteWindow;
    259     dynamicWindow=scsu->toUDynamicWindow;
    260     byteOne=scsu->toUByteOne;
    261 
    262     /* sourceIndex=-1 if the current character began in the previous buffer */
    263     sourceIndex=state==readCommand ? 0 : -1;
    264     nextSourceIndex=0;
    265 
    266     /*
    267      * conversion "loop"
    268      *
    269      * For performance, this is not a normal C loop.
    270      * Instead, there are two code blocks for the two SCSU modes.
    271      * The function branches to either one, and a change of the mode is done with a goto to
    272      * the other branch.
    273      *
    274      * Each branch has two conventional loops:
    275      * - a fast-path loop for the most common codes in the mode
    276      * - a loop for all other codes in the mode
    277      * When the fast-path runs into a code that it cannot handle, its loop ends and it
    278      * runs into the following loop to handle the other codes.
    279      * The end of the input or output buffer is also handled by the slower loop.
    280      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
    281      *
    282      * The callback handling is done by returning with an error code.
    283      * The conversion framework actually calls the callback function.
    284      */
    285     if(isSingleByteMode) {
    286         /* fast path for single-byte mode */
    287         if(state==readCommand) {
    288 fastSingle:
    289             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
    290                 ++source;
    291                 ++nextSourceIndex;
    292                 if(b<=0x7f) {
    293                     /* write US-ASCII graphic character or DEL */
    294                     *target++=(UChar)b;
    295                     if(offsets!=NULL) {
    296                         *offsets++=sourceIndex;
    297                     }
    298                 } else {
    299                     /* write from dynamic window */
    300                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
    301                     if(c<=0xffff) {
    302                         *target++=(UChar)c;
    303                         if(offsets!=NULL) {
    304                             *offsets++=sourceIndex;
    305                         }
    306                     } else {
    307                         /* output surrogate pair */
    308                         *target++=(UChar)(0xd7c0+(c>>10));
    309                         if(target<targetLimit) {
    310                             *target++=(UChar)(0xdc00|(c&0x3ff));
    311                             if(offsets!=NULL) {
    312                                 *offsets++=sourceIndex;
    313                                 *offsets++=sourceIndex;
    314                             }
    315                         } else {
    316                             /* target overflow */
    317                             if(offsets!=NULL) {
    318                                 *offsets++=sourceIndex;
    319                             }
    320                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    321                             cnv->UCharErrorBufferLength=1;
    322                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    323                             goto endloop;
    324                         }
    325                     }
    326                 }
    327                 sourceIndex=nextSourceIndex;
    328             }
    329         }
    330 
    331         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
    332 singleByteMode:
    333         while(source<sourceLimit) {
    334             if(target>=targetLimit) {
    335                 /* target is full */
    336                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    337                 break;
    338             }
    339             b=*source++;
    340             ++nextSourceIndex;
    341             switch(state) {
    342             case readCommand:
    343                 /* redundant conditions are commented out */
    344                 /* here: b<0x20 because otherwise we would be in fastSingle */
    345                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
    346                     /* CR/LF/TAB/NUL */
    347                     *target++=(UChar)b;
    348                     if(offsets!=NULL) {
    349                         *offsets++=sourceIndex;
    350                     }
    351                     sourceIndex=nextSourceIndex;
    352                     goto fastSingle;
    353                 } else if(SC0<=b) {
    354                     if(b<=SC7) {
    355                         dynamicWindow=(int8_t)(b-SC0);
    356                         sourceIndex=nextSourceIndex;
    357                         goto fastSingle;
    358                     } else /* if(SD0<=b && b<=SD7) */ {
    359                         dynamicWindow=(int8_t)(b-SD0);
    360                         state=defineOne;
    361                     }
    362                 } else if(/* SQ0<=b && */ b<=SQ7) {
    363                     quoteWindow=(int8_t)(b-SQ0);
    364                     state=quoteOne;
    365                 } else if(b==SDX) {
    366                     state=definePairOne;
    367                 } else if(b==SQU) {
    368                     state=quotePairOne;
    369                 } else if(b==SCU) {
    370                     sourceIndex=nextSourceIndex;
    371                     isSingleByteMode=FALSE;
    372                     goto fastUnicode;
    373                 } else /* Srs */ {
    374                     /* callback(illegal) */
    375                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    376                     cnv->toUBytes[0]=b;
    377                     cnv->toULength=1;
    378                     goto endloop;
    379                 }
    380 
    381                 /* store the first byte of a multibyte sequence in toUBytes[] */
    382                 cnv->toUBytes[0]=b;
    383                 cnv->toULength=1;
    384                 break;
    385             case quotePairOne:
    386                 byteOne=b;
    387                 cnv->toUBytes[1]=b;
    388                 cnv->toULength=2;
    389                 state=quotePairTwo;
    390                 break;
    391             case quotePairTwo:
    392                 *target++=(UChar)((byteOne<<8)|b);
    393                 if(offsets!=NULL) {
    394                     *offsets++=sourceIndex;
    395                 }
    396                 sourceIndex=nextSourceIndex;
    397                 state=readCommand;
    398                 goto fastSingle;
    399             case quoteOne:
    400                 if(b<0x80) {
    401                     /* all static offsets are in the BMP */
    402                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
    403                     if(offsets!=NULL) {
    404                         *offsets++=sourceIndex;
    405                     }
    406                 } else {
    407                     /* write from dynamic window */
    408                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
    409                     if(c<=0xffff) {
    410                         *target++=(UChar)c;
    411                         if(offsets!=NULL) {
    412                             *offsets++=sourceIndex;
    413                         }
    414                     } else {
    415                         /* output surrogate pair */
    416                         *target++=(UChar)(0xd7c0+(c>>10));
    417                         if(target<targetLimit) {
    418                             *target++=(UChar)(0xdc00|(c&0x3ff));
    419                             if(offsets!=NULL) {
    420                                 *offsets++=sourceIndex;
    421                                 *offsets++=sourceIndex;
    422                             }
    423                         } else {
    424                             /* target overflow */
    425                             if(offsets!=NULL) {
    426                                 *offsets++=sourceIndex;
    427                             }
    428                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    429                             cnv->UCharErrorBufferLength=1;
    430                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    431                             goto endloop;
    432                         }
    433                     }
    434                 }
    435                 sourceIndex=nextSourceIndex;
    436                 state=readCommand;
    437                 goto fastSingle;
    438             case definePairOne:
    439                 dynamicWindow=(int8_t)((b>>5)&7);
    440                 byteOne=(uint8_t)(b&0x1f);
    441                 cnv->toUBytes[1]=b;
    442                 cnv->toULength=2;
    443                 state=definePairTwo;
    444                 break;
    445             case definePairTwo:
    446                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
    447                 sourceIndex=nextSourceIndex;
    448                 state=readCommand;
    449                 goto fastSingle;
    450             case defineOne:
    451                 if(b==0) {
    452                     /* callback(illegal): Reserved window offset value 0 */
    453                     cnv->toUBytes[1]=b;
    454                     cnv->toULength=2;
    455                     goto endloop;
    456                 } else if(b<gapThreshold) {
    457                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
    458                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
    459                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
    460                 } else if(b>=fixedThreshold) {
    461                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
    462                 } else {
    463                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
    464                     cnv->toUBytes[1]=b;
    465                     cnv->toULength=2;
    466                     goto endloop;
    467                 }
    468                 sourceIndex=nextSourceIndex;
    469                 state=readCommand;
    470                 goto fastSingle;
    471             }
    472         }
    473     } else {
    474         /* fast path for Unicode mode */
    475         if(state==readCommand) {
    476 fastUnicode:
    477             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
    478                 *target++=(UChar)((b<<8)|source[1]);
    479                 if(offsets!=NULL) {
    480                     *offsets++=sourceIndex;
    481                 }
    482                 sourceIndex=nextSourceIndex;
    483                 nextSourceIndex+=2;
    484                 source+=2;
    485             }
    486         }
    487 
    488         /* normal state machine for Unicode mode */
    489 /* unicodeByteMode: */
    490         while(source<sourceLimit) {
    491             if(target>=targetLimit) {
    492                 /* target is full */
    493                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    494                 break;
    495             }
    496             b=*source++;
    497             ++nextSourceIndex;
    498             switch(state) {
    499             case readCommand:
    500                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
    501                     byteOne=b;
    502                     cnv->toUBytes[0]=b;
    503                     cnv->toULength=1;
    504                     state=quotePairTwo;
    505                 } else if(/* UC0<=b && */ b<=UC7) {
    506                     dynamicWindow=(int8_t)(b-UC0);
    507                     sourceIndex=nextSourceIndex;
    508                     isSingleByteMode=TRUE;
    509                     goto fastSingle;
    510                 } else if(/* UD0<=b && */ b<=UD7) {
    511                     dynamicWindow=(int8_t)(b-UD0);
    512                     isSingleByteMode=TRUE;
    513                     cnv->toUBytes[0]=b;
    514                     cnv->toULength=1;
    515                     state=defineOne;
    516                     goto singleByteMode;
    517                 } else if(b==UDX) {
    518                     isSingleByteMode=TRUE;
    519                     cnv->toUBytes[0]=b;
    520                     cnv->toULength=1;
    521                     state=definePairOne;
    522                     goto singleByteMode;
    523                 } else if(b==UQU) {
    524                     cnv->toUBytes[0]=b;
    525                     cnv->toULength=1;
    526                     state=quotePairOne;
    527                 } else /* Urs */ {
    528                     /* callback(illegal) */
    529                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    530                     cnv->toUBytes[0]=b;
    531                     cnv->toULength=1;
    532                     goto endloop;
    533                 }
    534                 break;
    535             case quotePairOne:
    536                 byteOne=b;
    537                 cnv->toUBytes[1]=b;
    538                 cnv->toULength=2;
    539                 state=quotePairTwo;
    540                 break;
    541             case quotePairTwo:
    542                 *target++=(UChar)((byteOne<<8)|b);
    543                 if(offsets!=NULL) {
    544                     *offsets++=sourceIndex;
    545                 }
    546                 sourceIndex=nextSourceIndex;
    547                 state=readCommand;
    548                 goto fastUnicode;
    549             }
    550         }
    551     }
    552 endloop:
    553 
    554     /* set the converter state back into UConverter */
    555     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
    556         /* reset to deal with the next character */
    557         state=readCommand;
    558     } else if(state==readCommand) {
    559         /* not in a multi-byte sequence, reset toULength */
    560         cnv->toULength=0;
    561     }
    562     scsu->toUIsSingleByteMode=isSingleByteMode;
    563     scsu->toUState=state;
    564     scsu->toUQuoteWindow=quoteWindow;
    565     scsu->toUDynamicWindow=dynamicWindow;
    566     scsu->toUByteOne=byteOne;
    567 
    568     /* write back the updated pointers */
    569     pArgs->source=(const char *)source;
    570     pArgs->target=target;
    571     pArgs->offsets=offsets;
    572     return;
    573 }
    574 
    575 /*
    576  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
    577  * If a change is made in the original function, then either
    578  * change this function the same way or
    579  * re-copy the original function and remove the variables
    580  * offsets, sourceIndex, and nextSourceIndex.
    581  */
    582 static void
    583 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
    584                UErrorCode *pErrorCode) {
    585     UConverter *cnv;
    586     SCSUData *scsu;
    587     const uint8_t *source, *sourceLimit;
    588     UChar *target;
    589     const UChar *targetLimit;
    590     UBool isSingleByteMode;
    591     uint8_t state, byteOne;
    592     int8_t quoteWindow, dynamicWindow;
    593 
    594     uint8_t b;
    595 
    596     /* set up the local pointers */
    597     cnv=pArgs->converter;
    598     scsu=(SCSUData *)cnv->extraInfo;
    599 
    600     source=(const uint8_t *)pArgs->source;
    601     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    602     target=pArgs->target;
    603     targetLimit=pArgs->targetLimit;
    604 
    605     /* get the state machine state */
    606     isSingleByteMode=scsu->toUIsSingleByteMode;
    607     state=scsu->toUState;
    608     quoteWindow=scsu->toUQuoteWindow;
    609     dynamicWindow=scsu->toUDynamicWindow;
    610     byteOne=scsu->toUByteOne;
    611 
    612     /*
    613      * conversion "loop"
    614      *
    615      * For performance, this is not a normal C loop.
    616      * Instead, there are two code blocks for the two SCSU modes.
    617      * The function branches to either one, and a change of the mode is done with a goto to
    618      * the other branch.
    619      *
    620      * Each branch has two conventional loops:
    621      * - a fast-path loop for the most common codes in the mode
    622      * - a loop for all other codes in the mode
    623      * When the fast-path runs into a code that it cannot handle, its loop ends and it
    624      * runs into the following loop to handle the other codes.
    625      * The end of the input or output buffer is also handled by the slower loop.
    626      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
    627      *
    628      * The callback handling is done by returning with an error code.
    629      * The conversion framework actually calls the callback function.
    630      */
    631     if(isSingleByteMode) {
    632         /* fast path for single-byte mode */
    633         if(state==readCommand) {
    634 fastSingle:
    635             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
    636                 ++source;
    637                 if(b<=0x7f) {
    638                     /* write US-ASCII graphic character or DEL */
    639                     *target++=(UChar)b;
    640                 } else {
    641                     /* write from dynamic window */
    642                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
    643                     if(c<=0xffff) {
    644                         *target++=(UChar)c;
    645                     } else {
    646                         /* output surrogate pair */
    647                         *target++=(UChar)(0xd7c0+(c>>10));
    648                         if(target<targetLimit) {
    649                             *target++=(UChar)(0xdc00|(c&0x3ff));
    650                         } else {
    651                             /* target overflow */
    652                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    653                             cnv->UCharErrorBufferLength=1;
    654                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    655                             goto endloop;
    656                         }
    657                     }
    658                 }
    659             }
    660         }
    661 
    662         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
    663 singleByteMode:
    664         while(source<sourceLimit) {
    665             if(target>=targetLimit) {
    666                 /* target is full */
    667                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    668                 break;
    669             }
    670             b=*source++;
    671             switch(state) {
    672             case readCommand:
    673                 /* redundant conditions are commented out */
    674                 /* here: b<0x20 because otherwise we would be in fastSingle */
    675                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
    676                     /* CR/LF/TAB/NUL */
    677                     *target++=(UChar)b;
    678                     goto fastSingle;
    679                 } else if(SC0<=b) {
    680                     if(b<=SC7) {
    681                         dynamicWindow=(int8_t)(b-SC0);
    682                         goto fastSingle;
    683                     } else /* if(SD0<=b && b<=SD7) */ {
    684                         dynamicWindow=(int8_t)(b-SD0);
    685                         state=defineOne;
    686                     }
    687                 } else if(/* SQ0<=b && */ b<=SQ7) {
    688                     quoteWindow=(int8_t)(b-SQ0);
    689                     state=quoteOne;
    690                 } else if(b==SDX) {
    691                     state=definePairOne;
    692                 } else if(b==SQU) {
    693                     state=quotePairOne;
    694                 } else if(b==SCU) {
    695                     isSingleByteMode=FALSE;
    696                     goto fastUnicode;
    697                 } else /* Srs */ {
    698                     /* callback(illegal) */
    699                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    700                     cnv->toUBytes[0]=b;
    701                     cnv->toULength=1;
    702                     goto endloop;
    703                 }
    704 
    705                 /* store the first byte of a multibyte sequence in toUBytes[] */
    706                 cnv->toUBytes[0]=b;
    707                 cnv->toULength=1;
    708                 break;
    709             case quotePairOne:
    710                 byteOne=b;
    711                 cnv->toUBytes[1]=b;
    712                 cnv->toULength=2;
    713                 state=quotePairTwo;
    714                 break;
    715             case quotePairTwo:
    716                 *target++=(UChar)((byteOne<<8)|b);
    717                 state=readCommand;
    718                 goto fastSingle;
    719             case quoteOne:
    720                 if(b<0x80) {
    721                     /* all static offsets are in the BMP */
    722                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
    723                 } else {
    724                     /* write from dynamic window */
    725                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
    726                     if(c<=0xffff) {
    727                         *target++=(UChar)c;
    728                     } else {
    729                         /* output surrogate pair */
    730                         *target++=(UChar)(0xd7c0+(c>>10));
    731                         if(target<targetLimit) {
    732                             *target++=(UChar)(0xdc00|(c&0x3ff));
    733                         } else {
    734                             /* target overflow */
    735                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
    736                             cnv->UCharErrorBufferLength=1;
    737                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    738                             goto endloop;
    739                         }
    740                     }
    741                 }
    742                 state=readCommand;
    743                 goto fastSingle;
    744             case definePairOne:
    745                 dynamicWindow=(int8_t)((b>>5)&7);
    746                 byteOne=(uint8_t)(b&0x1f);
    747                 cnv->toUBytes[1]=b;
    748                 cnv->toULength=2;
    749                 state=definePairTwo;
    750                 break;
    751             case definePairTwo:
    752                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
    753                 state=readCommand;
    754                 goto fastSingle;
    755             case defineOne:
    756                 if(b==0) {
    757                     /* callback(illegal): Reserved window offset value 0 */
    758                     cnv->toUBytes[1]=b;
    759                     cnv->toULength=2;
    760                     goto endloop;
    761                 } else if(b<gapThreshold) {
    762                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
    763                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
    764                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
    765                 } else if(b>=fixedThreshold) {
    766                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
    767                 } else {
    768                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
    769                     cnv->toUBytes[1]=b;
    770                     cnv->toULength=2;
    771                     goto endloop;
    772                 }
    773                 state=readCommand;
    774                 goto fastSingle;
    775             }
    776         }
    777     } else {
    778         /* fast path for Unicode mode */
    779         if(state==readCommand) {
    780 fastUnicode:
    781             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
    782                 *target++=(UChar)((b<<8)|source[1]);
    783                 source+=2;
    784             }
    785         }
    786 
    787         /* normal state machine for Unicode mode */
    788 /* unicodeByteMode: */
    789         while(source<sourceLimit) {
    790             if(target>=targetLimit) {
    791                 /* target is full */
    792                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    793                 break;
    794             }
    795             b=*source++;
    796             switch(state) {
    797             case readCommand:
    798                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
    799                     byteOne=b;
    800                     cnv->toUBytes[0]=b;
    801                     cnv->toULength=1;
    802                     state=quotePairTwo;
    803                 } else if(/* UC0<=b && */ b<=UC7) {
    804                     dynamicWindow=(int8_t)(b-UC0);
    805                     isSingleByteMode=TRUE;
    806                     goto fastSingle;
    807                 } else if(/* UD0<=b && */ b<=UD7) {
    808                     dynamicWindow=(int8_t)(b-UD0);
    809                     isSingleByteMode=TRUE;
    810                     cnv->toUBytes[0]=b;
    811                     cnv->toULength=1;
    812                     state=defineOne;
    813                     goto singleByteMode;
    814                 } else if(b==UDX) {
    815                     isSingleByteMode=TRUE;
    816                     cnv->toUBytes[0]=b;
    817                     cnv->toULength=1;
    818                     state=definePairOne;
    819                     goto singleByteMode;
    820                 } else if(b==UQU) {
    821                     cnv->toUBytes[0]=b;
    822                     cnv->toULength=1;
    823                     state=quotePairOne;
    824                 } else /* Urs */ {
    825                     /* callback(illegal) */
    826                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    827                     cnv->toUBytes[0]=b;
    828                     cnv->toULength=1;
    829                     goto endloop;
    830                 }
    831                 break;
    832             case quotePairOne:
    833                 byteOne=b;
    834                 cnv->toUBytes[1]=b;
    835                 cnv->toULength=2;
    836                 state=quotePairTwo;
    837                 break;
    838             case quotePairTwo:
    839                 *target++=(UChar)((byteOne<<8)|b);
    840                 state=readCommand;
    841                 goto fastUnicode;
    842             }
    843         }
    844     }
    845 endloop:
    846 
    847     /* set the converter state back into UConverter */
    848     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
    849         /* reset to deal with the next character */
    850         state=readCommand;
    851     } else if(state==readCommand) {
    852         /* not in a multi-byte sequence, reset toULength */
    853         cnv->toULength=0;
    854     }
    855     scsu->toUIsSingleByteMode=isSingleByteMode;
    856     scsu->toUState=state;
    857     scsu->toUQuoteWindow=quoteWindow;
    858     scsu->toUDynamicWindow=dynamicWindow;
    859     scsu->toUByteOne=byteOne;
    860 
    861     /* write back the updated pointers */
    862     pArgs->source=(const char *)source;
    863     pArgs->target=target;
    864     return;
    865 }
    866 
    867 /* SCSU-from-Unicode conversion functions ----------------------------------- */
    868 
    869 /*
    870  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
    871  * reasonable results. The lookahead is minimal.
    872  * Many cases are simple:
    873  * A character fits directly into the current mode, a dynamic or static window,
    874  * or is not compressible. These cases are tested first.
    875  * Real compression heuristics are applied to the rest, in code branches for
    876  * single/Unicode mode and BMP/supplementary code points.
    877  * The heuristics used here are extremely simple.
    878  */
    879 
    880 /* get the number of the window that this character is in, or -1 */
    881 static int8_t
    882 getWindow(const uint32_t offsets[8], uint32_t c) {
    883     int i;
    884     for(i=0; i<8; ++i) {
    885         if((uint32_t)(c-offsets[i])<=0x7f) {
    886             return (int8_t)(i);
    887         }
    888     }
    889     return -1;
    890 }
    891 
    892 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
    893 static UBool
    894 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
    895     return (UBool)(c<=offset+0x7f &&
    896           (c>=offset || (c<=0x7f &&
    897                         (c>=0x20 || (1UL<<c)&0x2601))));
    898                                 /* binary 0010 0110 0000 0001,
    899                                    check for b==0xd || b==0xa || b==9 || b==0 */
    900 }
    901 
    902 /*
    903  * getNextDynamicWindow returns the next dynamic window to be redefined
    904  */
    905 static int8_t
    906 getNextDynamicWindow(SCSUData *scsu) {
    907     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
    908     if(++scsu->nextWindowUseIndex==8) {
    909         scsu->nextWindowUseIndex=0;
    910     }
    911     return window;
    912 }
    913 
    914 /*
    915  * useDynamicWindow() adjusts
    916  * windowUse[] and nextWindowUseIndex for the algorithm to choose
    917  * the next dynamic window to be defined;
    918  * a subclass may override it and provide its own algorithm.
    919  */
    920 static void
    921 useDynamicWindow(SCSUData *scsu, int8_t window) {
    922     /*
    923      * move the existing window, which just became the most recently used one,
    924      * up in windowUse[] to nextWindowUseIndex-1
    925      */
    926 
    927     /* first, find the index of the window - backwards to favor the more recently used windows */
    928     int i, j;
    929 
    930     i=scsu->nextWindowUseIndex;
    931     do {
    932         if(--i<0) {
    933             i=7;
    934         }
    935     } while(scsu->windowUse[i]!=window);
    936 
    937     /* now copy each windowUse[i+1] to [i] */
    938     j=i+1;
    939     if(j==8) {
    940         j=0;
    941     }
    942     while(j!=scsu->nextWindowUseIndex) {
    943         scsu->windowUse[i]=scsu->windowUse[j];
    944         i=j;
    945         if(++j==8) { j=0; }
    946     }
    947 
    948     /* finally, set the window into the most recently used index */
    949     scsu->windowUse[i]=window;
    950 }
    951 
    952 /*
    953  * calculate the offset and the code for a dynamic window that contains the character
    954  * takes fixed offsets into account
    955  * the offset of the window is stored in the offset variable,
    956  * the code is returned
    957  *
    958  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
    959  */
    960 static int
    961 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
    962     int i;
    963 
    964     for(i=0; i<7; ++i) {
    965         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
    966             *pOffset=fixedOffsets[i];
    967             return 0xf9+i;
    968         }
    969     }
    970 
    971     if(c<0x80) {
    972         /* No dynamic window for US-ASCII. */
    973         return -1;
    974     } else if(c<0x3400 ||
    975               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
    976               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
    977     ) {
    978         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
    979         *pOffset=c&0x7fffff80;
    980         return (int)(c>>7);
    981     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
    982         /* For these characters we need to take the gapOffset into account. */
    983         *pOffset=c&0x7fffff80;
    984         return (int)((c-gapOffset)>>7);
    985     } else {
    986         return -1;
    987     }
    988 }
    989 
    990 /*
    991  * Idea for compression:
    992  *  - save SCSUData and other state before really starting work
    993  *  - at endloop, see if compression could be better with just unicode mode
    994  *  - don't do this if a callback has been called
    995  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
    996  *  - different buffer handling!
    997  *
    998  * Drawback or need for corrective handling:
    999  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
   1000  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
   1001  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
   1002  *
   1003  * How to achieve both?
   1004  *  - Only replace the result after an SDX or SCU?
   1005  */
   1006 
   1007 static void
   1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1009                             UErrorCode *pErrorCode) {
   1010     UConverter *cnv;
   1011     SCSUData *scsu;
   1012     const UChar *source, *sourceLimit;
   1013     uint8_t *target;
   1014     int32_t targetCapacity;
   1015     int32_t *offsets;
   1016 
   1017     UBool isSingleByteMode;
   1018     uint8_t dynamicWindow;
   1019     uint32_t currentOffset;
   1020 
   1021     uint32_t c, delta;
   1022 
   1023     int32_t sourceIndex, nextSourceIndex;
   1024 
   1025     int32_t length;
   1026 
   1027     /* variables for compression heuristics */
   1028     uint32_t offset;
   1029     UChar lead, trail;
   1030     int code;
   1031     int8_t window;
   1032 
   1033     /* set up the local pointers */
   1034     cnv=pArgs->converter;
   1035     scsu=(SCSUData *)cnv->extraInfo;
   1036 
   1037     /* set up the local pointers */
   1038     source=pArgs->source;
   1039     sourceLimit=pArgs->sourceLimit;
   1040     target=(uint8_t *)pArgs->target;
   1041     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1042     offsets=pArgs->offsets;
   1043 
   1044     /* get the state machine state */
   1045     isSingleByteMode=scsu->fromUIsSingleByteMode;
   1046     dynamicWindow=scsu->fromUDynamicWindow;
   1047     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1048 
   1049     c=cnv->fromUChar32;
   1050 
   1051     /* sourceIndex=-1 if the current character began in the previous buffer */
   1052     sourceIndex= c==0 ? 0 : -1;
   1053     nextSourceIndex=0;
   1054 
   1055     /* similar conversion "loop" as in toUnicode */
   1056 loop:
   1057     if(isSingleByteMode) {
   1058         if(c!=0 && targetCapacity>0) {
   1059             goto getTrailSingle;
   1060         }
   1061 
   1062         /* state machine for single-byte mode */
   1063 /* singleByteMode: */
   1064         while(source<sourceLimit) {
   1065             if(targetCapacity<=0) {
   1066                 /* target is full */
   1067                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1068                 break;
   1069             }
   1070             c=*source++;
   1071             ++nextSourceIndex;
   1072 
   1073             if((c-0x20)<=0x5f) {
   1074                 /* pass US-ASCII graphic character through */
   1075                 *target++=(uint8_t)c;
   1076                 if(offsets!=NULL) {
   1077                     *offsets++=sourceIndex;
   1078                 }
   1079                 --targetCapacity;
   1080             } else if(c<0x20) {
   1081                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   1082                     /* CR/LF/TAB/NUL */
   1083                     *target++=(uint8_t)c;
   1084                     if(offsets!=NULL) {
   1085                         *offsets++=sourceIndex;
   1086                     }
   1087                     --targetCapacity;
   1088                 } else {
   1089                     /* quote C0 control character */
   1090                     c|=SQ0<<8;
   1091                     length=2;
   1092                     goto outputBytes;
   1093                 }
   1094             } else if((delta=c-currentOffset)<=0x7f) {
   1095                 /* use the current dynamic window */
   1096                 *target++=(uint8_t)(delta|0x80);
   1097                 if(offsets!=NULL) {
   1098                     *offsets++=sourceIndex;
   1099                 }
   1100                 --targetCapacity;
   1101             } else if(UTF_IS_SURROGATE(c)) {
   1102                 if(UTF_IS_SURROGATE_FIRST(c)) {
   1103 getTrailSingle:
   1104                     lead=(UChar)c;
   1105                     if(source<sourceLimit) {
   1106                         /* test the following code unit */
   1107                         trail=*source;
   1108                         if(UTF_IS_SECOND_SURROGATE(trail)) {
   1109                             ++source;
   1110                             ++nextSourceIndex;
   1111                             c=UTF16_GET_PAIR_VALUE(c, trail);
   1112                             /* convert this surrogate code point */
   1113                             /* exit this condition tree */
   1114                         } else {
   1115                             /* this is an unmatched lead code unit (1st surrogate) */
   1116                             /* callback(illegal) */
   1117                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1118                             goto endloop;
   1119                         }
   1120                     } else {
   1121                         /* no more input */
   1122                         break;
   1123                     }
   1124                 } else {
   1125                     /* this is an unmatched trail code unit (2nd surrogate) */
   1126                     /* callback(illegal) */
   1127                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1128                     goto endloop;
   1129                 }
   1130 
   1131                 /* compress supplementary character U+10000..U+10ffff */
   1132                 if((delta=c-currentOffset)<=0x7f) {
   1133                     /* use the current dynamic window */
   1134                     *target++=(uint8_t)(delta|0x80);
   1135                     if(offsets!=NULL) {
   1136                         *offsets++=sourceIndex;
   1137                     }
   1138                     --targetCapacity;
   1139                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1140                     /* there is a dynamic window that contains this character, change to it */
   1141                     dynamicWindow=window;
   1142                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1143                     useDynamicWindow(scsu, dynamicWindow);
   1144                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1145                     length=2;
   1146                     goto outputBytes;
   1147                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1148                     /* might check if there are more characters in this window to come */
   1149                     /* define an extended window with this character */
   1150                     code-=0x200;
   1151                     dynamicWindow=getNextDynamicWindow(scsu);
   1152                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1153                     useDynamicWindow(scsu, dynamicWindow);
   1154                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1155                     length=4;
   1156                     goto outputBytes;
   1157                 } else {
   1158                     /* change to Unicode mode and output this (lead, trail) pair */
   1159                     isSingleByteMode=FALSE;
   1160                     *target++=(uint8_t)SCU;
   1161                     if(offsets!=NULL) {
   1162                         *offsets++=sourceIndex;
   1163                     }
   1164                     --targetCapacity;
   1165                     c=((uint32_t)lead<<16)|trail;
   1166                     length=4;
   1167                     goto outputBytes;
   1168                 }
   1169             } else if(c<0xa0) {
   1170                 /* quote C1 control character */
   1171                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
   1172                 length=2;
   1173                 goto outputBytes;
   1174             } else if(c==0xfeff || c>=0xfff0) {
   1175                 /* quote signature character=byte order mark and specials */
   1176                 c|=SQU<<16;
   1177                 length=3;
   1178                 goto outputBytes;
   1179             } else {
   1180                 /* compress all other BMP characters */
   1181                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1182                     /* there is a window defined that contains this character - switch to it or quote from it? */
   1183                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
   1184                         /* change to dynamic window */
   1185                         dynamicWindow=window;
   1186                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1187                         useDynamicWindow(scsu, dynamicWindow);
   1188                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1189                         length=2;
   1190                         goto outputBytes;
   1191                     } else {
   1192                         /* quote from dynamic window */
   1193                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
   1194                         length=2;
   1195                         goto outputBytes;
   1196                     }
   1197                 } else if((window=getWindow(staticOffsets, c))>=0) {
   1198                     /* quote from static window */
   1199                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
   1200                     length=2;
   1201                     goto outputBytes;
   1202                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1203                     /* define a dynamic window with this character */
   1204                     dynamicWindow=getNextDynamicWindow(scsu);
   1205                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1206                     useDynamicWindow(scsu, dynamicWindow);
   1207                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1208                     length=3;
   1209                     goto outputBytes;
   1210                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
   1211                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1212                 ) {
   1213                     /*
   1214                      * this character is not compressible (a BMP ideograph or similar);
   1215                      * switch to Unicode mode if this is the last character in the block
   1216                      * or there is at least one more ideograph following immediately
   1217                      */
   1218                     isSingleByteMode=FALSE;
   1219                     c|=SCU<<16;
   1220                     length=3;
   1221                     goto outputBytes;
   1222                 } else {
   1223                     /* quote Unicode */
   1224                     c|=SQU<<16;
   1225                     length=3;
   1226                     goto outputBytes;
   1227                 }
   1228             }
   1229 
   1230             /* normal end of conversion: prepare for a new character */
   1231             c=0;
   1232             sourceIndex=nextSourceIndex;
   1233         }
   1234     } else {
   1235         if(c!=0 && targetCapacity>0) {
   1236             goto getTrailUnicode;
   1237         }
   1238 
   1239         /* state machine for Unicode mode */
   1240 /* unicodeByteMode: */
   1241         while(source<sourceLimit) {
   1242             if(targetCapacity<=0) {
   1243                 /* target is full */
   1244                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1245                 break;
   1246             }
   1247             c=*source++;
   1248             ++nextSourceIndex;
   1249 
   1250             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
   1251                 /* not compressible, write character directly */
   1252                 if(targetCapacity>=2) {
   1253                     *target++=(uint8_t)(c>>8);
   1254                     *target++=(uint8_t)c;
   1255                     if(offsets!=NULL) {
   1256                         *offsets++=sourceIndex;
   1257                         *offsets++=sourceIndex;
   1258                     }
   1259                     targetCapacity-=2;
   1260                 } else {
   1261                     length=2;
   1262                     goto outputBytes;
   1263                 }
   1264             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
   1265                 /* compress BMP character if the following one is not an uncompressible ideograph */
   1266                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
   1267                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
   1268                         /* ASCII digit or letter */
   1269                         isSingleByteMode=TRUE;
   1270                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
   1271                         length=2;
   1272                         goto outputBytes;
   1273                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1274                         /* there is a dynamic window that contains this character, change to it */
   1275                         isSingleByteMode=TRUE;
   1276                         dynamicWindow=window;
   1277                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1278                         useDynamicWindow(scsu, dynamicWindow);
   1279                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1280                         length=2;
   1281                         goto outputBytes;
   1282                     } else if((code=getDynamicOffset(c, &offset))>=0) {
   1283                         /* define a dynamic window with this character */
   1284                         isSingleByteMode=TRUE;
   1285                         dynamicWindow=getNextDynamicWindow(scsu);
   1286                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1287                         useDynamicWindow(scsu, dynamicWindow);
   1288                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1289                         length=3;
   1290                         goto outputBytes;
   1291                     }
   1292                 }
   1293 
   1294                 /* don't know how to compress this character, just write it directly */
   1295                 length=2;
   1296                 goto outputBytes;
   1297             } else if(c<0xe000) {
   1298                 /* c is a surrogate */
   1299                 if(UTF_IS_SURROGATE_FIRST(c)) {
   1300 getTrailUnicode:
   1301                     lead=(UChar)c;
   1302                     if(source<sourceLimit) {
   1303                         /* test the following code unit */
   1304                         trail=*source;
   1305                         if(UTF_IS_SECOND_SURROGATE(trail)) {
   1306                             ++source;
   1307                             ++nextSourceIndex;
   1308                             c=UTF16_GET_PAIR_VALUE(c, trail);
   1309                             /* convert this surrogate code point */
   1310                             /* exit this condition tree */
   1311                         } else {
   1312                             /* this is an unmatched lead code unit (1st surrogate) */
   1313                             /* callback(illegal) */
   1314                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1315                             goto endloop;
   1316                         }
   1317                     } else {
   1318                         /* no more input */
   1319                         break;
   1320                     }
   1321                 } else {
   1322                     /* this is an unmatched trail code unit (2nd surrogate) */
   1323                     /* callback(illegal) */
   1324                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1325                     goto endloop;
   1326                 }
   1327 
   1328                 /* compress supplementary character */
   1329                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
   1330                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1331                 ) {
   1332                     /*
   1333                      * there is a dynamic window that contains this character and
   1334                      * the following character is not uncompressible,
   1335                      * change to the window
   1336                      */
   1337                     isSingleByteMode=TRUE;
   1338                     dynamicWindow=window;
   1339                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1340                     useDynamicWindow(scsu, dynamicWindow);
   1341                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1342                     length=2;
   1343                     goto outputBytes;
   1344                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
   1345                           (code=getDynamicOffset(c, &offset))>=0
   1346                 ) {
   1347                     /* two supplementary characters in (probably) the same window - define an extended one */
   1348                     isSingleByteMode=TRUE;
   1349                     code-=0x200;
   1350                     dynamicWindow=getNextDynamicWindow(scsu);
   1351                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1352                     useDynamicWindow(scsu, dynamicWindow);
   1353                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1354                     length=4;
   1355                     goto outputBytes;
   1356                 } else {
   1357                     /* don't know how to compress this character, just write it directly */
   1358                     c=((uint32_t)lead<<16)|trail;
   1359                     length=4;
   1360                     goto outputBytes;
   1361                 }
   1362             } else /* 0xe000<=c<0xf300 */ {
   1363                 /* quote to avoid SCSU tags */
   1364                 c|=UQU<<16;
   1365                 length=3;
   1366                 goto outputBytes;
   1367             }
   1368 
   1369             /* normal end of conversion: prepare for a new character */
   1370             c=0;
   1371             sourceIndex=nextSourceIndex;
   1372         }
   1373     }
   1374 endloop:
   1375 
   1376     /* set the converter state back into UConverter */
   1377     scsu->fromUIsSingleByteMode=isSingleByteMode;
   1378     scsu->fromUDynamicWindow=dynamicWindow;
   1379 
   1380     cnv->fromUChar32=c;
   1381 
   1382     /* write back the updated pointers */
   1383     pArgs->source=source;
   1384     pArgs->target=(char *)target;
   1385     pArgs->offsets=offsets;
   1386     return;
   1387 
   1388 outputBytes:
   1389     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
   1390     /* from the first if in the loop we know that targetCapacity>0 */
   1391     if(length<=targetCapacity) {
   1392         if(offsets==NULL) {
   1393             switch(length) {
   1394                 /* each branch falls through to the next one */
   1395             case 4:
   1396                 *target++=(uint8_t)(c>>24);
   1397             case 3:
   1398                 *target++=(uint8_t)(c>>16);
   1399             case 2:
   1400                 *target++=(uint8_t)(c>>8);
   1401             case 1:
   1402                 *target++=(uint8_t)c;
   1403             default:
   1404                 /* will never occur */
   1405                 break;
   1406             }
   1407         } else {
   1408             switch(length) {
   1409                 /* each branch falls through to the next one */
   1410             case 4:
   1411                 *target++=(uint8_t)(c>>24);
   1412                 *offsets++=sourceIndex;
   1413             case 3:
   1414                 *target++=(uint8_t)(c>>16);
   1415                 *offsets++=sourceIndex;
   1416             case 2:
   1417                 *target++=(uint8_t)(c>>8);
   1418                 *offsets++=sourceIndex;
   1419             case 1:
   1420                 *target++=(uint8_t)c;
   1421                 *offsets++=sourceIndex;
   1422             default:
   1423                 /* will never occur */
   1424                 break;
   1425             }
   1426         }
   1427         targetCapacity-=length;
   1428 
   1429         /* normal end of conversion: prepare for a new character */
   1430         c=0;
   1431         sourceIndex=nextSourceIndex;
   1432         goto loop;
   1433     } else {
   1434         uint8_t *p;
   1435 
   1436         /*
   1437          * We actually do this backwards here:
   1438          * In order to save an intermediate variable, we output
   1439          * first to the overflow buffer what does not fit into the
   1440          * regular target.
   1441          */
   1442         /* we know that 0<=targetCapacity<length<=4 */
   1443         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
   1444         length-=targetCapacity;
   1445         p=(uint8_t *)cnv->charErrorBuffer;
   1446         switch(length) {
   1447             /* each branch falls through to the next one */
   1448         case 4:
   1449             *p++=(uint8_t)(c>>24);
   1450         case 3:
   1451             *p++=(uint8_t)(c>>16);
   1452         case 2:
   1453             *p++=(uint8_t)(c>>8);
   1454         case 1:
   1455             *p=(uint8_t)c;
   1456         default:
   1457             /* will never occur */
   1458             break;
   1459         }
   1460         cnv->charErrorBufferLength=(int8_t)length;
   1461 
   1462         /* now output what fits into the regular target */
   1463         c>>=8*length; /* length was reduced by targetCapacity */
   1464         switch(targetCapacity) {
   1465             /* each branch falls through to the next one */
   1466         case 3:
   1467             *target++=(uint8_t)(c>>16);
   1468             if(offsets!=NULL) {
   1469                 *offsets++=sourceIndex;
   1470             }
   1471         case 2:
   1472             *target++=(uint8_t)(c>>8);
   1473             if(offsets!=NULL) {
   1474                 *offsets++=sourceIndex;
   1475             }
   1476         case 1:
   1477             *target++=(uint8_t)c;
   1478             if(offsets!=NULL) {
   1479                 *offsets++=sourceIndex;
   1480             }
   1481         default:
   1482             break;
   1483         }
   1484 
   1485         /* target overflow */
   1486         targetCapacity=0;
   1487         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1488         c=0;
   1489         goto endloop;
   1490     }
   1491 }
   1492 
   1493 /*
   1494  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
   1495  * If a change is made in the original function, then either
   1496  * change this function the same way or
   1497  * re-copy the original function and remove the variables
   1498  * offsets, sourceIndex, and nextSourceIndex.
   1499  */
   1500 static void
   1501 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
   1502                  UErrorCode *pErrorCode) {
   1503     UConverter *cnv;
   1504     SCSUData *scsu;
   1505     const UChar *source, *sourceLimit;
   1506     uint8_t *target;
   1507     int32_t targetCapacity;
   1508 
   1509     UBool isSingleByteMode;
   1510     uint8_t dynamicWindow;
   1511     uint32_t currentOffset;
   1512 
   1513     uint32_t c, delta;
   1514 
   1515     int32_t length;
   1516 
   1517     /* variables for compression heuristics */
   1518     uint32_t offset;
   1519     UChar lead, trail;
   1520     int code;
   1521     int8_t window;
   1522 
   1523     /* set up the local pointers */
   1524     cnv=pArgs->converter;
   1525     scsu=(SCSUData *)cnv->extraInfo;
   1526 
   1527     /* set up the local pointers */
   1528     source=pArgs->source;
   1529     sourceLimit=pArgs->sourceLimit;
   1530     target=(uint8_t *)pArgs->target;
   1531     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   1532 
   1533     /* get the state machine state */
   1534     isSingleByteMode=scsu->fromUIsSingleByteMode;
   1535     dynamicWindow=scsu->fromUDynamicWindow;
   1536     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1537 
   1538     c=cnv->fromUChar32;
   1539 
   1540     /* similar conversion "loop" as in toUnicode */
   1541 loop:
   1542     if(isSingleByteMode) {
   1543         if(c!=0 && targetCapacity>0) {
   1544             goto getTrailSingle;
   1545         }
   1546 
   1547         /* state machine for single-byte mode */
   1548 /* singleByteMode: */
   1549         while(source<sourceLimit) {
   1550             if(targetCapacity<=0) {
   1551                 /* target is full */
   1552                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1553                 break;
   1554             }
   1555             c=*source++;
   1556 
   1557             if((c-0x20)<=0x5f) {
   1558                 /* pass US-ASCII graphic character through */
   1559                 *target++=(uint8_t)c;
   1560                 --targetCapacity;
   1561             } else if(c<0x20) {
   1562                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
   1563                     /* CR/LF/TAB/NUL */
   1564                     *target++=(uint8_t)c;
   1565                     --targetCapacity;
   1566                 } else {
   1567                     /* quote C0 control character */
   1568                     c|=SQ0<<8;
   1569                     length=2;
   1570                     goto outputBytes;
   1571                 }
   1572             } else if((delta=c-currentOffset)<=0x7f) {
   1573                 /* use the current dynamic window */
   1574                 *target++=(uint8_t)(delta|0x80);
   1575                 --targetCapacity;
   1576             } else if(UTF_IS_SURROGATE(c)) {
   1577                 if(UTF_IS_SURROGATE_FIRST(c)) {
   1578 getTrailSingle:
   1579                     lead=(UChar)c;
   1580                     if(source<sourceLimit) {
   1581                         /* test the following code unit */
   1582                         trail=*source;
   1583                         if(UTF_IS_SECOND_SURROGATE(trail)) {
   1584                             ++source;
   1585                             c=UTF16_GET_PAIR_VALUE(c, trail);
   1586                             /* convert this surrogate code point */
   1587                             /* exit this condition tree */
   1588                         } else {
   1589                             /* this is an unmatched lead code unit (1st surrogate) */
   1590                             /* callback(illegal) */
   1591                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1592                             goto endloop;
   1593                         }
   1594                     } else {
   1595                         /* no more input */
   1596                         break;
   1597                     }
   1598                 } else {
   1599                     /* this is an unmatched trail code unit (2nd surrogate) */
   1600                     /* callback(illegal) */
   1601                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1602                     goto endloop;
   1603                 }
   1604 
   1605                 /* compress supplementary character U+10000..U+10ffff */
   1606                 if((delta=c-currentOffset)<=0x7f) {
   1607                     /* use the current dynamic window */
   1608                     *target++=(uint8_t)(delta|0x80);
   1609                     --targetCapacity;
   1610                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1611                     /* there is a dynamic window that contains this character, change to it */
   1612                     dynamicWindow=window;
   1613                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1614                     useDynamicWindow(scsu, dynamicWindow);
   1615                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1616                     length=2;
   1617                     goto outputBytes;
   1618                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1619                     /* might check if there are more characters in this window to come */
   1620                     /* define an extended window with this character */
   1621                     code-=0x200;
   1622                     dynamicWindow=getNextDynamicWindow(scsu);
   1623                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1624                     useDynamicWindow(scsu, dynamicWindow);
   1625                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1626                     length=4;
   1627                     goto outputBytes;
   1628                 } else {
   1629                     /* change to Unicode mode and output this (lead, trail) pair */
   1630                     isSingleByteMode=FALSE;
   1631                     *target++=(uint8_t)SCU;
   1632                     --targetCapacity;
   1633                     c=((uint32_t)lead<<16)|trail;
   1634                     length=4;
   1635                     goto outputBytes;
   1636                 }
   1637             } else if(c<0xa0) {
   1638                 /* quote C1 control character */
   1639                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
   1640                 length=2;
   1641                 goto outputBytes;
   1642             } else if(c==0xfeff || c>=0xfff0) {
   1643                 /* quote signature character=byte order mark and specials */
   1644                 c|=SQU<<16;
   1645                 length=3;
   1646                 goto outputBytes;
   1647             } else {
   1648                 /* compress all other BMP characters */
   1649                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1650                     /* there is a window defined that contains this character - switch to it or quote from it? */
   1651                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
   1652                         /* change to dynamic window */
   1653                         dynamicWindow=window;
   1654                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1655                         useDynamicWindow(scsu, dynamicWindow);
   1656                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1657                         length=2;
   1658                         goto outputBytes;
   1659                     } else {
   1660                         /* quote from dynamic window */
   1661                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
   1662                         length=2;
   1663                         goto outputBytes;
   1664                     }
   1665                 } else if((window=getWindow(staticOffsets, c))>=0) {
   1666                     /* quote from static window */
   1667                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
   1668                     length=2;
   1669                     goto outputBytes;
   1670                 } else if((code=getDynamicOffset(c, &offset))>=0) {
   1671                     /* define a dynamic window with this character */
   1672                     dynamicWindow=getNextDynamicWindow(scsu);
   1673                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1674                     useDynamicWindow(scsu, dynamicWindow);
   1675                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1676                     length=3;
   1677                     goto outputBytes;
   1678                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
   1679                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1680                 ) {
   1681                     /*
   1682                      * this character is not compressible (a BMP ideograph or similar);
   1683                      * switch to Unicode mode if this is the last character in the block
   1684                      * or there is at least one more ideograph following immediately
   1685                      */
   1686                     isSingleByteMode=FALSE;
   1687                     c|=SCU<<16;
   1688                     length=3;
   1689                     goto outputBytes;
   1690                 } else {
   1691                     /* quote Unicode */
   1692                     c|=SQU<<16;
   1693                     length=3;
   1694                     goto outputBytes;
   1695                 }
   1696             }
   1697 
   1698             /* normal end of conversion: prepare for a new character */
   1699             c=0;
   1700         }
   1701     } else {
   1702         if(c!=0 && targetCapacity>0) {
   1703             goto getTrailUnicode;
   1704         }
   1705 
   1706         /* state machine for Unicode mode */
   1707 /* unicodeByteMode: */
   1708         while(source<sourceLimit) {
   1709             if(targetCapacity<=0) {
   1710                 /* target is full */
   1711                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1712                 break;
   1713             }
   1714             c=*source++;
   1715 
   1716             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
   1717                 /* not compressible, write character directly */
   1718                 if(targetCapacity>=2) {
   1719                     *target++=(uint8_t)(c>>8);
   1720                     *target++=(uint8_t)c;
   1721                     targetCapacity-=2;
   1722                 } else {
   1723                     length=2;
   1724                     goto outputBytes;
   1725                 }
   1726             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
   1727                 /* compress BMP character if the following one is not an uncompressible ideograph */
   1728                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
   1729                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
   1730                         /* ASCII digit or letter */
   1731                         isSingleByteMode=TRUE;
   1732                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
   1733                         length=2;
   1734                         goto outputBytes;
   1735                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
   1736                         /* there is a dynamic window that contains this character, change to it */
   1737                         isSingleByteMode=TRUE;
   1738                         dynamicWindow=window;
   1739                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1740                         useDynamicWindow(scsu, dynamicWindow);
   1741                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1742                         length=2;
   1743                         goto outputBytes;
   1744                     } else if((code=getDynamicOffset(c, &offset))>=0) {
   1745                         /* define a dynamic window with this character */
   1746                         isSingleByteMode=TRUE;
   1747                         dynamicWindow=getNextDynamicWindow(scsu);
   1748                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1749                         useDynamicWindow(scsu, dynamicWindow);
   1750                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1751                         length=3;
   1752                         goto outputBytes;
   1753                     }
   1754                 }
   1755 
   1756                 /* don't know how to compress this character, just write it directly */
   1757                 length=2;
   1758                 goto outputBytes;
   1759             } else if(c<0xe000) {
   1760                 /* c is a surrogate */
   1761                 if(UTF_IS_SURROGATE_FIRST(c)) {
   1762 getTrailUnicode:
   1763                     lead=(UChar)c;
   1764                     if(source<sourceLimit) {
   1765                         /* test the following code unit */
   1766                         trail=*source;
   1767                         if(UTF_IS_SECOND_SURROGATE(trail)) {
   1768                             ++source;
   1769                             c=UTF16_GET_PAIR_VALUE(c, trail);
   1770                             /* convert this surrogate code point */
   1771                             /* exit this condition tree */
   1772                         } else {
   1773                             /* this is an unmatched lead code unit (1st surrogate) */
   1774                             /* callback(illegal) */
   1775                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1776                             goto endloop;
   1777                         }
   1778                     } else {
   1779                         /* no more input */
   1780                         break;
   1781                     }
   1782                 } else {
   1783                     /* this is an unmatched trail code unit (2nd surrogate) */
   1784                     /* callback(illegal) */
   1785                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1786                     goto endloop;
   1787                 }
   1788 
   1789                 /* compress supplementary character */
   1790                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
   1791                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
   1792                 ) {
   1793                     /*
   1794                      * there is a dynamic window that contains this character and
   1795                      * the following character is not uncompressible,
   1796                      * change to the window
   1797                      */
   1798                     isSingleByteMode=TRUE;
   1799                     dynamicWindow=window;
   1800                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
   1801                     useDynamicWindow(scsu, dynamicWindow);
   1802                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
   1803                     length=2;
   1804                     goto outputBytes;
   1805                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
   1806                           (code=getDynamicOffset(c, &offset))>=0
   1807                 ) {
   1808                     /* two supplementary characters in (probably) the same window - define an extended one */
   1809                     isSingleByteMode=TRUE;
   1810                     code-=0x200;
   1811                     dynamicWindow=getNextDynamicWindow(scsu);
   1812                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
   1813                     useDynamicWindow(scsu, dynamicWindow);
   1814                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
   1815                     length=4;
   1816                     goto outputBytes;
   1817                 } else {
   1818                     /* don't know how to compress this character, just write it directly */
   1819                     c=((uint32_t)lead<<16)|trail;
   1820                     length=4;
   1821                     goto outputBytes;
   1822                 }
   1823             } else /* 0xe000<=c<0xf300 */ {
   1824                 /* quote to avoid SCSU tags */
   1825                 c|=UQU<<16;
   1826                 length=3;
   1827                 goto outputBytes;
   1828             }
   1829 
   1830             /* normal end of conversion: prepare for a new character */
   1831             c=0;
   1832         }
   1833     }
   1834 endloop:
   1835 
   1836     /* set the converter state back into UConverter */
   1837     scsu->fromUIsSingleByteMode=isSingleByteMode;
   1838     scsu->fromUDynamicWindow=dynamicWindow;
   1839 
   1840     cnv->fromUChar32=c;
   1841 
   1842     /* write back the updated pointers */
   1843     pArgs->source=source;
   1844     pArgs->target=(char *)target;
   1845     return;
   1846 
   1847 outputBytes:
   1848     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
   1849     /* from the first if in the loop we know that targetCapacity>0 */
   1850     if(length<=targetCapacity) {
   1851         switch(length) {
   1852             /* each branch falls through to the next one */
   1853         case 4:
   1854             *target++=(uint8_t)(c>>24);
   1855         case 3:
   1856             *target++=(uint8_t)(c>>16);
   1857         case 2:
   1858             *target++=(uint8_t)(c>>8);
   1859         case 1:
   1860             *target++=(uint8_t)c;
   1861         default:
   1862             /* will never occur */
   1863             break;
   1864         }
   1865         targetCapacity-=length;
   1866 
   1867         /* normal end of conversion: prepare for a new character */
   1868         c=0;
   1869         goto loop;
   1870     } else {
   1871         uint8_t *p;
   1872 
   1873         /*
   1874          * We actually do this backwards here:
   1875          * In order to save an intermediate variable, we output
   1876          * first to the overflow buffer what does not fit into the
   1877          * regular target.
   1878          */
   1879         /* we know that 0<=targetCapacity<length<=4 */
   1880         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
   1881         length-=targetCapacity;
   1882         p=(uint8_t *)cnv->charErrorBuffer;
   1883         switch(length) {
   1884             /* each branch falls through to the next one */
   1885         case 4:
   1886             *p++=(uint8_t)(c>>24);
   1887         case 3:
   1888             *p++=(uint8_t)(c>>16);
   1889         case 2:
   1890             *p++=(uint8_t)(c>>8);
   1891         case 1:
   1892             *p=(uint8_t)c;
   1893         default:
   1894             /* will never occur */
   1895             break;
   1896         }
   1897         cnv->charErrorBufferLength=(int8_t)length;
   1898 
   1899         /* now output what fits into the regular target */
   1900         c>>=8*length; /* length was reduced by targetCapacity */
   1901         switch(targetCapacity) {
   1902             /* each branch falls through to the next one */
   1903         case 3:
   1904             *target++=(uint8_t)(c>>16);
   1905         case 2:
   1906             *target++=(uint8_t)(c>>8);
   1907         case 1:
   1908             *target++=(uint8_t)c;
   1909         default:
   1910             break;
   1911         }
   1912 
   1913         /* target overflow */
   1914         targetCapacity=0;
   1915         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1916         c=0;
   1917         goto endloop;
   1918     }
   1919 }
   1920 
   1921 /* miscellaneous ------------------------------------------------------------ */
   1922 
   1923 static const char *
   1924 _SCSUGetName(const UConverter *cnv) {
   1925     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
   1926 
   1927     switch(scsu->locale) {
   1928     case l_ja:
   1929         return "SCSU,locale=ja";
   1930     default:
   1931         return "SCSU";
   1932     }
   1933 }
   1934 
   1935 /* structure for SafeClone calculations */
   1936 struct cloneSCSUStruct
   1937 {
   1938     UConverter cnv;
   1939     SCSUData mydata;
   1940 };
   1941 
   1942 static UConverter *
   1943 _SCSUSafeClone(const UConverter *cnv,
   1944                void *stackBuffer,
   1945                int32_t *pBufferSize,
   1946                UErrorCode *status)
   1947 {
   1948     struct cloneSCSUStruct * localClone;
   1949     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
   1950 
   1951     if (U_FAILURE(*status)){
   1952         return 0;
   1953     }
   1954 
   1955     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
   1956         *pBufferSize = bufferSizeNeeded;
   1957         return 0;
   1958     }
   1959 
   1960     localClone = (struct cloneSCSUStruct *)stackBuffer;
   1961     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   1962 
   1963     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
   1964     localClone->cnv.extraInfo = &localClone->mydata;
   1965     localClone->cnv.isExtraLocal = TRUE;
   1966 
   1967     return &localClone->cnv;
   1968 }
   1969 
   1970 
   1971 static const UConverterImpl _SCSUImpl={
   1972     UCNV_SCSU,
   1973 
   1974     NULL,
   1975     NULL,
   1976 
   1977     _SCSUOpen,
   1978     _SCSUClose,
   1979     _SCSUReset,
   1980 
   1981     _SCSUToUnicode,
   1982     _SCSUToUnicodeWithOffsets,
   1983     _SCSUFromUnicode,
   1984     _SCSUFromUnicodeWithOffsets,
   1985     NULL,
   1986 
   1987     NULL,
   1988     _SCSUGetName,
   1989     NULL,
   1990     _SCSUSafeClone,
   1991     ucnv_getCompleteUnicodeSet
   1992 };
   1993 
   1994 static const UConverterStaticData _SCSUStaticData={
   1995     sizeof(UConverterStaticData),
   1996     "SCSU",
   1997     1212, /* CCSID for SCSU */
   1998     UCNV_IBM, UCNV_SCSU,
   1999     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
   2000     /*
   2001      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
   2002      * substitution string.
   2003      */
   2004     { 0x0e, 0xff, 0xfd, 0 }, 3,
   2005     FALSE, FALSE,
   2006     0,
   2007     0,
   2008     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   2009 };
   2010 
   2011 const UConverterSharedData _SCSUData={
   2012     sizeof(UConverterSharedData), ~((uint32_t)0),
   2013     NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
   2014     0
   2015 };
   2016 
   2017 #endif
   2018