Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2000-2015, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv2022.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2000feb03
     12 *   created by: Markus W. Scherer
     13 *
     14 *   Change history:
     15 *
     16 *   06/29/2000  helena  Major rewrite of the callback APIs.
     17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
     18 *                       Changed implementation of toUnicode
     19 *                       function
     20 *   08/21/2000  Ram     Added support for ISO-2022-KR
     21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
     22 *                       ucnvebdc.c
     23 *   09/20/2000  Ram     Added support for ISO-2022-CN
     24 *                       Added implementations for getNextUChar()
     25 *                       for specific 2022 country variants.
     26 *   10/31/2000  Ram     Implemented offsets logic functions
     27 */
     28 
     29 #include "unicode/utypes.h"
     30 
     31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     32 
     33 #include "unicode/ucnv.h"
     34 #include "unicode/uset.h"
     35 #include "unicode/ucnv_err.h"
     36 #include "unicode/ucnv_cb.h"
     37 #include "unicode/utf16.h"
     38 #include "ucnv_imp.h"
     39 #include "ucnv_bld.h"
     40 #include "ucnv_cnv.h"
     41 #include "ucnvmbcs.h"
     42 #include "cstring.h"
     43 #include "cmemory.h"
     44 #include "uassert.h"
     45 
     46 #ifdef U_ENABLE_GENERIC_ISO_2022
     47 /*
     48  * I am disabling the generic ISO-2022 converter after proposing to do so on
     49  * the icu mailing list two days ago.
     50  *
     51  * Reasons:
     52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
     53  *    its designation sequences, single shifts with return to the previous state,
     54  *    switch-with-no-return to UTF-16BE or similar, etc.
     55  *    This is unlike the language-specific variants like ISO-2022-JP which
     56  *    require a much smaller repertoire of ISO-2022 features.
     57  *    These variants continue to be supported.
     58  * 2. I believe that no one is really using the generic ISO-2022 converter
     59  *    but rather always one of the language-specific variants.
     60  *    Note that ICU's generic ISO-2022 converter has always output one escape
     61  *    sequence followed by UTF-8 for the whole stream.
     62  * 3. Switching between subcharsets is extremely slow, because each time
     63  *    the previous converter is closed and a new one opened,
     64  *    without any kind of caching, least-recently-used list, etc.
     65  * 4. The code is currently buggy, and given the above it does not seem
     66  *    reasonable to spend the time on maintenance.
     67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
     68  *    This means, for example, that when ISO-8859-7 is designated, the following
     69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
     70  *    The ICU ISO-2022 converter does not handle this - and has no information
     71  *    about which subconverter would have to be shifted vs. which is designed
     72  *    for 7-bit ISO-2022.
     73  *
     74  * Markus Scherer 2003-dec-03
     75  */
     76 #endif
     77 
     78 #if !UCONFIG_ONLY_HTML_CONVERSION
     79 static const char SHIFT_IN_STR[]  = "\x0F";
     80 // static const char SHIFT_OUT_STR[] = "\x0E";
     81 #endif
     82 
     83 #define CR      0x0D
     84 #define LF      0x0A
     85 #define H_TAB   0x09
     86 #define V_TAB   0x0B
     87 #define SPACE   0x20
     88 
     89 enum {
     90     HWKANA_START=0xff61,
     91     HWKANA_END=0xff9f
     92 };
     93 
     94 /*
     95  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
     96  * as bytes 21..7E. (Subtract 0x80.)
     97  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
     98  * as bytes 20..7F. (Subtract 0x80.)
     99  * Do not encode C1 control codes with native bytes 80..9F
    100  * as bytes 00..1F (C0 control codes).
    101  */
    102 enum {
    103     GR94_START=0xa1,
    104     GR94_END=0xfe,
    105     GR96_START=0xa0,
    106     GR96_END=0xff
    107 };
    108 
    109 /*
    110  * ISO 2022 control codes must not be converted from Unicode
    111  * because they would mess up the byte stream.
    112  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
    113  * corresponding to SO, SI, and ESC.
    114  */
    115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
    116 
    117 /* for ISO-2022-JP and -CN implementations */
    118 typedef enum  {
    119         /* shared values */
    120         INVALID_STATE=-1,
    121         ASCII = 0,
    122 
    123         SS2_STATE=0x10,
    124         SS3_STATE,
    125 
    126         /* JP */
    127         ISO8859_1 = 1 ,
    128         ISO8859_7 = 2 ,
    129         JISX201  = 3,
    130         JISX208 = 4,
    131         JISX212 = 5,
    132         GB2312  =6,
    133         KSC5601 =7,
    134         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
    135 
    136         /* CN */
    137         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
    138         GB2312_1=1,
    139         ISO_IR_165=2,
    140         CNS_11643=3,
    141 
    142         /*
    143          * these are used in StateEnum and ISO2022State variables,
    144          * but CNS_11643 must be used to index into myConverterArray[]
    145          */
    146         CNS_11643_0=0x20,
    147         CNS_11643_1,
    148         CNS_11643_2,
    149         CNS_11643_3,
    150         CNS_11643_4,
    151         CNS_11643_5,
    152         CNS_11643_6,
    153         CNS_11643_7
    154 } StateEnum;
    155 
    156 /* is the StateEnum charset value for a DBCS charset? */
    157 #if UCONFIG_ONLY_HTML_CONVERSION
    158 #define IS_JP_DBCS(cs) (JISX208==(cs))
    159 #else
    160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
    161 #endif
    162 
    163 #define CSM(cs) ((uint16_t)1<<(cs))
    164 
    165 /*
    166  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
    167  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
    168  *
    169  * Note: The converter uses some leniency:
    170  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
    171  *   all versions, not just JIS7 and JIS8.
    172  * - ICU does not distinguish between different versions of JIS X 0208.
    173  */
    174 #if UCONFIG_ONLY_HTML_CONVERSION
    175 enum { MAX_JA_VERSION=0 };
    176 #else
    177 enum { MAX_JA_VERSION=4 };
    178 #endif
    179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
    180     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
    181 #if !UCONFIG_ONLY_HTML_CONVERSION
    182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
    183     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
    184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
    185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
    186 #endif
    187 };
    188 
    189 typedef enum {
    190         ASCII1=0,
    191         LATIN1,
    192         SBCS,
    193         DBCS,
    194         MBCS,
    195         HWKANA
    196 }Cnv2022Type;
    197 
    198 typedef struct ISO2022State {
    199     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
    200     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
    201     int8_t prevG;       /* g before single shift (SS2 or SS3) */
    202 } ISO2022State;
    203 
    204 #define UCNV_OPTIONS_VERSION_MASK 0xf
    205 #define UCNV_2022_MAX_CONVERTERS 10
    206 
    207 typedef struct{
    208     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
    209     UConverter *currentConverter;
    210     Cnv2022Type currentType;
    211     ISO2022State toU2022State, fromU2022State;
    212     uint32_t key;
    213     uint32_t version;
    214 #ifdef U_ENABLE_GENERIC_ISO_2022
    215     UBool isFirstBuffer;
    216 #endif
    217     UBool isEmptySegment;
    218     char name[30];
    219     char locale[3];
    220 }UConverterDataISO2022;
    221 
    222 /* Protos */
    223 /* ISO-2022 ----------------------------------------------------------------- */
    224 
    225 /*Forward declaration */
    226 U_CFUNC void
    227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
    228                       UErrorCode * err);
    229 U_CFUNC void
    230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
    231                                     UErrorCode * err);
    232 
    233 #define ESC_2022 0x1B /*ESC*/
    234 
    235 typedef enum
    236 {
    237         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
    238         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
    239         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
    240         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
    241 } UCNV_TableStates_2022;
    242 
    243 /*
    244 * The way these state transition arrays work is:
    245 * ex : ESC$B is the sequence for JISX208
    246 *      a) First Iteration: char is ESC
    247 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
    248 *             int x = normalize_esq_chars_2022[27] which is equal to 1
    249 *         ii) Search for this value in escSeqStateTable_Key_2022[]
    250 *             value of x is stored at escSeqStateTable_Key_2022[0]
    251 *        iii) Save this index as offset
    252 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
    253 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
    254 *     b) Switch on this state and continue to next char
    255 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
    256 *             which is normalize_esq_chars_2022[36] == 4
    257 *         ii) x is currently 1(from above)
    258 *               x<<=5 -- x is now 32
    259 *               x+=normalize_esq_chars_2022[36]
    260 *               now x is 36
    261 *        iii) Search for this value in escSeqStateTable_Key_2022[]
    262 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
    263 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
    264 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
    265 *     c) Switch on this state and continue to next char
    266 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
    267 *        ii) x is currently 36 (from above)
    268 *            x<<=5 -- x is now 1152
    269 *            x+=normalize_esq_chars_2022[66]
    270 *            now x is 1161
    271 *       iii) Search for this value in escSeqStateTable_Key_2022[]
    272 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
    273 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
    274 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
    275 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
    276 */
    277 
    278 
    279 /*Below are the 3 arrays depicting a state transition table*/
    280 static const int8_t normalize_esq_chars_2022[256] = {
    281 /*       0      1       2       3       4      5       6        7       8       9           */
    282 
    283          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
    286         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
    287         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
    288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    289         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
    290         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
    291         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
    292         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    308         ,0     ,0      ,0      ,0      ,0      ,0
    309 };
    310 
    311 #ifdef U_ENABLE_GENERIC_ISO_2022
    312 /*
    313  * When the generic ISO-2022 converter is completely removed, not just disabled
    314  * per #ifdef, then the following state table and the associated tables that are
    315  * dimensioned with MAX_STATES_2022 should be trimmed.
    316  *
    317  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
    318  * the associated escape sequences starting with ESC ( B should be removed.
    319  * This includes the ones with key values 1097 and all of the ones above 1000000.
    320  *
    321  * For the latter, the tables can simply be truncated.
    322  * For the former, since the tables must be kept parallel, it is probably best
    323  * to simply duplicate an adjacent table cell, parallel in all tables.
    324  *
    325  * It may make sense to restructure the tables, especially by using small search
    326  * tables for the variants instead of indexing them parallel to the table here.
    327  */
    328 #endif
    329 
    330 #define MAX_STATES_2022 74
    331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
    332 /*   0           1           2           3           4           5           6           7           8           9           */
    333 
    334      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
    335     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
    336     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
    337     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
    338     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
    339     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
    340     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
    341     ,35947631   ,35947635   ,35947636   ,35947638
    342 };
    343 
    344 #ifdef U_ENABLE_GENERIC_ISO_2022
    345 
    346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
    347  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
    348 
    349      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
    350     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
    351     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
    352     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
    353     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
    354     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
    355     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
    356     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
    357 };
    358 
    359 #endif
    360 
    361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
    362 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
    363      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    364     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    365     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
    366     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    367     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    371 };
    372 
    373 /* Type def for refactoring changeState_2022 code*/
    374 typedef enum{
    375 #ifdef U_ENABLE_GENERIC_ISO_2022
    376     ISO_2022=0,
    377 #endif
    378     ISO_2022_JP=1,
    379 #if !UCONFIG_ONLY_HTML_CONVERSION
    380     ISO_2022_KR=2,
    381     ISO_2022_CN=3
    382 #endif
    383 } Variant2022;
    384 
    385 /*********** ISO 2022 Converter Protos ***********/
    386 static void
    387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
    388 
    389 static void
    390  _ISO2022Close(UConverter *converter);
    391 
    392 static void
    393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
    394 
    395 static const char*
    396 _ISO2022getName(const UConverter* cnv);
    397 
    398 static void
    399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
    400 
    401 static UConverter *
    402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
    403 
    404 #ifdef U_ENABLE_GENERIC_ISO_2022
    405 static void
    406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
    407 #endif
    408 
    409 namespace {
    410 
    411 /*const UConverterSharedData _ISO2022Data;*/
    412 extern const UConverterSharedData _ISO2022JPData;
    413 
    414 #if !UCONFIG_ONLY_HTML_CONVERSION
    415 extern const UConverterSharedData _ISO2022KRData;
    416 extern const UConverterSharedData _ISO2022CNData;
    417 #endif
    418 
    419 }  // namespace
    420 
    421 /*************** Converter implementations ******************/
    422 
    423 /* The purpose of this function is to get around gcc compiler warnings. */
    424 static inline void
    425 fromUWriteUInt8(UConverter *cnv,
    426                  const char *bytes, int32_t length,
    427                  uint8_t **target, const char *targetLimit,
    428                  int32_t **offsets,
    429                  int32_t sourceIndex,
    430                  UErrorCode *pErrorCode)
    431 {
    432     char *targetChars = (char *)*target;
    433     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
    434                          offsets, sourceIndex, pErrorCode);
    435     *target = (uint8_t*)targetChars;
    436 
    437 }
    438 
    439 static inline void
    440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
    441     if(myConverterData->version == 1) {
    442         UConverter *cnv = myConverterData->currentConverter;
    443 
    444         cnv->toUnicodeStatus=0;     /* offset */
    445         cnv->mode=0;                /* state */
    446         cnv->toULength=0;           /* byteIndex */
    447     }
    448 }
    449 
    450 static inline void
    451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
    452    /* in ISO-2022-KR the designator sequence appears only once
    453     * in a file so we append it only once
    454     */
    455     if( converter->charErrorBufferLength==0){
    456 
    457         converter->charErrorBufferLength = 4;
    458         converter->charErrorBuffer[0] = 0x1b;
    459         converter->charErrorBuffer[1] = 0x24;
    460         converter->charErrorBuffer[2] = 0x29;
    461         converter->charErrorBuffer[3] = 0x43;
    462     }
    463     if(myConverterData->version == 1) {
    464         UConverter *cnv = myConverterData->currentConverter;
    465 
    466         cnv->fromUChar32=0;
    467         cnv->fromUnicodeStatus=1;   /* prevLength */
    468     }
    469 }
    470 
    471 static void
    472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
    473 
    474     char myLocale[6]={' ',' ',' ',' ',' ',' '};
    475 
    476     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
    477     if(cnv->extraInfo != NULL) {
    478         UConverterNamePieces stackPieces;
    479         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
    480         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
    481         uint32_t version;
    482 
    483         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
    484 
    485         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
    486         myConverterData->currentType = ASCII1;
    487         cnv->fromUnicodeStatus =FALSE;
    488         if(pArgs->locale){
    489             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
    490         }
    491         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
    492         myConverterData->version = version;
    493         /* Begin Google-specific change. */
    494         /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
    495         /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
    496         if((myLocale[0]=='j' &&
    497             (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
    498              myLocale[1]=='s') &&
    499             (myLocale[2]=='_' || myLocale[2]=='\0')))
    500         {
    501             /* open the required converters and cache them */
    502             if(version>MAX_JA_VERSION) {
    503                 // ICU 55 fails to open a converter for an unsupported version.
    504                 // Previously, it fell back to version 0, but that would yield
    505                 // unexpected behavior.
    506                 *errorCode = U_MISSING_RESOURCE_ERROR;
    507                 return;
    508             }
    509             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
    510                 myConverterData->myConverterArray[ISO8859_7] =
    511                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
    512             }
    513             if (myLocale[1]=='k') {  /* Use KDDI's version. */
    514                 myConverterData->myConverterArray[JISX208]  =
    515                     ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
    516             } else if (myLocale[1]=='s') {  /* Use SoftBank's version. */
    517                 myConverterData->myConverterArray[JISX208]  =
    518                     ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
    519             } else {
    520                 /*
    521                  * Change for http://b/issue?id=937017 :
    522                  * Restore JIS X 0208 ISO-2022-JP mappings from before
    523                  * sharing the table with the Shift-JIS converter
    524                  * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797).
    525                  * TODO(mscherer): Create and use a new, unified Google Shift-JIS
    526                  * table for both Shift-JIS and ISO-2022-JP.
    527                  */
    528                 myConverterData->myConverterArray[JISX208]  =
    529                     ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode);
    530             }
    531             /* End Google-specific change. */
    532             if(jpCharsetMasks[version]&CSM(JISX212)) {
    533                 myConverterData->myConverterArray[JISX212] =
    534                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
    535             }
    536             if(jpCharsetMasks[version]&CSM(GB2312)) {
    537                 myConverterData->myConverterArray[GB2312] =
    538                     /* BEGIN android-changed */
    539                     ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
    540                     /* END android-changed */
    541             }
    542             if(jpCharsetMasks[version]&CSM(KSC5601)) {
    543                 myConverterData->myConverterArray[KSC5601] =
    544                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
    545             }
    546 
    547             /* set the function pointers to appropriate funtions */
    548             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
    549             uprv_strcpy(myConverterData->locale,"ja");
    550 
    551             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
    552             size_t len = uprv_strlen(myConverterData->name);
    553             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
    554             myConverterData->name[len+1]='\0';
    555         }
    556 #if !UCONFIG_ONLY_HTML_CONVERSION
    557         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
    558             (myLocale[2]=='_' || myLocale[2]=='\0'))
    559         {
    560             if(version>1) {
    561                 // ICU 55 fails to open a converter for an unsupported version.
    562                 // Previously, it fell back to version 0, but that would yield
    563                 // unexpected behavior.
    564                 *errorCode = U_MISSING_RESOURCE_ERROR;
    565                 return;
    566             }
    567             const char *cnvName;
    568             if(version==1) {
    569                 cnvName="icu-internal-25546";
    570             } else {
    571                 /* BEGIN android-changed */
    572                 cnvName="ksc_5601";
    573                 /* END android-changed */
    574                 myConverterData->version=version=0;
    575             }
    576             if(pArgs->onlyTestIsLoadable) {
    577                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
    578                 uprv_free(cnv->extraInfo);
    579                 cnv->extraInfo=NULL;
    580                 return;
    581             } else {
    582                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
    583                 if (U_FAILURE(*errorCode)) {
    584                     _ISO2022Close(cnv);
    585                     return;
    586                 }
    587 
    588                 if(version==1) {
    589                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
    590                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
    591                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
    592                 }else{
    593                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
    594                 }
    595 
    596                 /* initialize the state variables */
    597                 setInitialStateToUnicodeKR(cnv, myConverterData);
    598                 setInitialStateFromUnicodeKR(cnv, myConverterData);
    599 
    600                 /* set the function pointers to appropriate funtions */
    601                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
    602                 uprv_strcpy(myConverterData->locale,"ko");
    603             }
    604         }
    605         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
    606             (myLocale[2]=='_' || myLocale[2]=='\0'))
    607         {
    608             if(version>2) {
    609                 // ICU 55 fails to open a converter for an unsupported version.
    610                 // Previously, it fell back to version 0, but that would yield
    611                 // unexpected behavior.
    612                 *errorCode = U_MISSING_RESOURCE_ERROR;
    613                 return;
    614             }
    615 
    616             /* open the required converters and cache them */
    617             /* BEGIN android-changed */
    618             myConverterData->myConverterArray[GB2312_1] =
    619                 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
    620             if(version==1) {
    621                 myConverterData->myConverterArray[ISO_IR_165] =
    622                     ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
    623             }
    624             myConverterData->myConverterArray[CNS_11643] =
    625                 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
    626             /* END android-changed */
    627 
    628 
    629             /* set the function pointers to appropriate funtions */
    630             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
    631             uprv_strcpy(myConverterData->locale,"cn");
    632 
    633             if (version==0){
    634                 myConverterData->version = 0;
    635                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
    636             }else if (version==1){
    637                 myConverterData->version = 1;
    638                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
    639             }else {
    640                 myConverterData->version = 2;
    641                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
    642             }
    643         }
    644 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
    645         else{
    646 #ifdef U_ENABLE_GENERIC_ISO_2022
    647             myConverterData->isFirstBuffer = TRUE;
    648 
    649             /* append the UTF-8 escape sequence */
    650             cnv->charErrorBufferLength = 3;
    651             cnv->charErrorBuffer[0] = 0x1b;
    652             cnv->charErrorBuffer[1] = 0x25;
    653             cnv->charErrorBuffer[2] = 0x42;
    654 
    655             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
    656             /* initialize the state variables */
    657             uprv_strcpy(myConverterData->name,"ISO_2022");
    658 #else
    659             *errorCode = U_MISSING_RESOURCE_ERROR;
    660             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
    661             // data loading error code.
    662             return;
    663 #endif
    664         }
    665 
    666         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
    667 
    668         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
    669             _ISO2022Close(cnv);
    670         }
    671     } else {
    672         *errorCode = U_MEMORY_ALLOCATION_ERROR;
    673     }
    674 }
    675 
    676 
    677 static void
    678 _ISO2022Close(UConverter *converter) {
    679     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
    680     UConverterSharedData **array = myData->myConverterArray;
    681     int32_t i;
    682 
    683     if (converter->extraInfo != NULL) {
    684         /*close the array of converter pointers and free the memory*/
    685         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
    686             if(array[i]!=NULL) {
    687                 ucnv_unloadSharedDataIfReady(array[i]);
    688             }
    689         }
    690 
    691         ucnv_close(myData->currentConverter);
    692 
    693         if(!converter->isExtraLocal){
    694             uprv_free (converter->extraInfo);
    695             converter->extraInfo = NULL;
    696         }
    697     }
    698 }
    699 
    700 static void
    701 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
    702     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
    703     if(choice<=UCNV_RESET_TO_UNICODE) {
    704         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
    705         myConverterData->key = 0;
    706         myConverterData->isEmptySegment = FALSE;
    707     }
    708     if(choice!=UCNV_RESET_TO_UNICODE) {
    709         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
    710     }
    711 #ifdef U_ENABLE_GENERIC_ISO_2022
    712     if(myConverterData->locale[0] == 0){
    713         if(choice<=UCNV_RESET_TO_UNICODE) {
    714             myConverterData->isFirstBuffer = TRUE;
    715             myConverterData->key = 0;
    716             if (converter->mode == UCNV_SO){
    717                 ucnv_close (myConverterData->currentConverter);
    718                 myConverterData->currentConverter=NULL;
    719             }
    720             converter->mode = UCNV_SI;
    721         }
    722         if(choice!=UCNV_RESET_TO_UNICODE) {
    723             /* re-append UTF-8 escape sequence */
    724             converter->charErrorBufferLength = 3;
    725             converter->charErrorBuffer[0] = 0x1b;
    726             converter->charErrorBuffer[1] = 0x28;
    727             converter->charErrorBuffer[2] = 0x42;
    728         }
    729     }
    730     else
    731 #endif
    732     {
    733         /* reset the state variables */
    734         if(myConverterData->locale[0] == 'k'){
    735             if(choice<=UCNV_RESET_TO_UNICODE) {
    736                 setInitialStateToUnicodeKR(converter, myConverterData);
    737             }
    738             if(choice!=UCNV_RESET_TO_UNICODE) {
    739                 setInitialStateFromUnicodeKR(converter, myConverterData);
    740             }
    741         }
    742     }
    743 }
    744 
    745 static const char*
    746 _ISO2022getName(const UConverter* cnv){
    747     if(cnv->extraInfo){
    748         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
    749         return myData->name;
    750     }
    751     return NULL;
    752 }
    753 
    754 
    755 /*************** to unicode *******************/
    756 /****************************************************************************
    757  * Recognized escape sequences are
    758  * <ESC>(B  ASCII
    759  * <ESC>.A  ISO-8859-1
    760  * <ESC>.F  ISO-8859-7
    761  * <ESC>(J  JISX-201
    762  * <ESC>(I  JISX-201
    763  * <ESC>$B  JISX-208
    764  * <ESC>$@  JISX-208
    765  * <ESC>$(D JISX-212
    766  * <ESC>$A  GB2312
    767  * <ESC>$(C KSC5601
    768  */
    769 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
    770 /*      0                1               2               3               4               5               6               7               8               9    */
    771     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    772     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
    773     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    774     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
    775     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    776     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    777     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    778     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    779 };
    780 
    781 #if !UCONFIG_ONLY_HTML_CONVERSION
    782 /*************** to unicode *******************/
    783 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
    784 /*      0                1               2               3               4               5               6               7               8               9    */
    785      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    786     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    787     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    788     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    789     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
    790     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    791     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    792     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    793 };
    794 #endif
    795 
    796 
    797 static UCNV_TableStates_2022
    798 getKey_2022(char c,int32_t* key,int32_t* offset){
    799     int32_t togo;
    800     int32_t low = 0;
    801     int32_t hi = MAX_STATES_2022;
    802     int32_t oldmid=0;
    803 
    804     togo = normalize_esq_chars_2022[(uint8_t)c];
    805     if(togo == 0) {
    806         /* not a valid character anywhere in an escape sequence */
    807         *key = 0;
    808         *offset = 0;
    809         return INVALID_2022;
    810     }
    811     togo = (*key << 5) + togo;
    812 
    813     while (hi != low)  /*binary search*/{
    814 
    815         int32_t mid = (hi+low) >> 1; /*Finds median*/
    816 
    817         if (mid == oldmid)
    818             break;
    819 
    820         if (escSeqStateTable_Key_2022[mid] > togo){
    821             hi = mid;
    822         }
    823         else if (escSeqStateTable_Key_2022[mid] < togo){
    824             low = mid;
    825         }
    826         else /*we found it*/{
    827             *key = togo;
    828             *offset = mid;
    829             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
    830         }
    831         oldmid = mid;
    832 
    833     }
    834 
    835     *key = 0;
    836     *offset = 0;
    837     return INVALID_2022;
    838 }
    839 
    840 /*runs through a state machine to determine the escape sequence - codepage correspondance
    841  */
    842 static void
    843 changeState_2022(UConverter* _this,
    844                 const char** source,
    845                 const char* sourceLimit,
    846                 Variant2022 var,
    847                 UErrorCode* err){
    848     UCNV_TableStates_2022 value;
    849     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
    850     uint32_t key = myData2022->key;
    851     int32_t offset = 0;
    852     int8_t initialToULength = _this->toULength;
    853     char c;
    854 
    855     value = VALID_NON_TERMINAL_2022;
    856     while (*source < sourceLimit) {
    857         c = *(*source)++;
    858         _this->toUBytes[_this->toULength++]=(uint8_t)c;
    859         value = getKey_2022(c,(int32_t *) &key, &offset);
    860 
    861         switch (value){
    862 
    863         case VALID_NON_TERMINAL_2022 :
    864             /* continue with the loop */
    865             break;
    866 
    867         case VALID_TERMINAL_2022:
    868             key = 0;
    869             goto DONE;
    870 
    871         case INVALID_2022:
    872             goto DONE;
    873 
    874         case VALID_MAYBE_TERMINAL_2022:
    875 #ifdef U_ENABLE_GENERIC_ISO_2022
    876             /* ESC ( B is ambiguous only for ISO_2022 itself */
    877             if(var == ISO_2022) {
    878                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
    879                 _this->toULength = 0;
    880 
    881                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
    882 
    883                 /* continue with the loop */
    884                 value = VALID_NON_TERMINAL_2022;
    885                 break;
    886             } else
    887 #endif
    888             {
    889                 /* not ISO_2022 itself, finish here */
    890                 value = VALID_TERMINAL_2022;
    891                 key = 0;
    892                 goto DONE;
    893             }
    894         }
    895     }
    896 
    897 DONE:
    898     myData2022->key = key;
    899 
    900     if (value == VALID_NON_TERMINAL_2022) {
    901         /* indicate that the escape sequence is incomplete: key!=0 */
    902         return;
    903     } else if (value == INVALID_2022 ) {
    904         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    905     } else /* value == VALID_TERMINAL_2022 */ {
    906         switch(var){
    907 #ifdef U_ENABLE_GENERIC_ISO_2022
    908         case ISO_2022:
    909         {
    910             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
    911             if(chosenConverterName == NULL) {
    912                 /* SS2 or SS3 */
    913                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    914                 _this->toUCallbackReason = UCNV_UNASSIGNED;
    915                 return;
    916             }
    917 
    918             _this->mode = UCNV_SI;
    919             ucnv_close(myData2022->currentConverter);
    920             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
    921             if(U_SUCCESS(*err)) {
    922                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
    923                 _this->mode = UCNV_SO;
    924             }
    925             break;
    926         }
    927 #endif
    928         case ISO_2022_JP:
    929             {
    930                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
    931                 switch(tempState) {
    932                 case INVALID_STATE:
    933                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    934                     break;
    935                 case SS2_STATE:
    936                     if(myData2022->toU2022State.cs[2]!=0) {
    937                         if(myData2022->toU2022State.g<2) {
    938                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    939                         }
    940                         myData2022->toU2022State.g=2;
    941                     } else {
    942                         /* illegal to have SS2 before a matching designator */
    943                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    944                     }
    945                     break;
    946                 /* case SS3_STATE: not used in ISO-2022-JP-x */
    947                 case ISO8859_1:
    948                 case ISO8859_7:
    949                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
    950                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    951                     } else {
    952                         /* G2 charset for SS2 */
    953                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
    954                     }
    955                     break;
    956                 default:
    957                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
    958                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    959                     } else {
    960                         /* G0 charset */
    961                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
    962                     }
    963                     break;
    964                 }
    965             }
    966             break;
    967 #if !UCONFIG_ONLY_HTML_CONVERSION
    968         case ISO_2022_CN:
    969             {
    970                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
    971                 switch(tempState) {
    972                 case INVALID_STATE:
    973                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    974                     break;
    975                 case SS2_STATE:
    976                     if(myData2022->toU2022State.cs[2]!=0) {
    977                         if(myData2022->toU2022State.g<2) {
    978                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    979                         }
    980                         myData2022->toU2022State.g=2;
    981                     } else {
    982                         /* illegal to have SS2 before a matching designator */
    983                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    984                     }
    985                     break;
    986                 case SS3_STATE:
    987                     if(myData2022->toU2022State.cs[3]!=0) {
    988                         if(myData2022->toU2022State.g<2) {
    989                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    990                         }
    991                         myData2022->toU2022State.g=3;
    992                     } else {
    993                         /* illegal to have SS3 before a matching designator */
    994                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    995                     }
    996                     break;
    997                 case ISO_IR_165:
    998                     if(myData2022->version==0) {
    999                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1000                         break;
   1001                     }
   1002                     /*fall through*/
   1003                 case GB2312_1:
   1004                     /*fall through*/
   1005                 case CNS_11643_1:
   1006                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
   1007                     break;
   1008                 case CNS_11643_2:
   1009                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
   1010                     break;
   1011                 default:
   1012                     /* other CNS 11643 planes */
   1013                     if(myData2022->version==0) {
   1014                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1015                     } else {
   1016                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
   1017                     }
   1018                     break;
   1019                 }
   1020             }
   1021             break;
   1022         case ISO_2022_KR:
   1023             if(offset==0x30){
   1024                 /* nothing to be done, just accept this one escape sequence */
   1025             } else {
   1026                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1027             }
   1028             break;
   1029 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
   1030 
   1031         default:
   1032             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1033             break;
   1034         }
   1035     }
   1036     if(U_SUCCESS(*err)) {
   1037         _this->toULength = 0;
   1038     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
   1039         if(_this->toULength>1) {
   1040             /*
   1041              * Ticket 5691: consistent illegal sequences:
   1042              * - We include at least the first byte (ESC) in the illegal sequence.
   1043              * - If any of the non-initial bytes could be the start of a character,
   1044              *   we stop the illegal sequence before the first one of those.
   1045              *   In escape sequences, all following bytes are "printable", that is,
   1046              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
   1047              *   they are valid single/lead bytes.
   1048              *   For simplicity, we always only report the initial ESC byte as the
   1049              *   illegal sequence and back out all other bytes we looked at.
   1050              */
   1051             /* Back out some bytes. */
   1052             int8_t backOutDistance=_this->toULength-1;
   1053             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
   1054             if(backOutDistance<=bytesFromThisBuffer) {
   1055                 /* same as initialToULength<=1 */
   1056                 *source-=backOutDistance;
   1057             } else {
   1058                 /* Back out bytes from the previous buffer: Need to replay them. */
   1059                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   1060                 /* same as -(initialToULength-1) */
   1061                 /* preToULength is negative! */
   1062                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
   1063                 *source-=bytesFromThisBuffer;
   1064             }
   1065             _this->toULength=1;
   1066         }
   1067     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
   1068         _this->toUCallbackReason = UCNV_UNASSIGNED;
   1069     }
   1070 }
   1071 
   1072 #if !UCONFIG_ONLY_HTML_CONVERSION
   1073 /*Checks the characters of the buffer against valid 2022 escape sequences
   1074 *if the match we return a pointer to the initial start of the sequence otherwise
   1075 *we return sourceLimit
   1076 */
   1077 /*for 2022 looks ahead in the stream
   1078  *to determine the longest possible convertible
   1079  *data stream
   1080  */
   1081 static inline const char*
   1082 getEndOfBuffer_2022(const char** source,
   1083                    const char* sourceLimit,
   1084                    UBool /*flush*/){
   1085 
   1086     const char* mySource = *source;
   1087 
   1088 #ifdef U_ENABLE_GENERIC_ISO_2022
   1089     if (*source >= sourceLimit)
   1090         return sourceLimit;
   1091 
   1092     do{
   1093 
   1094         if (*mySource == ESC_2022){
   1095             int8_t i;
   1096             int32_t key = 0;
   1097             int32_t offset;
   1098             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
   1099 
   1100             /* Kludge: I could not
   1101             * figure out the reason for validating an escape sequence
   1102             * twice - once here and once in changeState_2022().
   1103             * is it possible to have an ESC character in a ISO2022
   1104             * byte stream which is valid in a code page? Is it legal?
   1105             */
   1106             for (i=0;
   1107             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
   1108             i++) {
   1109                 value =  getKey_2022(*(mySource+i), &key, &offset);
   1110             }
   1111             if (value > 0 || *mySource==ESC_2022)
   1112                 return mySource;
   1113 
   1114             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
   1115                 return sourceLimit;
   1116         }
   1117     }while (++mySource < sourceLimit);
   1118 
   1119     return sourceLimit;
   1120 #else
   1121     while(mySource < sourceLimit && *mySource != ESC_2022) {
   1122         ++mySource;
   1123     }
   1124     return mySource;
   1125 #endif
   1126 }
   1127 #endif
   1128 
   1129 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
   1130  * any future change in _MBCSFromUChar32() function should be reflected here.
   1131  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
   1132  */
   1133 static inline int32_t
   1134 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
   1135                                          UChar32 c,
   1136                                          uint32_t* value,
   1137                                          UBool useFallback,
   1138                                          int outputType)
   1139 {
   1140     const int32_t *cx;
   1141     const uint16_t *table;
   1142     uint32_t stage2Entry;
   1143     uint32_t myValue;
   1144     int32_t length;
   1145     const uint8_t *p;
   1146     /*
   1147      * TODO(markus): Use and require new, faster MBCS conversion table structures.
   1148      * Use internal version of ucnv_open() that verifies that the new structures are available,
   1149      * else U_INTERNAL_PROGRAM_ERROR.
   1150      */
   1151     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   1152     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   1153         table=sharedData->mbcs.fromUnicodeTable;
   1154         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   1155         /* get the bytes and the length for the output */
   1156         if(outputType==MBCS_OUTPUT_2){
   1157             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   1158             if(myValue<=0xff) {
   1159                 length=1;
   1160             } else {
   1161                 length=2;
   1162             }
   1163         } else /* outputType==MBCS_OUTPUT_3 */ {
   1164             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   1165             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   1166             if(myValue<=0xff) {
   1167                 length=1;
   1168             } else if(myValue<=0xffff) {
   1169                 length=2;
   1170             } else {
   1171                 length=3;
   1172             }
   1173         }
   1174         /* is this code point assigned, or do we use fallbacks? */
   1175         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
   1176             /* assigned */
   1177             *value=myValue;
   1178             return length;
   1179         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
   1180             /*
   1181              * We allow a 0 byte output if the "assigned" bit is set for this entry.
   1182              * There is no way with this data structure for fallback output
   1183              * to be a zero byte.
   1184              */
   1185             *value=myValue;
   1186             return -length;
   1187         }
   1188     }
   1189 
   1190     cx=sharedData->mbcs.extIndexes;
   1191     if(cx!=NULL) {
   1192         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
   1193     }
   1194 
   1195     /* unassigned */
   1196     return 0;
   1197 }
   1198 
   1199 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
   1200  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
   1201  * @param retval pointer to output byte
   1202  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
   1203  */
   1204 static inline int32_t
   1205 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
   1206                                        UChar32 c,
   1207                                        uint32_t* retval,
   1208                                        UBool useFallback)
   1209 {
   1210     const uint16_t *table;
   1211     int32_t value;
   1212     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   1213     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   1214         return 0;
   1215     }
   1216     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   1217     table=sharedData->mbcs.fromUnicodeTable;
   1218     /* get the byte for the output */
   1219     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   1220     /* is this code point assigned, or do we use fallbacks? */
   1221     *retval=(uint32_t)(value&0xff);
   1222     if(value>=0xf00) {
   1223         return 1;  /* roundtrip */
   1224     } else if(useFallback ? value>=0x800 : value>=0xc00) {
   1225         return -1;  /* fallback taken */
   1226     } else {
   1227         return 0;  /* no mapping */
   1228     }
   1229 }
   1230 
   1231 /*
   1232  * Check that the result is a 2-byte value with each byte in the range A1..FE
   1233  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
   1234  * to move it to the ISO 2022 range 21..7E.
   1235  * Return 0 if out of range.
   1236  */
   1237 static inline uint32_t
   1238 _2022FromGR94DBCS(uint32_t value) {
   1239     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
   1240         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
   1241     ) {
   1242         return value - 0x8080;  /* shift down to 21..7e byte range */
   1243     } else {
   1244         return 0;  /* not valid for ISO 2022 */
   1245     }
   1246 }
   1247 
   1248 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
   1249 /*
   1250  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
   1251  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
   1252  * unchanged.
   1253  */
   1254 static inline uint32_t
   1255 _2022ToGR94DBCS(uint32_t value) {
   1256     uint32_t returnValue = value + 0x8080;
   1257     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
   1258         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
   1259         return returnValue;
   1260     } else {
   1261         return value;
   1262     }
   1263 }
   1264 #endif
   1265 
   1266 #ifdef U_ENABLE_GENERIC_ISO_2022
   1267 
   1268 /**********************************************************************************
   1269 *  ISO-2022 Converter
   1270 *
   1271 *
   1272 */
   1273 
   1274 static void
   1275 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
   1276                                                            UErrorCode* err){
   1277     const char* mySourceLimit, *realSourceLimit;
   1278     const char* sourceStart;
   1279     const UChar* myTargetStart;
   1280     UConverter* saveThis;
   1281     UConverterDataISO2022* myData;
   1282     int8_t length;
   1283 
   1284     saveThis = args->converter;
   1285     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
   1286 
   1287     realSourceLimit = args->sourceLimit;
   1288     while (args->source < realSourceLimit) {
   1289         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
   1290             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
   1291             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
   1292 
   1293             if(args->source < mySourceLimit) {
   1294                 if(myData->currentConverter==NULL) {
   1295                     myData->currentConverter = ucnv_open("ASCII",err);
   1296                     if(U_FAILURE(*err)){
   1297                         return;
   1298                     }
   1299 
   1300                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
   1301                     saveThis->mode = UCNV_SO;
   1302                 }
   1303 
   1304                 /* convert to before the ESC or until the end of the buffer */
   1305                 myData->isFirstBuffer=FALSE;
   1306                 sourceStart = args->source;
   1307                 myTargetStart = args->target;
   1308                 args->converter = myData->currentConverter;
   1309                 ucnv_toUnicode(args->converter,
   1310                     &args->target,
   1311                     args->targetLimit,
   1312                     &args->source,
   1313                     mySourceLimit,
   1314                     args->offsets,
   1315                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
   1316                     err);
   1317                 args->converter = saveThis;
   1318 
   1319                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
   1320                     /* move the overflow buffer */
   1321                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
   1322                     myData->currentConverter->UCharErrorBufferLength = 0;
   1323                     if(length > 0) {
   1324                         uprv_memcpy(saveThis->UCharErrorBuffer,
   1325                                     myData->currentConverter->UCharErrorBuffer,
   1326                                     length*U_SIZEOF_UCHAR);
   1327                     }
   1328                     return;
   1329                 }
   1330 
   1331                 /*
   1332                  * At least one of:
   1333                  * -Error while converting
   1334                  * -Done with entire buffer
   1335                  * -Need to write offsets or update the current offset
   1336                  *  (leave that up to the code in ucnv.c)
   1337                  *
   1338                  * or else we just stopped at an ESC byte and continue with changeState_2022()
   1339                  */
   1340                 if (U_FAILURE(*err) ||
   1341                     (args->source == realSourceLimit) ||
   1342                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
   1343                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
   1344                 ) {
   1345                     /* copy partial or error input for truncated detection and error handling */
   1346                     if(U_FAILURE(*err)) {
   1347                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
   1348                         if(length > 0) {
   1349                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
   1350                         }
   1351                     } else {
   1352                         length = saveThis->toULength = myData->currentConverter->toULength;
   1353                         if(length > 0) {
   1354                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
   1355                             if(args->source < mySourceLimit) {
   1356                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
   1357                             }
   1358                         }
   1359                     }
   1360                     return;
   1361                 }
   1362             }
   1363         }
   1364 
   1365         sourceStart = args->source;
   1366         changeState_2022(args->converter,
   1367                &(args->source),
   1368                realSourceLimit,
   1369                ISO_2022,
   1370                err);
   1371         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
   1372             /* let the ucnv.c code update its current offset */
   1373             return;
   1374         }
   1375     }
   1376 }
   1377 
   1378 #endif
   1379 
   1380 /*
   1381  * To Unicode Callback helper function
   1382  */
   1383 static void
   1384 toUnicodeCallback(UConverter *cnv,
   1385                   const uint32_t sourceChar, const uint32_t targetUniChar,
   1386                   UErrorCode* err){
   1387     if(sourceChar>0xff){
   1388         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
   1389         cnv->toUBytes[1] = (uint8_t)sourceChar;
   1390         cnv->toULength = 2;
   1391     }
   1392     else{
   1393         cnv->toUBytes[0] =(char) sourceChar;
   1394         cnv->toULength = 1;
   1395     }
   1396 
   1397     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
   1398         *err = U_INVALID_CHAR_FOUND;
   1399     }
   1400     else{
   1401         *err = U_ILLEGAL_CHAR_FOUND;
   1402     }
   1403 }
   1404 
   1405 /**************************************ISO-2022-JP*************************************************/
   1406 
   1407 /************************************** IMPORTANT **************************************************
   1408 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
   1409 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
   1410 * The converter iterates over each Unicode codepoint
   1411 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
   1412 * processed one char at a time it would make sense to reduce the extra processing a canned converter
   1413 * would do as far as possible.
   1414 *
   1415 * If the implementation of these macros or structure of sharedData struct change in the future, make
   1416 * sure that ISO-2022 is also changed.
   1417 ***************************************************************************************************
   1418 */
   1419 
   1420 /***************************************************************************************************
   1421 * Rules for ISO-2022-jp encoding
   1422 * (i)   Escape sequences must be fully contained within a line they should not
   1423 *       span new lines or CRs
   1424 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
   1425 *       JIS-Roman character escape sequence should follow before the line terminates
   1426 * (iii) If the first character on the line is represented by two bytes then a two
   1427 *       byte character escape sequence should precede it
   1428 * (iv)  If no escape sequence is encountered then the characters are ASCII
   1429 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
   1430 *       and invoked with SS2 (ESC N).
   1431 * (vi)  If there is any G0 designation in text, there must be a switch to
   1432 *       ASCII or to JIS X 0201-Roman before a space character (but not
   1433 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
   1434 *       characters such as tab or CRLF.
   1435 * (vi)  Supported encodings:
   1436 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
   1437 *
   1438 *  source : RFC-1554
   1439 *
   1440 *          JISX201, JISX208,JISX212 : new .cnv data files created
   1441 *          KSC5601 : alias to ibm-949 mapping table
   1442 *          GB2312 : alias to ibm-1386 mapping table
   1443 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
   1444 *          ISO-8859-7 : alisas to ibm-9409 mapping table
   1445 */
   1446 
   1447 /* preference order of JP charsets */
   1448 static const StateEnum jpCharsetPref[]={
   1449     ASCII,
   1450     JISX201,
   1451     ISO8859_1,
   1452     ISO8859_7,
   1453     JISX208,
   1454     JISX212,
   1455     GB2312,
   1456     KSC5601,
   1457     HWKANA_7BIT
   1458 };
   1459 
   1460 /*
   1461  * The escape sequences must be in order of the enum constants like JISX201  = 3,
   1462  * not in order of jpCharsetPref[]!
   1463  */
   1464 static const char escSeqChars[][6] ={
   1465     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
   1466     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
   1467     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
   1468     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
   1469     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
   1470     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
   1471     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
   1472     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
   1473     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
   1474 
   1475 };
   1476 static  const int8_t escSeqCharsLen[] ={
   1477     3, /* length of <ESC>(B  ASCII       */
   1478     3, /* length of <ESC>.A  ISO-8859-1  */
   1479     3, /* length of <ESC>.F  ISO-8859-7  */
   1480     3, /* length of <ESC>(J  JISX-201    */
   1481     3, /* length of <ESC>$B  JISX-208    */
   1482     4, /* length of <ESC>$(D JISX-212    */
   1483     3, /* length of <ESC>$A  GB2312      */
   1484     4, /* length of <ESC>$(C KSC5601     */
   1485     3  /* length of <ESC>(I  HWKANA_7BIT */
   1486 };
   1487 
   1488 /*
   1489 * The iteration over various code pages works this way:
   1490 * i)   Get the currentState from myConverterData->currentState
   1491 * ii)  Check if the character is mapped to a valid character in the currentState
   1492 *      Yes ->  a) set the initIterState to currentState
   1493 *       b) remain in this state until an invalid character is found
   1494 *      No  ->  a) go to the next code page and find the character
   1495 * iii) Before changing the state increment the current state check if the current state
   1496 *      is equal to the intitIteration state
   1497 *      Yes ->  A character that cannot be represented in any of the supported encodings
   1498 *       break and return a U_INVALID_CHARACTER error
   1499 *      No  ->  Continue and find the character in next code page
   1500 *
   1501 *
   1502 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
   1503 */
   1504 
   1505 /* Map 00..7F to Unicode according to JIS X 0201. */
   1506 static inline uint32_t
   1507 jisx201ToU(uint32_t value) {
   1508     if(value < 0x5c) {
   1509         return value;
   1510     } else if(value == 0x5c) {
   1511         return 0xa5;
   1512     } else if(value == 0x7e) {
   1513         return 0x203e;
   1514     } else /* value <= 0x7f */ {
   1515         return value;
   1516     }
   1517 }
   1518 
   1519 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
   1520 static inline uint32_t
   1521 jisx201FromU(uint32_t value) {
   1522     if(value<=0x7f) {
   1523         if(value!=0x5c && value!=0x7e) {
   1524             return value;
   1525         }
   1526     } else if(value==0xa5) {
   1527         return 0x5c;
   1528     } else if(value==0x203e) {
   1529         return 0x7e;
   1530     }
   1531     return 0xfffe;
   1532 }
   1533 
   1534 /*
   1535  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
   1536  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
   1537  * Return 0 if the byte pair is out of range.
   1538  */
   1539 static inline uint32_t
   1540 _2022FromSJIS(uint32_t value) {
   1541     uint8_t trail;
   1542 
   1543     if(value > 0xEFFC) {
   1544         return 0;  /* beyond JIS X 0208 */
   1545     }
   1546 
   1547     trail = (uint8_t)value;
   1548 
   1549     value &= 0xff00;  /* lead byte */
   1550     if(value <= 0x9f00) {
   1551         value -= 0x7000;
   1552     } else /* 0xe000 <= value <= 0xef00 */ {
   1553         value -= 0xb000;
   1554     }
   1555     value <<= 1;
   1556 
   1557     if(trail <= 0x9e) {
   1558         value -= 0x100;
   1559         if(trail <= 0x7e) {
   1560             value |= trail - 0x1f;
   1561         } else {
   1562             value |= trail - 0x20;
   1563         }
   1564     } else /* trail <= 0xfc */ {
   1565         value |= trail - 0x7e;
   1566     }
   1567     return value;
   1568 }
   1569 
   1570 /*
   1571  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
   1572  * If either byte is outside 21..7E make sure that the result is not valid
   1573  * for Shift-JIS so that the converter catches it.
   1574  * Some invalid byte values already turn into equally invalid Shift-JIS
   1575  * byte values and need not be tested explicitly.
   1576  */
   1577 static inline void
   1578 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
   1579     if(c1&1) {
   1580         ++c1;
   1581         if(c2 <= 0x5f) {
   1582             c2 += 0x1f;
   1583         } else if(c2 <= 0x7e) {
   1584             c2 += 0x20;
   1585         } else {
   1586             c2 = 0;  /* invalid */
   1587         }
   1588     } else {
   1589         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
   1590             c2 += 0x7e;
   1591         } else {
   1592             c2 = 0;  /* invalid */
   1593         }
   1594     }
   1595     c1 >>= 1;
   1596     if(c1 <= 0x2f) {
   1597         c1 += 0x70;
   1598     } else if(c1 <= 0x3f) {
   1599         c1 += 0xb0;
   1600     } else {
   1601         c1 = 0;  /* invalid */
   1602     }
   1603     bytes[0] = (char)c1;
   1604     bytes[1] = (char)c2;
   1605 }
   1606 
   1607 /*
   1608  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
   1609  * Katakana.
   1610  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
   1611  * because Shift-JIS roundtrips half-width Katakana to single bytes.
   1612  * These were the only fallbacks in ICU's jisx-208.ucm file.
   1613  */
   1614 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
   1615     0x2123,  /* U+FF61 */
   1616     0x2156,
   1617     0x2157,
   1618     0x2122,
   1619     0x2126,
   1620     0x2572,
   1621     0x2521,
   1622     0x2523,
   1623     0x2525,
   1624     0x2527,
   1625     0x2529,
   1626     0x2563,
   1627     0x2565,
   1628     0x2567,
   1629     0x2543,
   1630     0x213C,  /* U+FF70 */
   1631     0x2522,
   1632     0x2524,
   1633     0x2526,
   1634     0x2528,
   1635     0x252A,
   1636     0x252B,
   1637     0x252D,
   1638     0x252F,
   1639     0x2531,
   1640     0x2533,
   1641     0x2535,
   1642     0x2537,
   1643     0x2539,
   1644     0x253B,
   1645     0x253D,
   1646     0x253F,  /* U+FF80 */
   1647     0x2541,
   1648     0x2544,
   1649     0x2546,
   1650     0x2548,
   1651     0x254A,
   1652     0x254B,
   1653     0x254C,
   1654     0x254D,
   1655     0x254E,
   1656     0x254F,
   1657     0x2552,
   1658     0x2555,
   1659     0x2558,
   1660     0x255B,
   1661     0x255E,
   1662     0x255F,  /* U+FF90 */
   1663     0x2560,
   1664     0x2561,
   1665     0x2562,
   1666     0x2564,
   1667     0x2566,
   1668     0x2568,
   1669     0x2569,
   1670     0x256A,
   1671     0x256B,
   1672     0x256C,
   1673     0x256D,
   1674     0x256F,
   1675     0x2573,
   1676     0x212B,
   1677     0x212C   /* U+FF9F */
   1678 };
   1679 
   1680 static void
   1681 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
   1682     UConverter *cnv = args->converter;
   1683     UConverterDataISO2022 *converterData;
   1684     ISO2022State *pFromU2022State;
   1685     uint8_t *target = (uint8_t *) args->target;
   1686     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
   1687     const UChar* source = args->source;
   1688     const UChar* sourceLimit = args->sourceLimit;
   1689     int32_t* offsets = args->offsets;
   1690     UChar32 sourceChar;
   1691     char buffer[8];
   1692     int32_t len, outLen;
   1693     int8_t choices[10];
   1694     int32_t choiceCount;
   1695     uint32_t targetValue = 0;
   1696     UBool useFallback;
   1697 
   1698     int32_t i;
   1699     int8_t cs, g;
   1700 
   1701     /* set up the state */
   1702     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
   1703     pFromU2022State   = &converterData->fromU2022State;
   1704 
   1705     choiceCount = 0;
   1706 
   1707     /* check if the last codepoint of previous buffer was a lead surrogate*/
   1708     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
   1709         goto getTrail;
   1710     }
   1711 
   1712     while(source < sourceLimit) {
   1713         if(target < targetLimit) {
   1714 
   1715             sourceChar  = *(source++);
   1716             /*check if the char is a First surrogate*/
   1717             if(U16_IS_SURROGATE(sourceChar)) {
   1718                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   1719 getTrail:
   1720                     /*look ahead to find the trail surrogate*/
   1721                     if(source < sourceLimit) {
   1722                         /* test the following code unit */
   1723                         UChar trail=(UChar) *source;
   1724                         if(U16_IS_TRAIL(trail)) {
   1725                             source++;
   1726                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   1727                             cnv->fromUChar32=0x00;
   1728                             /* convert this supplementary code point */
   1729                             /* exit this condition tree */
   1730                         } else {
   1731                             /* this is an unmatched lead code unit (1st surrogate) */
   1732                             /* callback(illegal) */
   1733                             *err=U_ILLEGAL_CHAR_FOUND;
   1734                             cnv->fromUChar32=sourceChar;
   1735                             break;
   1736                         }
   1737                     } else {
   1738                         /* no more input */
   1739                         cnv->fromUChar32=sourceChar;
   1740                         break;
   1741                     }
   1742                 } else {
   1743                     /* this is an unmatched trail code unit (2nd surrogate) */
   1744                     /* callback(illegal) */
   1745                     *err=U_ILLEGAL_CHAR_FOUND;
   1746                     cnv->fromUChar32=sourceChar;
   1747                     break;
   1748                 }
   1749             }
   1750 
   1751             /* do not convert SO/SI/ESC */
   1752             if(IS_2022_CONTROL(sourceChar)) {
   1753                 /* callback(illegal) */
   1754                 *err=U_ILLEGAL_CHAR_FOUND;
   1755                 cnv->fromUChar32=sourceChar;
   1756                 break;
   1757             }
   1758 
   1759             /* do the conversion */
   1760 
   1761             if(choiceCount == 0) {
   1762                 uint16_t csm;
   1763 
   1764                 /*
   1765                  * The csm variable keeps track of which charsets are allowed
   1766                  * and not used yet while building the choices[].
   1767                  */
   1768                 csm = jpCharsetMasks[converterData->version];
   1769                 choiceCount = 0;
   1770 
   1771                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
   1772                 if(converterData->version == 3 || converterData->version == 4) {
   1773                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
   1774                 }
   1775                 /* Do not try single-byte half-width Katakana for other versions. */
   1776                 csm &= ~CSM(HWKANA_7BIT);
   1777 
   1778                 /* try the current G0 charset */
   1779                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
   1780                 csm &= ~CSM(cs);
   1781 
   1782                 /* try the current G2 charset */
   1783                 if((cs = pFromU2022State->cs[2]) != 0) {
   1784                     choices[choiceCount++] = cs;
   1785                     csm &= ~CSM(cs);
   1786                 }
   1787 
   1788                 /* try all the other possible charsets */
   1789                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
   1790                     cs = (int8_t)jpCharsetPref[i];
   1791                     if(CSM(cs) & csm) {
   1792                         choices[choiceCount++] = cs;
   1793                         csm &= ~CSM(cs);
   1794                     }
   1795                 }
   1796             }
   1797 
   1798             cs = g = 0;
   1799             /*
   1800              * len==0: no mapping found yet
   1801              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
   1802              * len>0: found a roundtrip result, done
   1803              */
   1804             len = 0;
   1805             /*
   1806              * We will turn off useFallback after finding a fallback,
   1807              * but we still get fallbacks from PUA code points as usual.
   1808              * Therefore, we will also need to check that we don't overwrite
   1809              * an early fallback with a later one.
   1810              */
   1811             useFallback = cnv->useFallback;
   1812 
   1813             for(i = 0; i < choiceCount && len <= 0; ++i) {
   1814                 uint32_t value;
   1815                 int32_t len2;
   1816                 int8_t cs0 = choices[i];
   1817                 switch(cs0) {
   1818                 case ASCII:
   1819                     if(sourceChar <= 0x7f) {
   1820                         targetValue = (uint32_t)sourceChar;
   1821                         len = 1;
   1822                         cs = cs0;
   1823                         g = 0;
   1824                     }
   1825                     break;
   1826                 case ISO8859_1:
   1827                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
   1828                         targetValue = (uint32_t)sourceChar - 0x80;
   1829                         len = 1;
   1830                         cs = cs0;
   1831                         g = 2;
   1832                     }
   1833                     break;
   1834                 case HWKANA_7BIT:
   1835                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
   1836                         if(converterData->version==3) {
   1837                             /* JIS7: use G1 (SO) */
   1838                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
   1839                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
   1840                             len = 1;
   1841                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
   1842                             g = 1;
   1843                         } else if(converterData->version==4) {
   1844                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
   1845                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
   1846                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
   1847                             len = 1;
   1848 
   1849                             cs = pFromU2022State->cs[0];
   1850                             if(IS_JP_DBCS(cs)) {
   1851                                 /* switch from a DBCS charset to JISX201 */
   1852                                 cs = (int8_t)JISX201;
   1853                             }
   1854                             /* else stay in the current G0 charset */
   1855                             g = 0;
   1856                         }
   1857                         /* else do not use HWKANA_7BIT with other versions */
   1858                     }
   1859                     break;
   1860                 case JISX201:
   1861                     /* G0 SBCS */
   1862                     value = jisx201FromU(sourceChar);
   1863                     if(value <= 0x7f) {
   1864                         targetValue = value;
   1865                         len = 1;
   1866                         cs = cs0;
   1867                         g = 0;
   1868                         useFallback = FALSE;
   1869                     }
   1870                     break;
   1871                 case JISX208:
   1872                     /* G0 DBCS from Shift-JIS table */
   1873                     len2 = MBCS_FROM_UCHAR32_ISO2022(
   1874                                 converterData->myConverterArray[cs0],
   1875                                 sourceChar, &value,
   1876                                 useFallback, MBCS_OUTPUT_2);
   1877                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
   1878                         value = _2022FromSJIS(value);
   1879                         if(value != 0) {
   1880                             targetValue = value;
   1881                             len = len2;
   1882                             cs = cs0;
   1883                             g = 0;
   1884                             useFallback = FALSE;
   1885                         }
   1886                     } else if(len == 0 && useFallback &&
   1887                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
   1888                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
   1889                         len = -2;
   1890                         cs = cs0;
   1891                         g = 0;
   1892                         useFallback = FALSE;
   1893                     }
   1894                     break;
   1895                 case ISO8859_7:
   1896                     /* G0 SBCS forced to 7-bit output */
   1897                     len2 = MBCS_SINGLE_FROM_UCHAR32(
   1898                                 converterData->myConverterArray[cs0],
   1899                                 sourceChar, &value,
   1900                                 useFallback);
   1901                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
   1902                         targetValue = value - 0x80;
   1903                         len = len2;
   1904                         cs = cs0;
   1905                         g = 2;
   1906                         useFallback = FALSE;
   1907                     }
   1908                     break;
   1909                 default:
   1910                     /* G0 DBCS */
   1911                     len2 = MBCS_FROM_UCHAR32_ISO2022(
   1912                                 converterData->myConverterArray[cs0],
   1913                                 sourceChar, &value,
   1914                                 useFallback, MBCS_OUTPUT_2);
   1915                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
   1916                         if(cs0 == KSC5601) {
   1917                             /*
   1918                              * Check for valid bytes for the encoding scheme.
   1919                              * This is necessary because the sub-converter (windows-949)
   1920                              * has a broader encoding scheme than is valid for 2022.
   1921                              */
   1922                             value = _2022FromGR94DBCS(value);
   1923                             if(value == 0) {
   1924                                 break;
   1925                             }
   1926                         }
   1927                         targetValue = value;
   1928                         len = len2;
   1929                         cs = cs0;
   1930                         g = 0;
   1931                         useFallback = FALSE;
   1932                     }
   1933                     break;
   1934                 }
   1935             }
   1936 
   1937             if(len != 0) {
   1938                 if(len < 0) {
   1939                     len = -len;  /* fallback */
   1940                 }
   1941                 outLen = 0; /* count output bytes */
   1942 
   1943                 /* write SI if necessary (only for JIS7) */
   1944                 if(pFromU2022State->g == 1 && g == 0) {
   1945                     buffer[outLen++] = UCNV_SI;
   1946                     pFromU2022State->g = 0;
   1947                 }
   1948 
   1949                 /* write the designation sequence if necessary */
   1950                 if(cs != pFromU2022State->cs[g]) {
   1951                     int32_t escLen = escSeqCharsLen[cs];
   1952                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
   1953                     outLen += escLen;
   1954                     pFromU2022State->cs[g] = cs;
   1955 
   1956                     /* invalidate the choices[] */
   1957                     choiceCount = 0;
   1958                 }
   1959 
   1960                 /* write the shift sequence if necessary */
   1961                 if(g != pFromU2022State->g) {
   1962                     switch(g) {
   1963                     /* case 0 handled before writing escapes */
   1964                     case 1:
   1965                         buffer[outLen++] = UCNV_SO;
   1966                         pFromU2022State->g = 1;
   1967                         break;
   1968                     default: /* case 2 */
   1969                         buffer[outLen++] = 0x1b;
   1970                         buffer[outLen++] = 0x4e;
   1971                         break;
   1972                     /* no case 3: no SS3 in ISO-2022-JP-x */
   1973                     }
   1974                 }
   1975 
   1976                 /* write the output bytes */
   1977                 if(len == 1) {
   1978                     buffer[outLen++] = (char)targetValue;
   1979                 } else /* len == 2 */ {
   1980                     buffer[outLen++] = (char)(targetValue >> 8);
   1981                     buffer[outLen++] = (char)targetValue;
   1982                 }
   1983             } else {
   1984                 /*
   1985                  * if we cannot find the character after checking all codepages
   1986                  * then this is an error
   1987                  */
   1988                 *err = U_INVALID_CHAR_FOUND;
   1989                 cnv->fromUChar32=sourceChar;
   1990                 break;
   1991             }
   1992 
   1993             if(sourceChar == CR || sourceChar == LF) {
   1994                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
   1995                 pFromU2022State->cs[2] = 0;
   1996                 choiceCount = 0;
   1997             }
   1998 
   1999             /* output outLen>0 bytes in buffer[] */
   2000             if(outLen == 1) {
   2001                 *target++ = buffer[0];
   2002                 if(offsets) {
   2003                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
   2004                 }
   2005             } else if(outLen == 2 && (target + 2) <= targetLimit) {
   2006                 *target++ = buffer[0];
   2007                 *target++ = buffer[1];
   2008                 if(offsets) {
   2009                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
   2010                     *offsets++ = sourceIndex;
   2011                     *offsets++ = sourceIndex;
   2012                 }
   2013             } else {
   2014                 fromUWriteUInt8(
   2015                     cnv,
   2016                     buffer, outLen,
   2017                     &target, (const char *)targetLimit,
   2018                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
   2019                     err);
   2020                 if(U_FAILURE(*err)) {
   2021                     break;
   2022                 }
   2023             }
   2024         } /* end if(myTargetIndex<myTargetLength) */
   2025         else{
   2026             *err =U_BUFFER_OVERFLOW_ERROR;
   2027             break;
   2028         }
   2029 
   2030     }/* end while(mySourceIndex<mySourceLength) */
   2031 
   2032     /*
   2033      * the end of the input stream and detection of truncated input
   2034      * are handled by the framework, but for ISO-2022-JP conversion
   2035      * we need to be in ASCII mode at the very end
   2036      *
   2037      * conditions:
   2038      *   successful
   2039      *   in SO mode or not in ASCII mode
   2040      *   end of input and no truncated input
   2041      */
   2042     if( U_SUCCESS(*err) &&
   2043         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
   2044         args->flush && source>=sourceLimit && cnv->fromUChar32==0
   2045     ) {
   2046         int32_t sourceIndex;
   2047 
   2048         outLen = 0;
   2049 
   2050         if(pFromU2022State->g != 0) {
   2051             buffer[outLen++] = UCNV_SI;
   2052             pFromU2022State->g = 0;
   2053         }
   2054 
   2055         if(pFromU2022State->cs[0] != ASCII) {
   2056             int32_t escLen = escSeqCharsLen[ASCII];
   2057             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
   2058             outLen += escLen;
   2059             pFromU2022State->cs[0] = (int8_t)ASCII;
   2060         }
   2061 
   2062         /* get the source index of the last input character */
   2063         /*
   2064          * TODO this would be simpler and more reliable if we used a pair
   2065          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   2066          * so that we could simply use the prevSourceIndex here;
   2067          * this code gives an incorrect result for the rare case of an unmatched
   2068          * trail surrogate that is alone in the last buffer of the text stream
   2069          */
   2070         sourceIndex=(int32_t)(source-args->source);
   2071         if(sourceIndex>0) {
   2072             --sourceIndex;
   2073             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   2074                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   2075             ) {
   2076                 --sourceIndex;
   2077             }
   2078         } else {
   2079             sourceIndex=-1;
   2080         }
   2081 
   2082         fromUWriteUInt8(
   2083             cnv,
   2084             buffer, outLen,
   2085             &target, (const char *)targetLimit,
   2086             &offsets, sourceIndex,
   2087             err);
   2088     }
   2089 
   2090     /*save the state and return */
   2091     args->source = source;
   2092     args->target = (char*)target;
   2093 }
   2094 
   2095 /*************** to unicode *******************/
   2096 
   2097 static void
   2098 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   2099                                                UErrorCode* err){
   2100     char tempBuf[2];
   2101     const char *mySource = (char *) args->source;
   2102     UChar *myTarget = args->target;
   2103     const char *mySourceLimit = args->sourceLimit;
   2104     uint32_t targetUniChar = 0x0000;
   2105     uint32_t mySourceChar = 0x0000;
   2106     uint32_t tmpSourceChar = 0x0000;
   2107     UConverterDataISO2022* myData;
   2108     ISO2022State *pToU2022State;
   2109     StateEnum cs;
   2110 
   2111     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2112     pToU2022State = &myData->toU2022State;
   2113 
   2114     if(myData->key != 0) {
   2115         /* continue with a partial escape sequence */
   2116         goto escape;
   2117     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   2118         /* continue with a partial double-byte character */
   2119         mySourceChar = args->converter->toUBytes[0];
   2120         args->converter->toULength = 0;
   2121         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
   2122         targetUniChar = missingCharMarker;
   2123         goto getTrailByte;
   2124     }
   2125 
   2126     while(mySource < mySourceLimit){
   2127 
   2128         targetUniChar =missingCharMarker;
   2129 
   2130         if(myTarget < args->targetLimit){
   2131 
   2132             mySourceChar= (unsigned char) *mySource++;
   2133 
   2134             switch(mySourceChar) {
   2135             case UCNV_SI:
   2136                 if(myData->version==3) {
   2137                     pToU2022State->g=0;
   2138                     continue;
   2139                 } else {
   2140                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
   2141                     myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
   2142                     break;
   2143                 }
   2144 
   2145             case UCNV_SO:
   2146                 if(myData->version==3) {
   2147                     /* JIS7: switch to G1 half-width Katakana */
   2148                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
   2149                     pToU2022State->g=1;
   2150                     continue;
   2151                 } else {
   2152                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
   2153                     myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
   2154                     break;
   2155                 }
   2156 
   2157             case ESC_2022:
   2158                 mySource--;
   2159 escape:
   2160                 {
   2161                     const char * mySourceBefore = mySource;
   2162                     int8_t toULengthBefore = args->converter->toULength;
   2163 
   2164                     changeState_2022(args->converter,&(mySource),
   2165                         mySourceLimit, ISO_2022_JP,err);
   2166 
   2167                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
   2168                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
   2169                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   2170                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
   2171                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
   2172                     }
   2173                 }
   2174 
   2175                 /* invalid or illegal escape sequence */
   2176                 if(U_FAILURE(*err)){
   2177                     args->target = myTarget;
   2178                     args->source = mySource;
   2179                     myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
   2180                     return;
   2181                 }
   2182                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
   2183                 if(myData->key==0) {
   2184                     myData->isEmptySegment = TRUE;
   2185                 }
   2186                 continue;
   2187 
   2188             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
   2189 
   2190             case CR:
   2191                 /*falls through*/
   2192             case LF:
   2193                 /* automatically reset to single-byte mode */
   2194                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
   2195                     pToU2022State->cs[0] = (int8_t)ASCII;
   2196                 }
   2197                 pToU2022State->cs[2] = 0;
   2198                 pToU2022State->g = 0;
   2199                 /* falls through */
   2200             default:
   2201                 /* convert one or two bytes */
   2202                 myData->isEmptySegment = FALSE;
   2203                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
   2204                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
   2205                     !IS_JP_DBCS(cs)
   2206                 ) {
   2207                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
   2208                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
   2209 
   2210                     /* return from a single-shift state to the previous one */
   2211                     if(pToU2022State->g >= 2) {
   2212                         pToU2022State->g=pToU2022State->prevG;
   2213                     }
   2214                 } else switch(cs) {
   2215                 case ASCII:
   2216                     if(mySourceChar <= 0x7f) {
   2217                         targetUniChar = mySourceChar;
   2218                     }
   2219                     break;
   2220                 case ISO8859_1:
   2221                     if(mySourceChar <= 0x7f) {
   2222                         targetUniChar = mySourceChar + 0x80;
   2223                     }
   2224                     /* return from a single-shift state to the previous one */
   2225                     pToU2022State->g=pToU2022State->prevG;
   2226                     break;
   2227                 case ISO8859_7:
   2228                     if(mySourceChar <= 0x7f) {
   2229                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
   2230                         targetUniChar =
   2231                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
   2232                                 myData->myConverterArray[cs],
   2233                                 mySourceChar + 0x80);
   2234                     }
   2235                     /* return from a single-shift state to the previous one */
   2236                     pToU2022State->g=pToU2022State->prevG;
   2237                     break;
   2238                 case JISX201:
   2239                     if(mySourceChar <= 0x7f) {
   2240                         targetUniChar = jisx201ToU(mySourceChar);
   2241                     }
   2242                     break;
   2243                 case HWKANA_7BIT:
   2244                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
   2245                         /* 7-bit halfwidth Katakana */
   2246                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
   2247                     }
   2248                     break;
   2249                 default:
   2250                     /* G0 DBCS */
   2251                     if(mySource < mySourceLimit) {
   2252                         int leadIsOk, trailIsOk;
   2253                         uint8_t trailByte;
   2254 getTrailByte:
   2255                         trailByte = (uint8_t)*mySource;
   2256                         /*
   2257                          * Ticket 5691: consistent illegal sequences:
   2258                          * - We include at least the first byte in the illegal sequence.
   2259                          * - If any of the non-initial bytes could be the start of a character,
   2260                          *   we stop the illegal sequence before the first one of those.
   2261                          *
   2262                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   2263                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   2264                          * Otherwise we convert or report the pair of bytes.
   2265                          */
   2266                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   2267                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   2268                         if (leadIsOk && trailIsOk) {
   2269                             ++mySource;
   2270                             tmpSourceChar = (mySourceChar << 8) | trailByte;
   2271                             if(cs == JISX208) {
   2272                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
   2273                                 mySourceChar = tmpSourceChar;
   2274                             } else {
   2275                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
   2276                                 mySourceChar = tmpSourceChar;
   2277                                 if (cs == KSC5601) {
   2278                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
   2279                                 }
   2280                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
   2281                                 tempBuf[1] = (char)(tmpSourceChar);
   2282                             }
   2283                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
   2284                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   2285                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   2286                             ++mySource;
   2287                             /* add another bit so that the code below writes 2 bytes in case of error */
   2288                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   2289                         }
   2290                     } else {
   2291                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2292                         args->converter->toULength = 1;
   2293                         goto endloop;
   2294                     }
   2295                 }  /* End of inner switch */
   2296                 break;
   2297             }  /* End of outer switch */
   2298             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
   2299                 if(args->offsets){
   2300                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2301                 }
   2302                 *(myTarget++)=(UChar)targetUniChar;
   2303             }
   2304             else if(targetUniChar > missingCharMarker){
   2305                 /* disassemble the surrogate pair and write to output*/
   2306                 targetUniChar-=0x0010000;
   2307                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
   2308                 if(args->offsets){
   2309                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2310                 }
   2311                 ++myTarget;
   2312                 if(myTarget< args->targetLimit){
   2313                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   2314                     if(args->offsets){
   2315                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2316                     }
   2317                     ++myTarget;
   2318                 }else{
   2319                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
   2320                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   2321                 }
   2322 
   2323             }
   2324             else{
   2325                 /* Call the callback function*/
   2326                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   2327                 break;
   2328             }
   2329         }
   2330         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
   2331             *err =U_BUFFER_OVERFLOW_ERROR;
   2332             break;
   2333         }
   2334     }
   2335 endloop:
   2336     args->target = myTarget;
   2337     args->source = mySource;
   2338 }
   2339 
   2340 
   2341 #if !UCONFIG_ONLY_HTML_CONVERSION
   2342 /***************************************************************
   2343 *   Rules for ISO-2022-KR encoding
   2344 *   i) The KSC5601 designator sequence should appear only once in a file,
   2345 *      at the begining of a line before any KSC5601 characters. This usually
   2346 *      means that it appears by itself on the first line of the file
   2347 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
   2348 *      and SI to shift into single byte mode
   2349 */
   2350 static void
   2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2352 
   2353     UConverter* saveConv = args->converter;
   2354     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
   2355     args->converter=myConverterData->currentConverter;
   2356 
   2357     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
   2358     ucnv_MBCSFromUnicodeWithOffsets(args,err);
   2359     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
   2360 
   2361     if(*err == U_BUFFER_OVERFLOW_ERROR) {
   2362         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
   2363             uprv_memcpy(
   2364                 saveConv->charErrorBuffer,
   2365                 myConverterData->currentConverter->charErrorBuffer,
   2366                 myConverterData->currentConverter->charErrorBufferLength);
   2367         }
   2368         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
   2369         myConverterData->currentConverter->charErrorBufferLength = 0;
   2370     }
   2371     args->converter=saveConv;
   2372 }
   2373 
   2374 static void
   2375 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2376 
   2377     const UChar *source = args->source;
   2378     const UChar *sourceLimit = args->sourceLimit;
   2379     unsigned char *target = (unsigned char *) args->target;
   2380     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   2381     int32_t* offsets = args->offsets;
   2382     uint32_t targetByteUnit = 0x0000;
   2383     UChar32 sourceChar = 0x0000;
   2384     UBool isTargetByteDBCS;
   2385     UBool oldIsTargetByteDBCS;
   2386     UConverterDataISO2022 *converterData;
   2387     UConverterSharedData* sharedData;
   2388     UBool useFallback;
   2389     int32_t length =0;
   2390 
   2391     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
   2392     /* if the version is 1 then the user is requesting
   2393      * conversion with ibm-25546 pass the arguments to
   2394      * MBCS converter and return
   2395      */
   2396     if(converterData->version==1){
   2397         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
   2398         return;
   2399     }
   2400 
   2401     /* initialize data */
   2402     sharedData = converterData->currentConverter->sharedData;
   2403     useFallback = args->converter->useFallback;
   2404     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
   2405     oldIsTargetByteDBCS = isTargetByteDBCS;
   2406 
   2407     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
   2408     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
   2409         goto getTrail;
   2410     }
   2411     while(source < sourceLimit){
   2412 
   2413         targetByteUnit = missingCharMarker;
   2414 
   2415         if(target < (unsigned char*) args->targetLimit){
   2416             sourceChar = *source++;
   2417 
   2418             /* do not convert SO/SI/ESC */
   2419             if(IS_2022_CONTROL(sourceChar)) {
   2420                 /* callback(illegal) */
   2421                 *err=U_ILLEGAL_CHAR_FOUND;
   2422                 args->converter->fromUChar32=sourceChar;
   2423                 break;
   2424             }
   2425 
   2426             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
   2427             if(length < 0) {
   2428                 length = -length;  /* fallback */
   2429             }
   2430             /* only DBCS or SBCS characters are expected*/
   2431             /* DB characters with high bit set to 1 are expected */
   2432             if( length > 2 || length==0 ||
   2433                 (length == 1 && targetByteUnit > 0x7f) ||
   2434                 (length == 2 &&
   2435                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
   2436                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
   2437             ) {
   2438                 targetByteUnit=missingCharMarker;
   2439             }
   2440             if (targetByteUnit != missingCharMarker){
   2441 
   2442                 oldIsTargetByteDBCS = isTargetByteDBCS;
   2443                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
   2444                   /* append the shift sequence */
   2445                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
   2446 
   2447                     if (isTargetByteDBCS)
   2448                         *target++ = UCNV_SO;
   2449                     else
   2450                         *target++ = UCNV_SI;
   2451                     if(offsets)
   2452                         *(offsets++) = (int32_t)(source - args->source-1);
   2453                 }
   2454                 /* write the targetUniChar  to target */
   2455                 if(targetByteUnit <= 0x00FF){
   2456                     if( target < targetLimit){
   2457                         *(target++) = (unsigned char) targetByteUnit;
   2458                         if(offsets){
   2459                             *(offsets++) = (int32_t)(source - args->source-1);
   2460                         }
   2461 
   2462                     }else{
   2463                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
   2464                         *err = U_BUFFER_OVERFLOW_ERROR;
   2465                     }
   2466                 }else{
   2467                     if(target < targetLimit){
   2468                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
   2469                         if(offsets){
   2470                             *(offsets++) = (int32_t)(source - args->source-1);
   2471                         }
   2472                         if(target < targetLimit){
   2473                             *(target++) =(unsigned char) (targetByteUnit -0x80);
   2474                             if(offsets){
   2475                                 *(offsets++) = (int32_t)(source - args->source-1);
   2476                             }
   2477                         }else{
   2478                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
   2479                             *err = U_BUFFER_OVERFLOW_ERROR;
   2480                         }
   2481                     }else{
   2482                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
   2483                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
   2484                         *err = U_BUFFER_OVERFLOW_ERROR;
   2485                     }
   2486                 }
   2487 
   2488             }
   2489             else{
   2490                 /* oops.. the code point is unassingned
   2491                  * set the error and reason
   2492                  */
   2493 
   2494                 /*check if the char is a First surrogate*/
   2495                 if(U16_IS_SURROGATE(sourceChar)) {
   2496                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   2497 getTrail:
   2498                         /*look ahead to find the trail surrogate*/
   2499                         if(source <  sourceLimit) {
   2500                             /* test the following code unit */
   2501                             UChar trail=(UChar) *source;
   2502                             if(U16_IS_TRAIL(trail)) {
   2503                                 source++;
   2504                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   2505                                 *err = U_INVALID_CHAR_FOUND;
   2506                                 /* convert this surrogate code point */
   2507                                 /* exit this condition tree */
   2508                             } else {
   2509                                 /* this is an unmatched lead code unit (1st surrogate) */
   2510                                 /* callback(illegal) */
   2511                                 *err=U_ILLEGAL_CHAR_FOUND;
   2512                             }
   2513                         } else {
   2514                             /* no more input */
   2515                             *err = U_ZERO_ERROR;
   2516                         }
   2517                     } else {
   2518                         /* this is an unmatched trail code unit (2nd surrogate) */
   2519                         /* callback(illegal) */
   2520                         *err=U_ILLEGAL_CHAR_FOUND;
   2521                     }
   2522                 } else {
   2523                     /* callback(unassigned) for a BMP code point */
   2524                     *err = U_INVALID_CHAR_FOUND;
   2525                 }
   2526 
   2527                 args->converter->fromUChar32=sourceChar;
   2528                 break;
   2529             }
   2530         } /* end if(myTargetIndex<myTargetLength) */
   2531         else{
   2532             *err =U_BUFFER_OVERFLOW_ERROR;
   2533             break;
   2534         }
   2535 
   2536     }/* end while(mySourceIndex<mySourceLength) */
   2537 
   2538     /*
   2539      * the end of the input stream and detection of truncated input
   2540      * are handled by the framework, but for ISO-2022-KR conversion
   2541      * we need to be in ASCII mode at the very end
   2542      *
   2543      * conditions:
   2544      *   successful
   2545      *   not in ASCII mode
   2546      *   end of input and no truncated input
   2547      */
   2548     if( U_SUCCESS(*err) &&
   2549         isTargetByteDBCS &&
   2550         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
   2551     ) {
   2552         int32_t sourceIndex;
   2553 
   2554         /* we are switching to ASCII */
   2555         isTargetByteDBCS=FALSE;
   2556 
   2557         /* get the source index of the last input character */
   2558         /*
   2559          * TODO this would be simpler and more reliable if we used a pair
   2560          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   2561          * so that we could simply use the prevSourceIndex here;
   2562          * this code gives an incorrect result for the rare case of an unmatched
   2563          * trail surrogate that is alone in the last buffer of the text stream
   2564          */
   2565         sourceIndex=(int32_t)(source-args->source);
   2566         if(sourceIndex>0) {
   2567             --sourceIndex;
   2568             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   2569                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   2570             ) {
   2571                 --sourceIndex;
   2572             }
   2573         } else {
   2574             sourceIndex=-1;
   2575         }
   2576 
   2577         fromUWriteUInt8(
   2578             args->converter,
   2579             SHIFT_IN_STR, 1,
   2580             &target, (const char *)targetLimit,
   2581             &offsets, sourceIndex,
   2582             err);
   2583     }
   2584 
   2585     /*save the state and return */
   2586     args->source = source;
   2587     args->target = (char*)target;
   2588     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
   2589 }
   2590 
   2591 /************************ To Unicode ***************************************/
   2592 
   2593 static void
   2594 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
   2595                                                             UErrorCode* err){
   2596     char const* sourceStart;
   2597     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2598 
   2599     UConverterToUnicodeArgs subArgs;
   2600     int32_t minArgsSize;
   2601 
   2602     /* set up the subconverter arguments */
   2603     if(args->size<sizeof(UConverterToUnicodeArgs)) {
   2604         minArgsSize = args->size;
   2605     } else {
   2606         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
   2607     }
   2608 
   2609     uprv_memcpy(&subArgs, args, minArgsSize);
   2610     subArgs.size = (uint16_t)minArgsSize;
   2611     subArgs.converter = myData->currentConverter;
   2612 
   2613     /* remember the original start of the input for offsets */
   2614     sourceStart = args->source;
   2615 
   2616     if(myData->key != 0) {
   2617         /* continue with a partial escape sequence */
   2618         goto escape;
   2619     }
   2620 
   2621     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
   2622         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
   2623         subArgs.source = args->source;
   2624         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
   2625         if(subArgs.source != subArgs.sourceLimit) {
   2626             /*
   2627              * get the current partial byte sequence
   2628              *
   2629              * it needs to be moved between the public and the subconverter
   2630              * so that the conversion framework, which only sees the public
   2631              * converter, can handle truncated and illegal input etc.
   2632              */
   2633             if(args->converter->toULength > 0) {
   2634                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
   2635             }
   2636             subArgs.converter->toULength = args->converter->toULength;
   2637 
   2638             /*
   2639              * Convert up to the end of the input, or to before the next escape character.
   2640              * Does not handle conversion extensions because the preToU[] state etc.
   2641              * is not copied.
   2642              */
   2643             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
   2644 
   2645             if(args->offsets != NULL && sourceStart != args->source) {
   2646                 /* update offsets to base them on the actual start of the input */
   2647                 int32_t *offsets = args->offsets;
   2648                 UChar *target = args->target;
   2649                 int32_t delta = (int32_t)(args->source - sourceStart);
   2650                 while(target < subArgs.target) {
   2651                     if(*offsets >= 0) {
   2652                         *offsets += delta;
   2653                     }
   2654                     ++offsets;
   2655                     ++target;
   2656                 }
   2657             }
   2658             args->source = subArgs.source;
   2659             args->target = subArgs.target;
   2660             args->offsets = subArgs.offsets;
   2661 
   2662             /* copy input/error/overflow buffers */
   2663             if(subArgs.converter->toULength > 0) {
   2664                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
   2665             }
   2666             args->converter->toULength = subArgs.converter->toULength;
   2667 
   2668             if(*err == U_BUFFER_OVERFLOW_ERROR) {
   2669                 if(subArgs.converter->UCharErrorBufferLength > 0) {
   2670                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
   2671                                 subArgs.converter->UCharErrorBufferLength);
   2672                 }
   2673                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
   2674                 subArgs.converter->UCharErrorBufferLength = 0;
   2675             }
   2676         }
   2677 
   2678         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
   2679             return;
   2680         }
   2681 
   2682 escape:
   2683         changeState_2022(args->converter,
   2684                &(args->source),
   2685                args->sourceLimit,
   2686                ISO_2022_KR,
   2687                err);
   2688     }
   2689 }
   2690 
   2691 static void
   2692 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   2693                                                             UErrorCode* err){
   2694     char tempBuf[2];
   2695     const char *mySource = ( char *) args->source;
   2696     UChar *myTarget = args->target;
   2697     const char *mySourceLimit = args->sourceLimit;
   2698     UChar32 targetUniChar = 0x0000;
   2699     UChar mySourceChar = 0x0000;
   2700     UConverterDataISO2022* myData;
   2701     UConverterSharedData* sharedData ;
   2702     UBool useFallback;
   2703 
   2704     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2705     if(myData->version==1){
   2706         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
   2707         return;
   2708     }
   2709 
   2710     /* initialize state */
   2711     sharedData = myData->currentConverter->sharedData;
   2712     useFallback = args->converter->useFallback;
   2713 
   2714     if(myData->key != 0) {
   2715         /* continue with a partial escape sequence */
   2716         goto escape;
   2717     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   2718         /* continue with a partial double-byte character */
   2719         mySourceChar = args->converter->toUBytes[0];
   2720         args->converter->toULength = 0;
   2721         goto getTrailByte;
   2722     }
   2723 
   2724     while(mySource< mySourceLimit){
   2725 
   2726         if(myTarget < args->targetLimit){
   2727 
   2728             mySourceChar= (unsigned char) *mySource++;
   2729 
   2730             if(mySourceChar==UCNV_SI){
   2731                 myData->toU2022State.g = 0;
   2732                 if (myData->isEmptySegment) {
   2733                     myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
   2734                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   2735                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
   2736                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2737                     args->converter->toULength = 1;
   2738                     args->target = myTarget;
   2739                     args->source = mySource;
   2740                     return;
   2741                 }
   2742                 /*consume the source */
   2743                 continue;
   2744             }else if(mySourceChar==UCNV_SO){
   2745                 myData->toU2022State.g = 1;
   2746                 myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
   2747                 /*consume the source */
   2748                 continue;
   2749             }else if(mySourceChar==ESC_2022){
   2750                 mySource--;
   2751 escape:
   2752                 myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
   2753                 changeState_2022(args->converter,&(mySource),
   2754                                 mySourceLimit, ISO_2022_KR, err);
   2755                 if(U_FAILURE(*err)){
   2756                     args->target = myTarget;
   2757                     args->source = mySource;
   2758                     return;
   2759                 }
   2760                 continue;
   2761             }
   2762 
   2763             myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
   2764             if(myData->toU2022State.g == 1) {
   2765                 if(mySource < mySourceLimit) {
   2766                     int leadIsOk, trailIsOk;
   2767                     uint8_t trailByte;
   2768 getTrailByte:
   2769                     targetUniChar = missingCharMarker;
   2770                     trailByte = (uint8_t)*mySource;
   2771                     /*
   2772                      * Ticket 5691: consistent illegal sequences:
   2773                      * - We include at least the first byte in the illegal sequence.
   2774                      * - If any of the non-initial bytes could be the start of a character,
   2775                      *   we stop the illegal sequence before the first one of those.
   2776                      *
   2777                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   2778                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   2779                      * Otherwise we convert or report the pair of bytes.
   2780                      */
   2781                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   2782                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   2783                     if (leadIsOk && trailIsOk) {
   2784                         ++mySource;
   2785                         tempBuf[0] = (char)(mySourceChar + 0x80);
   2786                         tempBuf[1] = (char)(trailByte + 0x80);
   2787                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
   2788                         mySourceChar = (mySourceChar << 8) | trailByte;
   2789                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   2790                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   2791                         ++mySource;
   2792                         /* add another bit so that the code below writes 2 bytes in case of error */
   2793                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   2794                     }
   2795                 } else {
   2796                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2797                     args->converter->toULength = 1;
   2798                     break;
   2799                 }
   2800             }
   2801             else if(mySourceChar <= 0x7f) {
   2802                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
   2803             } else {
   2804                 targetUniChar = 0xffff;
   2805             }
   2806             if(targetUniChar < 0xfffe){
   2807                 if(args->offsets) {
   2808                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2809                 }
   2810                 *(myTarget++)=(UChar)targetUniChar;
   2811             }
   2812             else {
   2813                 /* Call the callback function*/
   2814                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   2815                 break;
   2816             }
   2817         }
   2818         else{
   2819             *err =U_BUFFER_OVERFLOW_ERROR;
   2820             break;
   2821         }
   2822     }
   2823     args->target = myTarget;
   2824     args->source = mySource;
   2825 }
   2826 
   2827 /*************************** END ISO2022-KR *********************************/
   2828 
   2829 /*************************** ISO-2022-CN *********************************
   2830 *
   2831 * Rules for ISO-2022-CN Encoding:
   2832 * i)   The designator sequence must appear once on a line before any instance
   2833 *      of character set it designates.
   2834 * ii)  If two lines contain characters from the same character set, both lines
   2835 *      must include the designator sequence.
   2836 * iii) Once the designator sequence is known, a shifting sequence has to be found
   2837 *      to invoke the  shifting
   2838 * iv)  All lines start in ASCII and end in ASCII.
   2839 * v)   Four shifting sequences are employed for this purpose:
   2840 *
   2841 *      Sequcence   ASCII Eq    Charsets
   2842 *      ----------  -------    ---------
   2843 *      SI           <SI>        US-ASCII
   2844 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
   2845 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
   2846 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
   2847 *
   2848 * vi)
   2849 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
   2850 *      SS2designator : ESC "$" "*" finalchar_for_SS2
   2851 *      SS3designator : ESC "$" "+" finalchar_for_SS3
   2852 *
   2853 *      ESC $ ) A       Indicates the bytes following SO are Chinese
   2854 *       characters as defined in GB 2312-80, until
   2855 *       another SOdesignation appears
   2856 *
   2857 *
   2858 *      ESC $ ) E       Indicates the bytes following SO are as defined
   2859 *       in ISO-IR-165 (for details, see section 2.1),
   2860 *       until another SOdesignation appears
   2861 *
   2862 *      ESC $ ) G       Indicates the bytes following SO are as defined
   2863 *       in CNS 11643-plane-1, until another
   2864 *       SOdesignation appears
   2865 *
   2866 *      ESC $ * H       Indicates the two bytes immediately following
   2867 *       SS2 is a Chinese character as defined in CNS
   2868 *       11643-plane-2, until another SS2designation
   2869 *       appears
   2870 *       (Meaning <ESC>N must preceed every 2 byte
   2871 *        sequence.)
   2872 *
   2873 *      ESC $ + I       Indicates the immediate two bytes following SS3
   2874 *       is a Chinese character as defined in CNS
   2875 *       11643-plane-3, until another SS3designation
   2876 *       appears
   2877 *       (Meaning <ESC>O must preceed every 2 byte
   2878 *        sequence.)
   2879 *
   2880 *      ESC $ + J       Indicates the immediate two bytes following SS3
   2881 *       is a Chinese character as defined in CNS
   2882 *       11643-plane-4, until another SS3designation
   2883 *       appears
   2884 *       (In English: <ESC>O must preceed every 2 byte
   2885 *        sequence.)
   2886 *
   2887 *      ESC $ + K       Indicates the immediate two bytes following SS3
   2888 *       is a Chinese character as defined in CNS
   2889 *       11643-plane-5, until another SS3designation
   2890 *       appears
   2891 *
   2892 *      ESC $ + L       Indicates the immediate two bytes following SS3
   2893 *       is a Chinese character as defined in CNS
   2894 *       11643-plane-6, until another SS3designation
   2895 *       appears
   2896 *
   2897 *      ESC $ + M       Indicates the immediate two bytes following SS3
   2898 *       is a Chinese character as defined in CNS
   2899 *       11643-plane-7, until another SS3designation
   2900 *       appears
   2901 *
   2902 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
   2903 *       has its own designation information before any Chinese characters
   2904 *       appear
   2905 *
   2906 */
   2907 
   2908 /* The following are defined this way to make the strings truly readonly */
   2909 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
   2910 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
   2911 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
   2912 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
   2913 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
   2914 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
   2915 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
   2916 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
   2917 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
   2918 
   2919 /********************** ISO2022-CN Data **************************/
   2920 static const char* const escSeqCharsCN[10] ={
   2921         SHIFT_IN_STR,                   /* 0 ASCII */
   2922         GB_2312_80_STR,                 /* 1 GB2312_1 */
   2923         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
   2924         CNS_11643_1992_Plane_1_STR,
   2925         CNS_11643_1992_Plane_2_STR,
   2926         CNS_11643_1992_Plane_3_STR,
   2927         CNS_11643_1992_Plane_4_STR,
   2928         CNS_11643_1992_Plane_5_STR,
   2929         CNS_11643_1992_Plane_6_STR,
   2930         CNS_11643_1992_Plane_7_STR
   2931 };
   2932 
   2933 static void
   2934 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2935     UConverter *cnv = args->converter;
   2936     UConverterDataISO2022 *converterData;
   2937     ISO2022State *pFromU2022State;
   2938     uint8_t *target = (uint8_t *) args->target;
   2939     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
   2940     const UChar* source = args->source;
   2941     const UChar* sourceLimit = args->sourceLimit;
   2942     int32_t* offsets = args->offsets;
   2943     UChar32 sourceChar;
   2944     char buffer[8];
   2945     int32_t len;
   2946     int8_t choices[3];
   2947     int32_t choiceCount;
   2948     uint32_t targetValue = 0;
   2949     UBool useFallback;
   2950 
   2951     /* set up the state */
   2952     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
   2953     pFromU2022State   = &converterData->fromU2022State;
   2954 
   2955     choiceCount = 0;
   2956 
   2957     /* check if the last codepoint of previous buffer was a lead surrogate*/
   2958     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
   2959         goto getTrail;
   2960     }
   2961 
   2962     while( source < sourceLimit){
   2963         if(target < targetLimit){
   2964 
   2965             sourceChar  = *(source++);
   2966             /*check if the char is a First surrogate*/
   2967              if(U16_IS_SURROGATE(sourceChar)) {
   2968                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   2969 getTrail:
   2970                     /*look ahead to find the trail surrogate*/
   2971                     if(source < sourceLimit) {
   2972                         /* test the following code unit */
   2973                         UChar trail=(UChar) *source;
   2974                         if(U16_IS_TRAIL(trail)) {
   2975                             source++;
   2976                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   2977                             cnv->fromUChar32=0x00;
   2978                             /* convert this supplementary code point */
   2979                             /* exit this condition tree */
   2980                         } else {
   2981                             /* this is an unmatched lead code unit (1st surrogate) */
   2982                             /* callback(illegal) */
   2983                             *err=U_ILLEGAL_CHAR_FOUND;
   2984                             cnv->fromUChar32=sourceChar;
   2985                             break;
   2986                         }
   2987                     } else {
   2988                         /* no more input */
   2989                         cnv->fromUChar32=sourceChar;
   2990                         break;
   2991                     }
   2992                 } else {
   2993                     /* this is an unmatched trail code unit (2nd surrogate) */
   2994                     /* callback(illegal) */
   2995                     *err=U_ILLEGAL_CHAR_FOUND;
   2996                     cnv->fromUChar32=sourceChar;
   2997                     break;
   2998                 }
   2999             }
   3000 
   3001             /* do the conversion */
   3002             if(sourceChar <= 0x007f ){
   3003                 /* do not convert SO/SI/ESC */
   3004                 if(IS_2022_CONTROL(sourceChar)) {
   3005                     /* callback(illegal) */
   3006                     *err=U_ILLEGAL_CHAR_FOUND;
   3007                     cnv->fromUChar32=sourceChar;
   3008                     break;
   3009                 }
   3010 
   3011                 /* US-ASCII */
   3012                 if(pFromU2022State->g == 0) {
   3013                     buffer[0] = (char)sourceChar;
   3014                     len = 1;
   3015                 } else {
   3016                     buffer[0] = UCNV_SI;
   3017                     buffer[1] = (char)sourceChar;
   3018                     len = 2;
   3019                     pFromU2022State->g = 0;
   3020                     choiceCount = 0;
   3021                 }
   3022                 if(sourceChar == CR || sourceChar == LF) {
   3023                     /* reset the state at the end of a line */
   3024                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
   3025                     choiceCount = 0;
   3026                 }
   3027             }
   3028             else{
   3029                 /* convert U+0080..U+10ffff */
   3030                 int32_t i;
   3031                 int8_t cs, g;
   3032 
   3033                 if(choiceCount == 0) {
   3034                     /* try the current SO/G1 converter first */
   3035                     choices[0] = pFromU2022State->cs[1];
   3036 
   3037                     /* default to GB2312_1 if none is designated yet */
   3038                     if(choices[0] == 0) {
   3039                         choices[0] = GB2312_1;
   3040                     }
   3041 
   3042                     if(converterData->version == 0) {
   3043                         /* ISO-2022-CN */
   3044 
   3045                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
   3046                         if(choices[0] == GB2312_1) {
   3047                             choices[1] = (int8_t)CNS_11643_1;
   3048                         } else {
   3049                             choices[1] = (int8_t)GB2312_1;
   3050                         }
   3051 
   3052                         choiceCount = 2;
   3053                     } else if (converterData->version == 1) {
   3054                         /* ISO-2022-CN-EXT */
   3055 
   3056                         /* try one of the other converters */
   3057                         switch(choices[0]) {
   3058                         case GB2312_1:
   3059                             choices[1] = (int8_t)CNS_11643_1;
   3060                             choices[2] = (int8_t)ISO_IR_165;
   3061                             break;
   3062                         case ISO_IR_165:
   3063                             choices[1] = (int8_t)GB2312_1;
   3064                             choices[2] = (int8_t)CNS_11643_1;
   3065                             break;
   3066                         default: /* CNS_11643_x */
   3067                             choices[1] = (int8_t)GB2312_1;
   3068                             choices[2] = (int8_t)ISO_IR_165;
   3069                             break;
   3070                         }
   3071 
   3072                         choiceCount = 3;
   3073                     } else {
   3074                         choices[0] = (int8_t)CNS_11643_1;
   3075                         choices[1] = (int8_t)GB2312_1;
   3076                     }
   3077                 }
   3078 
   3079                 cs = g = 0;
   3080                 /*
   3081                  * len==0: no mapping found yet
   3082                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
   3083                  * len>0: found a roundtrip result, done
   3084                  */
   3085                 len = 0;
   3086                 /*
   3087                  * We will turn off useFallback after finding a fallback,
   3088                  * but we still get fallbacks from PUA code points as usual.
   3089                  * Therefore, we will also need to check that we don't overwrite
   3090                  * an early fallback with a later one.
   3091                  */
   3092                 useFallback = cnv->useFallback;
   3093 
   3094                 for(i = 0; i < choiceCount && len <= 0; ++i) {
   3095                     int8_t cs0 = choices[i];
   3096                     if(cs0 > 0) {
   3097                         uint32_t value;
   3098                         int32_t len2;
   3099                         if(cs0 >= CNS_11643_0) {
   3100                             len2 = MBCS_FROM_UCHAR32_ISO2022(
   3101                                         converterData->myConverterArray[CNS_11643],
   3102                                         sourceChar,
   3103                                         &value,
   3104                                         useFallback,
   3105                                         MBCS_OUTPUT_3);
   3106                             if(len2 == 3 || (len2 == -3 && len == 0)) {
   3107                                 targetValue = value;
   3108                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
   3109                                 if(len2 >= 0) {
   3110                                     len = 2;
   3111                                 } else {
   3112                                     len = -2;
   3113                                     useFallback = FALSE;
   3114                                 }
   3115                                 if(cs == CNS_11643_1) {
   3116                                     g = 1;
   3117                                 } else if(cs == CNS_11643_2) {
   3118                                     g = 2;
   3119                                 } else /* plane 3..7 */ if(converterData->version == 1) {
   3120                                     g = 3;
   3121                                 } else {
   3122                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
   3123                                     len = 0;
   3124                                 }
   3125                             }
   3126                         } else {
   3127                             /* GB2312_1 or ISO-IR-165 */
   3128                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
   3129                             len2 = MBCS_FROM_UCHAR32_ISO2022(
   3130                                         converterData->myConverterArray[cs0],
   3131                                         sourceChar,
   3132                                         &value,
   3133                                         useFallback,
   3134                                         MBCS_OUTPUT_2);
   3135                             if(len2 == 2 || (len2 == -2 && len == 0)) {
   3136                                 targetValue = value;
   3137                                 len = len2;
   3138                                 cs = cs0;
   3139                                 g = 1;
   3140                                 useFallback = FALSE;
   3141                             }
   3142                         }
   3143                     }
   3144                 }
   3145 
   3146                 if(len != 0) {
   3147                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
   3148 
   3149                     /* write the designation sequence if necessary */
   3150                     if(cs != pFromU2022State->cs[g]) {
   3151                         if(cs < CNS_11643) {
   3152                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
   3153                         } else {
   3154                             U_ASSERT(cs >= CNS_11643_1);
   3155                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
   3156                         }
   3157                         len = 4;
   3158                         pFromU2022State->cs[g] = cs;
   3159                         if(g == 1) {
   3160                             /* changing the SO/G1 charset invalidates the choices[] */
   3161                             choiceCount = 0;
   3162                         }
   3163                     }
   3164 
   3165                     /* write the shift sequence if necessary */
   3166                     if(g != pFromU2022State->g) {
   3167                         switch(g) {
   3168                         case 1:
   3169                             buffer[len++] = UCNV_SO;
   3170 
   3171                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
   3172                             pFromU2022State->g = 1;
   3173                             break;
   3174                         case 2:
   3175                             buffer[len++] = 0x1b;
   3176                             buffer[len++] = 0x4e;
   3177                             break;
   3178                         default: /* case 3 */
   3179                             buffer[len++] = 0x1b;
   3180                             buffer[len++] = 0x4f;
   3181                             break;
   3182                         }
   3183                     }
   3184 
   3185                     /* write the two output bytes */
   3186                     buffer[len++] = (char)(targetValue >> 8);
   3187                     buffer[len++] = (char)targetValue;
   3188                 } else {
   3189                     /* if we cannot find the character after checking all codepages
   3190                      * then this is an error
   3191                      */
   3192                     *err = U_INVALID_CHAR_FOUND;
   3193                     cnv->fromUChar32=sourceChar;
   3194                     break;
   3195                 }
   3196             }
   3197 
   3198             /* output len>0 bytes in buffer[] */
   3199             if(len == 1) {
   3200                 *target++ = buffer[0];
   3201                 if(offsets) {
   3202                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
   3203                 }
   3204             } else if(len == 2 && (target + 2) <= targetLimit) {
   3205                 *target++ = buffer[0];
   3206                 *target++ = buffer[1];
   3207                 if(offsets) {
   3208                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
   3209                     *offsets++ = sourceIndex;
   3210                     *offsets++ = sourceIndex;
   3211                 }
   3212             } else {
   3213                 fromUWriteUInt8(
   3214                     cnv,
   3215                     buffer, len,
   3216                     &target, (const char *)targetLimit,
   3217                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
   3218                     err);
   3219                 if(U_FAILURE(*err)) {
   3220                     break;
   3221                 }
   3222             }
   3223         } /* end if(myTargetIndex<myTargetLength) */
   3224         else{
   3225             *err =U_BUFFER_OVERFLOW_ERROR;
   3226             break;
   3227         }
   3228 
   3229     }/* end while(mySourceIndex<mySourceLength) */
   3230 
   3231     /*
   3232      * the end of the input stream and detection of truncated input
   3233      * are handled by the framework, but for ISO-2022-CN conversion
   3234      * we need to be in ASCII mode at the very end
   3235      *
   3236      * conditions:
   3237      *   successful
   3238      *   not in ASCII mode
   3239      *   end of input and no truncated input
   3240      */
   3241     if( U_SUCCESS(*err) &&
   3242         pFromU2022State->g!=0 &&
   3243         args->flush && source>=sourceLimit && cnv->fromUChar32==0
   3244     ) {
   3245         int32_t sourceIndex;
   3246 
   3247         /* we are switching to ASCII */
   3248         pFromU2022State->g=0;
   3249 
   3250         /* get the source index of the last input character */
   3251         /*
   3252          * TODO this would be simpler and more reliable if we used a pair
   3253          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   3254          * so that we could simply use the prevSourceIndex here;
   3255          * this code gives an incorrect result for the rare case of an unmatched
   3256          * trail surrogate that is alone in the last buffer of the text stream
   3257          */
   3258         sourceIndex=(int32_t)(source-args->source);
   3259         if(sourceIndex>0) {
   3260             --sourceIndex;
   3261             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   3262                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   3263             ) {
   3264                 --sourceIndex;
   3265             }
   3266         } else {
   3267             sourceIndex=-1;
   3268         }
   3269 
   3270         fromUWriteUInt8(
   3271             cnv,
   3272             SHIFT_IN_STR, 1,
   3273             &target, (const char *)targetLimit,
   3274             &offsets, sourceIndex,
   3275             err);
   3276     }
   3277 
   3278     /*save the state and return */
   3279     args->source = source;
   3280     args->target = (char*)target;
   3281 }
   3282 
   3283 
   3284 static void
   3285 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   3286                                                UErrorCode* err){
   3287     char tempBuf[3];
   3288     const char *mySource = (char *) args->source;
   3289     UChar *myTarget = args->target;
   3290     const char *mySourceLimit = args->sourceLimit;
   3291     uint32_t targetUniChar = 0x0000;
   3292     uint32_t mySourceChar = 0x0000;
   3293     UConverterDataISO2022* myData;
   3294     ISO2022State *pToU2022State;
   3295 
   3296     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   3297     pToU2022State = &myData->toU2022State;
   3298 
   3299     if(myData->key != 0) {
   3300         /* continue with a partial escape sequence */
   3301         goto escape;
   3302     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   3303         /* continue with a partial double-byte character */
   3304         mySourceChar = args->converter->toUBytes[0];
   3305         args->converter->toULength = 0;
   3306         targetUniChar = missingCharMarker;
   3307         goto getTrailByte;
   3308     }
   3309 
   3310     while(mySource < mySourceLimit){
   3311 
   3312         targetUniChar =missingCharMarker;
   3313 
   3314         if(myTarget < args->targetLimit){
   3315 
   3316             mySourceChar= (unsigned char) *mySource++;
   3317 
   3318             switch(mySourceChar){
   3319             case UCNV_SI:
   3320                 pToU2022State->g=0;
   3321                 if (myData->isEmptySegment) {
   3322                     myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
   3323                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   3324                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
   3325                     args->converter->toUBytes[0] = mySourceChar;
   3326                     args->converter->toULength = 1;
   3327                     args->target = myTarget;
   3328                     args->source = mySource;
   3329                     return;
   3330                 }
   3331                 continue;
   3332 
   3333             case UCNV_SO:
   3334                 if(pToU2022State->cs[1] != 0) {
   3335                     pToU2022State->g=1;
   3336                     myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
   3337                     continue;
   3338                 } else {
   3339                     /* illegal to have SO before a matching designator */
   3340                     myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
   3341                     break;
   3342                 }
   3343 
   3344             case ESC_2022:
   3345                 mySource--;
   3346 escape:
   3347                 {
   3348                     const char * mySourceBefore = mySource;
   3349                     int8_t toULengthBefore = args->converter->toULength;
   3350 
   3351                     changeState_2022(args->converter,&(mySource),
   3352                         mySourceLimit, ISO_2022_CN,err);
   3353 
   3354                     /* After SO there must be at least one character before a designator (designator error handled separately) */
   3355                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
   3356                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   3357                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
   3358                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
   3359                     }
   3360                 }
   3361 
   3362                 /* invalid or illegal escape sequence */
   3363                 if(U_FAILURE(*err)){
   3364                     args->target = myTarget;
   3365                     args->source = mySource;
   3366                     myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
   3367                     return;
   3368                 }
   3369                 continue;
   3370 
   3371             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
   3372 
   3373             case CR:
   3374                 /*falls through*/
   3375             case LF:
   3376                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
   3377                 /* falls through */
   3378             default:
   3379                 /* convert one or two bytes */
   3380                 myData->isEmptySegment = FALSE;
   3381                 if(pToU2022State->g != 0) {
   3382                     if(mySource < mySourceLimit) {
   3383                         UConverterSharedData *cnv;
   3384                         StateEnum tempState;
   3385                         int32_t tempBufLen;
   3386                         int leadIsOk, trailIsOk;
   3387                         uint8_t trailByte;
   3388 getTrailByte:
   3389                         trailByte = (uint8_t)*mySource;
   3390                         /*
   3391                          * Ticket 5691: consistent illegal sequences:
   3392                          * - We include at least the first byte in the illegal sequence.
   3393                          * - If any of the non-initial bytes could be the start of a character,
   3394                          *   we stop the illegal sequence before the first one of those.
   3395                          *
   3396                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   3397                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   3398                          * Otherwise we convert or report the pair of bytes.
   3399                          */
   3400                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   3401                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   3402                         if (leadIsOk && trailIsOk) {
   3403                             ++mySource;
   3404                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
   3405                             if(tempState >= CNS_11643_0) {
   3406                                 cnv = myData->myConverterArray[CNS_11643];
   3407                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
   3408                                 tempBuf[1] = (char) (mySourceChar);
   3409                                 tempBuf[2] = (char) trailByte;
   3410                                 tempBufLen = 3;
   3411 
   3412                             }else{
   3413                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
   3414                                 cnv = myData->myConverterArray[tempState];
   3415                                 tempBuf[0] = (char) (mySourceChar);
   3416                                 tempBuf[1] = (char) trailByte;
   3417                                 tempBufLen = 2;
   3418                             }
   3419                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
   3420                             mySourceChar = (mySourceChar << 8) | trailByte;
   3421                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   3422                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   3423                             ++mySource;
   3424                             /* add another bit so that the code below writes 2 bytes in case of error */
   3425                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   3426                         }
   3427                         if(pToU2022State->g>=2) {
   3428                             /* return from a single-shift state to the previous one */
   3429                             pToU2022State->g=pToU2022State->prevG;
   3430                         }
   3431                     } else {
   3432                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   3433                         args->converter->toULength = 1;
   3434                         goto endloop;
   3435                     }
   3436                 }
   3437                 else{
   3438                     if(mySourceChar <= 0x7f) {
   3439                         targetUniChar = (UChar) mySourceChar;
   3440                     }
   3441                 }
   3442                 break;
   3443             }
   3444             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
   3445                 if(args->offsets){
   3446                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3447                 }
   3448                 *(myTarget++)=(UChar)targetUniChar;
   3449             }
   3450             else if(targetUniChar > missingCharMarker){
   3451                 /* disassemble the surrogate pair and write to output*/
   3452                 targetUniChar-=0x0010000;
   3453                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
   3454                 if(args->offsets){
   3455                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3456                 }
   3457                 ++myTarget;
   3458                 if(myTarget< args->targetLimit){
   3459                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   3460                     if(args->offsets){
   3461                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3462                     }
   3463                     ++myTarget;
   3464                 }else{
   3465                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
   3466                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   3467                 }
   3468 
   3469             }
   3470             else{
   3471                 /* Call the callback function*/
   3472                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   3473                 break;
   3474             }
   3475         }
   3476         else{
   3477             *err =U_BUFFER_OVERFLOW_ERROR;
   3478             break;
   3479         }
   3480     }
   3481 endloop:
   3482     args->target = myTarget;
   3483     args->source = mySource;
   3484 }
   3485 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
   3486 
   3487 static void
   3488 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
   3489     UConverter *cnv = args->converter;
   3490     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
   3491     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
   3492     char *p, *subchar;
   3493     char buffer[8];
   3494     int32_t length;
   3495 
   3496     subchar=(char *)cnv->subChars;
   3497     length=cnv->subCharLen; /* assume length==1 for most variants */
   3498 
   3499     p = buffer;
   3500     switch(myConverterData->locale[0]){
   3501     case 'j':
   3502         {
   3503             int8_t cs;
   3504 
   3505             if(pFromU2022State->g == 1) {
   3506                 /* JIS7: switch from G1 to G0 */
   3507                 pFromU2022State->g = 0;
   3508                 *p++ = UCNV_SI;
   3509             }
   3510 
   3511             cs = pFromU2022State->cs[0];
   3512             if(cs != ASCII && cs != JISX201) {
   3513                 /* not in ASCII or JIS X 0201: switch to ASCII */
   3514                 pFromU2022State->cs[0] = (int8_t)ASCII;
   3515                 *p++ = '\x1b';
   3516                 *p++ = '\x28';
   3517                 *p++ = '\x42';
   3518             }
   3519 
   3520             *p++ = subchar[0];
   3521             break;
   3522         }
   3523     case 'c':
   3524         if(pFromU2022State->g != 0) {
   3525             /* not in ASCII mode: switch to ASCII */
   3526             pFromU2022State->g = 0;
   3527             *p++ = UCNV_SI;
   3528         }
   3529         *p++ = subchar[0];
   3530         break;
   3531     case 'k':
   3532         if(myConverterData->version == 0) {
   3533             if(length == 1) {
   3534                 if((UBool)args->converter->fromUnicodeStatus) {
   3535                     /* in DBCS mode: switch to SBCS */
   3536                     args->converter->fromUnicodeStatus = 0;
   3537                     *p++ = UCNV_SI;
   3538                 }
   3539                 *p++ = subchar[0];
   3540             } else /* length == 2*/ {
   3541                 if(!(UBool)args->converter->fromUnicodeStatus) {
   3542                     /* in SBCS mode: switch to DBCS */
   3543                     args->converter->fromUnicodeStatus = 1;
   3544                     *p++ = UCNV_SO;
   3545                 }
   3546                 *p++ = subchar[0];
   3547                 *p++ = subchar[1];
   3548             }
   3549             break;
   3550         } else {
   3551             /* save the subconverter's substitution string */
   3552             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
   3553             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
   3554 
   3555             /* set our substitution string into the subconverter */
   3556             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
   3557             myConverterData->currentConverter->subCharLen = (int8_t)length;
   3558 
   3559             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
   3560             args->converter = myConverterData->currentConverter;
   3561             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
   3562             ucnv_cbFromUWriteSub(args, 0, err);
   3563             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
   3564             args->converter = cnv;
   3565 
   3566             /* restore the subconverter's substitution string */
   3567             myConverterData->currentConverter->subChars = currentSubChars;
   3568             myConverterData->currentConverter->subCharLen = currentSubCharLen;
   3569 
   3570             if(*err == U_BUFFER_OVERFLOW_ERROR) {
   3571                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
   3572                     uprv_memcpy(
   3573                         cnv->charErrorBuffer,
   3574                         myConverterData->currentConverter->charErrorBuffer,
   3575                         myConverterData->currentConverter->charErrorBufferLength);
   3576                 }
   3577                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
   3578                 myConverterData->currentConverter->charErrorBufferLength = 0;
   3579             }
   3580             return;
   3581         }
   3582     default:
   3583         /* not expected */
   3584         break;
   3585     }
   3586     ucnv_cbFromUWriteBytes(args,
   3587                            buffer, (int32_t)(p - buffer),
   3588                            offsetIndex, err);
   3589 }
   3590 
   3591 /*
   3592  * Structure for cloning an ISO 2022 converter into a single memory block.
   3593  * ucnv_safeClone() of the converter will align the entire cloneStruct,
   3594  * and then ucnv_safeClone() of the sub-converter may additionally align
   3595  * currentConverter inside the cloneStruct, for which we need the deadSpace
   3596  * after currentConverter.
   3597  * This is because UAlignedMemory may be larger than the actually
   3598  * necessary alignment size for the platform.
   3599  * The other cloneStruct fields will not be moved around,
   3600  * and are aligned properly with cloneStruct's alignment.
   3601  */
   3602 struct cloneStruct
   3603 {
   3604     UConverter cnv;
   3605     UConverter currentConverter;
   3606     UAlignedMemory deadSpace;
   3607     UConverterDataISO2022 mydata;
   3608 };
   3609 
   3610 
   3611 static UConverter *
   3612 _ISO_2022_SafeClone(
   3613             const UConverter *cnv,
   3614             void *stackBuffer,
   3615             int32_t *pBufferSize,
   3616             UErrorCode *status)
   3617 {
   3618     struct cloneStruct * localClone;
   3619     UConverterDataISO2022 *cnvData;
   3620     int32_t i, size;
   3621 
   3622     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
   3623         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
   3624         return NULL;
   3625     }
   3626 
   3627     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
   3628     localClone = (struct cloneStruct *)stackBuffer;
   3629 
   3630     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   3631 
   3632     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
   3633     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
   3634     localClone->cnv.isExtraLocal = TRUE;
   3635 
   3636     /* share the subconverters */
   3637 
   3638     if(cnvData->currentConverter != NULL) {
   3639         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
   3640         localClone->mydata.currentConverter =
   3641             ucnv_safeClone(cnvData->currentConverter,
   3642                             &localClone->currentConverter,
   3643                             &size, status);
   3644         if(U_FAILURE(*status)) {
   3645             return NULL;
   3646         }
   3647     }
   3648 
   3649     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
   3650         if(cnvData->myConverterArray[i] != NULL) {
   3651             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
   3652         }
   3653     }
   3654 
   3655     return &localClone->cnv;
   3656 }
   3657 
   3658 static void
   3659 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
   3660                     const USetAdder *sa,
   3661                     UConverterUnicodeSet which,
   3662                     UErrorCode *pErrorCode)
   3663 {
   3664     int32_t i;
   3665     UConverterDataISO2022* cnvData;
   3666 
   3667     if (U_FAILURE(*pErrorCode)) {
   3668         return;
   3669     }
   3670 #ifdef U_ENABLE_GENERIC_ISO_2022
   3671     if (cnv->sharedData == &_ISO2022Data) {
   3672         /* We use UTF-8 in this case */
   3673         sa->addRange(sa->set, 0, 0xd7FF);
   3674         sa->addRange(sa->set, 0xE000, 0x10FFFF);
   3675         return;
   3676     }
   3677 #endif
   3678 
   3679     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
   3680 
   3681     /* open a set and initialize it with code points that are algorithmically round-tripped */
   3682     switch(cnvData->locale[0]){
   3683     case 'j':
   3684         /* include JIS X 0201 which is hardcoded */
   3685         sa->add(sa->set, 0xa5);
   3686         sa->add(sa->set, 0x203e);
   3687         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
   3688             /* include Latin-1 for some variants of JP */
   3689             sa->addRange(sa->set, 0, 0xff);
   3690         } else {
   3691             /* include ASCII for JP */
   3692             sa->addRange(sa->set, 0, 0x7f);
   3693         }
   3694         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
   3695             /*
   3696              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
   3697              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
   3698              * use half-width Katakana.
   3699              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
   3700              * half-width Katakana via the ESC ( I sequence.
   3701              * However, we only emit (fromUnicode) half-width Katakana according to the
   3702              * definition of each variant.
   3703              *
   3704              * When including fallbacks,
   3705              * we need to include half-width Katakana Unicode code points for all JP variants because
   3706              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
   3707              */
   3708             /* include half-width Katakana for JP */
   3709             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
   3710         }
   3711         break;
   3712 #if !UCONFIG_ONLY_HTML_CONVERSION
   3713     case 'c':
   3714     case 'z':
   3715         /* include ASCII for CN */
   3716         sa->addRange(sa->set, 0, 0x7f);
   3717         break;
   3718     case 'k':
   3719         /* there is only one converter for KR, and it is not in the myConverterArray[] */
   3720         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
   3721                 cnvData->currentConverter, sa, which, pErrorCode);
   3722         /* the loop over myConverterArray[] will simply not find another converter */
   3723         break;
   3724 #endif
   3725     default:
   3726         break;
   3727     }
   3728 
   3729 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
   3730             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
   3731                 cnvData->version==0 && i==CNS_11643
   3732             ) {
   3733                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
   3734                 ucnv_MBCSGetUnicodeSetForBytes(
   3735                         cnvData->myConverterArray[i],
   3736                         sa, UCNV_ROUNDTRIP_SET,
   3737                         0, 0x81, 0x82,
   3738                         pErrorCode);
   3739             }
   3740 #endif
   3741 
   3742     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
   3743         UConverterSetFilter filter;
   3744         if(cnvData->myConverterArray[i]!=NULL) {
   3745             if(cnvData->locale[0]=='j' && i==JISX208) {
   3746                 /*
   3747                  * Only add code points that map to Shift-JIS codes
   3748                  * corresponding to JIS X 0208.
   3749                  */
   3750                 filter=UCNV_SET_FILTER_SJIS;
   3751 #if !UCONFIG_ONLY_HTML_CONVERSION
   3752             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
   3753                        cnvData->version==0 && i==CNS_11643) {
   3754                 /*
   3755                  * Version-specific for CN:
   3756                  * CN version 0 does not map CNS planes 3..7 although
   3757                  * they are all available in the CNS conversion table;
   3758                  * CN version 1 (-EXT) does map them all.
   3759                  * The two versions create different Unicode sets.
   3760                  */
   3761                 filter=UCNV_SET_FILTER_2022_CN;
   3762             } else if(i==KSC5601) {
   3763                 /*
   3764                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
   3765                  * are broader than GR94.
   3766                  */
   3767                 filter=UCNV_SET_FILTER_GR94DBCS;
   3768 #endif
   3769             } else {
   3770                 filter=UCNV_SET_FILTER_NONE;
   3771             }
   3772             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
   3773         }
   3774     }
   3775 
   3776     /*
   3777      * ISO 2022 converters must not convert SO/SI/ESC despite what
   3778      * sub-converters do by themselves.
   3779      * Remove these characters from the set.
   3780      */
   3781     sa->remove(sa->set, 0x0e);
   3782     sa->remove(sa->set, 0x0f);
   3783     sa->remove(sa->set, 0x1b);
   3784 
   3785     /* ISO 2022 converters do not convert C1 controls either */
   3786     sa->removeRange(sa->set, 0x80, 0x9f);
   3787 }
   3788 
   3789 static const UConverterImpl _ISO2022Impl={
   3790     UCNV_ISO_2022,
   3791 
   3792     NULL,
   3793     NULL,
   3794 
   3795     _ISO2022Open,
   3796     _ISO2022Close,
   3797     _ISO2022Reset,
   3798 
   3799 #ifdef U_ENABLE_GENERIC_ISO_2022
   3800     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
   3801     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
   3802     ucnv_fromUnicode_UTF8,
   3803     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   3804 #else
   3805     NULL,
   3806     NULL,
   3807     NULL,
   3808     NULL,
   3809 #endif
   3810     NULL,
   3811 
   3812     NULL,
   3813     _ISO2022getName,
   3814     _ISO_2022_WriteSub,
   3815     _ISO_2022_SafeClone,
   3816     _ISO_2022_GetUnicodeSet,
   3817 
   3818     NULL,
   3819     NULL
   3820 };
   3821 static const UConverterStaticData _ISO2022StaticData={
   3822     sizeof(UConverterStaticData),
   3823     "ISO_2022",
   3824     2022,
   3825     UCNV_IBM,
   3826     UCNV_ISO_2022,
   3827     1,
   3828     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
   3829     { 0x1a, 0, 0, 0 },
   3830     1,
   3831     FALSE,
   3832     FALSE,
   3833     0,
   3834     0,
   3835     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3836 };
   3837 const UConverterSharedData _ISO2022Data={
   3838     sizeof(UConverterSharedData),
   3839     ~((uint32_t) 0),
   3840     NULL,
   3841     NULL,
   3842     &_ISO2022StaticData,
   3843     FALSE,
   3844     &_ISO2022Impl,
   3845     0, UCNV_MBCS_TABLE_INITIALIZER
   3846 };
   3847 
   3848 /*************JP****************/
   3849 static const UConverterImpl _ISO2022JPImpl={
   3850     UCNV_ISO_2022,
   3851 
   3852     NULL,
   3853     NULL,
   3854 
   3855     _ISO2022Open,
   3856     _ISO2022Close,
   3857     _ISO2022Reset,
   3858 
   3859     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3860     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3861     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3862     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3863     NULL,
   3864 
   3865     NULL,
   3866     _ISO2022getName,
   3867     _ISO_2022_WriteSub,
   3868     _ISO_2022_SafeClone,
   3869     _ISO_2022_GetUnicodeSet,
   3870 
   3871     NULL,
   3872     NULL
   3873 };
   3874 static const UConverterStaticData _ISO2022JPStaticData={
   3875     sizeof(UConverterStaticData),
   3876     "ISO_2022_JP",
   3877     0,
   3878     UCNV_IBM,
   3879     UCNV_ISO_2022,
   3880     1,
   3881     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
   3882     { 0x1a, 0, 0, 0 },
   3883     1,
   3884     FALSE,
   3885     FALSE,
   3886     0,
   3887     0,
   3888     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3889 };
   3890 
   3891 namespace {
   3892 
   3893 const UConverterSharedData _ISO2022JPData={
   3894     sizeof(UConverterSharedData),
   3895     ~((uint32_t) 0),
   3896     NULL,
   3897     NULL,
   3898     &_ISO2022JPStaticData,
   3899     FALSE,
   3900     &_ISO2022JPImpl,
   3901     0, UCNV_MBCS_TABLE_INITIALIZER
   3902 };
   3903 
   3904 }  // namespace
   3905 
   3906 #if !UCONFIG_ONLY_HTML_CONVERSION
   3907 /************* KR ***************/
   3908 static const UConverterImpl _ISO2022KRImpl={
   3909     UCNV_ISO_2022,
   3910 
   3911     NULL,
   3912     NULL,
   3913 
   3914     _ISO2022Open,
   3915     _ISO2022Close,
   3916     _ISO2022Reset,
   3917 
   3918     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3919     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3920     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3921     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3922     NULL,
   3923 
   3924     NULL,
   3925     _ISO2022getName,
   3926     _ISO_2022_WriteSub,
   3927     _ISO_2022_SafeClone,
   3928     _ISO_2022_GetUnicodeSet,
   3929 
   3930     NULL,
   3931     NULL
   3932 };
   3933 static const UConverterStaticData _ISO2022KRStaticData={
   3934     sizeof(UConverterStaticData),
   3935     "ISO_2022_KR",
   3936     0,
   3937     UCNV_IBM,
   3938     UCNV_ISO_2022,
   3939     1,
   3940     3, /* max 3 bytes per UChar: SO+DBCS */
   3941     { 0x1a, 0, 0, 0 },
   3942     1,
   3943     FALSE,
   3944     FALSE,
   3945     0,
   3946     0,
   3947     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3948 };
   3949 
   3950 namespace {
   3951 
   3952 const UConverterSharedData _ISO2022KRData={
   3953     sizeof(UConverterSharedData),
   3954     ~((uint32_t) 0),
   3955     NULL,
   3956     NULL,
   3957     &_ISO2022KRStaticData,
   3958     FALSE,
   3959     &_ISO2022KRImpl,
   3960     0, UCNV_MBCS_TABLE_INITIALIZER
   3961 };
   3962 
   3963 }  // namespace
   3964 
   3965 /*************** CN ***************/
   3966 static const UConverterImpl _ISO2022CNImpl={
   3967 
   3968     UCNV_ISO_2022,
   3969 
   3970     NULL,
   3971     NULL,
   3972 
   3973     _ISO2022Open,
   3974     _ISO2022Close,
   3975     _ISO2022Reset,
   3976 
   3977     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3978     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3979     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3980     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3981     NULL,
   3982 
   3983     NULL,
   3984     _ISO2022getName,
   3985     _ISO_2022_WriteSub,
   3986     _ISO_2022_SafeClone,
   3987     _ISO_2022_GetUnicodeSet,
   3988 
   3989     NULL,
   3990     NULL
   3991 };
   3992 static const UConverterStaticData _ISO2022CNStaticData={
   3993     sizeof(UConverterStaticData),
   3994     "ISO_2022_CN",
   3995     0,
   3996     UCNV_IBM,
   3997     UCNV_ISO_2022,
   3998     1,
   3999     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
   4000     { 0x1a, 0, 0, 0 },
   4001     1,
   4002     FALSE,
   4003     FALSE,
   4004     0,
   4005     0,
   4006     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   4007 };
   4008 
   4009 namespace {
   4010 
   4011 const UConverterSharedData _ISO2022CNData={
   4012     sizeof(UConverterSharedData),
   4013     ~((uint32_t) 0),
   4014     NULL,
   4015     NULL,
   4016     &_ISO2022CNStaticData,
   4017     FALSE,
   4018     &_ISO2022CNImpl,
   4019     0, UCNV_MBCS_TABLE_INITIALIZER
   4020 };
   4021 
   4022 }  // namespace
   4023 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
   4024 
   4025 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   4026