Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2000-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv2022.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000feb03
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Change history:
     17 *
     18 *   06/29/2000  helena  Major rewrite of the callback APIs.
     19 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
     20 *                       Changed implementation of toUnicode
     21 *                       function
     22 *   08/21/2000  Ram     Added support for ISO-2022-KR
     23 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
     24 *                       ucnvebdc.c
     25 *   09/20/2000  Ram     Added support for ISO-2022-CN
     26 *                       Added implementations for getNextUChar()
     27 *                       for specific 2022 country variants.
     28 *   10/31/2000  Ram     Implemented offsets logic functions
     29 */
     30 
     31 #include "unicode/utypes.h"
     32 
     33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     34 
     35 #include "unicode/ucnv.h"
     36 #include "unicode/uset.h"
     37 #include "unicode/ucnv_err.h"
     38 #include "unicode/ucnv_cb.h"
     39 #include "unicode/utf16.h"
     40 #include "ucnv_imp.h"
     41 #include "ucnv_bld.h"
     42 #include "ucnv_cnv.h"
     43 #include "ucnvmbcs.h"
     44 #include "cstring.h"
     45 #include "cmemory.h"
     46 #include "uassert.h"
     47 
     48 #ifdef U_ENABLE_GENERIC_ISO_2022
     49 /*
     50  * I am disabling the generic ISO-2022 converter after proposing to do so on
     51  * the icu mailing list two days ago.
     52  *
     53  * Reasons:
     54  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
     55  *    its designation sequences, single shifts with return to the previous state,
     56  *    switch-with-no-return to UTF-16BE or similar, etc.
     57  *    This is unlike the language-specific variants like ISO-2022-JP which
     58  *    require a much smaller repertoire of ISO-2022 features.
     59  *    These variants continue to be supported.
     60  * 2. I believe that no one is really using the generic ISO-2022 converter
     61  *    but rather always one of the language-specific variants.
     62  *    Note that ICU's generic ISO-2022 converter has always output one escape
     63  *    sequence followed by UTF-8 for the whole stream.
     64  * 3. Switching between subcharsets is extremely slow, because each time
     65  *    the previous converter is closed and a new one opened,
     66  *    without any kind of caching, least-recently-used list, etc.
     67  * 4. The code is currently buggy, and given the above it does not seem
     68  *    reasonable to spend the time on maintenance.
     69  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
     70  *    This means, for example, that when ISO-8859-7 is designated, the following
     71  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
     72  *    The ICU ISO-2022 converter does not handle this - and has no information
     73  *    about which subconverter would have to be shifted vs. which is designed
     74  *    for 7-bit ISO-2022.
     75  *
     76  * Markus Scherer 2003-dec-03
     77  */
     78 #endif
     79 
     80 #if !UCONFIG_ONLY_HTML_CONVERSION
     81 static const char SHIFT_IN_STR[]  = "\x0F";
     82 // static const char SHIFT_OUT_STR[] = "\x0E";
     83 #endif
     84 
     85 #define CR      0x0D
     86 #define LF      0x0A
     87 #define H_TAB   0x09
     88 #define V_TAB   0x0B
     89 #define SPACE   0x20
     90 
     91 enum {
     92     HWKANA_START=0xff61,
     93     HWKANA_END=0xff9f
     94 };
     95 
     96 /*
     97  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
     98  * as bytes 21..7E. (Subtract 0x80.)
     99  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
    100  * as bytes 20..7F. (Subtract 0x80.)
    101  * Do not encode C1 control codes with native bytes 80..9F
    102  * as bytes 00..1F (C0 control codes).
    103  */
    104 enum {
    105     GR94_START=0xa1,
    106     GR94_END=0xfe,
    107     GR96_START=0xa0,
    108     GR96_END=0xff
    109 };
    110 
    111 /*
    112  * ISO 2022 control codes must not be converted from Unicode
    113  * because they would mess up the byte stream.
    114  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
    115  * corresponding to SO, SI, and ESC.
    116  */
    117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
    118 
    119 /* for ISO-2022-JP and -CN implementations */
    120 typedef enum  {
    121         /* shared values */
    122         INVALID_STATE=-1,
    123         ASCII = 0,
    124 
    125         SS2_STATE=0x10,
    126         SS3_STATE,
    127 
    128         /* JP */
    129         ISO8859_1 = 1 ,
    130         ISO8859_7 = 2 ,
    131         JISX201  = 3,
    132         JISX208 = 4,
    133         JISX212 = 5,
    134         GB2312  =6,
    135         KSC5601 =7,
    136         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
    137 
    138         /* CN */
    139         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
    140         GB2312_1=1,
    141         ISO_IR_165=2,
    142         CNS_11643=3,
    143 
    144         /*
    145          * these are used in StateEnum and ISO2022State variables,
    146          * but CNS_11643 must be used to index into myConverterArray[]
    147          */
    148         CNS_11643_0=0x20,
    149         CNS_11643_1,
    150         CNS_11643_2,
    151         CNS_11643_3,
    152         CNS_11643_4,
    153         CNS_11643_5,
    154         CNS_11643_6,
    155         CNS_11643_7
    156 } StateEnum;
    157 
    158 /* is the StateEnum charset value for a DBCS charset? */
    159 #if UCONFIG_ONLY_HTML_CONVERSION
    160 #define IS_JP_DBCS(cs) (JISX208==(cs))
    161 #else
    162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
    163 #endif
    164 
    165 #define CSM(cs) ((uint16_t)1<<(cs))
    166 
    167 /*
    168  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
    169  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
    170  *
    171  * Note: The converter uses some leniency:
    172  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
    173  *   all versions, not just JIS7 and JIS8.
    174  * - ICU does not distinguish between different versions of JIS X 0208.
    175  */
    176 #if UCONFIG_ONLY_HTML_CONVERSION
    177 enum { MAX_JA_VERSION=0 };
    178 #else
    179 enum { MAX_JA_VERSION=4 };
    180 #endif
    181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
    182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
    183 #if !UCONFIG_ONLY_HTML_CONVERSION
    184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
    185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
    186     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
    187     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
    188 #endif
    189 };
    190 
    191 typedef enum {
    192         ASCII1=0,
    193         LATIN1,
    194         SBCS,
    195         DBCS,
    196         MBCS,
    197         HWKANA
    198 }Cnv2022Type;
    199 
    200 typedef struct ISO2022State {
    201     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
    202     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
    203     int8_t prevG;       /* g before single shift (SS2 or SS3) */
    204 } ISO2022State;
    205 
    206 #define UCNV_OPTIONS_VERSION_MASK 0xf
    207 #define UCNV_2022_MAX_CONVERTERS 10
    208 
    209 typedef struct{
    210     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
    211     UConverter *currentConverter;
    212     Cnv2022Type currentType;
    213     ISO2022State toU2022State, fromU2022State;
    214     uint32_t key;
    215     uint32_t version;
    216 #ifdef U_ENABLE_GENERIC_ISO_2022
    217     UBool isFirstBuffer;
    218 #endif
    219     UBool isEmptySegment;
    220     char name[30];
    221     char locale[3];
    222 }UConverterDataISO2022;
    223 
    224 /* Protos */
    225 /* ISO-2022 ----------------------------------------------------------------- */
    226 
    227 /*Forward declaration */
    228 U_CFUNC void U_CALLCONV
    229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
    230                       UErrorCode * err);
    231 U_CFUNC void U_CALLCONV
    232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
    233                                     UErrorCode * err);
    234 
    235 #define ESC_2022 0x1B /*ESC*/
    236 
    237 typedef enum
    238 {
    239         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
    240         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
    241         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
    242         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
    243 } UCNV_TableStates_2022;
    244 
    245 /*
    246 * The way these state transition arrays work is:
    247 * ex : ESC$B is the sequence for JISX208
    248 *      a) First Iteration: char is ESC
    249 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
    250 *             int x = normalize_esq_chars_2022[27] which is equal to 1
    251 *         ii) Search for this value in escSeqStateTable_Key_2022[]
    252 *             value of x is stored at escSeqStateTable_Key_2022[0]
    253 *        iii) Save this index as offset
    254 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
    255 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
    256 *     b) Switch on this state and continue to next char
    257 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
    258 *             which is normalize_esq_chars_2022[36] == 4
    259 *         ii) x is currently 1(from above)
    260 *               x<<=5 -- x is now 32
    261 *               x+=normalize_esq_chars_2022[36]
    262 *               now x is 36
    263 *        iii) Search for this value in escSeqStateTable_Key_2022[]
    264 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
    265 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
    266 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
    267 *     c) Switch on this state and continue to next char
    268 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
    269 *        ii) x is currently 36 (from above)
    270 *            x<<=5 -- x is now 1152
    271 *            x+=normalize_esq_chars_2022[66]
    272 *            now x is 1161
    273 *       iii) Search for this value in escSeqStateTable_Key_2022[]
    274 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
    275 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
    276 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
    277 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
    278 */
    279 
    280 
    281 /*Below are the 3 arrays depicting a state transition table*/
    282 static const int8_t normalize_esq_chars_2022[256] = {
    283 /*       0      1       2       3       4      5       6        7       8       9           */
    284 
    285          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
    288         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
    289         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
    290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    291         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
    292         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
    293         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
    294         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    308         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    309         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
    310         ,0     ,0      ,0      ,0      ,0      ,0
    311 };
    312 
    313 #ifdef U_ENABLE_GENERIC_ISO_2022
    314 /*
    315  * When the generic ISO-2022 converter is completely removed, not just disabled
    316  * per #ifdef, then the following state table and the associated tables that are
    317  * dimensioned with MAX_STATES_2022 should be trimmed.
    318  *
    319  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
    320  * the associated escape sequences starting with ESC ( B should be removed.
    321  * This includes the ones with key values 1097 and all of the ones above 1000000.
    322  *
    323  * For the latter, the tables can simply be truncated.
    324  * For the former, since the tables must be kept parallel, it is probably best
    325  * to simply duplicate an adjacent table cell, parallel in all tables.
    326  *
    327  * It may make sense to restructure the tables, especially by using small search
    328  * tables for the variants instead of indexing them parallel to the table here.
    329  */
    330 #endif
    331 
    332 #define MAX_STATES_2022 74
    333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
    334 /*   0           1           2           3           4           5           6           7           8           9           */
    335 
    336      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
    337     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
    338     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
    339     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
    340     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
    341     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
    342     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
    343     ,35947631   ,35947635   ,35947636   ,35947638
    344 };
    345 
    346 #ifdef U_ENABLE_GENERIC_ISO_2022
    347 
    348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
    349  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
    350 
    351      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
    352     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
    353     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
    354     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
    355     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
    356     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
    357     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
    358     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
    359 };
    360 
    361 #endif
    362 
    363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
    364 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
    365      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    366     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    367     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
    368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    371     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    372     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
    373 };
    374 
    375 /* Type def for refactoring changeState_2022 code*/
    376 typedef enum{
    377 #ifdef U_ENABLE_GENERIC_ISO_2022
    378     ISO_2022=0,
    379 #endif
    380     ISO_2022_JP=1,
    381 #if !UCONFIG_ONLY_HTML_CONVERSION
    382     ISO_2022_KR=2,
    383     ISO_2022_CN=3
    384 #endif
    385 } Variant2022;
    386 
    387 /*********** ISO 2022 Converter Protos ***********/
    388 static void U_CALLCONV
    389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
    390 
    391 static void U_CALLCONV
    392  _ISO2022Close(UConverter *converter);
    393 
    394 static void U_CALLCONV
    395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
    396 
    397 U_CDECL_BEGIN
    398 static const char * U_CALLCONV
    399 _ISO2022getName(const UConverter* cnv);
    400 U_CDECL_END
    401 
    402 static void  U_CALLCONV
    403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
    404 
    405 U_CDECL_BEGIN
    406 static UConverter * U_CALLCONV
    407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
    408 
    409 U_CDECL_END
    410 
    411 #ifdef U_ENABLE_GENERIC_ISO_2022
    412 static void U_CALLCONV
    413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
    414 #endif
    415 
    416 namespace {
    417 
    418 /*const UConverterSharedData _ISO2022Data;*/
    419 extern const UConverterSharedData _ISO2022JPData;
    420 
    421 #if !UCONFIG_ONLY_HTML_CONVERSION
    422 extern const UConverterSharedData _ISO2022KRData;
    423 extern const UConverterSharedData _ISO2022CNData;
    424 #endif
    425 
    426 }  // namespace
    427 
    428 /*************** Converter implementations ******************/
    429 
    430 /* The purpose of this function is to get around gcc compiler warnings. */
    431 static inline void
    432 fromUWriteUInt8(UConverter *cnv,
    433                  const char *bytes, int32_t length,
    434                  uint8_t **target, const char *targetLimit,
    435                  int32_t **offsets,
    436                  int32_t sourceIndex,
    437                  UErrorCode *pErrorCode)
    438 {
    439     char *targetChars = (char *)*target;
    440     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
    441                          offsets, sourceIndex, pErrorCode);
    442     *target = (uint8_t*)targetChars;
    443 
    444 }
    445 
    446 static inline void
    447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
    448     if(myConverterData->version == 1) {
    449         UConverter *cnv = myConverterData->currentConverter;
    450 
    451         cnv->toUnicodeStatus=0;     /* offset */
    452         cnv->mode=0;                /* state */
    453         cnv->toULength=0;           /* byteIndex */
    454     }
    455 }
    456 
    457 static inline void
    458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
    459    /* in ISO-2022-KR the designator sequence appears only once
    460     * in a file so we append it only once
    461     */
    462     if( converter->charErrorBufferLength==0){
    463 
    464         converter->charErrorBufferLength = 4;
    465         converter->charErrorBuffer[0] = 0x1b;
    466         converter->charErrorBuffer[1] = 0x24;
    467         converter->charErrorBuffer[2] = 0x29;
    468         converter->charErrorBuffer[3] = 0x43;
    469     }
    470     if(myConverterData->version == 1) {
    471         UConverter *cnv = myConverterData->currentConverter;
    472 
    473         cnv->fromUChar32=0;
    474         cnv->fromUnicodeStatus=1;   /* prevLength */
    475     }
    476 }
    477 
    478 static void U_CALLCONV
    479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
    480 
    481     char myLocale[6]={' ',' ',' ',' ',' ',' '};
    482 
    483     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
    484     if(cnv->extraInfo != NULL) {
    485         UConverterNamePieces stackPieces;
    486         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
    487         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
    488         uint32_t version;
    489 
    490         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
    491 
    492         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
    493         myConverterData->currentType = ASCII1;
    494         cnv->fromUnicodeStatus =FALSE;
    495         if(pArgs->locale){
    496             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
    497         }
    498         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
    499         myConverterData->version = version;
    500         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
    501             (myLocale[2]=='_' || myLocale[2]=='\0'))
    502         {
    503             /* open the required converters and cache them */
    504             if(version>MAX_JA_VERSION) {
    505                 // ICU 55 fails to open a converter for an unsupported version.
    506                 // Previously, it fell back to version 0, but that would yield
    507                 // unexpected behavior.
    508                 *errorCode = U_MISSING_RESOURCE_ERROR;
    509                 return;
    510             }
    511             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
    512                 myConverterData->myConverterArray[ISO8859_7] =
    513                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
    514             }
    515             myConverterData->myConverterArray[JISX208] =
    516                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
    517             if(jpCharsetMasks[version]&CSM(JISX212)) {
    518                 myConverterData->myConverterArray[JISX212] =
    519                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
    520             }
    521             if(jpCharsetMasks[version]&CSM(GB2312)) {
    522                 myConverterData->myConverterArray[GB2312] =
    523                     /* BEGIN android-changed */
    524                     ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
    525                     /* END android-changed */
    526             }
    527             if(jpCharsetMasks[version]&CSM(KSC5601)) {
    528                 myConverterData->myConverterArray[KSC5601] =
    529                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
    530             }
    531 
    532             /* set the function pointers to appropriate funtions */
    533             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
    534             uprv_strcpy(myConverterData->locale,"ja");
    535 
    536             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
    537             size_t len = uprv_strlen(myConverterData->name);
    538             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
    539             myConverterData->name[len+1]='\0';
    540         }
    541 #if !UCONFIG_ONLY_HTML_CONVERSION
    542         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
    543             (myLocale[2]=='_' || myLocale[2]=='\0'))
    544         {
    545             if(version>1) {
    546                 // ICU 55 fails to open a converter for an unsupported version.
    547                 // Previously, it fell back to version 0, but that would yield
    548                 // unexpected behavior.
    549                 *errorCode = U_MISSING_RESOURCE_ERROR;
    550                 return;
    551             }
    552             const char *cnvName;
    553             if(version==1) {
    554                 cnvName="icu-internal-25546";
    555             } else {
    556                 /* BEGIN android-changed */
    557                 cnvName="ksc_5601";
    558                 /* END android-changed */
    559                 myConverterData->version=version=0;
    560             }
    561             if(pArgs->onlyTestIsLoadable) {
    562                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
    563                 uprv_free(cnv->extraInfo);
    564                 cnv->extraInfo=NULL;
    565                 return;
    566             } else {
    567                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
    568                 if (U_FAILURE(*errorCode)) {
    569                     _ISO2022Close(cnv);
    570                     return;
    571                 }
    572 
    573                 if(version==1) {
    574                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
    575                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
    576                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
    577                 }else{
    578                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
    579                 }
    580 
    581                 /* initialize the state variables */
    582                 setInitialStateToUnicodeKR(cnv, myConverterData);
    583                 setInitialStateFromUnicodeKR(cnv, myConverterData);
    584 
    585                 /* set the function pointers to appropriate funtions */
    586                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
    587                 uprv_strcpy(myConverterData->locale,"ko");
    588             }
    589         }
    590         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
    591             (myLocale[2]=='_' || myLocale[2]=='\0'))
    592         {
    593             if(version>2) {
    594                 // ICU 55 fails to open a converter for an unsupported version.
    595                 // Previously, it fell back to version 0, but that would yield
    596                 // unexpected behavior.
    597                 *errorCode = U_MISSING_RESOURCE_ERROR;
    598                 return;
    599             }
    600 
    601             /* open the required converters and cache them */
    602             /* BEGIN android-changed */
    603             myConverterData->myConverterArray[GB2312_1] =
    604                 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
    605             if(version==1) {
    606                 myConverterData->myConverterArray[ISO_IR_165] =
    607                     ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
    608             }
    609             myConverterData->myConverterArray[CNS_11643] =
    610                 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
    611             /* END android-changed */
    612 
    613 
    614             /* set the function pointers to appropriate funtions */
    615             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
    616             uprv_strcpy(myConverterData->locale,"cn");
    617 
    618             if (version==0){
    619                 myConverterData->version = 0;
    620                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
    621             }else if (version==1){
    622                 myConverterData->version = 1;
    623                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
    624             }else {
    625                 myConverterData->version = 2;
    626                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
    627             }
    628         }
    629 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
    630         else{
    631 #ifdef U_ENABLE_GENERIC_ISO_2022
    632             myConverterData->isFirstBuffer = TRUE;
    633 
    634             /* append the UTF-8 escape sequence */
    635             cnv->charErrorBufferLength = 3;
    636             cnv->charErrorBuffer[0] = 0x1b;
    637             cnv->charErrorBuffer[1] = 0x25;
    638             cnv->charErrorBuffer[2] = 0x42;
    639 
    640             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
    641             /* initialize the state variables */
    642             uprv_strcpy(myConverterData->name,"ISO_2022");
    643 #else
    644             *errorCode = U_MISSING_RESOURCE_ERROR;
    645             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
    646             // data loading error code.
    647             return;
    648 #endif
    649         }
    650 
    651         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
    652 
    653         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
    654             _ISO2022Close(cnv);
    655         }
    656     } else {
    657         *errorCode = U_MEMORY_ALLOCATION_ERROR;
    658     }
    659 }
    660 
    661 
    662 static void U_CALLCONV
    663 _ISO2022Close(UConverter *converter) {
    664     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
    665     UConverterSharedData **array = myData->myConverterArray;
    666     int32_t i;
    667 
    668     if (converter->extraInfo != NULL) {
    669         /*close the array of converter pointers and free the memory*/
    670         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
    671             if(array[i]!=NULL) {
    672                 ucnv_unloadSharedDataIfReady(array[i]);
    673             }
    674         }
    675 
    676         ucnv_close(myData->currentConverter);
    677 
    678         if(!converter->isExtraLocal){
    679             uprv_free (converter->extraInfo);
    680             converter->extraInfo = NULL;
    681         }
    682     }
    683 }
    684 
    685 static void U_CALLCONV
    686 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
    687     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
    688     if(choice<=UCNV_RESET_TO_UNICODE) {
    689         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
    690         myConverterData->key = 0;
    691         myConverterData->isEmptySegment = FALSE;
    692     }
    693     if(choice!=UCNV_RESET_TO_UNICODE) {
    694         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
    695     }
    696 #ifdef U_ENABLE_GENERIC_ISO_2022
    697     if(myConverterData->locale[0] == 0){
    698         if(choice<=UCNV_RESET_TO_UNICODE) {
    699             myConverterData->isFirstBuffer = TRUE;
    700             myConverterData->key = 0;
    701             if (converter->mode == UCNV_SO){
    702                 ucnv_close (myConverterData->currentConverter);
    703                 myConverterData->currentConverter=NULL;
    704             }
    705             converter->mode = UCNV_SI;
    706         }
    707         if(choice!=UCNV_RESET_TO_UNICODE) {
    708             /* re-append UTF-8 escape sequence */
    709             converter->charErrorBufferLength = 3;
    710             converter->charErrorBuffer[0] = 0x1b;
    711             converter->charErrorBuffer[1] = 0x28;
    712             converter->charErrorBuffer[2] = 0x42;
    713         }
    714     }
    715     else
    716 #endif
    717     {
    718         /* reset the state variables */
    719         if(myConverterData->locale[0] == 'k'){
    720             if(choice<=UCNV_RESET_TO_UNICODE) {
    721                 setInitialStateToUnicodeKR(converter, myConverterData);
    722             }
    723             if(choice!=UCNV_RESET_TO_UNICODE) {
    724                 setInitialStateFromUnicodeKR(converter, myConverterData);
    725             }
    726         }
    727     }
    728 }
    729 
    730 U_CDECL_BEGIN
    731 
    732 static const char * U_CALLCONV
    733 _ISO2022getName(const UConverter* cnv){
    734     if(cnv->extraInfo){
    735         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
    736         return myData->name;
    737     }
    738     return NULL;
    739 }
    740 
    741 U_CDECL_END
    742 
    743 
    744 /*************** to unicode *******************/
    745 /****************************************************************************
    746  * Recognized escape sequences are
    747  * <ESC>(B  ASCII
    748  * <ESC>.A  ISO-8859-1
    749  * <ESC>.F  ISO-8859-7
    750  * <ESC>(J  JISX-201
    751  * <ESC>(I  JISX-201
    752  * <ESC>$B  JISX-208
    753  * <ESC>$@  JISX-208
    754  * <ESC>$(D JISX-212
    755  * <ESC>$A  GB2312
    756  * <ESC>$(C KSC5601
    757  */
    758 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
    759 /*      0                1               2               3               4               5               6               7               8               9    */
    760     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    761     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
    762     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    763     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
    764     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    765     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    766     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    767     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    768 };
    769 
    770 #if !UCONFIG_ONLY_HTML_CONVERSION
    771 /*************** to unicode *******************/
    772 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
    773 /*      0                1               2               3               4               5               6               7               8               9    */
    774      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    775     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    776     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    777     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    778     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
    779     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    780     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    781     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
    782 };
    783 #endif
    784 
    785 
    786 static UCNV_TableStates_2022
    787 getKey_2022(char c,int32_t* key,int32_t* offset){
    788     int32_t togo;
    789     int32_t low = 0;
    790     int32_t hi = MAX_STATES_2022;
    791     int32_t oldmid=0;
    792 
    793     togo = normalize_esq_chars_2022[(uint8_t)c];
    794     if(togo == 0) {
    795         /* not a valid character anywhere in an escape sequence */
    796         *key = 0;
    797         *offset = 0;
    798         return INVALID_2022;
    799     }
    800     togo = (*key << 5) + togo;
    801 
    802     while (hi != low)  /*binary search*/{
    803 
    804         int32_t mid = (hi+low) >> 1; /*Finds median*/
    805 
    806         if (mid == oldmid)
    807             break;
    808 
    809         if (escSeqStateTable_Key_2022[mid] > togo){
    810             hi = mid;
    811         }
    812         else if (escSeqStateTable_Key_2022[mid] < togo){
    813             low = mid;
    814         }
    815         else /*we found it*/{
    816             *key = togo;
    817             *offset = mid;
    818             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
    819         }
    820         oldmid = mid;
    821 
    822     }
    823 
    824     *key = 0;
    825     *offset = 0;
    826     return INVALID_2022;
    827 }
    828 
    829 /*runs through a state machine to determine the escape sequence - codepage correspondance
    830  */
    831 static void
    832 changeState_2022(UConverter* _this,
    833                 const char** source,
    834                 const char* sourceLimit,
    835                 Variant2022 var,
    836                 UErrorCode* err){
    837     UCNV_TableStates_2022 value;
    838     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
    839     uint32_t key = myData2022->key;
    840     int32_t offset = 0;
    841     int8_t initialToULength = _this->toULength;
    842     char c;
    843 
    844     value = VALID_NON_TERMINAL_2022;
    845     while (*source < sourceLimit) {
    846         c = *(*source)++;
    847         _this->toUBytes[_this->toULength++]=(uint8_t)c;
    848         value = getKey_2022(c,(int32_t *) &key, &offset);
    849 
    850         switch (value){
    851 
    852         case VALID_NON_TERMINAL_2022 :
    853             /* continue with the loop */
    854             break;
    855 
    856         case VALID_TERMINAL_2022:
    857             key = 0;
    858             goto DONE;
    859 
    860         case INVALID_2022:
    861             goto DONE;
    862 
    863         case VALID_MAYBE_TERMINAL_2022:
    864 #ifdef U_ENABLE_GENERIC_ISO_2022
    865             /* ESC ( B is ambiguous only for ISO_2022 itself */
    866             if(var == ISO_2022) {
    867                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
    868                 _this->toULength = 0;
    869 
    870                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
    871 
    872                 /* continue with the loop */
    873                 value = VALID_NON_TERMINAL_2022;
    874                 break;
    875             } else
    876 #endif
    877             {
    878                 /* not ISO_2022 itself, finish here */
    879                 value = VALID_TERMINAL_2022;
    880                 key = 0;
    881                 goto DONE;
    882             }
    883         }
    884     }
    885 
    886 DONE:
    887     myData2022->key = key;
    888 
    889     if (value == VALID_NON_TERMINAL_2022) {
    890         /* indicate that the escape sequence is incomplete: key!=0 */
    891         return;
    892     } else if (value == INVALID_2022 ) {
    893         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    894     } else /* value == VALID_TERMINAL_2022 */ {
    895         switch(var){
    896 #ifdef U_ENABLE_GENERIC_ISO_2022
    897         case ISO_2022:
    898         {
    899             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
    900             if(chosenConverterName == NULL) {
    901                 /* SS2 or SS3 */
    902                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    903                 _this->toUCallbackReason = UCNV_UNASSIGNED;
    904                 return;
    905             }
    906 
    907             _this->mode = UCNV_SI;
    908             ucnv_close(myData2022->currentConverter);
    909             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
    910             if(U_SUCCESS(*err)) {
    911                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
    912                 _this->mode = UCNV_SO;
    913             }
    914             break;
    915         }
    916 #endif
    917         case ISO_2022_JP:
    918             {
    919                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
    920                 switch(tempState) {
    921                 case INVALID_STATE:
    922                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    923                     break;
    924                 case SS2_STATE:
    925                     if(myData2022->toU2022State.cs[2]!=0) {
    926                         if(myData2022->toU2022State.g<2) {
    927                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    928                         }
    929                         myData2022->toU2022State.g=2;
    930                     } else {
    931                         /* illegal to have SS2 before a matching designator */
    932                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    933                     }
    934                     break;
    935                 /* case SS3_STATE: not used in ISO-2022-JP-x */
    936                 case ISO8859_1:
    937                 case ISO8859_7:
    938                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
    939                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    940                     } else {
    941                         /* G2 charset for SS2 */
    942                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
    943                     }
    944                     break;
    945                 default:
    946                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
    947                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    948                     } else {
    949                         /* G0 charset */
    950                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
    951                     }
    952                     break;
    953                 }
    954             }
    955             break;
    956 #if !UCONFIG_ONLY_HTML_CONVERSION
    957         case ISO_2022_CN:
    958             {
    959                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
    960                 switch(tempState) {
    961                 case INVALID_STATE:
    962                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    963                     break;
    964                 case SS2_STATE:
    965                     if(myData2022->toU2022State.cs[2]!=0) {
    966                         if(myData2022->toU2022State.g<2) {
    967                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    968                         }
    969                         myData2022->toU2022State.g=2;
    970                     } else {
    971                         /* illegal to have SS2 before a matching designator */
    972                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    973                     }
    974                     break;
    975                 case SS3_STATE:
    976                     if(myData2022->toU2022State.cs[3]!=0) {
    977                         if(myData2022->toU2022State.g<2) {
    978                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
    979                         }
    980                         myData2022->toU2022State.g=3;
    981                     } else {
    982                         /* illegal to have SS3 before a matching designator */
    983                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
    984                     }
    985                     break;
    986                 case ISO_IR_165:
    987                     if(myData2022->version==0) {
    988                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
    989                         break;
    990                     }
    991                     U_FALLTHROUGH;
    992                 case GB2312_1:
    993                     U_FALLTHROUGH;
    994                 case CNS_11643_1:
    995                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
    996                     break;
    997                 case CNS_11643_2:
    998                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
    999                     break;
   1000                 default:
   1001                     /* other CNS 11643 planes */
   1002                     if(myData2022->version==0) {
   1003                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1004                     } else {
   1005                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
   1006                     }
   1007                     break;
   1008                 }
   1009             }
   1010             break;
   1011         case ISO_2022_KR:
   1012             if(offset==0x30){
   1013                 /* nothing to be done, just accept this one escape sequence */
   1014             } else {
   1015                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
   1016             }
   1017             break;
   1018 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
   1019 
   1020         default:
   1021             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   1022             break;
   1023         }
   1024     }
   1025     if(U_SUCCESS(*err)) {
   1026         _this->toULength = 0;
   1027     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
   1028         if(_this->toULength>1) {
   1029             /*
   1030              * Ticket 5691: consistent illegal sequences:
   1031              * - We include at least the first byte (ESC) in the illegal sequence.
   1032              * - If any of the non-initial bytes could be the start of a character,
   1033              *   we stop the illegal sequence before the first one of those.
   1034              *   In escape sequences, all following bytes are "printable", that is,
   1035              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
   1036              *   they are valid single/lead bytes.
   1037              *   For simplicity, we always only report the initial ESC byte as the
   1038              *   illegal sequence and back out all other bytes we looked at.
   1039              */
   1040             /* Back out some bytes. */
   1041             int8_t backOutDistance=_this->toULength-1;
   1042             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
   1043             if(backOutDistance<=bytesFromThisBuffer) {
   1044                 /* same as initialToULength<=1 */
   1045                 *source-=backOutDistance;
   1046             } else {
   1047                 /* Back out bytes from the previous buffer: Need to replay them. */
   1048                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   1049                 /* same as -(initialToULength-1) */
   1050                 /* preToULength is negative! */
   1051                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
   1052                 *source-=bytesFromThisBuffer;
   1053             }
   1054             _this->toULength=1;
   1055         }
   1056     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
   1057         _this->toUCallbackReason = UCNV_UNASSIGNED;
   1058     }
   1059 }
   1060 
   1061 #if !UCONFIG_ONLY_HTML_CONVERSION
   1062 /*Checks the characters of the buffer against valid 2022 escape sequences
   1063 *if the match we return a pointer to the initial start of the sequence otherwise
   1064 *we return sourceLimit
   1065 */
   1066 /*for 2022 looks ahead in the stream
   1067  *to determine the longest possible convertible
   1068  *data stream
   1069  */
   1070 static inline const char*
   1071 getEndOfBuffer_2022(const char** source,
   1072                    const char* sourceLimit,
   1073                    UBool /*flush*/){
   1074 
   1075     const char* mySource = *source;
   1076 
   1077 #ifdef U_ENABLE_GENERIC_ISO_2022
   1078     if (*source >= sourceLimit)
   1079         return sourceLimit;
   1080 
   1081     do{
   1082 
   1083         if (*mySource == ESC_2022){
   1084             int8_t i;
   1085             int32_t key = 0;
   1086             int32_t offset;
   1087             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
   1088 
   1089             /* Kludge: I could not
   1090             * figure out the reason for validating an escape sequence
   1091             * twice - once here and once in changeState_2022().
   1092             * is it possible to have an ESC character in a ISO2022
   1093             * byte stream which is valid in a code page? Is it legal?
   1094             */
   1095             for (i=0;
   1096             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
   1097             i++) {
   1098                 value =  getKey_2022(*(mySource+i), &key, &offset);
   1099             }
   1100             if (value > 0 || *mySource==ESC_2022)
   1101                 return mySource;
   1102 
   1103             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
   1104                 return sourceLimit;
   1105         }
   1106     }while (++mySource < sourceLimit);
   1107 
   1108     return sourceLimit;
   1109 #else
   1110     while(mySource < sourceLimit && *mySource != ESC_2022) {
   1111         ++mySource;
   1112     }
   1113     return mySource;
   1114 #endif
   1115 }
   1116 #endif
   1117 
   1118 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
   1119  * any future change in _MBCSFromUChar32() function should be reflected here.
   1120  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
   1121  */
   1122 static inline int32_t
   1123 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
   1124                                          UChar32 c,
   1125                                          uint32_t* value,
   1126                                          UBool useFallback,
   1127                                          int outputType)
   1128 {
   1129     const int32_t *cx;
   1130     const uint16_t *table;
   1131     uint32_t stage2Entry;
   1132     uint32_t myValue;
   1133     int32_t length;
   1134     const uint8_t *p;
   1135     /*
   1136      * TODO(markus): Use and require new, faster MBCS conversion table structures.
   1137      * Use internal version of ucnv_open() that verifies that the new structures are available,
   1138      * else U_INTERNAL_PROGRAM_ERROR.
   1139      */
   1140     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   1141     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   1142         table=sharedData->mbcs.fromUnicodeTable;
   1143         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   1144         /* get the bytes and the length for the output */
   1145         if(outputType==MBCS_OUTPUT_2){
   1146             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   1147             if(myValue<=0xff) {
   1148                 length=1;
   1149             } else {
   1150                 length=2;
   1151             }
   1152         } else /* outputType==MBCS_OUTPUT_3 */ {
   1153             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   1154             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   1155             if(myValue<=0xff) {
   1156                 length=1;
   1157             } else if(myValue<=0xffff) {
   1158                 length=2;
   1159             } else {
   1160                 length=3;
   1161             }
   1162         }
   1163         /* is this code point assigned, or do we use fallbacks? */
   1164         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
   1165             /* assigned */
   1166             *value=myValue;
   1167             return length;
   1168         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
   1169             /*
   1170              * We allow a 0 byte output if the "assigned" bit is set for this entry.
   1171              * There is no way with this data structure for fallback output
   1172              * to be a zero byte.
   1173              */
   1174             *value=myValue;
   1175             return -length;
   1176         }
   1177     }
   1178 
   1179     cx=sharedData->mbcs.extIndexes;
   1180     if(cx!=NULL) {
   1181         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
   1182     }
   1183 
   1184     /* unassigned */
   1185     return 0;
   1186 }
   1187 
   1188 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
   1189  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
   1190  * @param retval pointer to output byte
   1191  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
   1192  */
   1193 static inline int32_t
   1194 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
   1195                                        UChar32 c,
   1196                                        uint32_t* retval,
   1197                                        UBool useFallback)
   1198 {
   1199     const uint16_t *table;
   1200     int32_t value;
   1201     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   1202     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   1203         return 0;
   1204     }
   1205     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   1206     table=sharedData->mbcs.fromUnicodeTable;
   1207     /* get the byte for the output */
   1208     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   1209     /* is this code point assigned, or do we use fallbacks? */
   1210     *retval=(uint32_t)(value&0xff);
   1211     if(value>=0xf00) {
   1212         return 1;  /* roundtrip */
   1213     } else if(useFallback ? value>=0x800 : value>=0xc00) {
   1214         return -1;  /* fallback taken */
   1215     } else {
   1216         return 0;  /* no mapping */
   1217     }
   1218 }
   1219 
   1220 /*
   1221  * Check that the result is a 2-byte value with each byte in the range A1..FE
   1222  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
   1223  * to move it to the ISO 2022 range 21..7E.
   1224  * Return 0 if out of range.
   1225  */
   1226 static inline uint32_t
   1227 _2022FromGR94DBCS(uint32_t value) {
   1228     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
   1229         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
   1230     ) {
   1231         return value - 0x8080;  /* shift down to 21..7e byte range */
   1232     } else {
   1233         return 0;  /* not valid for ISO 2022 */
   1234     }
   1235 }
   1236 
   1237 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
   1238 /*
   1239  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
   1240  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
   1241  * unchanged.
   1242  */
   1243 static inline uint32_t
   1244 _2022ToGR94DBCS(uint32_t value) {
   1245     uint32_t returnValue = value + 0x8080;
   1246     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
   1247         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
   1248         return returnValue;
   1249     } else {
   1250         return value;
   1251     }
   1252 }
   1253 #endif
   1254 
   1255 #ifdef U_ENABLE_GENERIC_ISO_2022
   1256 
   1257 /**********************************************************************************
   1258 *  ISO-2022 Converter
   1259 *
   1260 *
   1261 */
   1262 
   1263 static void U_CALLCONV
   1264 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
   1265                                                            UErrorCode* err){
   1266     const char* mySourceLimit, *realSourceLimit;
   1267     const char* sourceStart;
   1268     const UChar* myTargetStart;
   1269     UConverter* saveThis;
   1270     UConverterDataISO2022* myData;
   1271     int8_t length;
   1272 
   1273     saveThis = args->converter;
   1274     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
   1275 
   1276     realSourceLimit = args->sourceLimit;
   1277     while (args->source < realSourceLimit) {
   1278         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
   1279             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
   1280             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
   1281 
   1282             if(args->source < mySourceLimit) {
   1283                 if(myData->currentConverter==NULL) {
   1284                     myData->currentConverter = ucnv_open("ASCII",err);
   1285                     if(U_FAILURE(*err)){
   1286                         return;
   1287                     }
   1288 
   1289                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
   1290                     saveThis->mode = UCNV_SO;
   1291                 }
   1292 
   1293                 /* convert to before the ESC or until the end of the buffer */
   1294                 myData->isFirstBuffer=FALSE;
   1295                 sourceStart = args->source;
   1296                 myTargetStart = args->target;
   1297                 args->converter = myData->currentConverter;
   1298                 ucnv_toUnicode(args->converter,
   1299                     &args->target,
   1300                     args->targetLimit,
   1301                     &args->source,
   1302                     mySourceLimit,
   1303                     args->offsets,
   1304                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
   1305                     err);
   1306                 args->converter = saveThis;
   1307 
   1308                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
   1309                     /* move the overflow buffer */
   1310                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
   1311                     myData->currentConverter->UCharErrorBufferLength = 0;
   1312                     if(length > 0) {
   1313                         uprv_memcpy(saveThis->UCharErrorBuffer,
   1314                                     myData->currentConverter->UCharErrorBuffer,
   1315                                     length*U_SIZEOF_UCHAR);
   1316                     }
   1317                     return;
   1318                 }
   1319 
   1320                 /*
   1321                  * At least one of:
   1322                  * -Error while converting
   1323                  * -Done with entire buffer
   1324                  * -Need to write offsets or update the current offset
   1325                  *  (leave that up to the code in ucnv.c)
   1326                  *
   1327                  * or else we just stopped at an ESC byte and continue with changeState_2022()
   1328                  */
   1329                 if (U_FAILURE(*err) ||
   1330                     (args->source == realSourceLimit) ||
   1331                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
   1332                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
   1333                 ) {
   1334                     /* copy partial or error input for truncated detection and error handling */
   1335                     if(U_FAILURE(*err)) {
   1336                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
   1337                         if(length > 0) {
   1338                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
   1339                         }
   1340                     } else {
   1341                         length = saveThis->toULength = myData->currentConverter->toULength;
   1342                         if(length > 0) {
   1343                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
   1344                             if(args->source < mySourceLimit) {
   1345                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
   1346                             }
   1347                         }
   1348                     }
   1349                     return;
   1350                 }
   1351             }
   1352         }
   1353 
   1354         sourceStart = args->source;
   1355         changeState_2022(args->converter,
   1356                &(args->source),
   1357                realSourceLimit,
   1358                ISO_2022,
   1359                err);
   1360         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
   1361             /* let the ucnv.c code update its current offset */
   1362             return;
   1363         }
   1364     }
   1365 }
   1366 
   1367 #endif
   1368 
   1369 /*
   1370  * To Unicode Callback helper function
   1371  */
   1372 static void
   1373 toUnicodeCallback(UConverter *cnv,
   1374                   const uint32_t sourceChar, const uint32_t targetUniChar,
   1375                   UErrorCode* err){
   1376     if(sourceChar>0xff){
   1377         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
   1378         cnv->toUBytes[1] = (uint8_t)sourceChar;
   1379         cnv->toULength = 2;
   1380     }
   1381     else{
   1382         cnv->toUBytes[0] =(char) sourceChar;
   1383         cnv->toULength = 1;
   1384     }
   1385 
   1386     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
   1387         *err = U_INVALID_CHAR_FOUND;
   1388     }
   1389     else{
   1390         *err = U_ILLEGAL_CHAR_FOUND;
   1391     }
   1392 }
   1393 
   1394 /**************************************ISO-2022-JP*************************************************/
   1395 
   1396 /************************************** IMPORTANT **************************************************
   1397 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
   1398 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
   1399 * The converter iterates over each Unicode codepoint
   1400 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
   1401 * processed one char at a time it would make sense to reduce the extra processing a canned converter
   1402 * would do as far as possible.
   1403 *
   1404 * If the implementation of these macros or structure of sharedData struct change in the future, make
   1405 * sure that ISO-2022 is also changed.
   1406 ***************************************************************************************************
   1407 */
   1408 
   1409 /***************************************************************************************************
   1410 * Rules for ISO-2022-jp encoding
   1411 * (i)   Escape sequences must be fully contained within a line they should not
   1412 *       span new lines or CRs
   1413 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
   1414 *       JIS-Roman character escape sequence should follow before the line terminates
   1415 * (iii) If the first character on the line is represented by two bytes then a two
   1416 *       byte character escape sequence should precede it
   1417 * (iv)  If no escape sequence is encountered then the characters are ASCII
   1418 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
   1419 *       and invoked with SS2 (ESC N).
   1420 * (vi)  If there is any G0 designation in text, there must be a switch to
   1421 *       ASCII or to JIS X 0201-Roman before a space character (but not
   1422 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
   1423 *       characters such as tab or CRLF.
   1424 * (vi)  Supported encodings:
   1425 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
   1426 *
   1427 *  source : RFC-1554
   1428 *
   1429 *          JISX201, JISX208,JISX212 : new .cnv data files created
   1430 *          KSC5601 : alias to ibm-949 mapping table
   1431 *          GB2312 : alias to ibm-1386 mapping table
   1432 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
   1433 *          ISO-8859-7 : alisas to ibm-9409 mapping table
   1434 */
   1435 
   1436 /* preference order of JP charsets */
   1437 static const StateEnum jpCharsetPref[]={
   1438     ASCII,
   1439     JISX201,
   1440     ISO8859_1,
   1441     JISX208,
   1442     ISO8859_7,
   1443     JISX212,
   1444     GB2312,
   1445     KSC5601,
   1446     HWKANA_7BIT
   1447 };
   1448 
   1449 /*
   1450  * The escape sequences must be in order of the enum constants like JISX201  = 3,
   1451  * not in order of jpCharsetPref[]!
   1452  */
   1453 static const char escSeqChars[][6] ={
   1454     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
   1455     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
   1456     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
   1457     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
   1458     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
   1459     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
   1460     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
   1461     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
   1462     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
   1463 
   1464 };
   1465 static  const int8_t escSeqCharsLen[] ={
   1466     3, /* length of <ESC>(B  ASCII       */
   1467     3, /* length of <ESC>.A  ISO-8859-1  */
   1468     3, /* length of <ESC>.F  ISO-8859-7  */
   1469     3, /* length of <ESC>(J  JISX-201    */
   1470     3, /* length of <ESC>$B  JISX-208    */
   1471     4, /* length of <ESC>$(D JISX-212    */
   1472     3, /* length of <ESC>$A  GB2312      */
   1473     4, /* length of <ESC>$(C KSC5601     */
   1474     3  /* length of <ESC>(I  HWKANA_7BIT */
   1475 };
   1476 
   1477 /*
   1478 * The iteration over various code pages works this way:
   1479 * i)   Get the currentState from myConverterData->currentState
   1480 * ii)  Check if the character is mapped to a valid character in the currentState
   1481 *      Yes ->  a) set the initIterState to currentState
   1482 *       b) remain in this state until an invalid character is found
   1483 *      No  ->  a) go to the next code page and find the character
   1484 * iii) Before changing the state increment the current state check if the current state
   1485 *      is equal to the intitIteration state
   1486 *      Yes ->  A character that cannot be represented in any of the supported encodings
   1487 *       break and return a U_INVALID_CHARACTER error
   1488 *      No  ->  Continue and find the character in next code page
   1489 *
   1490 *
   1491 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
   1492 */
   1493 
   1494 /* Map 00..7F to Unicode according to JIS X 0201. */
   1495 static inline uint32_t
   1496 jisx201ToU(uint32_t value) {
   1497     if(value < 0x5c) {
   1498         return value;
   1499     } else if(value == 0x5c) {
   1500         return 0xa5;
   1501     } else if(value == 0x7e) {
   1502         return 0x203e;
   1503     } else /* value <= 0x7f */ {
   1504         return value;
   1505     }
   1506 }
   1507 
   1508 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
   1509 static inline uint32_t
   1510 jisx201FromU(uint32_t value) {
   1511     if(value<=0x7f) {
   1512         if(value!=0x5c && value!=0x7e) {
   1513             return value;
   1514         }
   1515     } else if(value==0xa5) {
   1516         return 0x5c;
   1517     } else if(value==0x203e) {
   1518         return 0x7e;
   1519     }
   1520     return 0xfffe;
   1521 }
   1522 
   1523 /*
   1524  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
   1525  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
   1526  * Return 0 if the byte pair is out of range.
   1527  */
   1528 static inline uint32_t
   1529 _2022FromSJIS(uint32_t value) {
   1530     uint8_t trail;
   1531 
   1532     if(value > 0xEFFC) {
   1533         return 0;  /* beyond JIS X 0208 */
   1534     }
   1535 
   1536     trail = (uint8_t)value;
   1537 
   1538     value &= 0xff00;  /* lead byte */
   1539     if(value <= 0x9f00) {
   1540         value -= 0x7000;
   1541     } else /* 0xe000 <= value <= 0xef00 */ {
   1542         value -= 0xb000;
   1543     }
   1544     value <<= 1;
   1545 
   1546     if(trail <= 0x9e) {
   1547         value -= 0x100;
   1548         if(trail <= 0x7e) {
   1549             value |= trail - 0x1f;
   1550         } else {
   1551             value |= trail - 0x20;
   1552         }
   1553     } else /* trail <= 0xfc */ {
   1554         value |= trail - 0x7e;
   1555     }
   1556     return value;
   1557 }
   1558 
   1559 /*
   1560  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
   1561  * If either byte is outside 21..7E make sure that the result is not valid
   1562  * for Shift-JIS so that the converter catches it.
   1563  * Some invalid byte values already turn into equally invalid Shift-JIS
   1564  * byte values and need not be tested explicitly.
   1565  */
   1566 static inline void
   1567 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
   1568     if(c1&1) {
   1569         ++c1;
   1570         if(c2 <= 0x5f) {
   1571             c2 += 0x1f;
   1572         } else if(c2 <= 0x7e) {
   1573             c2 += 0x20;
   1574         } else {
   1575             c2 = 0;  /* invalid */
   1576         }
   1577     } else {
   1578         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
   1579             c2 += 0x7e;
   1580         } else {
   1581             c2 = 0;  /* invalid */
   1582         }
   1583     }
   1584     c1 >>= 1;
   1585     if(c1 <= 0x2f) {
   1586         c1 += 0x70;
   1587     } else if(c1 <= 0x3f) {
   1588         c1 += 0xb0;
   1589     } else {
   1590         c1 = 0;  /* invalid */
   1591     }
   1592     bytes[0] = (char)c1;
   1593     bytes[1] = (char)c2;
   1594 }
   1595 
   1596 /*
   1597  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
   1598  * Katakana.
   1599  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
   1600  * because Shift-JIS roundtrips half-width Katakana to single bytes.
   1601  * These were the only fallbacks in ICU's jisx-208.ucm file.
   1602  */
   1603 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
   1604     0x2123,  /* U+FF61 */
   1605     0x2156,
   1606     0x2157,
   1607     0x2122,
   1608     0x2126,
   1609     0x2572,
   1610     0x2521,
   1611     0x2523,
   1612     0x2525,
   1613     0x2527,
   1614     0x2529,
   1615     0x2563,
   1616     0x2565,
   1617     0x2567,
   1618     0x2543,
   1619     0x213C,  /* U+FF70 */
   1620     0x2522,
   1621     0x2524,
   1622     0x2526,
   1623     0x2528,
   1624     0x252A,
   1625     0x252B,
   1626     0x252D,
   1627     0x252F,
   1628     0x2531,
   1629     0x2533,
   1630     0x2535,
   1631     0x2537,
   1632     0x2539,
   1633     0x253B,
   1634     0x253D,
   1635     0x253F,  /* U+FF80 */
   1636     0x2541,
   1637     0x2544,
   1638     0x2546,
   1639     0x2548,
   1640     0x254A,
   1641     0x254B,
   1642     0x254C,
   1643     0x254D,
   1644     0x254E,
   1645     0x254F,
   1646     0x2552,
   1647     0x2555,
   1648     0x2558,
   1649     0x255B,
   1650     0x255E,
   1651     0x255F,  /* U+FF90 */
   1652     0x2560,
   1653     0x2561,
   1654     0x2562,
   1655     0x2564,
   1656     0x2566,
   1657     0x2568,
   1658     0x2569,
   1659     0x256A,
   1660     0x256B,
   1661     0x256C,
   1662     0x256D,
   1663     0x256F,
   1664     0x2573,
   1665     0x212B,
   1666     0x212C   /* U+FF9F */
   1667 };
   1668 
   1669 static void U_CALLCONV
   1670 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
   1671     UConverter *cnv = args->converter;
   1672     UConverterDataISO2022 *converterData;
   1673     ISO2022State *pFromU2022State;
   1674     uint8_t *target = (uint8_t *) args->target;
   1675     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
   1676     const UChar* source = args->source;
   1677     const UChar* sourceLimit = args->sourceLimit;
   1678     int32_t* offsets = args->offsets;
   1679     UChar32 sourceChar;
   1680     char buffer[8];
   1681     int32_t len, outLen;
   1682     int8_t choices[10];
   1683     int32_t choiceCount;
   1684     uint32_t targetValue = 0;
   1685     UBool useFallback;
   1686 
   1687     int32_t i;
   1688     int8_t cs, g;
   1689 
   1690     /* set up the state */
   1691     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
   1692     pFromU2022State   = &converterData->fromU2022State;
   1693 
   1694     choiceCount = 0;
   1695 
   1696     /* check if the last codepoint of previous buffer was a lead surrogate*/
   1697     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
   1698         goto getTrail;
   1699     }
   1700 
   1701     while(source < sourceLimit) {
   1702         if(target < targetLimit) {
   1703 
   1704             sourceChar  = *(source++);
   1705             /*check if the char is a First surrogate*/
   1706             if(U16_IS_SURROGATE(sourceChar)) {
   1707                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   1708 getTrail:
   1709                     /*look ahead to find the trail surrogate*/
   1710                     if(source < sourceLimit) {
   1711                         /* test the following code unit */
   1712                         UChar trail=(UChar) *source;
   1713                         if(U16_IS_TRAIL(trail)) {
   1714                             source++;
   1715                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   1716                             cnv->fromUChar32=0x00;
   1717                             /* convert this supplementary code point */
   1718                             /* exit this condition tree */
   1719                         } else {
   1720                             /* this is an unmatched lead code unit (1st surrogate) */
   1721                             /* callback(illegal) */
   1722                             *err=U_ILLEGAL_CHAR_FOUND;
   1723                             cnv->fromUChar32=sourceChar;
   1724                             break;
   1725                         }
   1726                     } else {
   1727                         /* no more input */
   1728                         cnv->fromUChar32=sourceChar;
   1729                         break;
   1730                     }
   1731                 } else {
   1732                     /* this is an unmatched trail code unit (2nd surrogate) */
   1733                     /* callback(illegal) */
   1734                     *err=U_ILLEGAL_CHAR_FOUND;
   1735                     cnv->fromUChar32=sourceChar;
   1736                     break;
   1737                 }
   1738             }
   1739 
   1740             /* do not convert SO/SI/ESC */
   1741             if(IS_2022_CONTROL(sourceChar)) {
   1742                 /* callback(illegal) */
   1743                 *err=U_ILLEGAL_CHAR_FOUND;
   1744                 cnv->fromUChar32=sourceChar;
   1745                 break;
   1746             }
   1747 
   1748             /* do the conversion */
   1749 
   1750             if(choiceCount == 0) {
   1751                 uint16_t csm;
   1752 
   1753                 /*
   1754                  * The csm variable keeps track of which charsets are allowed
   1755                  * and not used yet while building the choices[].
   1756                  */
   1757                 csm = jpCharsetMasks[converterData->version];
   1758                 choiceCount = 0;
   1759 
   1760                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
   1761                 if(converterData->version == 3 || converterData->version == 4) {
   1762                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
   1763                 }
   1764                 /* Do not try single-byte half-width Katakana for other versions. */
   1765                 csm &= ~CSM(HWKANA_7BIT);
   1766 
   1767                 /* try the current G0 charset */
   1768                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
   1769                 csm &= ~CSM(cs);
   1770 
   1771                 /* try the current G2 charset */
   1772                 if((cs = pFromU2022State->cs[2]) != 0) {
   1773                     choices[choiceCount++] = cs;
   1774                     csm &= ~CSM(cs);
   1775                 }
   1776 
   1777                 /* try all the other possible charsets */
   1778                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
   1779                     cs = (int8_t)jpCharsetPref[i];
   1780                     if(CSM(cs) & csm) {
   1781                         choices[choiceCount++] = cs;
   1782                         csm &= ~CSM(cs);
   1783                     }
   1784                 }
   1785             }
   1786 
   1787             cs = g = 0;
   1788             /*
   1789              * len==0: no mapping found yet
   1790              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
   1791              * len>0: found a roundtrip result, done
   1792              */
   1793             len = 0;
   1794             /*
   1795              * We will turn off useFallback after finding a fallback,
   1796              * but we still get fallbacks from PUA code points as usual.
   1797              * Therefore, we will also need to check that we don't overwrite
   1798              * an early fallback with a later one.
   1799              */
   1800             useFallback = cnv->useFallback;
   1801 
   1802             for(i = 0; i < choiceCount && len <= 0; ++i) {
   1803                 uint32_t value;
   1804                 int32_t len2;
   1805                 int8_t cs0 = choices[i];
   1806                 switch(cs0) {
   1807                 case ASCII:
   1808                     if(sourceChar <= 0x7f) {
   1809                         targetValue = (uint32_t)sourceChar;
   1810                         len = 1;
   1811                         cs = cs0;
   1812                         g = 0;
   1813                     }
   1814                     break;
   1815                 case ISO8859_1:
   1816                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
   1817                         targetValue = (uint32_t)sourceChar - 0x80;
   1818                         len = 1;
   1819                         cs = cs0;
   1820                         g = 2;
   1821                     }
   1822                     break;
   1823                 case HWKANA_7BIT:
   1824                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
   1825                         if(converterData->version==3) {
   1826                             /* JIS7: use G1 (SO) */
   1827                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
   1828                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
   1829                             len = 1;
   1830                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
   1831                             g = 1;
   1832                         } else if(converterData->version==4) {
   1833                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
   1834                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
   1835                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
   1836                             len = 1;
   1837 
   1838                             cs = pFromU2022State->cs[0];
   1839                             if(IS_JP_DBCS(cs)) {
   1840                                 /* switch from a DBCS charset to JISX201 */
   1841                                 cs = (int8_t)JISX201;
   1842                             }
   1843                             /* else stay in the current G0 charset */
   1844                             g = 0;
   1845                         }
   1846                         /* else do not use HWKANA_7BIT with other versions */
   1847                     }
   1848                     break;
   1849                 case JISX201:
   1850                     /* G0 SBCS */
   1851                     value = jisx201FromU(sourceChar);
   1852                     if(value <= 0x7f) {
   1853                         targetValue = value;
   1854                         len = 1;
   1855                         cs = cs0;
   1856                         g = 0;
   1857                         useFallback = FALSE;
   1858                     }
   1859                     break;
   1860                 case JISX208:
   1861                     /* G0 DBCS from Shift-JIS table */
   1862                     len2 = MBCS_FROM_UCHAR32_ISO2022(
   1863                                 converterData->myConverterArray[cs0],
   1864                                 sourceChar, &value,
   1865                                 useFallback, MBCS_OUTPUT_2);
   1866                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
   1867                         value = _2022FromSJIS(value);
   1868                         if(value != 0) {
   1869                             targetValue = value;
   1870                             len = len2;
   1871                             cs = cs0;
   1872                             g = 0;
   1873                             useFallback = FALSE;
   1874                         }
   1875                     } else if(len == 0 && useFallback &&
   1876                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
   1877                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
   1878                         len = -2;
   1879                         cs = cs0;
   1880                         g = 0;
   1881                         useFallback = FALSE;
   1882                     }
   1883                     break;
   1884                 case ISO8859_7:
   1885                     /* G0 SBCS forced to 7-bit output */
   1886                     len2 = MBCS_SINGLE_FROM_UCHAR32(
   1887                                 converterData->myConverterArray[cs0],
   1888                                 sourceChar, &value,
   1889                                 useFallback);
   1890                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
   1891                         targetValue = value - 0x80;
   1892                         len = len2;
   1893                         cs = cs0;
   1894                         g = 2;
   1895                         useFallback = FALSE;
   1896                     }
   1897                     break;
   1898                 default:
   1899                     /* G0 DBCS */
   1900                     len2 = MBCS_FROM_UCHAR32_ISO2022(
   1901                                 converterData->myConverterArray[cs0],
   1902                                 sourceChar, &value,
   1903                                 useFallback, MBCS_OUTPUT_2);
   1904                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
   1905                         if(cs0 == KSC5601) {
   1906                             /*
   1907                              * Check for valid bytes for the encoding scheme.
   1908                              * This is necessary because the sub-converter (windows-949)
   1909                              * has a broader encoding scheme than is valid for 2022.
   1910                              */
   1911                             value = _2022FromGR94DBCS(value);
   1912                             if(value == 0) {
   1913                                 break;
   1914                             }
   1915                         }
   1916                         targetValue = value;
   1917                         len = len2;
   1918                         cs = cs0;
   1919                         g = 0;
   1920                         useFallback = FALSE;
   1921                     }
   1922                     break;
   1923                 }
   1924             }
   1925 
   1926             if(len != 0) {
   1927                 if(len < 0) {
   1928                     len = -len;  /* fallback */
   1929                 }
   1930                 outLen = 0; /* count output bytes */
   1931 
   1932                 /* write SI if necessary (only for JIS7) */
   1933                 if(pFromU2022State->g == 1 && g == 0) {
   1934                     buffer[outLen++] = UCNV_SI;
   1935                     pFromU2022State->g = 0;
   1936                 }
   1937 
   1938                 /* write the designation sequence if necessary */
   1939                 if(cs != pFromU2022State->cs[g]) {
   1940                     int32_t escLen = escSeqCharsLen[cs];
   1941                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
   1942                     outLen += escLen;
   1943                     pFromU2022State->cs[g] = cs;
   1944 
   1945                     /* invalidate the choices[] */
   1946                     choiceCount = 0;
   1947                 }
   1948 
   1949                 /* write the shift sequence if necessary */
   1950                 if(g != pFromU2022State->g) {
   1951                     switch(g) {
   1952                     /* case 0 handled before writing escapes */
   1953                     case 1:
   1954                         buffer[outLen++] = UCNV_SO;
   1955                         pFromU2022State->g = 1;
   1956                         break;
   1957                     default: /* case 2 */
   1958                         buffer[outLen++] = 0x1b;
   1959                         buffer[outLen++] = 0x4e;
   1960                         break;
   1961                     /* no case 3: no SS3 in ISO-2022-JP-x */
   1962                     }
   1963                 }
   1964 
   1965                 /* write the output bytes */
   1966                 if(len == 1) {
   1967                     buffer[outLen++] = (char)targetValue;
   1968                 } else /* len == 2 */ {
   1969                     buffer[outLen++] = (char)(targetValue >> 8);
   1970                     buffer[outLen++] = (char)targetValue;
   1971                 }
   1972             } else {
   1973                 /*
   1974                  * if we cannot find the character after checking all codepages
   1975                  * then this is an error
   1976                  */
   1977                 *err = U_INVALID_CHAR_FOUND;
   1978                 cnv->fromUChar32=sourceChar;
   1979                 break;
   1980             }
   1981 
   1982             if(sourceChar == CR || sourceChar == LF) {
   1983                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
   1984                 pFromU2022State->cs[2] = 0;
   1985                 choiceCount = 0;
   1986             }
   1987 
   1988             /* output outLen>0 bytes in buffer[] */
   1989             if(outLen == 1) {
   1990                 *target++ = buffer[0];
   1991                 if(offsets) {
   1992                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
   1993                 }
   1994             } else if(outLen == 2 && (target + 2) <= targetLimit) {
   1995                 *target++ = buffer[0];
   1996                 *target++ = buffer[1];
   1997                 if(offsets) {
   1998                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
   1999                     *offsets++ = sourceIndex;
   2000                     *offsets++ = sourceIndex;
   2001                 }
   2002             } else {
   2003                 fromUWriteUInt8(
   2004                     cnv,
   2005                     buffer, outLen,
   2006                     &target, (const char *)targetLimit,
   2007                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
   2008                     err);
   2009                 if(U_FAILURE(*err)) {
   2010                     break;
   2011                 }
   2012             }
   2013         } /* end if(myTargetIndex<myTargetLength) */
   2014         else{
   2015             *err =U_BUFFER_OVERFLOW_ERROR;
   2016             break;
   2017         }
   2018 
   2019     }/* end while(mySourceIndex<mySourceLength) */
   2020 
   2021     /*
   2022      * the end of the input stream and detection of truncated input
   2023      * are handled by the framework, but for ISO-2022-JP conversion
   2024      * we need to be in ASCII mode at the very end
   2025      *
   2026      * conditions:
   2027      *   successful
   2028      *   in SO mode or not in ASCII mode
   2029      *   end of input and no truncated input
   2030      */
   2031     if( U_SUCCESS(*err) &&
   2032         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
   2033         args->flush && source>=sourceLimit && cnv->fromUChar32==0
   2034     ) {
   2035         int32_t sourceIndex;
   2036 
   2037         outLen = 0;
   2038 
   2039         if(pFromU2022State->g != 0) {
   2040             buffer[outLen++] = UCNV_SI;
   2041             pFromU2022State->g = 0;
   2042         }
   2043 
   2044         if(pFromU2022State->cs[0] != ASCII) {
   2045             int32_t escLen = escSeqCharsLen[ASCII];
   2046             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
   2047             outLen += escLen;
   2048             pFromU2022State->cs[0] = (int8_t)ASCII;
   2049         }
   2050 
   2051         /* get the source index of the last input character */
   2052         /*
   2053          * TODO this would be simpler and more reliable if we used a pair
   2054          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   2055          * so that we could simply use the prevSourceIndex here;
   2056          * this code gives an incorrect result for the rare case of an unmatched
   2057          * trail surrogate that is alone in the last buffer of the text stream
   2058          */
   2059         sourceIndex=(int32_t)(source-args->source);
   2060         if(sourceIndex>0) {
   2061             --sourceIndex;
   2062             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   2063                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   2064             ) {
   2065                 --sourceIndex;
   2066             }
   2067         } else {
   2068             sourceIndex=-1;
   2069         }
   2070 
   2071         fromUWriteUInt8(
   2072             cnv,
   2073             buffer, outLen,
   2074             &target, (const char *)targetLimit,
   2075             &offsets, sourceIndex,
   2076             err);
   2077     }
   2078 
   2079     /*save the state and return */
   2080     args->source = source;
   2081     args->target = (char*)target;
   2082 }
   2083 
   2084 /*************** to unicode *******************/
   2085 
   2086 static void U_CALLCONV
   2087 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   2088                                                UErrorCode* err){
   2089     char tempBuf[2];
   2090     const char *mySource = (char *) args->source;
   2091     UChar *myTarget = args->target;
   2092     const char *mySourceLimit = args->sourceLimit;
   2093     uint32_t targetUniChar = 0x0000;
   2094     uint32_t mySourceChar = 0x0000;
   2095     uint32_t tmpSourceChar = 0x0000;
   2096     UConverterDataISO2022* myData;
   2097     ISO2022State *pToU2022State;
   2098     StateEnum cs;
   2099 
   2100     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2101     pToU2022State = &myData->toU2022State;
   2102 
   2103     if(myData->key != 0) {
   2104         /* continue with a partial escape sequence */
   2105         goto escape;
   2106     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   2107         /* continue with a partial double-byte character */
   2108         mySourceChar = args->converter->toUBytes[0];
   2109         args->converter->toULength = 0;
   2110         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
   2111         targetUniChar = missingCharMarker;
   2112         goto getTrailByte;
   2113     }
   2114 
   2115     while(mySource < mySourceLimit){
   2116 
   2117         targetUniChar =missingCharMarker;
   2118 
   2119         if(myTarget < args->targetLimit){
   2120 
   2121             mySourceChar= (unsigned char) *mySource++;
   2122 
   2123             switch(mySourceChar) {
   2124             case UCNV_SI:
   2125                 if(myData->version==3) {
   2126                     pToU2022State->g=0;
   2127                     continue;
   2128                 } else {
   2129                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
   2130                     myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
   2131                     break;
   2132                 }
   2133 
   2134             case UCNV_SO:
   2135                 if(myData->version==3) {
   2136                     /* JIS7: switch to G1 half-width Katakana */
   2137                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
   2138                     pToU2022State->g=1;
   2139                     continue;
   2140                 } else {
   2141                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
   2142                     myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
   2143                     break;
   2144                 }
   2145 
   2146             case ESC_2022:
   2147                 mySource--;
   2148 escape:
   2149                 {
   2150                     const char * mySourceBefore = mySource;
   2151                     int8_t toULengthBefore = args->converter->toULength;
   2152 
   2153                     changeState_2022(args->converter,&(mySource),
   2154                         mySourceLimit, ISO_2022_JP,err);
   2155 
   2156                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
   2157                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
   2158                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   2159                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
   2160                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
   2161                     }
   2162                 }
   2163 
   2164                 /* invalid or illegal escape sequence */
   2165                 if(U_FAILURE(*err)){
   2166                     args->target = myTarget;
   2167                     args->source = mySource;
   2168                     myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
   2169                     return;
   2170                 }
   2171                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
   2172                 if(myData->key==0) {
   2173                     myData->isEmptySegment = TRUE;
   2174                 }
   2175                 continue;
   2176 
   2177             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
   2178 
   2179             case CR:
   2180             case LF:
   2181                 /* automatically reset to single-byte mode */
   2182                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
   2183                     pToU2022State->cs[0] = (int8_t)ASCII;
   2184                 }
   2185                 pToU2022State->cs[2] = 0;
   2186                 pToU2022State->g = 0;
   2187                 U_FALLTHROUGH;
   2188             default:
   2189                 /* convert one or two bytes */
   2190                 myData->isEmptySegment = FALSE;
   2191                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
   2192                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
   2193                     !IS_JP_DBCS(cs)
   2194                 ) {
   2195                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
   2196                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
   2197 
   2198                     /* return from a single-shift state to the previous one */
   2199                     if(pToU2022State->g >= 2) {
   2200                         pToU2022State->g=pToU2022State->prevG;
   2201                     }
   2202                 } else switch(cs) {
   2203                 case ASCII:
   2204                     if(mySourceChar <= 0x7f) {
   2205                         targetUniChar = mySourceChar;
   2206                     }
   2207                     break;
   2208                 case ISO8859_1:
   2209                     if(mySourceChar <= 0x7f) {
   2210                         targetUniChar = mySourceChar + 0x80;
   2211                     }
   2212                     /* return from a single-shift state to the previous one */
   2213                     pToU2022State->g=pToU2022State->prevG;
   2214                     break;
   2215                 case ISO8859_7:
   2216                     if(mySourceChar <= 0x7f) {
   2217                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
   2218                         targetUniChar =
   2219                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
   2220                                 myData->myConverterArray[cs],
   2221                                 mySourceChar + 0x80);
   2222                     }
   2223                     /* return from a single-shift state to the previous one */
   2224                     pToU2022State->g=pToU2022State->prevG;
   2225                     break;
   2226                 case JISX201:
   2227                     if(mySourceChar <= 0x7f) {
   2228                         targetUniChar = jisx201ToU(mySourceChar);
   2229                     }
   2230                     break;
   2231                 case HWKANA_7BIT:
   2232                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
   2233                         /* 7-bit halfwidth Katakana */
   2234                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
   2235                     }
   2236                     break;
   2237                 default:
   2238                     /* G0 DBCS */
   2239                     if(mySource < mySourceLimit) {
   2240                         int leadIsOk, trailIsOk;
   2241                         uint8_t trailByte;
   2242 getTrailByte:
   2243                         trailByte = (uint8_t)*mySource;
   2244                         /*
   2245                          * Ticket 5691: consistent illegal sequences:
   2246                          * - We include at least the first byte in the illegal sequence.
   2247                          * - If any of the non-initial bytes could be the start of a character,
   2248                          *   we stop the illegal sequence before the first one of those.
   2249                          *
   2250                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   2251                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   2252                          * Otherwise we convert or report the pair of bytes.
   2253                          */
   2254                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   2255                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   2256                         if (leadIsOk && trailIsOk) {
   2257                             ++mySource;
   2258                             tmpSourceChar = (mySourceChar << 8) | trailByte;
   2259                             if(cs == JISX208) {
   2260                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
   2261                                 mySourceChar = tmpSourceChar;
   2262                             } else {
   2263                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
   2264                                 mySourceChar = tmpSourceChar;
   2265                                 if (cs == KSC5601) {
   2266                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
   2267                                 }
   2268                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
   2269                                 tempBuf[1] = (char)(tmpSourceChar);
   2270                             }
   2271                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
   2272                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   2273                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   2274                             ++mySource;
   2275                             /* add another bit so that the code below writes 2 bytes in case of error */
   2276                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   2277                         }
   2278                     } else {
   2279                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2280                         args->converter->toULength = 1;
   2281                         goto endloop;
   2282                     }
   2283                 }  /* End of inner switch */
   2284                 break;
   2285             }  /* End of outer switch */
   2286             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
   2287                 if(args->offsets){
   2288                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2289                 }
   2290                 *(myTarget++)=(UChar)targetUniChar;
   2291             }
   2292             else if(targetUniChar > missingCharMarker){
   2293                 /* disassemble the surrogate pair and write to output*/
   2294                 targetUniChar-=0x0010000;
   2295                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
   2296                 if(args->offsets){
   2297                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2298                 }
   2299                 ++myTarget;
   2300                 if(myTarget< args->targetLimit){
   2301                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   2302                     if(args->offsets){
   2303                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2304                     }
   2305                     ++myTarget;
   2306                 }else{
   2307                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
   2308                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   2309                 }
   2310 
   2311             }
   2312             else{
   2313                 /* Call the callback function*/
   2314                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   2315                 break;
   2316             }
   2317         }
   2318         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
   2319             *err =U_BUFFER_OVERFLOW_ERROR;
   2320             break;
   2321         }
   2322     }
   2323 endloop:
   2324     args->target = myTarget;
   2325     args->source = mySource;
   2326 }
   2327 
   2328 
   2329 #if !UCONFIG_ONLY_HTML_CONVERSION
   2330 /***************************************************************
   2331 *   Rules for ISO-2022-KR encoding
   2332 *   i) The KSC5601 designator sequence should appear only once in a file,
   2333 *      at the begining of a line before any KSC5601 characters. This usually
   2334 *      means that it appears by itself on the first line of the file
   2335 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
   2336 *      and SI to shift into single byte mode
   2337 */
   2338 static void U_CALLCONV
   2339 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2340 
   2341     UConverter* saveConv = args->converter;
   2342     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
   2343     args->converter=myConverterData->currentConverter;
   2344 
   2345     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
   2346     ucnv_MBCSFromUnicodeWithOffsets(args,err);
   2347     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
   2348 
   2349     if(*err == U_BUFFER_OVERFLOW_ERROR) {
   2350         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
   2351             uprv_memcpy(
   2352                 saveConv->charErrorBuffer,
   2353                 myConverterData->currentConverter->charErrorBuffer,
   2354                 myConverterData->currentConverter->charErrorBufferLength);
   2355         }
   2356         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
   2357         myConverterData->currentConverter->charErrorBufferLength = 0;
   2358     }
   2359     args->converter=saveConv;
   2360 }
   2361 
   2362 static void U_CALLCONV
   2363 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2364 
   2365     const UChar *source = args->source;
   2366     const UChar *sourceLimit = args->sourceLimit;
   2367     unsigned char *target = (unsigned char *) args->target;
   2368     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
   2369     int32_t* offsets = args->offsets;
   2370     uint32_t targetByteUnit = 0x0000;
   2371     UChar32 sourceChar = 0x0000;
   2372     UBool isTargetByteDBCS;
   2373     UBool oldIsTargetByteDBCS;
   2374     UConverterDataISO2022 *converterData;
   2375     UConverterSharedData* sharedData;
   2376     UBool useFallback;
   2377     int32_t length =0;
   2378 
   2379     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
   2380     /* if the version is 1 then the user is requesting
   2381      * conversion with ibm-25546 pass the arguments to
   2382      * MBCS converter and return
   2383      */
   2384     if(converterData->version==1){
   2385         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
   2386         return;
   2387     }
   2388 
   2389     /* initialize data */
   2390     sharedData = converterData->currentConverter->sharedData;
   2391     useFallback = args->converter->useFallback;
   2392     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
   2393     oldIsTargetByteDBCS = isTargetByteDBCS;
   2394 
   2395     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
   2396     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
   2397         goto getTrail;
   2398     }
   2399     while(source < sourceLimit){
   2400 
   2401         targetByteUnit = missingCharMarker;
   2402 
   2403         if(target < (unsigned char*) args->targetLimit){
   2404             sourceChar = *source++;
   2405 
   2406             /* do not convert SO/SI/ESC */
   2407             if(IS_2022_CONTROL(sourceChar)) {
   2408                 /* callback(illegal) */
   2409                 *err=U_ILLEGAL_CHAR_FOUND;
   2410                 args->converter->fromUChar32=sourceChar;
   2411                 break;
   2412             }
   2413 
   2414             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
   2415             if(length < 0) {
   2416                 length = -length;  /* fallback */
   2417             }
   2418             /* only DBCS or SBCS characters are expected*/
   2419             /* DB characters with high bit set to 1 are expected */
   2420             if( length > 2 || length==0 ||
   2421                 (length == 1 && targetByteUnit > 0x7f) ||
   2422                 (length == 2 &&
   2423                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
   2424                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
   2425             ) {
   2426                 targetByteUnit=missingCharMarker;
   2427             }
   2428             if (targetByteUnit != missingCharMarker){
   2429 
   2430                 oldIsTargetByteDBCS = isTargetByteDBCS;
   2431                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
   2432                   /* append the shift sequence */
   2433                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
   2434 
   2435                     if (isTargetByteDBCS)
   2436                         *target++ = UCNV_SO;
   2437                     else
   2438                         *target++ = UCNV_SI;
   2439                     if(offsets)
   2440                         *(offsets++) = (int32_t)(source - args->source-1);
   2441                 }
   2442                 /* write the targetUniChar  to target */
   2443                 if(targetByteUnit <= 0x00FF){
   2444                     if( target < targetLimit){
   2445                         *(target++) = (unsigned char) targetByteUnit;
   2446                         if(offsets){
   2447                             *(offsets++) = (int32_t)(source - args->source-1);
   2448                         }
   2449 
   2450                     }else{
   2451                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
   2452                         *err = U_BUFFER_OVERFLOW_ERROR;
   2453                     }
   2454                 }else{
   2455                     if(target < targetLimit){
   2456                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
   2457                         if(offsets){
   2458                             *(offsets++) = (int32_t)(source - args->source-1);
   2459                         }
   2460                         if(target < targetLimit){
   2461                             *(target++) =(unsigned char) (targetByteUnit -0x80);
   2462                             if(offsets){
   2463                                 *(offsets++) = (int32_t)(source - args->source-1);
   2464                             }
   2465                         }else{
   2466                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
   2467                             *err = U_BUFFER_OVERFLOW_ERROR;
   2468                         }
   2469                     }else{
   2470                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
   2471                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
   2472                         *err = U_BUFFER_OVERFLOW_ERROR;
   2473                     }
   2474                 }
   2475 
   2476             }
   2477             else{
   2478                 /* oops.. the code point is unassingned
   2479                  * set the error and reason
   2480                  */
   2481 
   2482                 /*check if the char is a First surrogate*/
   2483                 if(U16_IS_SURROGATE(sourceChar)) {
   2484                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   2485 getTrail:
   2486                         /*look ahead to find the trail surrogate*/
   2487                         if(source <  sourceLimit) {
   2488                             /* test the following code unit */
   2489                             UChar trail=(UChar) *source;
   2490                             if(U16_IS_TRAIL(trail)) {
   2491                                 source++;
   2492                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   2493                                 *err = U_INVALID_CHAR_FOUND;
   2494                                 /* convert this surrogate code point */
   2495                                 /* exit this condition tree */
   2496                             } else {
   2497                                 /* this is an unmatched lead code unit (1st surrogate) */
   2498                                 /* callback(illegal) */
   2499                                 *err=U_ILLEGAL_CHAR_FOUND;
   2500                             }
   2501                         } else {
   2502                             /* no more input */
   2503                             *err = U_ZERO_ERROR;
   2504                         }
   2505                     } else {
   2506                         /* this is an unmatched trail code unit (2nd surrogate) */
   2507                         /* callback(illegal) */
   2508                         *err=U_ILLEGAL_CHAR_FOUND;
   2509                     }
   2510                 } else {
   2511                     /* callback(unassigned) for a BMP code point */
   2512                     *err = U_INVALID_CHAR_FOUND;
   2513                 }
   2514 
   2515                 args->converter->fromUChar32=sourceChar;
   2516                 break;
   2517             }
   2518         } /* end if(myTargetIndex<myTargetLength) */
   2519         else{
   2520             *err =U_BUFFER_OVERFLOW_ERROR;
   2521             break;
   2522         }
   2523 
   2524     }/* end while(mySourceIndex<mySourceLength) */
   2525 
   2526     /*
   2527      * the end of the input stream and detection of truncated input
   2528      * are handled by the framework, but for ISO-2022-KR conversion
   2529      * we need to be in ASCII mode at the very end
   2530      *
   2531      * conditions:
   2532      *   successful
   2533      *   not in ASCII mode
   2534      *   end of input and no truncated input
   2535      */
   2536     if( U_SUCCESS(*err) &&
   2537         isTargetByteDBCS &&
   2538         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
   2539     ) {
   2540         int32_t sourceIndex;
   2541 
   2542         /* we are switching to ASCII */
   2543         isTargetByteDBCS=FALSE;
   2544 
   2545         /* get the source index of the last input character */
   2546         /*
   2547          * TODO this would be simpler and more reliable if we used a pair
   2548          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   2549          * so that we could simply use the prevSourceIndex here;
   2550          * this code gives an incorrect result for the rare case of an unmatched
   2551          * trail surrogate that is alone in the last buffer of the text stream
   2552          */
   2553         sourceIndex=(int32_t)(source-args->source);
   2554         if(sourceIndex>0) {
   2555             --sourceIndex;
   2556             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   2557                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   2558             ) {
   2559                 --sourceIndex;
   2560             }
   2561         } else {
   2562             sourceIndex=-1;
   2563         }
   2564 
   2565         fromUWriteUInt8(
   2566             args->converter,
   2567             SHIFT_IN_STR, 1,
   2568             &target, (const char *)targetLimit,
   2569             &offsets, sourceIndex,
   2570             err);
   2571     }
   2572 
   2573     /*save the state and return */
   2574     args->source = source;
   2575     args->target = (char*)target;
   2576     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
   2577 }
   2578 
   2579 /************************ To Unicode ***************************************/
   2580 
   2581 static void U_CALLCONV
   2582 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
   2583                                                             UErrorCode* err){
   2584     char const* sourceStart;
   2585     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2586 
   2587     UConverterToUnicodeArgs subArgs;
   2588     int32_t minArgsSize;
   2589 
   2590     /* set up the subconverter arguments */
   2591     if(args->size<sizeof(UConverterToUnicodeArgs)) {
   2592         minArgsSize = args->size;
   2593     } else {
   2594         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
   2595     }
   2596 
   2597     uprv_memcpy(&subArgs, args, minArgsSize);
   2598     subArgs.size = (uint16_t)minArgsSize;
   2599     subArgs.converter = myData->currentConverter;
   2600 
   2601     /* remember the original start of the input for offsets */
   2602     sourceStart = args->source;
   2603 
   2604     if(myData->key != 0) {
   2605         /* continue with a partial escape sequence */
   2606         goto escape;
   2607     }
   2608 
   2609     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
   2610         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
   2611         subArgs.source = args->source;
   2612         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
   2613         if(subArgs.source != subArgs.sourceLimit) {
   2614             /*
   2615              * get the current partial byte sequence
   2616              *
   2617              * it needs to be moved between the public and the subconverter
   2618              * so that the conversion framework, which only sees the public
   2619              * converter, can handle truncated and illegal input etc.
   2620              */
   2621             if(args->converter->toULength > 0) {
   2622                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
   2623             }
   2624             subArgs.converter->toULength = args->converter->toULength;
   2625 
   2626             /*
   2627              * Convert up to the end of the input, or to before the next escape character.
   2628              * Does not handle conversion extensions because the preToU[] state etc.
   2629              * is not copied.
   2630              */
   2631             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
   2632 
   2633             if(args->offsets != NULL && sourceStart != args->source) {
   2634                 /* update offsets to base them on the actual start of the input */
   2635                 int32_t *offsets = args->offsets;
   2636                 UChar *target = args->target;
   2637                 int32_t delta = (int32_t)(args->source - sourceStart);
   2638                 while(target < subArgs.target) {
   2639                     if(*offsets >= 0) {
   2640                         *offsets += delta;
   2641                     }
   2642                     ++offsets;
   2643                     ++target;
   2644                 }
   2645             }
   2646             args->source = subArgs.source;
   2647             args->target = subArgs.target;
   2648             args->offsets = subArgs.offsets;
   2649 
   2650             /* copy input/error/overflow buffers */
   2651             if(subArgs.converter->toULength > 0) {
   2652                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
   2653             }
   2654             args->converter->toULength = subArgs.converter->toULength;
   2655 
   2656             if(*err == U_BUFFER_OVERFLOW_ERROR) {
   2657                 if(subArgs.converter->UCharErrorBufferLength > 0) {
   2658                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
   2659                                 subArgs.converter->UCharErrorBufferLength);
   2660                 }
   2661                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
   2662                 subArgs.converter->UCharErrorBufferLength = 0;
   2663             }
   2664         }
   2665 
   2666         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
   2667             return;
   2668         }
   2669 
   2670 escape:
   2671         changeState_2022(args->converter,
   2672                &(args->source),
   2673                args->sourceLimit,
   2674                ISO_2022_KR,
   2675                err);
   2676     }
   2677 }
   2678 
   2679 static void U_CALLCONV
   2680 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   2681                                                             UErrorCode* err){
   2682     char tempBuf[2];
   2683     const char *mySource = ( char *) args->source;
   2684     UChar *myTarget = args->target;
   2685     const char *mySourceLimit = args->sourceLimit;
   2686     UChar32 targetUniChar = 0x0000;
   2687     UChar mySourceChar = 0x0000;
   2688     UConverterDataISO2022* myData;
   2689     UConverterSharedData* sharedData ;
   2690     UBool useFallback;
   2691 
   2692     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   2693     if(myData->version==1){
   2694         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
   2695         return;
   2696     }
   2697 
   2698     /* initialize state */
   2699     sharedData = myData->currentConverter->sharedData;
   2700     useFallback = args->converter->useFallback;
   2701 
   2702     if(myData->key != 0) {
   2703         /* continue with a partial escape sequence */
   2704         goto escape;
   2705     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   2706         /* continue with a partial double-byte character */
   2707         mySourceChar = args->converter->toUBytes[0];
   2708         args->converter->toULength = 0;
   2709         goto getTrailByte;
   2710     }
   2711 
   2712     while(mySource< mySourceLimit){
   2713 
   2714         if(myTarget < args->targetLimit){
   2715 
   2716             mySourceChar= (unsigned char) *mySource++;
   2717 
   2718             if(mySourceChar==UCNV_SI){
   2719                 myData->toU2022State.g = 0;
   2720                 if (myData->isEmptySegment) {
   2721                     myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
   2722                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   2723                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
   2724                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2725                     args->converter->toULength = 1;
   2726                     args->target = myTarget;
   2727                     args->source = mySource;
   2728                     return;
   2729                 }
   2730                 /*consume the source */
   2731                 continue;
   2732             }else if(mySourceChar==UCNV_SO){
   2733                 myData->toU2022State.g = 1;
   2734                 myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
   2735                 /*consume the source */
   2736                 continue;
   2737             }else if(mySourceChar==ESC_2022){
   2738                 mySource--;
   2739 escape:
   2740                 myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
   2741                 changeState_2022(args->converter,&(mySource),
   2742                                 mySourceLimit, ISO_2022_KR, err);
   2743                 if(U_FAILURE(*err)){
   2744                     args->target = myTarget;
   2745                     args->source = mySource;
   2746                     return;
   2747                 }
   2748                 continue;
   2749             }
   2750 
   2751             myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
   2752             if(myData->toU2022State.g == 1) {
   2753                 if(mySource < mySourceLimit) {
   2754                     int leadIsOk, trailIsOk;
   2755                     uint8_t trailByte;
   2756 getTrailByte:
   2757                     targetUniChar = missingCharMarker;
   2758                     trailByte = (uint8_t)*mySource;
   2759                     /*
   2760                      * Ticket 5691: consistent illegal sequences:
   2761                      * - We include at least the first byte in the illegal sequence.
   2762                      * - If any of the non-initial bytes could be the start of a character,
   2763                      *   we stop the illegal sequence before the first one of those.
   2764                      *
   2765                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   2766                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   2767                      * Otherwise we convert or report the pair of bytes.
   2768                      */
   2769                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   2770                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   2771                     if (leadIsOk && trailIsOk) {
   2772                         ++mySource;
   2773                         tempBuf[0] = (char)(mySourceChar + 0x80);
   2774                         tempBuf[1] = (char)(trailByte + 0x80);
   2775                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
   2776                         mySourceChar = (mySourceChar << 8) | trailByte;
   2777                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   2778                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   2779                         ++mySource;
   2780                         /* add another bit so that the code below writes 2 bytes in case of error */
   2781                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   2782                     }
   2783                 } else {
   2784                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   2785                     args->converter->toULength = 1;
   2786                     break;
   2787                 }
   2788             }
   2789             else if(mySourceChar <= 0x7f) {
   2790                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
   2791             } else {
   2792                 targetUniChar = 0xffff;
   2793             }
   2794             if(targetUniChar < 0xfffe){
   2795                 if(args->offsets) {
   2796                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   2797                 }
   2798                 *(myTarget++)=(UChar)targetUniChar;
   2799             }
   2800             else {
   2801                 /* Call the callback function*/
   2802                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   2803                 break;
   2804             }
   2805         }
   2806         else{
   2807             *err =U_BUFFER_OVERFLOW_ERROR;
   2808             break;
   2809         }
   2810     }
   2811     args->target = myTarget;
   2812     args->source = mySource;
   2813 }
   2814 
   2815 /*************************** END ISO2022-KR *********************************/
   2816 
   2817 /*************************** ISO-2022-CN *********************************
   2818 *
   2819 * Rules for ISO-2022-CN Encoding:
   2820 * i)   The designator sequence must appear once on a line before any instance
   2821 *      of character set it designates.
   2822 * ii)  If two lines contain characters from the same character set, both lines
   2823 *      must include the designator sequence.
   2824 * iii) Once the designator sequence is known, a shifting sequence has to be found
   2825 *      to invoke the  shifting
   2826 * iv)  All lines start in ASCII and end in ASCII.
   2827 * v)   Four shifting sequences are employed for this purpose:
   2828 *
   2829 *      Sequcence   ASCII Eq    Charsets
   2830 *      ----------  -------    ---------
   2831 *      SI           <SI>        US-ASCII
   2832 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
   2833 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
   2834 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
   2835 *
   2836 * vi)
   2837 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
   2838 *      SS2designator : ESC "$" "*" finalchar_for_SS2
   2839 *      SS3designator : ESC "$" "+" finalchar_for_SS3
   2840 *
   2841 *      ESC $ ) A       Indicates the bytes following SO are Chinese
   2842 *       characters as defined in GB 2312-80, until
   2843 *       another SOdesignation appears
   2844 *
   2845 *
   2846 *      ESC $ ) E       Indicates the bytes following SO are as defined
   2847 *       in ISO-IR-165 (for details, see section 2.1),
   2848 *       until another SOdesignation appears
   2849 *
   2850 *      ESC $ ) G       Indicates the bytes following SO are as defined
   2851 *       in CNS 11643-plane-1, until another
   2852 *       SOdesignation appears
   2853 *
   2854 *      ESC $ * H       Indicates the two bytes immediately following
   2855 *       SS2 is a Chinese character as defined in CNS
   2856 *       11643-plane-2, until another SS2designation
   2857 *       appears
   2858 *       (Meaning <ESC>N must preceed every 2 byte
   2859 *        sequence.)
   2860 *
   2861 *      ESC $ + I       Indicates the immediate two bytes following SS3
   2862 *       is a Chinese character as defined in CNS
   2863 *       11643-plane-3, until another SS3designation
   2864 *       appears
   2865 *       (Meaning <ESC>O must preceed every 2 byte
   2866 *        sequence.)
   2867 *
   2868 *      ESC $ + J       Indicates the immediate two bytes following SS3
   2869 *       is a Chinese character as defined in CNS
   2870 *       11643-plane-4, until another SS3designation
   2871 *       appears
   2872 *       (In English: <ESC>O must preceed every 2 byte
   2873 *        sequence.)
   2874 *
   2875 *      ESC $ + K       Indicates the immediate two bytes following SS3
   2876 *       is a Chinese character as defined in CNS
   2877 *       11643-plane-5, until another SS3designation
   2878 *       appears
   2879 *
   2880 *      ESC $ + L       Indicates the immediate two bytes following SS3
   2881 *       is a Chinese character as defined in CNS
   2882 *       11643-plane-6, until another SS3designation
   2883 *       appears
   2884 *
   2885 *      ESC $ + M       Indicates the immediate two bytes following SS3
   2886 *       is a Chinese character as defined in CNS
   2887 *       11643-plane-7, until another SS3designation
   2888 *       appears
   2889 *
   2890 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
   2891 *       has its own designation information before any Chinese characters
   2892 *       appear
   2893 *
   2894 */
   2895 
   2896 /* The following are defined this way to make the strings truly readonly */
   2897 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
   2898 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
   2899 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
   2900 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
   2901 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
   2902 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
   2903 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
   2904 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
   2905 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
   2906 
   2907 /********************** ISO2022-CN Data **************************/
   2908 static const char* const escSeqCharsCN[10] ={
   2909         SHIFT_IN_STR,                   /* 0 ASCII */
   2910         GB_2312_80_STR,                 /* 1 GB2312_1 */
   2911         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
   2912         CNS_11643_1992_Plane_1_STR,
   2913         CNS_11643_1992_Plane_2_STR,
   2914         CNS_11643_1992_Plane_3_STR,
   2915         CNS_11643_1992_Plane_4_STR,
   2916         CNS_11643_1992_Plane_5_STR,
   2917         CNS_11643_1992_Plane_6_STR,
   2918         CNS_11643_1992_Plane_7_STR
   2919 };
   2920 
   2921 static void U_CALLCONV
   2922 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
   2923     UConverter *cnv = args->converter;
   2924     UConverterDataISO2022 *converterData;
   2925     ISO2022State *pFromU2022State;
   2926     uint8_t *target = (uint8_t *) args->target;
   2927     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
   2928     const UChar* source = args->source;
   2929     const UChar* sourceLimit = args->sourceLimit;
   2930     int32_t* offsets = args->offsets;
   2931     UChar32 sourceChar;
   2932     char buffer[8];
   2933     int32_t len;
   2934     int8_t choices[3];
   2935     int32_t choiceCount;
   2936     uint32_t targetValue = 0;
   2937     UBool useFallback;
   2938 
   2939     /* set up the state */
   2940     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
   2941     pFromU2022State   = &converterData->fromU2022State;
   2942 
   2943     choiceCount = 0;
   2944 
   2945     /* check if the last codepoint of previous buffer was a lead surrogate*/
   2946     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
   2947         goto getTrail;
   2948     }
   2949 
   2950     while( source < sourceLimit){
   2951         if(target < targetLimit){
   2952 
   2953             sourceChar  = *(source++);
   2954             /*check if the char is a First surrogate*/
   2955              if(U16_IS_SURROGATE(sourceChar)) {
   2956                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
   2957 getTrail:
   2958                     /*look ahead to find the trail surrogate*/
   2959                     if(source < sourceLimit) {
   2960                         /* test the following code unit */
   2961                         UChar trail=(UChar) *source;
   2962                         if(U16_IS_TRAIL(trail)) {
   2963                             source++;
   2964                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
   2965                             cnv->fromUChar32=0x00;
   2966                             /* convert this supplementary code point */
   2967                             /* exit this condition tree */
   2968                         } else {
   2969                             /* this is an unmatched lead code unit (1st surrogate) */
   2970                             /* callback(illegal) */
   2971                             *err=U_ILLEGAL_CHAR_FOUND;
   2972                             cnv->fromUChar32=sourceChar;
   2973                             break;
   2974                         }
   2975                     } else {
   2976                         /* no more input */
   2977                         cnv->fromUChar32=sourceChar;
   2978                         break;
   2979                     }
   2980                 } else {
   2981                     /* this is an unmatched trail code unit (2nd surrogate) */
   2982                     /* callback(illegal) */
   2983                     *err=U_ILLEGAL_CHAR_FOUND;
   2984                     cnv->fromUChar32=sourceChar;
   2985                     break;
   2986                 }
   2987             }
   2988 
   2989             /* do the conversion */
   2990             if(sourceChar <= 0x007f ){
   2991                 /* do not convert SO/SI/ESC */
   2992                 if(IS_2022_CONTROL(sourceChar)) {
   2993                     /* callback(illegal) */
   2994                     *err=U_ILLEGAL_CHAR_FOUND;
   2995                     cnv->fromUChar32=sourceChar;
   2996                     break;
   2997                 }
   2998 
   2999                 /* US-ASCII */
   3000                 if(pFromU2022State->g == 0) {
   3001                     buffer[0] = (char)sourceChar;
   3002                     len = 1;
   3003                 } else {
   3004                     buffer[0] = UCNV_SI;
   3005                     buffer[1] = (char)sourceChar;
   3006                     len = 2;
   3007                     pFromU2022State->g = 0;
   3008                     choiceCount = 0;
   3009                 }
   3010                 if(sourceChar == CR || sourceChar == LF) {
   3011                     /* reset the state at the end of a line */
   3012                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
   3013                     choiceCount = 0;
   3014                 }
   3015             }
   3016             else{
   3017                 /* convert U+0080..U+10ffff */
   3018                 int32_t i;
   3019                 int8_t cs, g;
   3020 
   3021                 if(choiceCount == 0) {
   3022                     /* try the current SO/G1 converter first */
   3023                     choices[0] = pFromU2022State->cs[1];
   3024 
   3025                     /* default to GB2312_1 if none is designated yet */
   3026                     if(choices[0] == 0) {
   3027                         choices[0] = GB2312_1;
   3028                     }
   3029 
   3030                     if(converterData->version == 0) {
   3031                         /* ISO-2022-CN */
   3032 
   3033                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
   3034                         if(choices[0] == GB2312_1) {
   3035                             choices[1] = (int8_t)CNS_11643_1;
   3036                         } else {
   3037                             choices[1] = (int8_t)GB2312_1;
   3038                         }
   3039 
   3040                         choiceCount = 2;
   3041                     } else if (converterData->version == 1) {
   3042                         /* ISO-2022-CN-EXT */
   3043 
   3044                         /* try one of the other converters */
   3045                         switch(choices[0]) {
   3046                         case GB2312_1:
   3047                             choices[1] = (int8_t)CNS_11643_1;
   3048                             choices[2] = (int8_t)ISO_IR_165;
   3049                             break;
   3050                         case ISO_IR_165:
   3051                             choices[1] = (int8_t)GB2312_1;
   3052                             choices[2] = (int8_t)CNS_11643_1;
   3053                             break;
   3054                         default: /* CNS_11643_x */
   3055                             choices[1] = (int8_t)GB2312_1;
   3056                             choices[2] = (int8_t)ISO_IR_165;
   3057                             break;
   3058                         }
   3059 
   3060                         choiceCount = 3;
   3061                     } else {
   3062                         choices[0] = (int8_t)CNS_11643_1;
   3063                         choices[1] = (int8_t)GB2312_1;
   3064                     }
   3065                 }
   3066 
   3067                 cs = g = 0;
   3068                 /*
   3069                  * len==0: no mapping found yet
   3070                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
   3071                  * len>0: found a roundtrip result, done
   3072                  */
   3073                 len = 0;
   3074                 /*
   3075                  * We will turn off useFallback after finding a fallback,
   3076                  * but we still get fallbacks from PUA code points as usual.
   3077                  * Therefore, we will also need to check that we don't overwrite
   3078                  * an early fallback with a later one.
   3079                  */
   3080                 useFallback = cnv->useFallback;
   3081 
   3082                 for(i = 0; i < choiceCount && len <= 0; ++i) {
   3083                     int8_t cs0 = choices[i];
   3084                     if(cs0 > 0) {
   3085                         uint32_t value;
   3086                         int32_t len2;
   3087                         if(cs0 >= CNS_11643_0) {
   3088                             len2 = MBCS_FROM_UCHAR32_ISO2022(
   3089                                         converterData->myConverterArray[CNS_11643],
   3090                                         sourceChar,
   3091                                         &value,
   3092                                         useFallback,
   3093                                         MBCS_OUTPUT_3);
   3094                             if(len2 == 3 || (len2 == -3 && len == 0)) {
   3095                                 targetValue = value;
   3096                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
   3097                                 if(len2 >= 0) {
   3098                                     len = 2;
   3099                                 } else {
   3100                                     len = -2;
   3101                                     useFallback = FALSE;
   3102                                 }
   3103                                 if(cs == CNS_11643_1) {
   3104                                     g = 1;
   3105                                 } else if(cs == CNS_11643_2) {
   3106                                     g = 2;
   3107                                 } else /* plane 3..7 */ if(converterData->version == 1) {
   3108                                     g = 3;
   3109                                 } else {
   3110                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
   3111                                     len = 0;
   3112                                 }
   3113                             }
   3114                         } else {
   3115                             /* GB2312_1 or ISO-IR-165 */
   3116                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
   3117                             len2 = MBCS_FROM_UCHAR32_ISO2022(
   3118                                         converterData->myConverterArray[cs0],
   3119                                         sourceChar,
   3120                                         &value,
   3121                                         useFallback,
   3122                                         MBCS_OUTPUT_2);
   3123                             if(len2 == 2 || (len2 == -2 && len == 0)) {
   3124                                 targetValue = value;
   3125                                 len = len2;
   3126                                 cs = cs0;
   3127                                 g = 1;
   3128                                 useFallback = FALSE;
   3129                             }
   3130                         }
   3131                     }
   3132                 }
   3133 
   3134                 if(len != 0) {
   3135                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
   3136 
   3137                     /* write the designation sequence if necessary */
   3138                     if(cs != pFromU2022State->cs[g]) {
   3139                         if(cs < CNS_11643) {
   3140                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
   3141                         } else {
   3142                             U_ASSERT(cs >= CNS_11643_1);
   3143                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
   3144                         }
   3145                         len = 4;
   3146                         pFromU2022State->cs[g] = cs;
   3147                         if(g == 1) {
   3148                             /* changing the SO/G1 charset invalidates the choices[] */
   3149                             choiceCount = 0;
   3150                         }
   3151                     }
   3152 
   3153                     /* write the shift sequence if necessary */
   3154                     if(g != pFromU2022State->g) {
   3155                         switch(g) {
   3156                         case 1:
   3157                             buffer[len++] = UCNV_SO;
   3158 
   3159                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
   3160                             pFromU2022State->g = 1;
   3161                             break;
   3162                         case 2:
   3163                             buffer[len++] = 0x1b;
   3164                             buffer[len++] = 0x4e;
   3165                             break;
   3166                         default: /* case 3 */
   3167                             buffer[len++] = 0x1b;
   3168                             buffer[len++] = 0x4f;
   3169                             break;
   3170                         }
   3171                     }
   3172 
   3173                     /* write the two output bytes */
   3174                     buffer[len++] = (char)(targetValue >> 8);
   3175                     buffer[len++] = (char)targetValue;
   3176                 } else {
   3177                     /* if we cannot find the character after checking all codepages
   3178                      * then this is an error
   3179                      */
   3180                     *err = U_INVALID_CHAR_FOUND;
   3181                     cnv->fromUChar32=sourceChar;
   3182                     break;
   3183                 }
   3184             }
   3185 
   3186             /* output len>0 bytes in buffer[] */
   3187             if(len == 1) {
   3188                 *target++ = buffer[0];
   3189                 if(offsets) {
   3190                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
   3191                 }
   3192             } else if(len == 2 && (target + 2) <= targetLimit) {
   3193                 *target++ = buffer[0];
   3194                 *target++ = buffer[1];
   3195                 if(offsets) {
   3196                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
   3197                     *offsets++ = sourceIndex;
   3198                     *offsets++ = sourceIndex;
   3199                 }
   3200             } else {
   3201                 fromUWriteUInt8(
   3202                     cnv,
   3203                     buffer, len,
   3204                     &target, (const char *)targetLimit,
   3205                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
   3206                     err);
   3207                 if(U_FAILURE(*err)) {
   3208                     break;
   3209                 }
   3210             }
   3211         } /* end if(myTargetIndex<myTargetLength) */
   3212         else{
   3213             *err =U_BUFFER_OVERFLOW_ERROR;
   3214             break;
   3215         }
   3216 
   3217     }/* end while(mySourceIndex<mySourceLength) */
   3218 
   3219     /*
   3220      * the end of the input stream and detection of truncated input
   3221      * are handled by the framework, but for ISO-2022-CN conversion
   3222      * we need to be in ASCII mode at the very end
   3223      *
   3224      * conditions:
   3225      *   successful
   3226      *   not in ASCII mode
   3227      *   end of input and no truncated input
   3228      */
   3229     if( U_SUCCESS(*err) &&
   3230         pFromU2022State->g!=0 &&
   3231         args->flush && source>=sourceLimit && cnv->fromUChar32==0
   3232     ) {
   3233         int32_t sourceIndex;
   3234 
   3235         /* we are switching to ASCII */
   3236         pFromU2022State->g=0;
   3237 
   3238         /* get the source index of the last input character */
   3239         /*
   3240          * TODO this would be simpler and more reliable if we used a pair
   3241          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
   3242          * so that we could simply use the prevSourceIndex here;
   3243          * this code gives an incorrect result for the rare case of an unmatched
   3244          * trail surrogate that is alone in the last buffer of the text stream
   3245          */
   3246         sourceIndex=(int32_t)(source-args->source);
   3247         if(sourceIndex>0) {
   3248             --sourceIndex;
   3249             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
   3250                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
   3251             ) {
   3252                 --sourceIndex;
   3253             }
   3254         } else {
   3255             sourceIndex=-1;
   3256         }
   3257 
   3258         fromUWriteUInt8(
   3259             cnv,
   3260             SHIFT_IN_STR, 1,
   3261             &target, (const char *)targetLimit,
   3262             &offsets, sourceIndex,
   3263             err);
   3264     }
   3265 
   3266     /*save the state and return */
   3267     args->source = source;
   3268     args->target = (char*)target;
   3269 }
   3270 
   3271 
   3272 static void U_CALLCONV
   3273 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
   3274                                                UErrorCode* err){
   3275     char tempBuf[3];
   3276     const char *mySource = (char *) args->source;
   3277     UChar *myTarget = args->target;
   3278     const char *mySourceLimit = args->sourceLimit;
   3279     uint32_t targetUniChar = 0x0000;
   3280     uint32_t mySourceChar = 0x0000;
   3281     UConverterDataISO2022* myData;
   3282     ISO2022State *pToU2022State;
   3283 
   3284     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
   3285     pToU2022State = &myData->toU2022State;
   3286 
   3287     if(myData->key != 0) {
   3288         /* continue with a partial escape sequence */
   3289         goto escape;
   3290     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
   3291         /* continue with a partial double-byte character */
   3292         mySourceChar = args->converter->toUBytes[0];
   3293         args->converter->toULength = 0;
   3294         targetUniChar = missingCharMarker;
   3295         goto getTrailByte;
   3296     }
   3297 
   3298     while(mySource < mySourceLimit){
   3299 
   3300         targetUniChar =missingCharMarker;
   3301 
   3302         if(myTarget < args->targetLimit){
   3303 
   3304             mySourceChar= (unsigned char) *mySource++;
   3305 
   3306             switch(mySourceChar){
   3307             case UCNV_SI:
   3308                 pToU2022State->g=0;
   3309                 if (myData->isEmptySegment) {
   3310                     myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
   3311                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   3312                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
   3313                     args->converter->toUBytes[0] = mySourceChar;
   3314                     args->converter->toULength = 1;
   3315                     args->target = myTarget;
   3316                     args->source = mySource;
   3317                     return;
   3318                 }
   3319                 continue;
   3320 
   3321             case UCNV_SO:
   3322                 if(pToU2022State->cs[1] != 0) {
   3323                     pToU2022State->g=1;
   3324                     myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
   3325                     continue;
   3326                 } else {
   3327                     /* illegal to have SO before a matching designator */
   3328                     myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
   3329                     break;
   3330                 }
   3331 
   3332             case ESC_2022:
   3333                 mySource--;
   3334 escape:
   3335                 {
   3336                     const char * mySourceBefore = mySource;
   3337                     int8_t toULengthBefore = args->converter->toULength;
   3338 
   3339                     changeState_2022(args->converter,&(mySource),
   3340                         mySourceLimit, ISO_2022_CN,err);
   3341 
   3342                     /* After SO there must be at least one character before a designator (designator error handled separately) */
   3343                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
   3344                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
   3345                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
   3346                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
   3347                     }
   3348                 }
   3349 
   3350                 /* invalid or illegal escape sequence */
   3351                 if(U_FAILURE(*err)){
   3352                     args->target = myTarget;
   3353                     args->source = mySource;
   3354                     myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
   3355                     return;
   3356                 }
   3357                 continue;
   3358 
   3359             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
   3360 
   3361             case CR:
   3362             case LF:
   3363                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
   3364                 U_FALLTHROUGH;
   3365             default:
   3366                 /* convert one or two bytes */
   3367                 myData->isEmptySegment = FALSE;
   3368                 if(pToU2022State->g != 0) {
   3369                     if(mySource < mySourceLimit) {
   3370                         UConverterSharedData *cnv;
   3371                         StateEnum tempState;
   3372                         int32_t tempBufLen;
   3373                         int leadIsOk, trailIsOk;
   3374                         uint8_t trailByte;
   3375 getTrailByte:
   3376                         trailByte = (uint8_t)*mySource;
   3377                         /*
   3378                          * Ticket 5691: consistent illegal sequences:
   3379                          * - We include at least the first byte in the illegal sequence.
   3380                          * - If any of the non-initial bytes could be the start of a character,
   3381                          *   we stop the illegal sequence before the first one of those.
   3382                          *
   3383                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
   3384                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
   3385                          * Otherwise we convert or report the pair of bytes.
   3386                          */
   3387                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
   3388                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
   3389                         if (leadIsOk && trailIsOk) {
   3390                             ++mySource;
   3391                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
   3392                             if(tempState >= CNS_11643_0) {
   3393                                 cnv = myData->myConverterArray[CNS_11643];
   3394                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
   3395                                 tempBuf[1] = (char) (mySourceChar);
   3396                                 tempBuf[2] = (char) trailByte;
   3397                                 tempBufLen = 3;
   3398 
   3399                             }else{
   3400                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
   3401                                 cnv = myData->myConverterArray[tempState];
   3402                                 tempBuf[0] = (char) (mySourceChar);
   3403                                 tempBuf[1] = (char) trailByte;
   3404                                 tempBufLen = 2;
   3405                             }
   3406                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
   3407                             mySourceChar = (mySourceChar << 8) | trailByte;
   3408                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
   3409                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
   3410                             ++mySource;
   3411                             /* add another bit so that the code below writes 2 bytes in case of error */
   3412                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
   3413                         }
   3414                         if(pToU2022State->g>=2) {
   3415                             /* return from a single-shift state to the previous one */
   3416                             pToU2022State->g=pToU2022State->prevG;
   3417                         }
   3418                     } else {
   3419                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
   3420                         args->converter->toULength = 1;
   3421                         goto endloop;
   3422                     }
   3423                 }
   3424                 else{
   3425                     if(mySourceChar <= 0x7f) {
   3426                         targetUniChar = (UChar) mySourceChar;
   3427                     }
   3428                 }
   3429                 break;
   3430             }
   3431             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
   3432                 if(args->offsets){
   3433                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3434                 }
   3435                 *(myTarget++)=(UChar)targetUniChar;
   3436             }
   3437             else if(targetUniChar > missingCharMarker){
   3438                 /* disassemble the surrogate pair and write to output*/
   3439                 targetUniChar-=0x0010000;
   3440                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
   3441                 if(args->offsets){
   3442                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3443                 }
   3444                 ++myTarget;
   3445                 if(myTarget< args->targetLimit){
   3446                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   3447                     if(args->offsets){
   3448                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
   3449                     }
   3450                     ++myTarget;
   3451                 }else{
   3452                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
   3453                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
   3454                 }
   3455 
   3456             }
   3457             else{
   3458                 /* Call the callback function*/
   3459                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
   3460                 break;
   3461             }
   3462         }
   3463         else{
   3464             *err =U_BUFFER_OVERFLOW_ERROR;
   3465             break;
   3466         }
   3467     }
   3468 endloop:
   3469     args->target = myTarget;
   3470     args->source = mySource;
   3471 }
   3472 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
   3473 
   3474 static void U_CALLCONV
   3475 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
   3476     UConverter *cnv = args->converter;
   3477     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
   3478     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
   3479     char *p, *subchar;
   3480     char buffer[8];
   3481     int32_t length;
   3482 
   3483     subchar=(char *)cnv->subChars;
   3484     length=cnv->subCharLen; /* assume length==1 for most variants */
   3485 
   3486     p = buffer;
   3487     switch(myConverterData->locale[0]){
   3488     case 'j':
   3489         {
   3490             int8_t cs;
   3491 
   3492             if(pFromU2022State->g == 1) {
   3493                 /* JIS7: switch from G1 to G0 */
   3494                 pFromU2022State->g = 0;
   3495                 *p++ = UCNV_SI;
   3496             }
   3497 
   3498             cs = pFromU2022State->cs[0];
   3499             if(cs != ASCII && cs != JISX201) {
   3500                 /* not in ASCII or JIS X 0201: switch to ASCII */
   3501                 pFromU2022State->cs[0] = (int8_t)ASCII;
   3502                 *p++ = '\x1b';
   3503                 *p++ = '\x28';
   3504                 *p++ = '\x42';
   3505             }
   3506 
   3507             *p++ = subchar[0];
   3508             break;
   3509         }
   3510     case 'c':
   3511         if(pFromU2022State->g != 0) {
   3512             /* not in ASCII mode: switch to ASCII */
   3513             pFromU2022State->g = 0;
   3514             *p++ = UCNV_SI;
   3515         }
   3516         *p++ = subchar[0];
   3517         break;
   3518     case 'k':
   3519         if(myConverterData->version == 0) {
   3520             if(length == 1) {
   3521                 if((UBool)args->converter->fromUnicodeStatus) {
   3522                     /* in DBCS mode: switch to SBCS */
   3523                     args->converter->fromUnicodeStatus = 0;
   3524                     *p++ = UCNV_SI;
   3525                 }
   3526                 *p++ = subchar[0];
   3527             } else /* length == 2*/ {
   3528                 if(!(UBool)args->converter->fromUnicodeStatus) {
   3529                     /* in SBCS mode: switch to DBCS */
   3530                     args->converter->fromUnicodeStatus = 1;
   3531                     *p++ = UCNV_SO;
   3532                 }
   3533                 *p++ = subchar[0];
   3534                 *p++ = subchar[1];
   3535             }
   3536             break;
   3537         } else {
   3538             /* save the subconverter's substitution string */
   3539             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
   3540             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
   3541 
   3542             /* set our substitution string into the subconverter */
   3543             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
   3544             myConverterData->currentConverter->subCharLen = (int8_t)length;
   3545 
   3546             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
   3547             args->converter = myConverterData->currentConverter;
   3548             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
   3549             ucnv_cbFromUWriteSub(args, 0, err);
   3550             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
   3551             args->converter = cnv;
   3552 
   3553             /* restore the subconverter's substitution string */
   3554             myConverterData->currentConverter->subChars = currentSubChars;
   3555             myConverterData->currentConverter->subCharLen = currentSubCharLen;
   3556 
   3557             if(*err == U_BUFFER_OVERFLOW_ERROR) {
   3558                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
   3559                     uprv_memcpy(
   3560                         cnv->charErrorBuffer,
   3561                         myConverterData->currentConverter->charErrorBuffer,
   3562                         myConverterData->currentConverter->charErrorBufferLength);
   3563                 }
   3564                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
   3565                 myConverterData->currentConverter->charErrorBufferLength = 0;
   3566             }
   3567             return;
   3568         }
   3569     default:
   3570         /* not expected */
   3571         break;
   3572     }
   3573     ucnv_cbFromUWriteBytes(args,
   3574                            buffer, (int32_t)(p - buffer),
   3575                            offsetIndex, err);
   3576 }
   3577 
   3578 /*
   3579  * Structure for cloning an ISO 2022 converter into a single memory block.
   3580  * ucnv_safeClone() of the converter will align the entire cloneStruct,
   3581  * and then ucnv_safeClone() of the sub-converter may additionally align
   3582  * currentConverter inside the cloneStruct, for which we need the deadSpace
   3583  * after currentConverter.
   3584  * This is because UAlignedMemory may be larger than the actually
   3585  * necessary alignment size for the platform.
   3586  * The other cloneStruct fields will not be moved around,
   3587  * and are aligned properly with cloneStruct's alignment.
   3588  */
   3589 struct cloneStruct
   3590 {
   3591     UConverter cnv;
   3592     UConverter currentConverter;
   3593     UAlignedMemory deadSpace;
   3594     UConverterDataISO2022 mydata;
   3595 };
   3596 
   3597 
   3598 U_CDECL_BEGIN
   3599 
   3600 static UConverter * U_CALLCONV
   3601 _ISO_2022_SafeClone(
   3602             const UConverter *cnv,
   3603             void *stackBuffer,
   3604             int32_t *pBufferSize,
   3605             UErrorCode *status)
   3606 {
   3607     struct cloneStruct * localClone;
   3608     UConverterDataISO2022 *cnvData;
   3609     int32_t i, size;
   3610 
   3611     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
   3612         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
   3613         return NULL;
   3614     }
   3615 
   3616     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
   3617     localClone = (struct cloneStruct *)stackBuffer;
   3618 
   3619     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
   3620 
   3621     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
   3622     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
   3623     localClone->cnv.isExtraLocal = TRUE;
   3624 
   3625     /* share the subconverters */
   3626 
   3627     if(cnvData->currentConverter != NULL) {
   3628         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
   3629         localClone->mydata.currentConverter =
   3630             ucnv_safeClone(cnvData->currentConverter,
   3631                             &localClone->currentConverter,
   3632                             &size, status);
   3633         if(U_FAILURE(*status)) {
   3634             return NULL;
   3635         }
   3636     }
   3637 
   3638     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
   3639         if(cnvData->myConverterArray[i] != NULL) {
   3640             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
   3641         }
   3642     }
   3643 
   3644     return &localClone->cnv;
   3645 }
   3646 
   3647 U_CDECL_END
   3648 
   3649 static void U_CALLCONV
   3650 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
   3651                     const USetAdder *sa,
   3652                     UConverterUnicodeSet which,
   3653                     UErrorCode *pErrorCode)
   3654 {
   3655     int32_t i;
   3656     UConverterDataISO2022* cnvData;
   3657 
   3658     if (U_FAILURE(*pErrorCode)) {
   3659         return;
   3660     }
   3661 #ifdef U_ENABLE_GENERIC_ISO_2022
   3662     if (cnv->sharedData == &_ISO2022Data) {
   3663         /* We use UTF-8 in this case */
   3664         sa->addRange(sa->set, 0, 0xd7FF);
   3665         sa->addRange(sa->set, 0xE000, 0x10FFFF);
   3666         return;
   3667     }
   3668 #endif
   3669 
   3670     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
   3671 
   3672     /* open a set and initialize it with code points that are algorithmically round-tripped */
   3673     switch(cnvData->locale[0]){
   3674     case 'j':
   3675         /* include JIS X 0201 which is hardcoded */
   3676         sa->add(sa->set, 0xa5);
   3677         sa->add(sa->set, 0x203e);
   3678         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
   3679             /* include Latin-1 for some variants of JP */
   3680             sa->addRange(sa->set, 0, 0xff);
   3681         } else {
   3682             /* include ASCII for JP */
   3683             sa->addRange(sa->set, 0, 0x7f);
   3684         }
   3685         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
   3686             /*
   3687              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
   3688              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
   3689              * use half-width Katakana.
   3690              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
   3691              * half-width Katakana via the ESC ( I sequence.
   3692              * However, we only emit (fromUnicode) half-width Katakana according to the
   3693              * definition of each variant.
   3694              *
   3695              * When including fallbacks,
   3696              * we need to include half-width Katakana Unicode code points for all JP variants because
   3697              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
   3698              */
   3699             /* include half-width Katakana for JP */
   3700             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
   3701         }
   3702         break;
   3703 #if !UCONFIG_ONLY_HTML_CONVERSION
   3704     case 'c':
   3705     case 'z':
   3706         /* include ASCII for CN */
   3707         sa->addRange(sa->set, 0, 0x7f);
   3708         break;
   3709     case 'k':
   3710         /* there is only one converter for KR, and it is not in the myConverterArray[] */
   3711         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
   3712                 cnvData->currentConverter, sa, which, pErrorCode);
   3713         /* the loop over myConverterArray[] will simply not find another converter */
   3714         break;
   3715 #endif
   3716     default:
   3717         break;
   3718     }
   3719 
   3720 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
   3721             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
   3722                 cnvData->version==0 && i==CNS_11643
   3723             ) {
   3724                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
   3725                 ucnv_MBCSGetUnicodeSetForBytes(
   3726                         cnvData->myConverterArray[i],
   3727                         sa, UCNV_ROUNDTRIP_SET,
   3728                         0, 0x81, 0x82,
   3729                         pErrorCode);
   3730             }
   3731 #endif
   3732 
   3733     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
   3734         UConverterSetFilter filter;
   3735         if(cnvData->myConverterArray[i]!=NULL) {
   3736             if(cnvData->locale[0]=='j' && i==JISX208) {
   3737                 /*
   3738                  * Only add code points that map to Shift-JIS codes
   3739                  * corresponding to JIS X 0208.
   3740                  */
   3741                 filter=UCNV_SET_FILTER_SJIS;
   3742 #if !UCONFIG_ONLY_HTML_CONVERSION
   3743             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
   3744                        cnvData->version==0 && i==CNS_11643) {
   3745                 /*
   3746                  * Version-specific for CN:
   3747                  * CN version 0 does not map CNS planes 3..7 although
   3748                  * they are all available in the CNS conversion table;
   3749                  * CN version 1 (-EXT) does map them all.
   3750                  * The two versions create different Unicode sets.
   3751                  */
   3752                 filter=UCNV_SET_FILTER_2022_CN;
   3753             } else if(i==KSC5601) {
   3754                 /*
   3755                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
   3756                  * are broader than GR94.
   3757                  */
   3758                 filter=UCNV_SET_FILTER_GR94DBCS;
   3759 #endif
   3760             } else {
   3761                 filter=UCNV_SET_FILTER_NONE;
   3762             }
   3763             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
   3764         }
   3765     }
   3766 
   3767     /*
   3768      * ISO 2022 converters must not convert SO/SI/ESC despite what
   3769      * sub-converters do by themselves.
   3770      * Remove these characters from the set.
   3771      */
   3772     sa->remove(sa->set, 0x0e);
   3773     sa->remove(sa->set, 0x0f);
   3774     sa->remove(sa->set, 0x1b);
   3775 
   3776     /* ISO 2022 converters do not convert C1 controls either */
   3777     sa->removeRange(sa->set, 0x80, 0x9f);
   3778 }
   3779 
   3780 static const UConverterImpl _ISO2022Impl={
   3781     UCNV_ISO_2022,
   3782 
   3783     NULL,
   3784     NULL,
   3785 
   3786     _ISO2022Open,
   3787     _ISO2022Close,
   3788     _ISO2022Reset,
   3789 
   3790 #ifdef U_ENABLE_GENERIC_ISO_2022
   3791     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
   3792     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
   3793     ucnv_fromUnicode_UTF8,
   3794     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   3795 #else
   3796     NULL,
   3797     NULL,
   3798     NULL,
   3799     NULL,
   3800 #endif
   3801     NULL,
   3802 
   3803     NULL,
   3804     _ISO2022getName,
   3805     _ISO_2022_WriteSub,
   3806     _ISO_2022_SafeClone,
   3807     _ISO_2022_GetUnicodeSet,
   3808 
   3809     NULL,
   3810     NULL
   3811 };
   3812 static const UConverterStaticData _ISO2022StaticData={
   3813     sizeof(UConverterStaticData),
   3814     "ISO_2022",
   3815     2022,
   3816     UCNV_IBM,
   3817     UCNV_ISO_2022,
   3818     1,
   3819     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
   3820     { 0x1a, 0, 0, 0 },
   3821     1,
   3822     FALSE,
   3823     FALSE,
   3824     0,
   3825     0,
   3826     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3827 };
   3828 const UConverterSharedData _ISO2022Data=
   3829         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
   3830 
   3831 /*************JP****************/
   3832 static const UConverterImpl _ISO2022JPImpl={
   3833     UCNV_ISO_2022,
   3834 
   3835     NULL,
   3836     NULL,
   3837 
   3838     _ISO2022Open,
   3839     _ISO2022Close,
   3840     _ISO2022Reset,
   3841 
   3842     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3843     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3844     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3845     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
   3846     NULL,
   3847 
   3848     NULL,
   3849     _ISO2022getName,
   3850     _ISO_2022_WriteSub,
   3851     _ISO_2022_SafeClone,
   3852     _ISO_2022_GetUnicodeSet,
   3853 
   3854     NULL,
   3855     NULL
   3856 };
   3857 static const UConverterStaticData _ISO2022JPStaticData={
   3858     sizeof(UConverterStaticData),
   3859     "ISO_2022_JP",
   3860     0,
   3861     UCNV_IBM,
   3862     UCNV_ISO_2022,
   3863     1,
   3864     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
   3865     { 0x1a, 0, 0, 0 },
   3866     1,
   3867     FALSE,
   3868     FALSE,
   3869     0,
   3870     0,
   3871     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3872 };
   3873 
   3874 namespace {
   3875 
   3876 const UConverterSharedData _ISO2022JPData=
   3877         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
   3878 
   3879 }  // namespace
   3880 
   3881 #if !UCONFIG_ONLY_HTML_CONVERSION
   3882 /************* KR ***************/
   3883 static const UConverterImpl _ISO2022KRImpl={
   3884     UCNV_ISO_2022,
   3885 
   3886     NULL,
   3887     NULL,
   3888 
   3889     _ISO2022Open,
   3890     _ISO2022Close,
   3891     _ISO2022Reset,
   3892 
   3893     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3894     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3895     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3896     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
   3897     NULL,
   3898 
   3899     NULL,
   3900     _ISO2022getName,
   3901     _ISO_2022_WriteSub,
   3902     _ISO_2022_SafeClone,
   3903     _ISO_2022_GetUnicodeSet,
   3904 
   3905     NULL,
   3906     NULL
   3907 };
   3908 static const UConverterStaticData _ISO2022KRStaticData={
   3909     sizeof(UConverterStaticData),
   3910     "ISO_2022_KR",
   3911     0,
   3912     UCNV_IBM,
   3913     UCNV_ISO_2022,
   3914     1,
   3915     8, /* max 8 bytes per UChar */
   3916     { 0x1a, 0, 0, 0 },
   3917     1,
   3918     FALSE,
   3919     FALSE,
   3920     0,
   3921     0,
   3922     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3923 };
   3924 
   3925 namespace {
   3926 
   3927 const UConverterSharedData _ISO2022KRData=
   3928         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
   3929 
   3930 }  // namespace
   3931 
   3932 /*************** CN ***************/
   3933 static const UConverterImpl _ISO2022CNImpl={
   3934 
   3935     UCNV_ISO_2022,
   3936 
   3937     NULL,
   3938     NULL,
   3939 
   3940     _ISO2022Open,
   3941     _ISO2022Close,
   3942     _ISO2022Reset,
   3943 
   3944     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3945     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3946     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3947     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
   3948     NULL,
   3949 
   3950     NULL,
   3951     _ISO2022getName,
   3952     _ISO_2022_WriteSub,
   3953     _ISO_2022_SafeClone,
   3954     _ISO_2022_GetUnicodeSet,
   3955 
   3956     NULL,
   3957     NULL
   3958 };
   3959 static const UConverterStaticData _ISO2022CNStaticData={
   3960     sizeof(UConverterStaticData),
   3961     "ISO_2022_CN",
   3962     0,
   3963     UCNV_IBM,
   3964     UCNV_ISO_2022,
   3965     1,
   3966     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
   3967     { 0x1a, 0, 0, 0 },
   3968     1,
   3969     FALSE,
   3970     FALSE,
   3971     0,
   3972     0,
   3973     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   3974 };
   3975 
   3976 namespace {
   3977 
   3978 const UConverterSharedData _ISO2022CNData=
   3979         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
   3980 
   3981 }  // namespace
   3982 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
   3983 
   3984 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   3985