1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "ucnv_imp.h" 38 #include "ucnv_bld.h" 39 #include "ucnv_cnv.h" 40 #include "ucnvmbcs.h" 41 #include "cstring.h" 42 #include "cmemory.h" 43 44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 45 46 #ifdef U_ENABLE_GENERIC_ISO_2022 47 /* 48 * I am disabling the generic ISO-2022 converter after proposing to do so on 49 * the icu mailing list two days ago. 50 * 51 * Reasons: 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 53 * its designation sequences, single shifts with return to the previous state, 54 * switch-with-no-return to UTF-16BE or similar, etc. 55 * This is unlike the language-specific variants like ISO-2022-JP which 56 * require a much smaller repertoire of ISO-2022 features. 57 * These variants continue to be supported. 58 * 2. I believe that no one is really using the generic ISO-2022 converter 59 * but rather always one of the language-specific variants. 60 * Note that ICU's generic ISO-2022 converter has always output one escape 61 * sequence followed by UTF-8 for the whole stream. 62 * 3. Switching between subcharsets is extremely slow, because each time 63 * the previous converter is closed and a new one opened, 64 * without any kind of caching, least-recently-used list, etc. 65 * 4. The code is currently buggy, and given the above it does not seem 66 * reasonable to spend the time on maintenance. 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 68 * This means, for example, that when ISO-8859-7 is designated, the following 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 70 * The ICU ISO-2022 converter does not handle this - and has no information 71 * about which subconverter would have to be shifted vs. which is designed 72 * for 7-bit ISO-2022. 73 * 74 * Markus Scherer 2003-dec-03 75 */ 76 #endif 77 78 static const char SHIFT_IN_STR[] = "\x0F"; 79 static const char SHIFT_OUT_STR[] = "\x0E"; 80 81 #define CR 0x0D 82 #define LF 0x0A 83 #define H_TAB 0x09 84 #define V_TAB 0x0B 85 #define SPACE 0x20 86 87 enum { 88 HWKANA_START=0xff61, 89 HWKANA_END=0xff9f 90 }; 91 92 /* 93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 94 * as bytes 21..7E. (Subtract 0x80.) 95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 96 * as bytes 20..7F. (Subtract 0x80.) 97 * Do not encode C1 control codes with native bytes 80..9F 98 * as bytes 00..1F (C0 control codes). 99 */ 100 enum { 101 GR94_START=0xa1, 102 GR94_END=0xfe, 103 GR96_START=0xa0, 104 GR96_END=0xff 105 }; 106 107 /* 108 * ISO 2022 control codes must not be converted from Unicode 109 * because they would mess up the byte stream. 110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 111 * corresponding to SO, SI, and ESC. 112 */ 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 114 115 /* for ISO-2022-JP and -CN implementations */ 116 typedef enum { 117 /* shared values */ 118 INVALID_STATE=-1, 119 ASCII = 0, 120 121 SS2_STATE=0x10, 122 SS3_STATE, 123 124 /* JP */ 125 ISO8859_1 = 1 , 126 ISO8859_7 = 2 , 127 JISX201 = 3, 128 JISX208 = 4, 129 JISX212 = 5, 130 GB2312 =6, 131 KSC5601 =7, 132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 133 134 /* CN */ 135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 136 GB2312_1=1, 137 ISO_IR_165=2, 138 CNS_11643=3, 139 140 /* 141 * these are used in StateEnum and ISO2022State variables, 142 * but CNS_11643 must be used to index into myConverterArray[] 143 */ 144 CNS_11643_0=0x20, 145 CNS_11643_1, 146 CNS_11643_2, 147 CNS_11643_3, 148 CNS_11643_4, 149 CNS_11643_5, 150 CNS_11643_6, 151 CNS_11643_7 152 } StateEnum; 153 154 /* is the StateEnum charset value for a DBCS charset? */ 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 156 157 #define CSM(cs) ((uint16_t)1<<(cs)) 158 159 /* 160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 162 * 163 * Note: The converter uses some leniency: 164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 165 * all versions, not just JIS7 and JIS8. 166 * - ICU does not distinguish between different versions of JIS X 0208. 167 */ 168 enum { MAX_JA_VERSION=4 }; 169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 175 }; 176 177 typedef enum { 178 ASCII1=0, 179 LATIN1, 180 SBCS, 181 DBCS, 182 MBCS, 183 HWKANA 184 }Cnv2022Type; 185 186 typedef struct ISO2022State { 187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 189 int8_t prevG; /* g before single shift (SS2 or SS3) */ 190 } ISO2022State; 191 192 #define UCNV_OPTIONS_VERSION_MASK 0xf 193 #define UCNV_2022_MAX_CONVERTERS 10 194 195 typedef struct{ 196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 197 UConverter *currentConverter; 198 Cnv2022Type currentType; 199 ISO2022State toU2022State, fromU2022State; 200 uint32_t key; 201 uint32_t version; 202 #ifdef U_ENABLE_GENERIC_ISO_2022 203 UBool isFirstBuffer; 204 #endif 205 UBool isEmptySegment; 206 char name[30]; 207 char locale[3]; 208 }UConverterDataISO2022; 209 210 /* Protos */ 211 /* ISO-2022 ----------------------------------------------------------------- */ 212 213 /*Forward declaration */ 214 U_CFUNC void 215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 216 UErrorCode * err); 217 U_CFUNC void 218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 219 UErrorCode * err); 220 221 #define ESC_2022 0x1B /*ESC*/ 222 223 typedef enum 224 { 225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 229 } UCNV_TableStates_2022; 230 231 /* 232 * The way these state transition arrays work is: 233 * ex : ESC$B is the sequence for JISX208 234 * a) First Iteration: char is ESC 235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 236 * int x = normalize_esq_chars_2022[27] which is equal to 1 237 * ii) Search for this value in escSeqStateTable_Key_2022[] 238 * value of x is stored at escSeqStateTable_Key_2022[0] 239 * iii) Save this index as offset 240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 242 * b) Switch on this state and continue to next char 243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 244 * which is normalize_esq_chars_2022[36] == 4 245 * ii) x is currently 1(from above) 246 * x<<=5 -- x is now 32 247 * x+=normalize_esq_chars_2022[36] 248 * now x is 36 249 * iii) Search for this value in escSeqStateTable_Key_2022[] 250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 253 * c) Switch on this state and continue to next char 254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 255 * ii) x is currently 36 (from above) 256 * x<<=5 -- x is now 1152 257 * x+=normalize_esq_chars_2022[66] 258 * now x is 1161 259 * iii) Search for this value in escSeqStateTable_Key_2022[] 260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 264 */ 265 266 267 /*Below are the 3 arrays depicting a state transition table*/ 268 static const int8_t normalize_esq_chars_2022[256] = { 269 /* 0 1 2 3 4 5 6 7 8 9 */ 270 271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 297 }; 298 299 #ifdef U_ENABLE_GENERIC_ISO_2022 300 /* 301 * When the generic ISO-2022 converter is completely removed, not just disabled 302 * per #ifdef, then the following state table and the associated tables that are 303 * dimensioned with MAX_STATES_2022 should be trimmed. 304 * 305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 306 * the associated escape sequences starting with ESC ( B should be removed. 307 * This includes the ones with key values 1097 and all of the ones above 1000000. 308 * 309 * For the latter, the tables can simply be truncated. 310 * For the former, since the tables must be kept parallel, it is probably best 311 * to simply duplicate an adjacent table cell, parallel in all tables. 312 * 313 * It may make sense to restructure the tables, especially by using small search 314 * tables for the variants instead of indexing them parallel to the table here. 315 */ 316 #endif 317 318 #define MAX_STATES_2022 74 319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 320 /* 0 1 2 3 4 5 6 7 8 9 */ 321 322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 329 ,35947631 ,35947635 ,35947636 ,35947638 330 }; 331 332 #ifdef U_ENABLE_GENERIC_ISO_2022 333 334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 335 /* 0 1 2 3 4 5 6 7 8 9 */ 336 337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 345 }; 346 347 #endif 348 349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 350 /* 0 1 2 3 4 5 6 7 8 9 */ 351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 }; 360 361 362 /* Type def for refactoring changeState_2022 code*/ 363 typedef enum{ 364 #ifdef U_ENABLE_GENERIC_ISO_2022 365 ISO_2022=0, 366 #endif 367 ISO_2022_JP=1, 368 ISO_2022_KR=2, 369 ISO_2022_CN=3 370 } Variant2022; 371 372 /*********** ISO 2022 Converter Protos ***********/ 373 static void 374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 375 376 static void 377 _ISO2022Close(UConverter *converter); 378 379 static void 380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 381 382 static const char* 383 _ISO2022getName(const UConverter* cnv); 384 385 static void 386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 387 388 static UConverter * 389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 390 391 #ifdef U_ENABLE_GENERIC_ISO_2022 392 static void 393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 394 #endif 395 396 /*const UConverterSharedData _ISO2022Data;*/ 397 static const UConverterSharedData _ISO2022JPData; 398 static const UConverterSharedData _ISO2022KRData; 399 static const UConverterSharedData _ISO2022CNData; 400 401 /*************** Converter implementations ******************/ 402 403 /* The purpose of this function is to get around gcc compiler warnings. */ 404 static U_INLINE void 405 fromUWriteUInt8(UConverter *cnv, 406 const char *bytes, int32_t length, 407 uint8_t **target, const char *targetLimit, 408 int32_t **offsets, 409 int32_t sourceIndex, 410 UErrorCode *pErrorCode) 411 { 412 char *targetChars = (char *)*target; 413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 414 offsets, sourceIndex, pErrorCode); 415 *target = (uint8_t*)targetChars; 416 417 } 418 419 static U_INLINE void 420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){ 421 if(myConverterData->version == 1) { 422 UConverter *cnv = myConverterData->currentConverter; 423 424 cnv->toUnicodeStatus=0; /* offset */ 425 cnv->mode=0; /* state */ 426 cnv->toULength=0; /* byteIndex */ 427 } 428 } 429 430 static U_INLINE void 431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 432 /* in ISO-2022-KR the designator sequence appears only once 433 * in a file so we append it only once 434 */ 435 if( converter->charErrorBufferLength==0){ 436 437 converter->charErrorBufferLength = 4; 438 converter->charErrorBuffer[0] = 0x1b; 439 converter->charErrorBuffer[1] = 0x24; 440 converter->charErrorBuffer[2] = 0x29; 441 converter->charErrorBuffer[3] = 0x43; 442 } 443 if(myConverterData->version == 1) { 444 UConverter *cnv = myConverterData->currentConverter; 445 446 cnv->fromUChar32=0; 447 cnv->fromUnicodeStatus=1; /* prevLength */ 448 } 449 } 450 451 static void 452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 453 454 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 455 456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 457 if(cnv->extraInfo != NULL) { 458 UConverterNamePieces stackPieces; 459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; 460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 461 uint32_t version; 462 463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 464 465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 466 myConverterData->currentType = ASCII1; 467 cnv->fromUnicodeStatus =FALSE; 468 if(pArgs->locale){ 469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 470 } 471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 472 myConverterData->version = version; 473 474 /* BEGIN android-changed */ 475 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 476 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 477 if((myLocale[0]=='j' && 478 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 479 myLocale[1]=='s') && 480 (myLocale[2]=='_' || myLocale[2]=='\0'))) 481 { 482 size_t len=0; 483 /* open the required converters and cache them */ 484 if(version>MAX_JA_VERSION) { 485 /* prevent indexing beyond jpCharsetMasks[] */ 486 myConverterData->version = version = 0; 487 } 488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 489 myConverterData->myConverterArray[ISO8859_7] = 490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 491 } 492 if (myLocale[1]=='k') { /* Use KDDI's version. */ 493 myConverterData->myConverterArray[JISX208] = 494 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 495 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 496 myConverterData->myConverterArray[JISX208] = 497 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 498 } else { 499 myConverterData->myConverterArray[JISX208] = 500 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 501 } 502 /* END android-changed */ 503 504 if(jpCharsetMasks[version]&CSM(JISX212)) { 505 myConverterData->myConverterArray[JISX212] = 506 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 507 } 508 if(jpCharsetMasks[version]&CSM(GB2312)) { 509 myConverterData->myConverterArray[GB2312] = 510 /* BEGIN android-changed */ 511 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 512 /* END android-changed */ 513 } 514 if(jpCharsetMasks[version]&CSM(KSC5601)) { 515 myConverterData->myConverterArray[KSC5601] = 516 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 517 } 518 519 /* set the function pointers to appropriate funtions */ 520 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 521 uprv_strcpy(myConverterData->locale,"ja"); 522 523 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 524 len = uprv_strlen(myConverterData->name); 525 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 526 myConverterData->name[len+1]='\0'; 527 } 528 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 529 (myLocale[2]=='_' || myLocale[2]=='\0')) 530 { 531 const char *cnvName; 532 if(version==1) { 533 cnvName="icu-internal-25546"; 534 } else { 535 /* BEGIN android-changed */ 536 cnvName="ksc_5601"; 537 /* END android-changed */ 538 myConverterData->version=version=0; 539 } 540 if(pArgs->onlyTestIsLoadable) { 541 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 542 uprv_free(cnv->extraInfo); 543 cnv->extraInfo=NULL; 544 return; 545 } else { 546 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 547 if (U_FAILURE(*errorCode)) { 548 _ISO2022Close(cnv); 549 return; 550 } 551 552 if(version==1) { 553 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 554 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 555 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 556 }else{ 557 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 558 } 559 560 /* initialize the state variables */ 561 setInitialStateToUnicodeKR(cnv, myConverterData); 562 setInitialStateFromUnicodeKR(cnv, myConverterData); 563 564 /* set the function pointers to appropriate funtions */ 565 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 566 uprv_strcpy(myConverterData->locale,"ko"); 567 } 568 } 569 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 570 (myLocale[2]=='_' || myLocale[2]=='\0')) 571 { 572 573 /* open the required converters and cache them */ 574 /* BEGIN android-changed */ 575 myConverterData->myConverterArray[GB2312_1] = 576 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 577 if(version==1) { 578 myConverterData->myConverterArray[ISO_IR_165] = 579 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 580 } 581 myConverterData->myConverterArray[CNS_11643] = 582 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 583 /* END android-changed */ 584 585 586 /* set the function pointers to appropriate funtions */ 587 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 588 uprv_strcpy(myConverterData->locale,"cn"); 589 590 if (version==0){ 591 myConverterData->version = 0; 592 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 593 }else if (version==1){ 594 myConverterData->version = 1; 595 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 596 }else { 597 myConverterData->version = 2; 598 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 599 } 600 } 601 else{ 602 #ifdef U_ENABLE_GENERIC_ISO_2022 603 myConverterData->isFirstBuffer = TRUE; 604 605 /* append the UTF-8 escape sequence */ 606 cnv->charErrorBufferLength = 3; 607 cnv->charErrorBuffer[0] = 0x1b; 608 cnv->charErrorBuffer[1] = 0x25; 609 cnv->charErrorBuffer[2] = 0x42; 610 611 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 612 /* initialize the state variables */ 613 uprv_strcpy(myConverterData->name,"ISO_2022"); 614 #else 615 *errorCode = U_UNSUPPORTED_ERROR; 616 return; 617 #endif 618 } 619 620 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 621 622 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 623 _ISO2022Close(cnv); 624 } 625 } else { 626 *errorCode = U_MEMORY_ALLOCATION_ERROR; 627 } 628 } 629 630 631 static void 632 _ISO2022Close(UConverter *converter) { 633 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 634 UConverterSharedData **array = myData->myConverterArray; 635 int32_t i; 636 637 if (converter->extraInfo != NULL) { 638 /*close the array of converter pointers and free the memory*/ 639 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 640 if(array[i]!=NULL) { 641 ucnv_unloadSharedDataIfReady(array[i]); 642 } 643 } 644 645 ucnv_close(myData->currentConverter); 646 647 if(!converter->isExtraLocal){ 648 uprv_free (converter->extraInfo); 649 converter->extraInfo = NULL; 650 } 651 } 652 } 653 654 static void 655 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 656 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 657 if(choice<=UCNV_RESET_TO_UNICODE) { 658 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 659 myConverterData->key = 0; 660 myConverterData->isEmptySegment = FALSE; 661 } 662 if(choice!=UCNV_RESET_TO_UNICODE) { 663 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 664 } 665 #ifdef U_ENABLE_GENERIC_ISO_2022 666 if(myConverterData->locale[0] == 0){ 667 if(choice<=UCNV_RESET_TO_UNICODE) { 668 myConverterData->isFirstBuffer = TRUE; 669 myConverterData->key = 0; 670 if (converter->mode == UCNV_SO){ 671 ucnv_close (myConverterData->currentConverter); 672 myConverterData->currentConverter=NULL; 673 } 674 converter->mode = UCNV_SI; 675 } 676 if(choice!=UCNV_RESET_TO_UNICODE) { 677 /* re-append UTF-8 escape sequence */ 678 converter->charErrorBufferLength = 3; 679 converter->charErrorBuffer[0] = 0x1b; 680 converter->charErrorBuffer[1] = 0x28; 681 converter->charErrorBuffer[2] = 0x42; 682 } 683 } 684 else 685 #endif 686 { 687 /* reset the state variables */ 688 if(myConverterData->locale[0] == 'k'){ 689 if(choice<=UCNV_RESET_TO_UNICODE) { 690 setInitialStateToUnicodeKR(converter, myConverterData); 691 } 692 if(choice!=UCNV_RESET_TO_UNICODE) { 693 setInitialStateFromUnicodeKR(converter, myConverterData); 694 } 695 } 696 } 697 } 698 699 static const char* 700 _ISO2022getName(const UConverter* cnv){ 701 if(cnv->extraInfo){ 702 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 703 return myData->name; 704 } 705 return NULL; 706 } 707 708 709 /*************** to unicode *******************/ 710 /**************************************************************************** 711 * Recognized escape sequences are 712 * <ESC>(B ASCII 713 * <ESC>.A ISO-8859-1 714 * <ESC>.F ISO-8859-7 715 * <ESC>(J JISX-201 716 * <ESC>(I JISX-201 717 * <ESC>$B JISX-208 718 * <ESC>$@ JISX-208 719 * <ESC>$(D JISX-212 720 * <ESC>$A GB2312 721 * <ESC>$(C KSC5601 722 */ 723 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 724 /* 0 1 2 3 4 5 6 7 8 9 */ 725 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 726 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 727 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 728 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 729 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 730 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 731 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 732 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 733 }; 734 735 /*************** to unicode *******************/ 736 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 737 /* 0 1 2 3 4 5 6 7 8 9 */ 738 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 739 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 740 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 741 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 742 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 743 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 745 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 746 }; 747 748 749 static UCNV_TableStates_2022 750 getKey_2022(char c,int32_t* key,int32_t* offset){ 751 int32_t togo; 752 int32_t low = 0; 753 int32_t hi = MAX_STATES_2022; 754 int32_t oldmid=0; 755 756 togo = normalize_esq_chars_2022[(uint8_t)c]; 757 if(togo == 0) { 758 /* not a valid character anywhere in an escape sequence */ 759 *key = 0; 760 *offset = 0; 761 return INVALID_2022; 762 } 763 togo = (*key << 5) + togo; 764 765 while (hi != low) /*binary search*/{ 766 767 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 768 769 if (mid == oldmid) 770 break; 771 772 if (escSeqStateTable_Key_2022[mid] > togo){ 773 hi = mid; 774 } 775 else if (escSeqStateTable_Key_2022[mid] < togo){ 776 low = mid; 777 } 778 else /*we found it*/{ 779 *key = togo; 780 *offset = mid; 781 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 782 } 783 oldmid = mid; 784 785 } 786 787 *key = 0; 788 *offset = 0; 789 return INVALID_2022; 790 } 791 792 /*runs through a state machine to determine the escape sequence - codepage correspondance 793 */ 794 static void 795 changeState_2022(UConverter* _this, 796 const char** source, 797 const char* sourceLimit, 798 Variant2022 var, 799 UErrorCode* err){ 800 UCNV_TableStates_2022 value; 801 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 802 uint32_t key = myData2022->key; 803 int32_t offset = 0; 804 int8_t initialToULength = _this->toULength; 805 char c; 806 807 value = VALID_NON_TERMINAL_2022; 808 while (*source < sourceLimit) { 809 c = *(*source)++; 810 _this->toUBytes[_this->toULength++]=(uint8_t)c; 811 value = getKey_2022(c,(int32_t *) &key, &offset); 812 813 switch (value){ 814 815 case VALID_NON_TERMINAL_2022 : 816 /* continue with the loop */ 817 break; 818 819 case VALID_TERMINAL_2022: 820 key = 0; 821 goto DONE; 822 823 case INVALID_2022: 824 goto DONE; 825 826 case VALID_MAYBE_TERMINAL_2022: 827 #ifdef U_ENABLE_GENERIC_ISO_2022 828 /* ESC ( B is ambiguous only for ISO_2022 itself */ 829 if(var == ISO_2022) { 830 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 831 _this->toULength = 0; 832 833 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 834 835 /* continue with the loop */ 836 value = VALID_NON_TERMINAL_2022; 837 break; 838 } else 839 #endif 840 { 841 /* not ISO_2022 itself, finish here */ 842 value = VALID_TERMINAL_2022; 843 key = 0; 844 goto DONE; 845 } 846 } 847 } 848 849 DONE: 850 myData2022->key = key; 851 852 if (value == VALID_NON_TERMINAL_2022) { 853 /* indicate that the escape sequence is incomplete: key!=0 */ 854 return; 855 } else if (value == INVALID_2022 ) { 856 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 857 } else /* value == VALID_TERMINAL_2022 */ { 858 switch(var){ 859 #ifdef U_ENABLE_GENERIC_ISO_2022 860 case ISO_2022: 861 { 862 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 863 if(chosenConverterName == NULL) { 864 /* SS2 or SS3 */ 865 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 866 _this->toUCallbackReason = UCNV_UNASSIGNED; 867 return; 868 } 869 870 _this->mode = UCNV_SI; 871 ucnv_close(myData2022->currentConverter); 872 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 873 if(U_SUCCESS(*err)) { 874 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 875 _this->mode = UCNV_SO; 876 } 877 break; 878 } 879 #endif 880 case ISO_2022_JP: 881 { 882 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 883 switch(tempState) { 884 case INVALID_STATE: 885 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 886 break; 887 case SS2_STATE: 888 if(myData2022->toU2022State.cs[2]!=0) { 889 if(myData2022->toU2022State.g<2) { 890 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 891 } 892 myData2022->toU2022State.g=2; 893 } else { 894 /* illegal to have SS2 before a matching designator */ 895 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 896 } 897 break; 898 /* case SS3_STATE: not used in ISO-2022-JP-x */ 899 case ISO8859_1: 900 case ISO8859_7: 901 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 903 } else { 904 /* G2 charset for SS2 */ 905 myData2022->toU2022State.cs[2]=(int8_t)tempState; 906 } 907 break; 908 default: 909 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 910 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 911 } else { 912 /* G0 charset */ 913 myData2022->toU2022State.cs[0]=(int8_t)tempState; 914 } 915 break; 916 } 917 } 918 break; 919 case ISO_2022_CN: 920 { 921 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 922 switch(tempState) { 923 case INVALID_STATE: 924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 925 break; 926 case SS2_STATE: 927 if(myData2022->toU2022State.cs[2]!=0) { 928 if(myData2022->toU2022State.g<2) { 929 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 930 } 931 myData2022->toU2022State.g=2; 932 } else { 933 /* illegal to have SS2 before a matching designator */ 934 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 935 } 936 break; 937 case SS3_STATE: 938 if(myData2022->toU2022State.cs[3]!=0) { 939 if(myData2022->toU2022State.g<2) { 940 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 941 } 942 myData2022->toU2022State.g=3; 943 } else { 944 /* illegal to have SS3 before a matching designator */ 945 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 946 } 947 break; 948 case ISO_IR_165: 949 if(myData2022->version==0) { 950 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 951 break; 952 } 953 /*fall through*/ 954 case GB2312_1: 955 /*fall through*/ 956 case CNS_11643_1: 957 myData2022->toU2022State.cs[1]=(int8_t)tempState; 958 break; 959 case CNS_11643_2: 960 myData2022->toU2022State.cs[2]=(int8_t)tempState; 961 break; 962 default: 963 /* other CNS 11643 planes */ 964 if(myData2022->version==0) { 965 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 966 } else { 967 myData2022->toU2022State.cs[3]=(int8_t)tempState; 968 } 969 break; 970 } 971 } 972 break; 973 case ISO_2022_KR: 974 if(offset==0x30){ 975 /* nothing to be done, just accept this one escape sequence */ 976 } else { 977 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 978 } 979 break; 980 981 default: 982 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 983 break; 984 } 985 } 986 if(U_SUCCESS(*err)) { 987 _this->toULength = 0; 988 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 989 if(_this->toULength>1) { 990 /* 991 * Ticket 5691: consistent illegal sequences: 992 * - We include at least the first byte (ESC) in the illegal sequence. 993 * - If any of the non-initial bytes could be the start of a character, 994 * we stop the illegal sequence before the first one of those. 995 * In escape sequences, all following bytes are "printable", that is, 996 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 997 * they are valid single/lead bytes. 998 * For simplicity, we always only report the initial ESC byte as the 999 * illegal sequence and back out all other bytes we looked at. 1000 */ 1001 /* Back out some bytes. */ 1002 int8_t backOutDistance=_this->toULength-1; 1003 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1004 if(backOutDistance<=bytesFromThisBuffer) { 1005 /* same as initialToULength<=1 */ 1006 *source-=backOutDistance; 1007 } else { 1008 /* Back out bytes from the previous buffer: Need to replay them. */ 1009 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1010 /* same as -(initialToULength-1) */ 1011 /* preToULength is negative! */ 1012 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1013 *source-=bytesFromThisBuffer; 1014 } 1015 _this->toULength=1; 1016 } 1017 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1018 _this->toUCallbackReason = UCNV_UNASSIGNED; 1019 } 1020 } 1021 1022 /*Checks the characters of the buffer against valid 2022 escape sequences 1023 *if the match we return a pointer to the initial start of the sequence otherwise 1024 *we return sourceLimit 1025 */ 1026 /*for 2022 looks ahead in the stream 1027 *to determine the longest possible convertible 1028 *data stream 1029 */ 1030 static U_INLINE const char* 1031 getEndOfBuffer_2022(const char** source, 1032 const char* sourceLimit, 1033 UBool flush){ 1034 1035 const char* mySource = *source; 1036 1037 #ifdef U_ENABLE_GENERIC_ISO_2022 1038 if (*source >= sourceLimit) 1039 return sourceLimit; 1040 1041 do{ 1042 1043 if (*mySource == ESC_2022){ 1044 int8_t i; 1045 int32_t key = 0; 1046 int32_t offset; 1047 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1048 1049 /* Kludge: I could not 1050 * figure out the reason for validating an escape sequence 1051 * twice - once here and once in changeState_2022(). 1052 * is it possible to have an ESC character in a ISO2022 1053 * byte stream which is valid in a code page? Is it legal? 1054 */ 1055 for (i=0; 1056 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1057 i++) { 1058 value = getKey_2022(*(mySource+i), &key, &offset); 1059 } 1060 if (value > 0 || *mySource==ESC_2022) 1061 return mySource; 1062 1063 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1064 return sourceLimit; 1065 } 1066 }while (++mySource < sourceLimit); 1067 1068 return sourceLimit; 1069 #else 1070 while(mySource < sourceLimit && *mySource != ESC_2022) { 1071 ++mySource; 1072 } 1073 return mySource; 1074 #endif 1075 } 1076 1077 1078 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1079 * any future change in _MBCSFromUChar32() function should be reflected here. 1080 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1081 */ 1082 static U_INLINE int32_t 1083 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1084 UChar32 c, 1085 uint32_t* value, 1086 UBool useFallback, 1087 int outputType) 1088 { 1089 const int32_t *cx; 1090 const uint16_t *table; 1091 uint32_t stage2Entry; 1092 uint32_t myValue; 1093 int32_t length; 1094 const uint8_t *p; 1095 /* 1096 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1097 * Use internal version of ucnv_open() that verifies that the new structures are available, 1098 * else U_INTERNAL_PROGRAM_ERROR. 1099 */ 1100 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1101 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1102 table=sharedData->mbcs.fromUnicodeTable; 1103 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1104 /* get the bytes and the length for the output */ 1105 if(outputType==MBCS_OUTPUT_2){ 1106 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1107 if(myValue<=0xff) { 1108 length=1; 1109 } else { 1110 length=2; 1111 } 1112 } else /* outputType==MBCS_OUTPUT_3 */ { 1113 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1114 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1115 if(myValue<=0xff) { 1116 length=1; 1117 } else if(myValue<=0xffff) { 1118 length=2; 1119 } else { 1120 length=3; 1121 } 1122 } 1123 /* is this code point assigned, or do we use fallbacks? */ 1124 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1125 /* assigned */ 1126 *value=myValue; 1127 return length; 1128 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1129 /* 1130 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1131 * There is no way with this data structure for fallback output 1132 * to be a zero byte. 1133 */ 1134 *value=myValue; 1135 return -length; 1136 } 1137 } 1138 1139 cx=sharedData->mbcs.extIndexes; 1140 if(cx!=NULL) { 1141 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1142 } 1143 1144 /* unassigned */ 1145 return 0; 1146 } 1147 1148 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1149 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1150 * @param retval pointer to output byte 1151 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1152 */ 1153 static U_INLINE int32_t 1154 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1155 UChar32 c, 1156 uint32_t* retval, 1157 UBool useFallback) 1158 { 1159 const uint16_t *table; 1160 int32_t value; 1161 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1162 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1163 return 0; 1164 } 1165 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1166 table=sharedData->mbcs.fromUnicodeTable; 1167 /* get the byte for the output */ 1168 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1169 /* is this code point assigned, or do we use fallbacks? */ 1170 *retval=(uint32_t)(value&0xff); 1171 if(value>=0xf00) { 1172 return 1; /* roundtrip */ 1173 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1174 return -1; /* fallback taken */ 1175 } else { 1176 return 0; /* no mapping */ 1177 } 1178 } 1179 1180 /* 1181 * Check that the result is a 2-byte value with each byte in the range A1..FE 1182 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1183 * to move it to the ISO 2022 range 21..7E. 1184 * Return 0 if out of range. 1185 */ 1186 static U_INLINE uint32_t 1187 _2022FromGR94DBCS(uint32_t value) { 1188 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1189 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1190 ) { 1191 return value - 0x8080; /* shift down to 21..7e byte range */ 1192 } else { 1193 return 0; /* not valid for ISO 2022 */ 1194 } 1195 } 1196 1197 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1198 /* 1199 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1200 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1201 * unchanged. 1202 */ 1203 static U_INLINE uint32_t 1204 _2022ToGR94DBCS(uint32_t value) { 1205 uint32_t returnValue = value + 0x8080; 1206 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1207 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1208 return returnValue; 1209 } else { 1210 return value; 1211 } 1212 } 1213 #endif 1214 1215 #ifdef U_ENABLE_GENERIC_ISO_2022 1216 1217 /********************************************************************************** 1218 * ISO-2022 Converter 1219 * 1220 * 1221 */ 1222 1223 static void 1224 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1225 UErrorCode* err){ 1226 const char* mySourceLimit, *realSourceLimit; 1227 const char* sourceStart; 1228 const UChar* myTargetStart; 1229 UConverter* saveThis; 1230 UConverterDataISO2022* myData; 1231 int8_t length; 1232 1233 saveThis = args->converter; 1234 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1235 1236 realSourceLimit = args->sourceLimit; 1237 while (args->source < realSourceLimit) { 1238 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1239 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1240 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1241 1242 if(args->source < mySourceLimit) { 1243 if(myData->currentConverter==NULL) { 1244 myData->currentConverter = ucnv_open("ASCII",err); 1245 if(U_FAILURE(*err)){ 1246 return; 1247 } 1248 1249 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1250 saveThis->mode = UCNV_SO; 1251 } 1252 1253 /* convert to before the ESC or until the end of the buffer */ 1254 myData->isFirstBuffer=FALSE; 1255 sourceStart = args->source; 1256 myTargetStart = args->target; 1257 args->converter = myData->currentConverter; 1258 ucnv_toUnicode(args->converter, 1259 &args->target, 1260 args->targetLimit, 1261 &args->source, 1262 mySourceLimit, 1263 args->offsets, 1264 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1265 err); 1266 args->converter = saveThis; 1267 1268 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1269 /* move the overflow buffer */ 1270 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1271 myData->currentConverter->UCharErrorBufferLength = 0; 1272 if(length > 0) { 1273 uprv_memcpy(saveThis->UCharErrorBuffer, 1274 myData->currentConverter->UCharErrorBuffer, 1275 length*U_SIZEOF_UCHAR); 1276 } 1277 return; 1278 } 1279 1280 /* 1281 * At least one of: 1282 * -Error while converting 1283 * -Done with entire buffer 1284 * -Need to write offsets or update the current offset 1285 * (leave that up to the code in ucnv.c) 1286 * 1287 * or else we just stopped at an ESC byte and continue with changeState_2022() 1288 */ 1289 if (U_FAILURE(*err) || 1290 (args->source == realSourceLimit) || 1291 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1292 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1293 ) { 1294 /* copy partial or error input for truncated detection and error handling */ 1295 if(U_FAILURE(*err)) { 1296 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1297 if(length > 0) { 1298 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1299 } 1300 } else { 1301 length = saveThis->toULength = myData->currentConverter->toULength; 1302 if(length > 0) { 1303 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1304 if(args->source < mySourceLimit) { 1305 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1306 } 1307 } 1308 } 1309 return; 1310 } 1311 } 1312 } 1313 1314 sourceStart = args->source; 1315 changeState_2022(args->converter, 1316 &(args->source), 1317 realSourceLimit, 1318 ISO_2022, 1319 err); 1320 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1321 /* let the ucnv.c code update its current offset */ 1322 return; 1323 } 1324 } 1325 } 1326 1327 #endif 1328 1329 /* 1330 * To Unicode Callback helper function 1331 */ 1332 static void 1333 toUnicodeCallback(UConverter *cnv, 1334 const uint32_t sourceChar, const uint32_t targetUniChar, 1335 UErrorCode* err){ 1336 if(sourceChar>0xff){ 1337 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1338 cnv->toUBytes[1] = (uint8_t)sourceChar; 1339 cnv->toULength = 2; 1340 } 1341 else{ 1342 cnv->toUBytes[0] =(char) sourceChar; 1343 cnv->toULength = 1; 1344 } 1345 1346 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1347 *err = U_INVALID_CHAR_FOUND; 1348 } 1349 else{ 1350 *err = U_ILLEGAL_CHAR_FOUND; 1351 } 1352 } 1353 1354 /**************************************ISO-2022-JP*************************************************/ 1355 1356 /************************************** IMPORTANT ************************************************** 1357 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1358 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1359 * The converter iterates over each Unicode codepoint 1360 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1361 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1362 * would do as far as possible. 1363 * 1364 * If the implementation of these macros or structure of sharedData struct change in the future, make 1365 * sure that ISO-2022 is also changed. 1366 *************************************************************************************************** 1367 */ 1368 1369 /*************************************************************************************************** 1370 * Rules for ISO-2022-jp encoding 1371 * (i) Escape sequences must be fully contained within a line they should not 1372 * span new lines or CRs 1373 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1374 * JIS-Roman character escape sequence should follow before the line terminates 1375 * (iii) If the first character on the line is represented by two bytes then a two 1376 * byte character escape sequence should precede it 1377 * (iv) If no escape sequence is encountered then the characters are ASCII 1378 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1379 * and invoked with SS2 (ESC N). 1380 * (vi) If there is any G0 designation in text, there must be a switch to 1381 * ASCII or to JIS X 0201-Roman before a space character (but not 1382 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1383 * characters such as tab or CRLF. 1384 * (vi) Supported encodings: 1385 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1386 * 1387 * source : RFC-1554 1388 * 1389 * JISX201, JISX208,JISX212 : new .cnv data files created 1390 * KSC5601 : alias to ibm-949 mapping table 1391 * GB2312 : alias to ibm-1386 mapping table 1392 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1393 * ISO-8859-7 : alisas to ibm-9409 mapping table 1394 */ 1395 1396 /* preference order of JP charsets */ 1397 static const StateEnum jpCharsetPref[]={ 1398 ASCII, 1399 JISX201, 1400 ISO8859_1, 1401 ISO8859_7, 1402 JISX208, 1403 JISX212, 1404 GB2312, 1405 KSC5601, 1406 HWKANA_7BIT 1407 }; 1408 1409 /* 1410 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1411 * not in order of jpCharsetPref[]! 1412 */ 1413 static const char escSeqChars[][6] ={ 1414 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1415 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1416 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1417 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1418 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1419 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1420 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1421 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1422 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1423 1424 }; 1425 static const int8_t escSeqCharsLen[] ={ 1426 3, /* length of <ESC>(B ASCII */ 1427 3, /* length of <ESC>.A ISO-8859-1 */ 1428 3, /* length of <ESC>.F ISO-8859-7 */ 1429 3, /* length of <ESC>(J JISX-201 */ 1430 3, /* length of <ESC>$B JISX-208 */ 1431 4, /* length of <ESC>$(D JISX-212 */ 1432 3, /* length of <ESC>$A GB2312 */ 1433 4, /* length of <ESC>$(C KSC5601 */ 1434 3 /* length of <ESC>(I HWKANA_7BIT */ 1435 }; 1436 1437 /* 1438 * The iteration over various code pages works this way: 1439 * i) Get the currentState from myConverterData->currentState 1440 * ii) Check if the character is mapped to a valid character in the currentState 1441 * Yes -> a) set the initIterState to currentState 1442 * b) remain in this state until an invalid character is found 1443 * No -> a) go to the next code page and find the character 1444 * iii) Before changing the state increment the current state check if the current state 1445 * is equal to the intitIteration state 1446 * Yes -> A character that cannot be represented in any of the supported encodings 1447 * break and return a U_INVALID_CHARACTER error 1448 * No -> Continue and find the character in next code page 1449 * 1450 * 1451 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1452 */ 1453 1454 /* Map 00..7F to Unicode according to JIS X 0201. */ 1455 static U_INLINE uint32_t 1456 jisx201ToU(uint32_t value) { 1457 if(value < 0x5c) { 1458 return value; 1459 } else if(value == 0x5c) { 1460 return 0xa5; 1461 } else if(value == 0x7e) { 1462 return 0x203e; 1463 } else /* value <= 0x7f */ { 1464 return value; 1465 } 1466 } 1467 1468 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1469 static U_INLINE uint32_t 1470 jisx201FromU(uint32_t value) { 1471 if(value<=0x7f) { 1472 if(value!=0x5c && value!=0x7e) { 1473 return value; 1474 } 1475 } else if(value==0xa5) { 1476 return 0x5c; 1477 } else if(value==0x203e) { 1478 return 0x7e; 1479 } 1480 return 0xfffe; 1481 } 1482 1483 /* 1484 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1485 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1486 * Return 0 if the byte pair is out of range. 1487 */ 1488 static U_INLINE uint32_t 1489 _2022FromSJIS(uint32_t value) { 1490 uint8_t trail; 1491 1492 if(value > 0xEFFC) { 1493 return 0; /* beyond JIS X 0208 */ 1494 } 1495 1496 trail = (uint8_t)value; 1497 1498 value &= 0xff00; /* lead byte */ 1499 if(value <= 0x9f00) { 1500 value -= 0x7000; 1501 } else /* 0xe000 <= value <= 0xef00 */ { 1502 value -= 0xb000; 1503 } 1504 value <<= 1; 1505 1506 if(trail <= 0x9e) { 1507 value -= 0x100; 1508 if(trail <= 0x7e) { 1509 value |= trail - 0x1f; 1510 } else { 1511 value |= trail - 0x20; 1512 } 1513 } else /* trail <= 0xfc */ { 1514 value |= trail - 0x7e; 1515 } 1516 return value; 1517 } 1518 1519 /* 1520 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1521 * If either byte is outside 21..7E make sure that the result is not valid 1522 * for Shift-JIS so that the converter catches it. 1523 * Some invalid byte values already turn into equally invalid Shift-JIS 1524 * byte values and need not be tested explicitly. 1525 */ 1526 static U_INLINE void 1527 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1528 if(c1&1) { 1529 ++c1; 1530 if(c2 <= 0x5f) { 1531 c2 += 0x1f; 1532 } else if(c2 <= 0x7e) { 1533 c2 += 0x20; 1534 } else { 1535 c2 = 0; /* invalid */ 1536 } 1537 } else { 1538 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1539 c2 += 0x7e; 1540 } else { 1541 c2 = 0; /* invalid */ 1542 } 1543 } 1544 c1 >>= 1; 1545 if(c1 <= 0x2f) { 1546 c1 += 0x70; 1547 } else if(c1 <= 0x3f) { 1548 c1 += 0xb0; 1549 } else { 1550 c1 = 0; /* invalid */ 1551 } 1552 bytes[0] = (char)c1; 1553 bytes[1] = (char)c2; 1554 } 1555 1556 /* 1557 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1558 * Katakana. 1559 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1560 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1561 * These were the only fallbacks in ICU's jisx-208.ucm file. 1562 */ 1563 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1564 0x2123, /* U+FF61 */ 1565 0x2156, 1566 0x2157, 1567 0x2122, 1568 0x2126, 1569 0x2572, 1570 0x2521, 1571 0x2523, 1572 0x2525, 1573 0x2527, 1574 0x2529, 1575 0x2563, 1576 0x2565, 1577 0x2567, 1578 0x2543, 1579 0x213C, /* U+FF70 */ 1580 0x2522, 1581 0x2524, 1582 0x2526, 1583 0x2528, 1584 0x252A, 1585 0x252B, 1586 0x252D, 1587 0x252F, 1588 0x2531, 1589 0x2533, 1590 0x2535, 1591 0x2537, 1592 0x2539, 1593 0x253B, 1594 0x253D, 1595 0x253F, /* U+FF80 */ 1596 0x2541, 1597 0x2544, 1598 0x2546, 1599 0x2548, 1600 0x254A, 1601 0x254B, 1602 0x254C, 1603 0x254D, 1604 0x254E, 1605 0x254F, 1606 0x2552, 1607 0x2555, 1608 0x2558, 1609 0x255B, 1610 0x255E, 1611 0x255F, /* U+FF90 */ 1612 0x2560, 1613 0x2561, 1614 0x2562, 1615 0x2564, 1616 0x2566, 1617 0x2568, 1618 0x2569, 1619 0x256A, 1620 0x256B, 1621 0x256C, 1622 0x256D, 1623 0x256F, 1624 0x2573, 1625 0x212B, 1626 0x212C /* U+FF9F */ 1627 }; 1628 1629 static void 1630 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1631 UConverter *cnv = args->converter; 1632 UConverterDataISO2022 *converterData; 1633 ISO2022State *pFromU2022State; 1634 uint8_t *target = (uint8_t *) args->target; 1635 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1636 const UChar* source = args->source; 1637 const UChar* sourceLimit = args->sourceLimit; 1638 int32_t* offsets = args->offsets; 1639 UChar32 sourceChar; 1640 char buffer[8]; 1641 int32_t len, outLen; 1642 int8_t choices[10]; 1643 int32_t choiceCount; 1644 uint32_t targetValue = 0; 1645 UBool useFallback; 1646 1647 int32_t i; 1648 int8_t cs, g; 1649 1650 /* set up the state */ 1651 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1652 pFromU2022State = &converterData->fromU2022State; 1653 1654 choiceCount = 0; 1655 1656 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1657 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1658 goto getTrail; 1659 } 1660 1661 while(source < sourceLimit) { 1662 if(target < targetLimit) { 1663 1664 sourceChar = *(source++); 1665 /*check if the char is a First surrogate*/ 1666 if(UTF_IS_SURROGATE(sourceChar)) { 1667 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 1668 getTrail: 1669 /*look ahead to find the trail surrogate*/ 1670 if(source < sourceLimit) { 1671 /* test the following code unit */ 1672 UChar trail=(UChar) *source; 1673 if(UTF_IS_SECOND_SURROGATE(trail)) { 1674 source++; 1675 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 1676 cnv->fromUChar32=0x00; 1677 /* convert this supplementary code point */ 1678 /* exit this condition tree */ 1679 } else { 1680 /* this is an unmatched lead code unit (1st surrogate) */ 1681 /* callback(illegal) */ 1682 *err=U_ILLEGAL_CHAR_FOUND; 1683 cnv->fromUChar32=sourceChar; 1684 break; 1685 } 1686 } else { 1687 /* no more input */ 1688 cnv->fromUChar32=sourceChar; 1689 break; 1690 } 1691 } else { 1692 /* this is an unmatched trail code unit (2nd surrogate) */ 1693 /* callback(illegal) */ 1694 *err=U_ILLEGAL_CHAR_FOUND; 1695 cnv->fromUChar32=sourceChar; 1696 break; 1697 } 1698 } 1699 1700 /* do not convert SO/SI/ESC */ 1701 if(IS_2022_CONTROL(sourceChar)) { 1702 /* callback(illegal) */ 1703 *err=U_ILLEGAL_CHAR_FOUND; 1704 cnv->fromUChar32=sourceChar; 1705 break; 1706 } 1707 1708 /* do the conversion */ 1709 1710 if(choiceCount == 0) { 1711 uint16_t csm; 1712 1713 /* 1714 * The csm variable keeps track of which charsets are allowed 1715 * and not used yet while building the choices[]. 1716 */ 1717 csm = jpCharsetMasks[converterData->version]; 1718 choiceCount = 0; 1719 1720 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1721 if(converterData->version == 3 || converterData->version == 4) { 1722 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1723 } 1724 /* Do not try single-byte half-width Katakana for other versions. */ 1725 csm &= ~CSM(HWKANA_7BIT); 1726 1727 /* try the current G0 charset */ 1728 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1729 csm &= ~CSM(cs); 1730 1731 /* try the current G2 charset */ 1732 if((cs = pFromU2022State->cs[2]) != 0) { 1733 choices[choiceCount++] = cs; 1734 csm &= ~CSM(cs); 1735 } 1736 1737 /* try all the other possible charsets */ 1738 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1739 cs = (int8_t)jpCharsetPref[i]; 1740 if(CSM(cs) & csm) { 1741 choices[choiceCount++] = cs; 1742 csm &= ~CSM(cs); 1743 } 1744 } 1745 } 1746 1747 cs = g = 0; 1748 /* 1749 * len==0: no mapping found yet 1750 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1751 * len>0: found a roundtrip result, done 1752 */ 1753 len = 0; 1754 /* 1755 * We will turn off useFallback after finding a fallback, 1756 * but we still get fallbacks from PUA code points as usual. 1757 * Therefore, we will also need to check that we don't overwrite 1758 * an early fallback with a later one. 1759 */ 1760 useFallback = cnv->useFallback; 1761 1762 for(i = 0; i < choiceCount && len <= 0; ++i) { 1763 uint32_t value; 1764 int32_t len2; 1765 int8_t cs0 = choices[i]; 1766 switch(cs0) { 1767 case ASCII: 1768 if(sourceChar <= 0x7f) { 1769 targetValue = (uint32_t)sourceChar; 1770 len = 1; 1771 cs = cs0; 1772 g = 0; 1773 } 1774 break; 1775 case ISO8859_1: 1776 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1777 targetValue = (uint32_t)sourceChar - 0x80; 1778 len = 1; 1779 cs = cs0; 1780 g = 2; 1781 } 1782 break; 1783 case HWKANA_7BIT: 1784 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1785 if(converterData->version==3) { 1786 /* JIS7: use G1 (SO) */ 1787 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1788 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1789 len = 1; 1790 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1791 g = 1; 1792 } else if(converterData->version==4) { 1793 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1794 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1795 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1796 len = 1; 1797 1798 cs = pFromU2022State->cs[0]; 1799 if(IS_JP_DBCS(cs)) { 1800 /* switch from a DBCS charset to JISX201 */ 1801 cs = (int8_t)JISX201; 1802 } 1803 /* else stay in the current G0 charset */ 1804 g = 0; 1805 } 1806 /* else do not use HWKANA_7BIT with other versions */ 1807 } 1808 break; 1809 case JISX201: 1810 /* G0 SBCS */ 1811 value = jisx201FromU(sourceChar); 1812 if(value <= 0x7f) { 1813 targetValue = value; 1814 len = 1; 1815 cs = cs0; 1816 g = 0; 1817 useFallback = FALSE; 1818 } 1819 break; 1820 case JISX208: 1821 /* G0 DBCS from Shift-JIS table */ 1822 len2 = MBCS_FROM_UCHAR32_ISO2022( 1823 converterData->myConverterArray[cs0], 1824 sourceChar, &value, 1825 useFallback, MBCS_OUTPUT_2); 1826 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1827 value = _2022FromSJIS(value); 1828 if(value != 0) { 1829 targetValue = value; 1830 len = len2; 1831 cs = cs0; 1832 g = 0; 1833 useFallback = FALSE; 1834 } 1835 } else if(len == 0 && useFallback && 1836 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1837 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1838 len = -2; 1839 cs = cs0; 1840 g = 0; 1841 useFallback = FALSE; 1842 } 1843 break; 1844 case ISO8859_7: 1845 /* G0 SBCS forced to 7-bit output */ 1846 len2 = MBCS_SINGLE_FROM_UCHAR32( 1847 converterData->myConverterArray[cs0], 1848 sourceChar, &value, 1849 useFallback); 1850 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1851 targetValue = value - 0x80; 1852 len = len2; 1853 cs = cs0; 1854 g = 2; 1855 useFallback = FALSE; 1856 } 1857 break; 1858 default: 1859 /* G0 DBCS */ 1860 len2 = MBCS_FROM_UCHAR32_ISO2022( 1861 converterData->myConverterArray[cs0], 1862 sourceChar, &value, 1863 useFallback, MBCS_OUTPUT_2); 1864 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1865 if(cs0 == KSC5601) { 1866 /* 1867 * Check for valid bytes for the encoding scheme. 1868 * This is necessary because the sub-converter (windows-949) 1869 * has a broader encoding scheme than is valid for 2022. 1870 */ 1871 value = _2022FromGR94DBCS(value); 1872 if(value == 0) { 1873 break; 1874 } 1875 } 1876 targetValue = value; 1877 len = len2; 1878 cs = cs0; 1879 g = 0; 1880 useFallback = FALSE; 1881 } 1882 break; 1883 } 1884 } 1885 1886 if(len != 0) { 1887 if(len < 0) { 1888 len = -len; /* fallback */ 1889 } 1890 outLen = 0; /* count output bytes */ 1891 1892 /* write SI if necessary (only for JIS7) */ 1893 if(pFromU2022State->g == 1 && g == 0) { 1894 buffer[outLen++] = UCNV_SI; 1895 pFromU2022State->g = 0; 1896 } 1897 1898 /* write the designation sequence if necessary */ 1899 if(cs != pFromU2022State->cs[g]) { 1900 int32_t escLen = escSeqCharsLen[cs]; 1901 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1902 outLen += escLen; 1903 pFromU2022State->cs[g] = cs; 1904 1905 /* invalidate the choices[] */ 1906 choiceCount = 0; 1907 } 1908 1909 /* write the shift sequence if necessary */ 1910 if(g != pFromU2022State->g) { 1911 switch(g) { 1912 /* case 0 handled before writing escapes */ 1913 case 1: 1914 buffer[outLen++] = UCNV_SO; 1915 pFromU2022State->g = 1; 1916 break; 1917 default: /* case 2 */ 1918 buffer[outLen++] = 0x1b; 1919 buffer[outLen++] = 0x4e; 1920 break; 1921 /* no case 3: no SS3 in ISO-2022-JP-x */ 1922 } 1923 } 1924 1925 /* write the output bytes */ 1926 if(len == 1) { 1927 buffer[outLen++] = (char)targetValue; 1928 } else /* len == 2 */ { 1929 buffer[outLen++] = (char)(targetValue >> 8); 1930 buffer[outLen++] = (char)targetValue; 1931 } 1932 } else { 1933 /* 1934 * if we cannot find the character after checking all codepages 1935 * then this is an error 1936 */ 1937 *err = U_INVALID_CHAR_FOUND; 1938 cnv->fromUChar32=sourceChar; 1939 break; 1940 } 1941 1942 if(sourceChar == CR || sourceChar == LF) { 1943 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1944 pFromU2022State->cs[2] = 0; 1945 choiceCount = 0; 1946 } 1947 1948 /* output outLen>0 bytes in buffer[] */ 1949 if(outLen == 1) { 1950 *target++ = buffer[0]; 1951 if(offsets) { 1952 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1953 } 1954 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1955 *target++ = buffer[0]; 1956 *target++ = buffer[1]; 1957 if(offsets) { 1958 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1959 *offsets++ = sourceIndex; 1960 *offsets++ = sourceIndex; 1961 } 1962 } else { 1963 fromUWriteUInt8( 1964 cnv, 1965 buffer, outLen, 1966 &target, (const char *)targetLimit, 1967 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1968 err); 1969 if(U_FAILURE(*err)) { 1970 break; 1971 } 1972 } 1973 } /* end if(myTargetIndex<myTargetLength) */ 1974 else{ 1975 *err =U_BUFFER_OVERFLOW_ERROR; 1976 break; 1977 } 1978 1979 }/* end while(mySourceIndex<mySourceLength) */ 1980 1981 /* 1982 * the end of the input stream and detection of truncated input 1983 * are handled by the framework, but for ISO-2022-JP conversion 1984 * we need to be in ASCII mode at the very end 1985 * 1986 * conditions: 1987 * successful 1988 * in SO mode or not in ASCII mode 1989 * end of input and no truncated input 1990 */ 1991 if( U_SUCCESS(*err) && 1992 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1993 args->flush && source>=sourceLimit && cnv->fromUChar32==0 1994 ) { 1995 int32_t sourceIndex; 1996 1997 outLen = 0; 1998 1999 if(pFromU2022State->g != 0) { 2000 buffer[outLen++] = UCNV_SI; 2001 pFromU2022State->g = 0; 2002 } 2003 2004 if(pFromU2022State->cs[0] != ASCII) { 2005 int32_t escLen = escSeqCharsLen[ASCII]; 2006 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2007 outLen += escLen; 2008 pFromU2022State->cs[0] = (int8_t)ASCII; 2009 } 2010 2011 /* get the source index of the last input character */ 2012 /* 2013 * TODO this would be simpler and more reliable if we used a pair 2014 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2015 * so that we could simply use the prevSourceIndex here; 2016 * this code gives an incorrect result for the rare case of an unmatched 2017 * trail surrogate that is alone in the last buffer of the text stream 2018 */ 2019 sourceIndex=(int32_t)(source-args->source); 2020 if(sourceIndex>0) { 2021 --sourceIndex; 2022 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2023 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2024 ) { 2025 --sourceIndex; 2026 } 2027 } else { 2028 sourceIndex=-1; 2029 } 2030 2031 fromUWriteUInt8( 2032 cnv, 2033 buffer, outLen, 2034 &target, (const char *)targetLimit, 2035 &offsets, sourceIndex, 2036 err); 2037 } 2038 2039 /*save the state and return */ 2040 args->source = source; 2041 args->target = (char*)target; 2042 } 2043 2044 /*************** to unicode *******************/ 2045 2046 static void 2047 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2048 UErrorCode* err){ 2049 char tempBuf[2]; 2050 const char *mySource = (char *) args->source; 2051 UChar *myTarget = args->target; 2052 const char *mySourceLimit = args->sourceLimit; 2053 uint32_t targetUniChar = 0x0000; 2054 uint32_t mySourceChar = 0x0000; 2055 uint32_t tmpSourceChar = 0x0000; 2056 UConverterDataISO2022* myData; 2057 ISO2022State *pToU2022State; 2058 StateEnum cs; 2059 2060 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2061 pToU2022State = &myData->toU2022State; 2062 2063 if(myData->key != 0) { 2064 /* continue with a partial escape sequence */ 2065 goto escape; 2066 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2067 /* continue with a partial double-byte character */ 2068 mySourceChar = args->converter->toUBytes[0]; 2069 args->converter->toULength = 0; 2070 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2071 targetUniChar = missingCharMarker; 2072 goto getTrailByte; 2073 } 2074 2075 while(mySource < mySourceLimit){ 2076 2077 targetUniChar =missingCharMarker; 2078 2079 if(myTarget < args->targetLimit){ 2080 2081 mySourceChar= (unsigned char) *mySource++; 2082 2083 switch(mySourceChar) { 2084 case UCNV_SI: 2085 if(myData->version==3) { 2086 pToU2022State->g=0; 2087 continue; 2088 } else { 2089 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2090 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2091 break; 2092 } 2093 2094 case UCNV_SO: 2095 if(myData->version==3) { 2096 /* JIS7: switch to G1 half-width Katakana */ 2097 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2098 pToU2022State->g=1; 2099 continue; 2100 } else { 2101 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2102 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2103 break; 2104 } 2105 2106 case ESC_2022: 2107 mySource--; 2108 escape: 2109 { 2110 const char * mySourceBefore = mySource; 2111 int8_t toULengthBefore = args->converter->toULength; 2112 2113 changeState_2022(args->converter,&(mySource), 2114 mySourceLimit, ISO_2022_JP,err); 2115 2116 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2117 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2118 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2119 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2120 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2121 } 2122 } 2123 2124 /* invalid or illegal escape sequence */ 2125 if(U_FAILURE(*err)){ 2126 args->target = myTarget; 2127 args->source = mySource; 2128 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2129 return; 2130 } 2131 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2132 if(myData->key==0) { 2133 myData->isEmptySegment = TRUE; 2134 } 2135 continue; 2136 2137 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2138 2139 case CR: 2140 /*falls through*/ 2141 case LF: 2142 /* automatically reset to single-byte mode */ 2143 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2144 pToU2022State->cs[0] = (int8_t)ASCII; 2145 } 2146 pToU2022State->cs[2] = 0; 2147 pToU2022State->g = 0; 2148 /* falls through */ 2149 default: 2150 /* convert one or two bytes */ 2151 myData->isEmptySegment = FALSE; 2152 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2153 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2154 !IS_JP_DBCS(cs) 2155 ) { 2156 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2157 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2158 2159 /* return from a single-shift state to the previous one */ 2160 if(pToU2022State->g >= 2) { 2161 pToU2022State->g=pToU2022State->prevG; 2162 } 2163 } else switch(cs) { 2164 case ASCII: 2165 if(mySourceChar <= 0x7f) { 2166 targetUniChar = mySourceChar; 2167 } 2168 break; 2169 case ISO8859_1: 2170 if(mySourceChar <= 0x7f) { 2171 targetUniChar = mySourceChar + 0x80; 2172 } 2173 /* return from a single-shift state to the previous one */ 2174 pToU2022State->g=pToU2022State->prevG; 2175 break; 2176 case ISO8859_7: 2177 if(mySourceChar <= 0x7f) { 2178 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2179 targetUniChar = 2180 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2181 myData->myConverterArray[cs], 2182 mySourceChar + 0x80); 2183 } 2184 /* return from a single-shift state to the previous one */ 2185 pToU2022State->g=pToU2022State->prevG; 2186 break; 2187 case JISX201: 2188 if(mySourceChar <= 0x7f) { 2189 targetUniChar = jisx201ToU(mySourceChar); 2190 } 2191 break; 2192 case HWKANA_7BIT: 2193 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2194 /* 7-bit halfwidth Katakana */ 2195 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2196 } 2197 break; 2198 default: 2199 /* G0 DBCS */ 2200 if(mySource < mySourceLimit) { 2201 int leadIsOk, trailIsOk; 2202 uint8_t trailByte; 2203 getTrailByte: 2204 trailByte = (uint8_t)*mySource; 2205 /* 2206 * Ticket 5691: consistent illegal sequences: 2207 * - We include at least the first byte in the illegal sequence. 2208 * - If any of the non-initial bytes could be the start of a character, 2209 * we stop the illegal sequence before the first one of those. 2210 * 2211 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2212 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2213 * Otherwise we convert or report the pair of bytes. 2214 */ 2215 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2216 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2217 if (leadIsOk && trailIsOk) { 2218 ++mySource; 2219 tmpSourceChar = (mySourceChar << 8) | trailByte; 2220 if(cs == JISX208) { 2221 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2222 mySourceChar = tmpSourceChar; 2223 } else { 2224 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2225 mySourceChar = tmpSourceChar; 2226 if (cs == KSC5601) { 2227 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2228 } 2229 tempBuf[0] = (char)(tmpSourceChar >> 8); 2230 tempBuf[1] = (char)(tmpSourceChar); 2231 } 2232 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2233 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2234 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2235 ++mySource; 2236 /* add another bit so that the code below writes 2 bytes in case of error */ 2237 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2238 } 2239 } else { 2240 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2241 args->converter->toULength = 1; 2242 goto endloop; 2243 } 2244 } /* End of inner switch */ 2245 break; 2246 } /* End of outer switch */ 2247 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2248 if(args->offsets){ 2249 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2250 } 2251 *(myTarget++)=(UChar)targetUniChar; 2252 } 2253 else if(targetUniChar > missingCharMarker){ 2254 /* disassemble the surrogate pair and write to output*/ 2255 targetUniChar-=0x0010000; 2256 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2257 if(args->offsets){ 2258 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2259 } 2260 ++myTarget; 2261 if(myTarget< args->targetLimit){ 2262 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2263 if(args->offsets){ 2264 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2265 } 2266 ++myTarget; 2267 }else{ 2268 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2269 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2270 } 2271 2272 } 2273 else{ 2274 /* Call the callback function*/ 2275 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2276 break; 2277 } 2278 } 2279 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2280 *err =U_BUFFER_OVERFLOW_ERROR; 2281 break; 2282 } 2283 } 2284 endloop: 2285 args->target = myTarget; 2286 args->source = mySource; 2287 } 2288 2289 2290 /*************************************************************** 2291 * Rules for ISO-2022-KR encoding 2292 * i) The KSC5601 designator sequence should appear only once in a file, 2293 * at the begining of a line before any KSC5601 characters. This usually 2294 * means that it appears by itself on the first line of the file 2295 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2296 * and SI to shift into single byte mode 2297 */ 2298 static void 2299 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2300 2301 UConverter* saveConv = args->converter; 2302 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2303 args->converter=myConverterData->currentConverter; 2304 2305 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2306 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2307 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2308 2309 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2310 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2311 uprv_memcpy( 2312 saveConv->charErrorBuffer, 2313 myConverterData->currentConverter->charErrorBuffer, 2314 myConverterData->currentConverter->charErrorBufferLength); 2315 } 2316 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2317 myConverterData->currentConverter->charErrorBufferLength = 0; 2318 } 2319 args->converter=saveConv; 2320 } 2321 2322 static void 2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2324 2325 const UChar *source = args->source; 2326 const UChar *sourceLimit = args->sourceLimit; 2327 unsigned char *target = (unsigned char *) args->target; 2328 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2329 int32_t* offsets = args->offsets; 2330 uint32_t targetByteUnit = 0x0000; 2331 UChar32 sourceChar = 0x0000; 2332 UBool isTargetByteDBCS; 2333 UBool oldIsTargetByteDBCS; 2334 UConverterDataISO2022 *converterData; 2335 UConverterSharedData* sharedData; 2336 UBool useFallback; 2337 int32_t length =0; 2338 2339 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2340 /* if the version is 1 then the user is requesting 2341 * conversion with ibm-25546 pass the arguments to 2342 * MBCS converter and return 2343 */ 2344 if(converterData->version==1){ 2345 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2346 return; 2347 } 2348 2349 /* initialize data */ 2350 sharedData = converterData->currentConverter->sharedData; 2351 useFallback = args->converter->useFallback; 2352 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2353 oldIsTargetByteDBCS = isTargetByteDBCS; 2354 2355 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2356 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2357 goto getTrail; 2358 } 2359 while(source < sourceLimit){ 2360 2361 targetByteUnit = missingCharMarker; 2362 2363 if(target < (unsigned char*) args->targetLimit){ 2364 sourceChar = *source++; 2365 2366 /* do not convert SO/SI/ESC */ 2367 if(IS_2022_CONTROL(sourceChar)) { 2368 /* callback(illegal) */ 2369 *err=U_ILLEGAL_CHAR_FOUND; 2370 args->converter->fromUChar32=sourceChar; 2371 break; 2372 } 2373 2374 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2375 if(length < 0) { 2376 length = -length; /* fallback */ 2377 } 2378 /* only DBCS or SBCS characters are expected*/ 2379 /* DB characters with high bit set to 1 are expected */ 2380 if( length > 2 || length==0 || 2381 (length == 1 && targetByteUnit > 0x7f) || 2382 (length == 2 && 2383 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2384 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2385 ) { 2386 targetByteUnit=missingCharMarker; 2387 } 2388 if (targetByteUnit != missingCharMarker){ 2389 2390 oldIsTargetByteDBCS = isTargetByteDBCS; 2391 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2392 /* append the shift sequence */ 2393 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2394 2395 if (isTargetByteDBCS) 2396 *target++ = UCNV_SO; 2397 else 2398 *target++ = UCNV_SI; 2399 if(offsets) 2400 *(offsets++) = (int32_t)(source - args->source-1); 2401 } 2402 /* write the targetUniChar to target */ 2403 if(targetByteUnit <= 0x00FF){ 2404 if( target < targetLimit){ 2405 *(target++) = (unsigned char) targetByteUnit; 2406 if(offsets){ 2407 *(offsets++) = (int32_t)(source - args->source-1); 2408 } 2409 2410 }else{ 2411 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2412 *err = U_BUFFER_OVERFLOW_ERROR; 2413 } 2414 }else{ 2415 if(target < targetLimit){ 2416 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2417 if(offsets){ 2418 *(offsets++) = (int32_t)(source - args->source-1); 2419 } 2420 if(target < targetLimit){ 2421 *(target++) =(unsigned char) (targetByteUnit -0x80); 2422 if(offsets){ 2423 *(offsets++) = (int32_t)(source - args->source-1); 2424 } 2425 }else{ 2426 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2427 *err = U_BUFFER_OVERFLOW_ERROR; 2428 } 2429 }else{ 2430 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2431 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2432 *err = U_BUFFER_OVERFLOW_ERROR; 2433 } 2434 } 2435 2436 } 2437 else{ 2438 /* oops.. the code point is unassingned 2439 * set the error and reason 2440 */ 2441 2442 /*check if the char is a First surrogate*/ 2443 if(UTF_IS_SURROGATE(sourceChar)) { 2444 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2445 getTrail: 2446 /*look ahead to find the trail surrogate*/ 2447 if(source < sourceLimit) { 2448 /* test the following code unit */ 2449 UChar trail=(UChar) *source; 2450 if(UTF_IS_SECOND_SURROGATE(trail)) { 2451 source++; 2452 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2453 *err = U_INVALID_CHAR_FOUND; 2454 /* convert this surrogate code point */ 2455 /* exit this condition tree */ 2456 } else { 2457 /* this is an unmatched lead code unit (1st surrogate) */ 2458 /* callback(illegal) */ 2459 *err=U_ILLEGAL_CHAR_FOUND; 2460 } 2461 } else { 2462 /* no more input */ 2463 *err = U_ZERO_ERROR; 2464 } 2465 } else { 2466 /* this is an unmatched trail code unit (2nd surrogate) */ 2467 /* callback(illegal) */ 2468 *err=U_ILLEGAL_CHAR_FOUND; 2469 } 2470 } else { 2471 /* callback(unassigned) for a BMP code point */ 2472 *err = U_INVALID_CHAR_FOUND; 2473 } 2474 2475 args->converter->fromUChar32=sourceChar; 2476 break; 2477 } 2478 } /* end if(myTargetIndex<myTargetLength) */ 2479 else{ 2480 *err =U_BUFFER_OVERFLOW_ERROR; 2481 break; 2482 } 2483 2484 }/* end while(mySourceIndex<mySourceLength) */ 2485 2486 /* 2487 * the end of the input stream and detection of truncated input 2488 * are handled by the framework, but for ISO-2022-KR conversion 2489 * we need to be in ASCII mode at the very end 2490 * 2491 * conditions: 2492 * successful 2493 * not in ASCII mode 2494 * end of input and no truncated input 2495 */ 2496 if( U_SUCCESS(*err) && 2497 isTargetByteDBCS && 2498 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2499 ) { 2500 int32_t sourceIndex; 2501 2502 /* we are switching to ASCII */ 2503 isTargetByteDBCS=FALSE; 2504 2505 /* get the source index of the last input character */ 2506 /* 2507 * TODO this would be simpler and more reliable if we used a pair 2508 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2509 * so that we could simply use the prevSourceIndex here; 2510 * this code gives an incorrect result for the rare case of an unmatched 2511 * trail surrogate that is alone in the last buffer of the text stream 2512 */ 2513 sourceIndex=(int32_t)(source-args->source); 2514 if(sourceIndex>0) { 2515 --sourceIndex; 2516 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2517 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2518 ) { 2519 --sourceIndex; 2520 } 2521 } else { 2522 sourceIndex=-1; 2523 } 2524 2525 fromUWriteUInt8( 2526 args->converter, 2527 SHIFT_IN_STR, 1, 2528 &target, (const char *)targetLimit, 2529 &offsets, sourceIndex, 2530 err); 2531 } 2532 2533 /*save the state and return */ 2534 args->source = source; 2535 args->target = (char*)target; 2536 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2537 } 2538 2539 /************************ To Unicode ***************************************/ 2540 2541 static void 2542 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2543 UErrorCode* err){ 2544 char const* sourceStart; 2545 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2546 2547 UConverterToUnicodeArgs subArgs; 2548 int32_t minArgsSize; 2549 2550 /* set up the subconverter arguments */ 2551 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2552 minArgsSize = args->size; 2553 } else { 2554 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2555 } 2556 2557 uprv_memcpy(&subArgs, args, minArgsSize); 2558 subArgs.size = (uint16_t)minArgsSize; 2559 subArgs.converter = myData->currentConverter; 2560 2561 /* remember the original start of the input for offsets */ 2562 sourceStart = args->source; 2563 2564 if(myData->key != 0) { 2565 /* continue with a partial escape sequence */ 2566 goto escape; 2567 } 2568 2569 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2570 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2571 subArgs.source = args->source; 2572 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2573 if(subArgs.source != subArgs.sourceLimit) { 2574 /* 2575 * get the current partial byte sequence 2576 * 2577 * it needs to be moved between the public and the subconverter 2578 * so that the conversion framework, which only sees the public 2579 * converter, can handle truncated and illegal input etc. 2580 */ 2581 if(args->converter->toULength > 0) { 2582 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2583 } 2584 subArgs.converter->toULength = args->converter->toULength; 2585 2586 /* 2587 * Convert up to the end of the input, or to before the next escape character. 2588 * Does not handle conversion extensions because the preToU[] state etc. 2589 * is not copied. 2590 */ 2591 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2592 2593 if(args->offsets != NULL && sourceStart != args->source) { 2594 /* update offsets to base them on the actual start of the input */ 2595 int32_t *offsets = args->offsets; 2596 UChar *target = args->target; 2597 int32_t delta = (int32_t)(args->source - sourceStart); 2598 while(target < subArgs.target) { 2599 if(*offsets >= 0) { 2600 *offsets += delta; 2601 } 2602 ++offsets; 2603 ++target; 2604 } 2605 } 2606 args->source = subArgs.source; 2607 args->target = subArgs.target; 2608 args->offsets = subArgs.offsets; 2609 2610 /* copy input/error/overflow buffers */ 2611 if(subArgs.converter->toULength > 0) { 2612 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2613 } 2614 args->converter->toULength = subArgs.converter->toULength; 2615 2616 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2617 if(subArgs.converter->UCharErrorBufferLength > 0) { 2618 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2619 subArgs.converter->UCharErrorBufferLength); 2620 } 2621 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2622 subArgs.converter->UCharErrorBufferLength = 0; 2623 } 2624 } 2625 2626 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2627 return; 2628 } 2629 2630 escape: 2631 changeState_2022(args->converter, 2632 &(args->source), 2633 args->sourceLimit, 2634 ISO_2022_KR, 2635 err); 2636 } 2637 } 2638 2639 static void 2640 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2641 UErrorCode* err){ 2642 char tempBuf[2]; 2643 const char *mySource = ( char *) args->source; 2644 UChar *myTarget = args->target; 2645 const char *mySourceLimit = args->sourceLimit; 2646 UChar32 targetUniChar = 0x0000; 2647 UChar mySourceChar = 0x0000; 2648 UConverterDataISO2022* myData; 2649 UConverterSharedData* sharedData ; 2650 UBool useFallback; 2651 2652 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2653 if(myData->version==1){ 2654 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2655 return; 2656 } 2657 2658 /* initialize state */ 2659 sharedData = myData->currentConverter->sharedData; 2660 useFallback = args->converter->useFallback; 2661 2662 if(myData->key != 0) { 2663 /* continue with a partial escape sequence */ 2664 goto escape; 2665 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2666 /* continue with a partial double-byte character */ 2667 mySourceChar = args->converter->toUBytes[0]; 2668 args->converter->toULength = 0; 2669 goto getTrailByte; 2670 } 2671 2672 while(mySource< mySourceLimit){ 2673 2674 if(myTarget < args->targetLimit){ 2675 2676 mySourceChar= (unsigned char) *mySource++; 2677 2678 if(mySourceChar==UCNV_SI){ 2679 myData->toU2022State.g = 0; 2680 if (myData->isEmptySegment) { 2681 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2682 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2683 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2684 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2685 args->converter->toULength = 1; 2686 args->target = myTarget; 2687 args->source = mySource; 2688 return; 2689 } 2690 /*consume the source */ 2691 continue; 2692 }else if(mySourceChar==UCNV_SO){ 2693 myData->toU2022State.g = 1; 2694 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2695 /*consume the source */ 2696 continue; 2697 }else if(mySourceChar==ESC_2022){ 2698 mySource--; 2699 escape: 2700 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2701 changeState_2022(args->converter,&(mySource), 2702 mySourceLimit, ISO_2022_KR, err); 2703 if(U_FAILURE(*err)){ 2704 args->target = myTarget; 2705 args->source = mySource; 2706 return; 2707 } 2708 continue; 2709 } 2710 2711 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2712 if(myData->toU2022State.g == 1) { 2713 if(mySource < mySourceLimit) { 2714 int leadIsOk, trailIsOk; 2715 uint8_t trailByte; 2716 getTrailByte: 2717 targetUniChar = missingCharMarker; 2718 trailByte = (uint8_t)*mySource; 2719 /* 2720 * Ticket 5691: consistent illegal sequences: 2721 * - We include at least the first byte in the illegal sequence. 2722 * - If any of the non-initial bytes could be the start of a character, 2723 * we stop the illegal sequence before the first one of those. 2724 * 2725 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2726 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2727 * Otherwise we convert or report the pair of bytes. 2728 */ 2729 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2730 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2731 if (leadIsOk && trailIsOk) { 2732 ++mySource; 2733 tempBuf[0] = (char)(mySourceChar + 0x80); 2734 tempBuf[1] = (char)(trailByte + 0x80); 2735 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2736 mySourceChar = (mySourceChar << 8) | trailByte; 2737 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2738 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2739 ++mySource; 2740 /* add another bit so that the code below writes 2 bytes in case of error */ 2741 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2742 } 2743 } else { 2744 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2745 args->converter->toULength = 1; 2746 break; 2747 } 2748 } 2749 else if(mySourceChar <= 0x7f) { 2750 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2751 } else { 2752 targetUniChar = 0xffff; 2753 } 2754 if(targetUniChar < 0xfffe){ 2755 if(args->offsets) { 2756 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2757 } 2758 *(myTarget++)=(UChar)targetUniChar; 2759 } 2760 else { 2761 /* Call the callback function*/ 2762 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2763 break; 2764 } 2765 } 2766 else{ 2767 *err =U_BUFFER_OVERFLOW_ERROR; 2768 break; 2769 } 2770 } 2771 args->target = myTarget; 2772 args->source = mySource; 2773 } 2774 2775 /*************************** END ISO2022-KR *********************************/ 2776 2777 /*************************** ISO-2022-CN ********************************* 2778 * 2779 * Rules for ISO-2022-CN Encoding: 2780 * i) The designator sequence must appear once on a line before any instance 2781 * of character set it designates. 2782 * ii) If two lines contain characters from the same character set, both lines 2783 * must include the designator sequence. 2784 * iii) Once the designator sequence is known, a shifting sequence has to be found 2785 * to invoke the shifting 2786 * iv) All lines start in ASCII and end in ASCII. 2787 * v) Four shifting sequences are employed for this purpose: 2788 * 2789 * Sequcence ASCII Eq Charsets 2790 * ---------- ------- --------- 2791 * SI <SI> US-ASCII 2792 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2793 * SS2 <ESC>N CNS-11643-1992 Plane 2 2794 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2795 * 2796 * vi) 2797 * SOdesignator : ESC "$" ")" finalchar_for_SO 2798 * SS2designator : ESC "$" "*" finalchar_for_SS2 2799 * SS3designator : ESC "$" "+" finalchar_for_SS3 2800 * 2801 * ESC $ ) A Indicates the bytes following SO are Chinese 2802 * characters as defined in GB 2312-80, until 2803 * another SOdesignation appears 2804 * 2805 * 2806 * ESC $ ) E Indicates the bytes following SO are as defined 2807 * in ISO-IR-165 (for details, see section 2.1), 2808 * until another SOdesignation appears 2809 * 2810 * ESC $ ) G Indicates the bytes following SO are as defined 2811 * in CNS 11643-plane-1, until another 2812 * SOdesignation appears 2813 * 2814 * ESC $ * H Indicates the two bytes immediately following 2815 * SS2 is a Chinese character as defined in CNS 2816 * 11643-plane-2, until another SS2designation 2817 * appears 2818 * (Meaning <ESC>N must preceed every 2 byte 2819 * sequence.) 2820 * 2821 * ESC $ + I Indicates the immediate two bytes following SS3 2822 * is a Chinese character as defined in CNS 2823 * 11643-plane-3, until another SS3designation 2824 * appears 2825 * (Meaning <ESC>O must preceed every 2 byte 2826 * sequence.) 2827 * 2828 * ESC $ + J Indicates the immediate two bytes following SS3 2829 * is a Chinese character as defined in CNS 2830 * 11643-plane-4, until another SS3designation 2831 * appears 2832 * (In English: <ESC>O must preceed every 2 byte 2833 * sequence.) 2834 * 2835 * ESC $ + K Indicates the immediate two bytes following SS3 2836 * is a Chinese character as defined in CNS 2837 * 11643-plane-5, until another SS3designation 2838 * appears 2839 * 2840 * ESC $ + L Indicates the immediate two bytes following SS3 2841 * is a Chinese character as defined in CNS 2842 * 11643-plane-6, until another SS3designation 2843 * appears 2844 * 2845 * ESC $ + M Indicates the immediate two bytes following SS3 2846 * is a Chinese character as defined in CNS 2847 * 11643-plane-7, until another SS3designation 2848 * appears 2849 * 2850 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2851 * has its own designation information before any Chinese characters 2852 * appear 2853 * 2854 */ 2855 2856 /* The following are defined this way to make the strings truely readonly */ 2857 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2858 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2859 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2860 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2861 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2862 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2863 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2864 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2865 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2866 2867 /********************** ISO2022-CN Data **************************/ 2868 static const char* const escSeqCharsCN[10] ={ 2869 SHIFT_IN_STR, /* ASCII */ 2870 GB_2312_80_STR, 2871 ISO_IR_165_STR, 2872 CNS_11643_1992_Plane_1_STR, 2873 CNS_11643_1992_Plane_2_STR, 2874 CNS_11643_1992_Plane_3_STR, 2875 CNS_11643_1992_Plane_4_STR, 2876 CNS_11643_1992_Plane_5_STR, 2877 CNS_11643_1992_Plane_6_STR, 2878 CNS_11643_1992_Plane_7_STR 2879 }; 2880 2881 static void 2882 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2883 UConverter *cnv = args->converter; 2884 UConverterDataISO2022 *converterData; 2885 ISO2022State *pFromU2022State; 2886 uint8_t *target = (uint8_t *) args->target; 2887 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2888 const UChar* source = args->source; 2889 const UChar* sourceLimit = args->sourceLimit; 2890 int32_t* offsets = args->offsets; 2891 UChar32 sourceChar; 2892 char buffer[8]; 2893 int32_t len; 2894 int8_t choices[3]; 2895 int32_t choiceCount; 2896 uint32_t targetValue = 0; 2897 UBool useFallback; 2898 2899 /* set up the state */ 2900 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2901 pFromU2022State = &converterData->fromU2022State; 2902 2903 choiceCount = 0; 2904 2905 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2906 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2907 goto getTrail; 2908 } 2909 2910 while( source < sourceLimit){ 2911 if(target < targetLimit){ 2912 2913 sourceChar = *(source++); 2914 /*check if the char is a First surrogate*/ 2915 if(UTF_IS_SURROGATE(sourceChar)) { 2916 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2917 getTrail: 2918 /*look ahead to find the trail surrogate*/ 2919 if(source < sourceLimit) { 2920 /* test the following code unit */ 2921 UChar trail=(UChar) *source; 2922 if(UTF_IS_SECOND_SURROGATE(trail)) { 2923 source++; 2924 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2925 cnv->fromUChar32=0x00; 2926 /* convert this supplementary code point */ 2927 /* exit this condition tree */ 2928 } else { 2929 /* this is an unmatched lead code unit (1st surrogate) */ 2930 /* callback(illegal) */ 2931 *err=U_ILLEGAL_CHAR_FOUND; 2932 cnv->fromUChar32=sourceChar; 2933 break; 2934 } 2935 } else { 2936 /* no more input */ 2937 cnv->fromUChar32=sourceChar; 2938 break; 2939 } 2940 } else { 2941 /* this is an unmatched trail code unit (2nd surrogate) */ 2942 /* callback(illegal) */ 2943 *err=U_ILLEGAL_CHAR_FOUND; 2944 cnv->fromUChar32=sourceChar; 2945 break; 2946 } 2947 } 2948 2949 /* do the conversion */ 2950 if(sourceChar <= 0x007f ){ 2951 /* do not convert SO/SI/ESC */ 2952 if(IS_2022_CONTROL(sourceChar)) { 2953 /* callback(illegal) */ 2954 *err=U_ILLEGAL_CHAR_FOUND; 2955 cnv->fromUChar32=sourceChar; 2956 break; 2957 } 2958 2959 /* US-ASCII */ 2960 if(pFromU2022State->g == 0) { 2961 buffer[0] = (char)sourceChar; 2962 len = 1; 2963 } else { 2964 buffer[0] = UCNV_SI; 2965 buffer[1] = (char)sourceChar; 2966 len = 2; 2967 pFromU2022State->g = 0; 2968 choiceCount = 0; 2969 } 2970 if(sourceChar == CR || sourceChar == LF) { 2971 /* reset the state at the end of a line */ 2972 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2973 choiceCount = 0; 2974 } 2975 } 2976 else{ 2977 /* convert U+0080..U+10ffff */ 2978 int32_t i; 2979 int8_t cs, g; 2980 2981 if(choiceCount == 0) { 2982 /* try the current SO/G1 converter first */ 2983 choices[0] = pFromU2022State->cs[1]; 2984 2985 /* default to GB2312_1 if none is designated yet */ 2986 if(choices[0] == 0) { 2987 choices[0] = GB2312_1; 2988 } 2989 2990 if(converterData->version == 0) { 2991 /* ISO-2022-CN */ 2992 2993 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 2994 if(choices[0] == GB2312_1) { 2995 choices[1] = (int8_t)CNS_11643_1; 2996 } else { 2997 choices[1] = (int8_t)GB2312_1; 2998 } 2999 3000 choiceCount = 2; 3001 } else if (converterData->version == 1) { 3002 /* ISO-2022-CN-EXT */ 3003 3004 /* try one of the other converters */ 3005 switch(choices[0]) { 3006 case GB2312_1: 3007 choices[1] = (int8_t)CNS_11643_1; 3008 choices[2] = (int8_t)ISO_IR_165; 3009 break; 3010 case ISO_IR_165: 3011 choices[1] = (int8_t)GB2312_1; 3012 choices[2] = (int8_t)CNS_11643_1; 3013 break; 3014 default: /* CNS_11643_x */ 3015 choices[1] = (int8_t)GB2312_1; 3016 choices[2] = (int8_t)ISO_IR_165; 3017 break; 3018 } 3019 3020 choiceCount = 3; 3021 } else { 3022 choices[0] = (int8_t)CNS_11643_1; 3023 choices[1] = (int8_t)GB2312_1; 3024 } 3025 } 3026 3027 cs = g = 0; 3028 /* 3029 * len==0: no mapping found yet 3030 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3031 * len>0: found a roundtrip result, done 3032 */ 3033 len = 0; 3034 /* 3035 * We will turn off useFallback after finding a fallback, 3036 * but we still get fallbacks from PUA code points as usual. 3037 * Therefore, we will also need to check that we don't overwrite 3038 * an early fallback with a later one. 3039 */ 3040 useFallback = cnv->useFallback; 3041 3042 for(i = 0; i < choiceCount && len <= 0; ++i) { 3043 int8_t cs0 = choices[i]; 3044 if(cs0 > 0) { 3045 uint32_t value; 3046 int32_t len2; 3047 if(cs0 >= CNS_11643_0) { 3048 len2 = MBCS_FROM_UCHAR32_ISO2022( 3049 converterData->myConverterArray[CNS_11643], 3050 sourceChar, 3051 &value, 3052 useFallback, 3053 MBCS_OUTPUT_3); 3054 if(len2 == 3 || (len2 == -3 && len == 0)) { 3055 targetValue = value; 3056 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3057 if(len2 >= 0) { 3058 len = 2; 3059 } else { 3060 len = -2; 3061 useFallback = FALSE; 3062 } 3063 if(cs == CNS_11643_1) { 3064 g = 1; 3065 } else if(cs == CNS_11643_2) { 3066 g = 2; 3067 } else /* plane 3..7 */ if(converterData->version == 1) { 3068 g = 3; 3069 } else { 3070 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3071 len = 0; 3072 } 3073 } 3074 } else { 3075 /* GB2312_1 or ISO-IR-165 */ 3076 len2 = MBCS_FROM_UCHAR32_ISO2022( 3077 converterData->myConverterArray[cs0], 3078 sourceChar, 3079 &value, 3080 useFallback, 3081 MBCS_OUTPUT_2); 3082 if(len2 == 2 || (len2 == -2 && len == 0)) { 3083 targetValue = value; 3084 len = len2; 3085 cs = cs0; 3086 g = 1; 3087 useFallback = FALSE; 3088 } 3089 } 3090 } 3091 } 3092 3093 if(len != 0) { 3094 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3095 3096 /* write the designation sequence if necessary */ 3097 if(cs != pFromU2022State->cs[g]) { 3098 if(cs < CNS_11643) { 3099 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3100 } else { 3101 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3102 } 3103 len = 4; 3104 pFromU2022State->cs[g] = cs; 3105 if(g == 1) { 3106 /* changing the SO/G1 charset invalidates the choices[] */ 3107 choiceCount = 0; 3108 } 3109 } 3110 3111 /* write the shift sequence if necessary */ 3112 if(g != pFromU2022State->g) { 3113 switch(g) { 3114 case 1: 3115 buffer[len++] = UCNV_SO; 3116 3117 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3118 pFromU2022State->g = 1; 3119 break; 3120 case 2: 3121 buffer[len++] = 0x1b; 3122 buffer[len++] = 0x4e; 3123 break; 3124 default: /* case 3 */ 3125 buffer[len++] = 0x1b; 3126 buffer[len++] = 0x4f; 3127 break; 3128 } 3129 } 3130 3131 /* write the two output bytes */ 3132 buffer[len++] = (char)(targetValue >> 8); 3133 buffer[len++] = (char)targetValue; 3134 } else { 3135 /* if we cannot find the character after checking all codepages 3136 * then this is an error 3137 */ 3138 *err = U_INVALID_CHAR_FOUND; 3139 cnv->fromUChar32=sourceChar; 3140 break; 3141 } 3142 } 3143 3144 /* output len>0 bytes in buffer[] */ 3145 if(len == 1) { 3146 *target++ = buffer[0]; 3147 if(offsets) { 3148 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3149 } 3150 } else if(len == 2 && (target + 2) <= targetLimit) { 3151 *target++ = buffer[0]; 3152 *target++ = buffer[1]; 3153 if(offsets) { 3154 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3155 *offsets++ = sourceIndex; 3156 *offsets++ = sourceIndex; 3157 } 3158 } else { 3159 fromUWriteUInt8( 3160 cnv, 3161 buffer, len, 3162 &target, (const char *)targetLimit, 3163 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3164 err); 3165 if(U_FAILURE(*err)) { 3166 break; 3167 } 3168 } 3169 } /* end if(myTargetIndex<myTargetLength) */ 3170 else{ 3171 *err =U_BUFFER_OVERFLOW_ERROR; 3172 break; 3173 } 3174 3175 }/* end while(mySourceIndex<mySourceLength) */ 3176 3177 /* 3178 * the end of the input stream and detection of truncated input 3179 * are handled by the framework, but for ISO-2022-CN conversion 3180 * we need to be in ASCII mode at the very end 3181 * 3182 * conditions: 3183 * successful 3184 * not in ASCII mode 3185 * end of input and no truncated input 3186 */ 3187 if( U_SUCCESS(*err) && 3188 pFromU2022State->g!=0 && 3189 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3190 ) { 3191 int32_t sourceIndex; 3192 3193 /* we are switching to ASCII */ 3194 pFromU2022State->g=0; 3195 3196 /* get the source index of the last input character */ 3197 /* 3198 * TODO this would be simpler and more reliable if we used a pair 3199 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3200 * so that we could simply use the prevSourceIndex here; 3201 * this code gives an incorrect result for the rare case of an unmatched 3202 * trail surrogate that is alone in the last buffer of the text stream 3203 */ 3204 sourceIndex=(int32_t)(source-args->source); 3205 if(sourceIndex>0) { 3206 --sourceIndex; 3207 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3208 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3209 ) { 3210 --sourceIndex; 3211 } 3212 } else { 3213 sourceIndex=-1; 3214 } 3215 3216 fromUWriteUInt8( 3217 cnv, 3218 SHIFT_IN_STR, 1, 3219 &target, (const char *)targetLimit, 3220 &offsets, sourceIndex, 3221 err); 3222 } 3223 3224 /*save the state and return */ 3225 args->source = source; 3226 args->target = (char*)target; 3227 } 3228 3229 3230 static void 3231 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3232 UErrorCode* err){ 3233 char tempBuf[3]; 3234 const char *mySource = (char *) args->source; 3235 UChar *myTarget = args->target; 3236 const char *mySourceLimit = args->sourceLimit; 3237 uint32_t targetUniChar = 0x0000; 3238 uint32_t mySourceChar = 0x0000; 3239 UConverterDataISO2022* myData; 3240 ISO2022State *pToU2022State; 3241 3242 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3243 pToU2022State = &myData->toU2022State; 3244 3245 if(myData->key != 0) { 3246 /* continue with a partial escape sequence */ 3247 goto escape; 3248 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3249 /* continue with a partial double-byte character */ 3250 mySourceChar = args->converter->toUBytes[0]; 3251 args->converter->toULength = 0; 3252 targetUniChar = missingCharMarker; 3253 goto getTrailByte; 3254 } 3255 3256 while(mySource < mySourceLimit){ 3257 3258 targetUniChar =missingCharMarker; 3259 3260 if(myTarget < args->targetLimit){ 3261 3262 mySourceChar= (unsigned char) *mySource++; 3263 3264 switch(mySourceChar){ 3265 case UCNV_SI: 3266 pToU2022State->g=0; 3267 if (myData->isEmptySegment) { 3268 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3269 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3270 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3271 args->converter->toUBytes[0] = mySourceChar; 3272 args->converter->toULength = 1; 3273 args->target = myTarget; 3274 args->source = mySource; 3275 return; 3276 } 3277 continue; 3278 3279 case UCNV_SO: 3280 if(pToU2022State->cs[1] != 0) { 3281 pToU2022State->g=1; 3282 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3283 continue; 3284 } else { 3285 /* illegal to have SO before a matching designator */ 3286 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3287 break; 3288 } 3289 3290 case ESC_2022: 3291 mySource--; 3292 escape: 3293 { 3294 const char * mySourceBefore = mySource; 3295 int8_t toULengthBefore = args->converter->toULength; 3296 3297 changeState_2022(args->converter,&(mySource), 3298 mySourceLimit, ISO_2022_CN,err); 3299 3300 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3301 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3302 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3303 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3304 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3305 } 3306 } 3307 3308 /* invalid or illegal escape sequence */ 3309 if(U_FAILURE(*err)){ 3310 args->target = myTarget; 3311 args->source = mySource; 3312 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3313 return; 3314 } 3315 continue; 3316 3317 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3318 3319 case CR: 3320 /*falls through*/ 3321 case LF: 3322 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3323 /* falls through */ 3324 default: 3325 /* convert one or two bytes */ 3326 myData->isEmptySegment = FALSE; 3327 if(pToU2022State->g != 0) { 3328 if(mySource < mySourceLimit) { 3329 UConverterSharedData *cnv; 3330 StateEnum tempState; 3331 int32_t tempBufLen; 3332 int leadIsOk, trailIsOk; 3333 uint8_t trailByte; 3334 getTrailByte: 3335 trailByte = (uint8_t)*mySource; 3336 /* 3337 * Ticket 5691: consistent illegal sequences: 3338 * - We include at least the first byte in the illegal sequence. 3339 * - If any of the non-initial bytes could be the start of a character, 3340 * we stop the illegal sequence before the first one of those. 3341 * 3342 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3343 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3344 * Otherwise we convert or report the pair of bytes. 3345 */ 3346 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3347 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3348 if (leadIsOk && trailIsOk) { 3349 ++mySource; 3350 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3351 if(tempState >= CNS_11643_0) { 3352 cnv = myData->myConverterArray[CNS_11643]; 3353 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3354 tempBuf[1] = (char) (mySourceChar); 3355 tempBuf[2] = (char) trailByte; 3356 tempBufLen = 3; 3357 3358 }else{ 3359 cnv = myData->myConverterArray[tempState]; 3360 tempBuf[0] = (char) (mySourceChar); 3361 tempBuf[1] = (char) trailByte; 3362 tempBufLen = 2; 3363 } 3364 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3365 mySourceChar = (mySourceChar << 8) | trailByte; 3366 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3367 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3368 ++mySource; 3369 /* add another bit so that the code below writes 2 bytes in case of error */ 3370 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3371 } 3372 if(pToU2022State->g>=2) { 3373 /* return from a single-shift state to the previous one */ 3374 pToU2022State->g=pToU2022State->prevG; 3375 } 3376 } else { 3377 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3378 args->converter->toULength = 1; 3379 goto endloop; 3380 } 3381 } 3382 else{ 3383 if(mySourceChar <= 0x7f) { 3384 targetUniChar = (UChar) mySourceChar; 3385 } 3386 } 3387 break; 3388 } 3389 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3390 if(args->offsets){ 3391 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3392 } 3393 *(myTarget++)=(UChar)targetUniChar; 3394 } 3395 else if(targetUniChar > missingCharMarker){ 3396 /* disassemble the surrogate pair and write to output*/ 3397 targetUniChar-=0x0010000; 3398 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3399 if(args->offsets){ 3400 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3401 } 3402 ++myTarget; 3403 if(myTarget< args->targetLimit){ 3404 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3405 if(args->offsets){ 3406 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3407 } 3408 ++myTarget; 3409 }else{ 3410 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3411 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3412 } 3413 3414 } 3415 else{ 3416 /* Call the callback function*/ 3417 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3418 break; 3419 } 3420 } 3421 else{ 3422 *err =U_BUFFER_OVERFLOW_ERROR; 3423 break; 3424 } 3425 } 3426 endloop: 3427 args->target = myTarget; 3428 args->source = mySource; 3429 } 3430 3431 static void 3432 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3433 UConverter *cnv = args->converter; 3434 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3435 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3436 char *p, *subchar; 3437 char buffer[8]; 3438 int32_t length; 3439 3440 subchar=(char *)cnv->subChars; 3441 length=cnv->subCharLen; /* assume length==1 for most variants */ 3442 3443 p = buffer; 3444 switch(myConverterData->locale[0]){ 3445 case 'j': 3446 { 3447 int8_t cs; 3448 3449 if(pFromU2022State->g == 1) { 3450 /* JIS7: switch from G1 to G0 */ 3451 pFromU2022State->g = 0; 3452 *p++ = UCNV_SI; 3453 } 3454 3455 cs = pFromU2022State->cs[0]; 3456 if(cs != ASCII && cs != JISX201) { 3457 /* not in ASCII or JIS X 0201: switch to ASCII */ 3458 pFromU2022State->cs[0] = (int8_t)ASCII; 3459 *p++ = '\x1b'; 3460 *p++ = '\x28'; 3461 *p++ = '\x42'; 3462 } 3463 3464 *p++ = subchar[0]; 3465 break; 3466 } 3467 case 'c': 3468 if(pFromU2022State->g != 0) { 3469 /* not in ASCII mode: switch to ASCII */ 3470 pFromU2022State->g = 0; 3471 *p++ = UCNV_SI; 3472 } 3473 *p++ = subchar[0]; 3474 break; 3475 case 'k': 3476 if(myConverterData->version == 0) { 3477 if(length == 1) { 3478 if((UBool)args->converter->fromUnicodeStatus) { 3479 /* in DBCS mode: switch to SBCS */ 3480 args->converter->fromUnicodeStatus = 0; 3481 *p++ = UCNV_SI; 3482 } 3483 *p++ = subchar[0]; 3484 } else /* length == 2*/ { 3485 if(!(UBool)args->converter->fromUnicodeStatus) { 3486 /* in SBCS mode: switch to DBCS */ 3487 args->converter->fromUnicodeStatus = 1; 3488 *p++ = UCNV_SO; 3489 } 3490 *p++ = subchar[0]; 3491 *p++ = subchar[1]; 3492 } 3493 break; 3494 } else { 3495 /* save the subconverter's substitution string */ 3496 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3497 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3498 3499 /* set our substitution string into the subconverter */ 3500 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3501 myConverterData->currentConverter->subCharLen = (int8_t)length; 3502 3503 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3504 args->converter = myConverterData->currentConverter; 3505 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3506 ucnv_cbFromUWriteSub(args, 0, err); 3507 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3508 args->converter = cnv; 3509 3510 /* restore the subconverter's substitution string */ 3511 myConverterData->currentConverter->subChars = currentSubChars; 3512 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3513 3514 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3515 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3516 uprv_memcpy( 3517 cnv->charErrorBuffer, 3518 myConverterData->currentConverter->charErrorBuffer, 3519 myConverterData->currentConverter->charErrorBufferLength); 3520 } 3521 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3522 myConverterData->currentConverter->charErrorBufferLength = 0; 3523 } 3524 return; 3525 } 3526 default: 3527 /* not expected */ 3528 break; 3529 } 3530 ucnv_cbFromUWriteBytes(args, 3531 buffer, (int32_t)(p - buffer), 3532 offsetIndex, err); 3533 } 3534 3535 /* 3536 * Structure for cloning an ISO 2022 converter into a single memory block. 3537 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3538 * and then ucnv_safeClone() of the sub-converter may additionally align 3539 * currentConverter inside the cloneStruct, for which we need the deadSpace 3540 * after currentConverter. 3541 * This is because UAlignedMemory may be larger than the actually 3542 * necessary alignment size for the platform. 3543 * The other cloneStruct fields will not be moved around, 3544 * and are aligned properly with cloneStruct's alignment. 3545 */ 3546 struct cloneStruct 3547 { 3548 UConverter cnv; 3549 UConverter currentConverter; 3550 UAlignedMemory deadSpace; 3551 UConverterDataISO2022 mydata; 3552 }; 3553 3554 3555 static UConverter * 3556 _ISO_2022_SafeClone( 3557 const UConverter *cnv, 3558 void *stackBuffer, 3559 int32_t *pBufferSize, 3560 UErrorCode *status) 3561 { 3562 struct cloneStruct * localClone; 3563 UConverterDataISO2022 *cnvData; 3564 int32_t i, size; 3565 3566 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3567 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3568 return NULL; 3569 } 3570 3571 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3572 localClone = (struct cloneStruct *)stackBuffer; 3573 3574 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3575 3576 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3577 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3578 localClone->cnv.isExtraLocal = TRUE; 3579 3580 /* share the subconverters */ 3581 3582 if(cnvData->currentConverter != NULL) { 3583 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3584 localClone->mydata.currentConverter = 3585 ucnv_safeClone(cnvData->currentConverter, 3586 &localClone->currentConverter, 3587 &size, status); 3588 if(U_FAILURE(*status)) { 3589 return NULL; 3590 } 3591 } 3592 3593 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3594 if(cnvData->myConverterArray[i] != NULL) { 3595 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3596 } 3597 } 3598 3599 return &localClone->cnv; 3600 } 3601 3602 static void 3603 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3604 const USetAdder *sa, 3605 UConverterUnicodeSet which, 3606 UErrorCode *pErrorCode) 3607 { 3608 int32_t i; 3609 UConverterDataISO2022* cnvData; 3610 3611 if (U_FAILURE(*pErrorCode)) { 3612 return; 3613 } 3614 #ifdef U_ENABLE_GENERIC_ISO_2022 3615 if (cnv->sharedData == &_ISO2022Data) { 3616 /* We use UTF-8 in this case */ 3617 sa->addRange(sa->set, 0, 0xd7FF); 3618 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3619 return; 3620 } 3621 #endif 3622 3623 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3624 3625 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3626 switch(cnvData->locale[0]){ 3627 case 'j': 3628 /* include JIS X 0201 which is hardcoded */ 3629 sa->add(sa->set, 0xa5); 3630 sa->add(sa->set, 0x203e); 3631 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3632 /* include Latin-1 for some variants of JP */ 3633 sa->addRange(sa->set, 0, 0xff); 3634 } else { 3635 /* include ASCII for JP */ 3636 sa->addRange(sa->set, 0, 0x7f); 3637 } 3638 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3639 /* 3640 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3641 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3642 * use half-width Katakana. 3643 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3644 * half-width Katakana via the ESC ( I sequence. 3645 * However, we only emit (fromUnicode) half-width Katakana according to the 3646 * definition of each variant. 3647 * 3648 * When including fallbacks, 3649 * we need to include half-width Katakana Unicode code points for all JP variants because 3650 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3651 */ 3652 /* include half-width Katakana for JP */ 3653 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3654 } 3655 break; 3656 case 'c': 3657 case 'z': 3658 /* include ASCII for CN */ 3659 sa->addRange(sa->set, 0, 0x7f); 3660 break; 3661 case 'k': 3662 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3663 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3664 cnvData->currentConverter, sa, which, pErrorCode); 3665 /* the loop over myConverterArray[] will simply not find another converter */ 3666 break; 3667 default: 3668 break; 3669 } 3670 3671 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3672 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3673 cnvData->version==0 && i==CNS_11643 3674 ) { 3675 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3676 ucnv_MBCSGetUnicodeSetForBytes( 3677 cnvData->myConverterArray[i], 3678 sa, UCNV_ROUNDTRIP_SET, 3679 0, 0x81, 0x82, 3680 pErrorCode); 3681 } 3682 #endif 3683 3684 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3685 UConverterSetFilter filter; 3686 if(cnvData->myConverterArray[i]!=NULL) { 3687 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3688 cnvData->version==0 && i==CNS_11643 3689 ) { 3690 /* 3691 * Version-specific for CN: 3692 * CN version 0 does not map CNS planes 3..7 although 3693 * they are all available in the CNS conversion table; 3694 * CN version 1 (-EXT) does map them all. 3695 * The two versions create different Unicode sets. 3696 */ 3697 filter=UCNV_SET_FILTER_2022_CN; 3698 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3699 /* 3700 * Only add code points that map to Shift-JIS codes 3701 * corresponding to JIS X 0208. 3702 */ 3703 filter=UCNV_SET_FILTER_SJIS; 3704 } else if(i==KSC5601) { 3705 /* 3706 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3707 * are broader than GR94. 3708 */ 3709 filter=UCNV_SET_FILTER_GR94DBCS; 3710 } else { 3711 filter=UCNV_SET_FILTER_NONE; 3712 } 3713 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3714 } 3715 } 3716 3717 /* 3718 * ISO 2022 converters must not convert SO/SI/ESC despite what 3719 * sub-converters do by themselves. 3720 * Remove these characters from the set. 3721 */ 3722 sa->remove(sa->set, 0x0e); 3723 sa->remove(sa->set, 0x0f); 3724 sa->remove(sa->set, 0x1b); 3725 3726 /* ISO 2022 converters do not convert C1 controls either */ 3727 sa->removeRange(sa->set, 0x80, 0x9f); 3728 } 3729 3730 static const UConverterImpl _ISO2022Impl={ 3731 UCNV_ISO_2022, 3732 3733 NULL, 3734 NULL, 3735 3736 _ISO2022Open, 3737 _ISO2022Close, 3738 _ISO2022Reset, 3739 3740 #ifdef U_ENABLE_GENERIC_ISO_2022 3741 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3742 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3743 ucnv_fromUnicode_UTF8, 3744 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3745 #else 3746 NULL, 3747 NULL, 3748 NULL, 3749 NULL, 3750 #endif 3751 NULL, 3752 3753 NULL, 3754 _ISO2022getName, 3755 _ISO_2022_WriteSub, 3756 _ISO_2022_SafeClone, 3757 _ISO_2022_GetUnicodeSet 3758 }; 3759 static const UConverterStaticData _ISO2022StaticData={ 3760 sizeof(UConverterStaticData), 3761 "ISO_2022", 3762 2022, 3763 UCNV_IBM, 3764 UCNV_ISO_2022, 3765 1, 3766 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3767 { 0x1a, 0, 0, 0 }, 3768 1, 3769 FALSE, 3770 FALSE, 3771 0, 3772 0, 3773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3774 }; 3775 const UConverterSharedData _ISO2022Data={ 3776 sizeof(UConverterSharedData), 3777 ~((uint32_t) 0), 3778 NULL, 3779 NULL, 3780 &_ISO2022StaticData, 3781 FALSE, 3782 &_ISO2022Impl, 3783 0 3784 }; 3785 3786 /*************JP****************/ 3787 static const UConverterImpl _ISO2022JPImpl={ 3788 UCNV_ISO_2022, 3789 3790 NULL, 3791 NULL, 3792 3793 _ISO2022Open, 3794 _ISO2022Close, 3795 _ISO2022Reset, 3796 3797 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3798 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3799 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3800 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3801 NULL, 3802 3803 NULL, 3804 _ISO2022getName, 3805 _ISO_2022_WriteSub, 3806 _ISO_2022_SafeClone, 3807 _ISO_2022_GetUnicodeSet 3808 }; 3809 static const UConverterStaticData _ISO2022JPStaticData={ 3810 sizeof(UConverterStaticData), 3811 "ISO_2022_JP", 3812 0, 3813 UCNV_IBM, 3814 UCNV_ISO_2022, 3815 1, 3816 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3817 { 0x1a, 0, 0, 0 }, 3818 1, 3819 FALSE, 3820 FALSE, 3821 0, 3822 0, 3823 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3824 }; 3825 static const UConverterSharedData _ISO2022JPData={ 3826 sizeof(UConverterSharedData), 3827 ~((uint32_t) 0), 3828 NULL, 3829 NULL, 3830 &_ISO2022JPStaticData, 3831 FALSE, 3832 &_ISO2022JPImpl, 3833 0 3834 }; 3835 3836 /************* KR ***************/ 3837 static const UConverterImpl _ISO2022KRImpl={ 3838 UCNV_ISO_2022, 3839 3840 NULL, 3841 NULL, 3842 3843 _ISO2022Open, 3844 _ISO2022Close, 3845 _ISO2022Reset, 3846 3847 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3848 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3849 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3850 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3851 NULL, 3852 3853 NULL, 3854 _ISO2022getName, 3855 _ISO_2022_WriteSub, 3856 _ISO_2022_SafeClone, 3857 _ISO_2022_GetUnicodeSet 3858 }; 3859 static const UConverterStaticData _ISO2022KRStaticData={ 3860 sizeof(UConverterStaticData), 3861 "ISO_2022_KR", 3862 0, 3863 UCNV_IBM, 3864 UCNV_ISO_2022, 3865 1, 3866 3, /* max 3 bytes per UChar: SO+DBCS */ 3867 { 0x1a, 0, 0, 0 }, 3868 1, 3869 FALSE, 3870 FALSE, 3871 0, 3872 0, 3873 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3874 }; 3875 static const UConverterSharedData _ISO2022KRData={ 3876 sizeof(UConverterSharedData), 3877 ~((uint32_t) 0), 3878 NULL, 3879 NULL, 3880 &_ISO2022KRStaticData, 3881 FALSE, 3882 &_ISO2022KRImpl, 3883 0 3884 }; 3885 3886 /*************** CN ***************/ 3887 static const UConverterImpl _ISO2022CNImpl={ 3888 3889 UCNV_ISO_2022, 3890 3891 NULL, 3892 NULL, 3893 3894 _ISO2022Open, 3895 _ISO2022Close, 3896 _ISO2022Reset, 3897 3898 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3899 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3900 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3901 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3902 NULL, 3903 3904 NULL, 3905 _ISO2022getName, 3906 _ISO_2022_WriteSub, 3907 _ISO_2022_SafeClone, 3908 _ISO_2022_GetUnicodeSet 3909 }; 3910 static const UConverterStaticData _ISO2022CNStaticData={ 3911 sizeof(UConverterStaticData), 3912 "ISO_2022_CN", 3913 0, 3914 UCNV_IBM, 3915 UCNV_ISO_2022, 3916 1, 3917 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3918 { 0x1a, 0, 0, 0 }, 3919 1, 3920 FALSE, 3921 FALSE, 3922 0, 3923 0, 3924 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3925 }; 3926 static const UConverterSharedData _ISO2022CNData={ 3927 sizeof(UConverterSharedData), 3928 ~((uint32_t) 0), 3929 NULL, 3930 NULL, 3931 &_ISO2022CNStaticData, 3932 FALSE, 3933 &_ISO2022CNImpl, 3934 0 3935 }; 3936 3937 3938 3939 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3940