1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "ucnv_imp.h" 38 #include "ucnv_bld.h" 39 #include "ucnv_cnv.h" 40 #include "ucnvmbcs.h" 41 #include "cstring.h" 42 #include "cmemory.h" 43 44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 45 46 #ifdef U_ENABLE_GENERIC_ISO_2022 47 /* 48 * I am disabling the generic ISO-2022 converter after proposing to do so on 49 * the icu mailing list two days ago. 50 * 51 * Reasons: 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 53 * its designation sequences, single shifts with return to the previous state, 54 * switch-with-no-return to UTF-16BE or similar, etc. 55 * This is unlike the language-specific variants like ISO-2022-JP which 56 * require a much smaller repertoire of ISO-2022 features. 57 * These variants continue to be supported. 58 * 2. I believe that no one is really using the generic ISO-2022 converter 59 * but rather always one of the language-specific variants. 60 * Note that ICU's generic ISO-2022 converter has always output one escape 61 * sequence followed by UTF-8 for the whole stream. 62 * 3. Switching between subcharsets is extremely slow, because each time 63 * the previous converter is closed and a new one opened, 64 * without any kind of caching, least-recently-used list, etc. 65 * 4. The code is currently buggy, and given the above it does not seem 66 * reasonable to spend the time on maintenance. 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 68 * This means, for example, that when ISO-8859-7 is designated, the following 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 70 * The ICU ISO-2022 converter does not handle this - and has no information 71 * about which subconverter would have to be shifted vs. which is designed 72 * for 7-bit ISO-2022. 73 * 74 * Markus Scherer 2003-dec-03 75 */ 76 #endif 77 78 static const char SHIFT_IN_STR[] = "\x0F"; 79 static const char SHIFT_OUT_STR[] = "\x0E"; 80 81 #define CR 0x0D 82 #define LF 0x0A 83 #define H_TAB 0x09 84 #define V_TAB 0x0B 85 #define SPACE 0x20 86 87 enum { 88 HWKANA_START=0xff61, 89 HWKANA_END=0xff9f 90 }; 91 92 /* 93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 94 * as bytes 21..7E. (Subtract 0x80.) 95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 96 * as bytes 20..7F. (Subtract 0x80.) 97 * Do not encode C1 control codes with native bytes 80..9F 98 * as bytes 00..1F (C0 control codes). 99 */ 100 enum { 101 GR94_START=0xa1, 102 GR94_END=0xfe, 103 GR96_START=0xa0, 104 GR96_END=0xff 105 }; 106 107 /* 108 * ISO 2022 control codes must not be converted from Unicode 109 * because they would mess up the byte stream. 110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 111 * corresponding to SO, SI, and ESC. 112 */ 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 114 115 /* for ISO-2022-JP and -CN implementations */ 116 typedef enum { 117 /* shared values */ 118 INVALID_STATE=-1, 119 ASCII = 0, 120 121 SS2_STATE=0x10, 122 SS3_STATE, 123 124 /* JP */ 125 ISO8859_1 = 1 , 126 ISO8859_7 = 2 , 127 JISX201 = 3, 128 JISX208 = 4, 129 JISX212 = 5, 130 GB2312 =6, 131 KSC5601 =7, 132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 133 134 /* CN */ 135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 136 GB2312_1=1, 137 ISO_IR_165=2, 138 CNS_11643=3, 139 140 /* 141 * these are used in StateEnum and ISO2022State variables, 142 * but CNS_11643 must be used to index into myConverterArray[] 143 */ 144 CNS_11643_0=0x20, 145 CNS_11643_1, 146 CNS_11643_2, 147 CNS_11643_3, 148 CNS_11643_4, 149 CNS_11643_5, 150 CNS_11643_6, 151 CNS_11643_7 152 } StateEnum; 153 154 /* is the StateEnum charset value for a DBCS charset? */ 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 156 157 #define CSM(cs) ((uint16_t)1<<(cs)) 158 159 /* 160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 162 * 163 * Note: The converter uses some leniency: 164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 165 * all versions, not just JIS7 and JIS8. 166 * - ICU does not distinguish between different versions of JIS X 0208. 167 */ 168 enum { MAX_JA_VERSION=4 }; 169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 175 }; 176 177 typedef enum { 178 ASCII1=0, 179 LATIN1, 180 SBCS, 181 DBCS, 182 MBCS, 183 HWKANA 184 }Cnv2022Type; 185 186 typedef struct ISO2022State { 187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 189 int8_t prevG; /* g before single shift (SS2 or SS3) */ 190 } ISO2022State; 191 192 #define UCNV_OPTIONS_VERSION_MASK 0xf 193 #define UCNV_2022_MAX_CONVERTERS 10 194 195 typedef struct{ 196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 197 UConverter *currentConverter; 198 Cnv2022Type currentType; 199 ISO2022State toU2022State, fromU2022State; 200 uint32_t key; 201 uint32_t version; 202 #ifdef U_ENABLE_GENERIC_ISO_2022 203 UBool isFirstBuffer; 204 #endif 205 UBool isEmptySegment; 206 char name[30]; 207 char locale[3]; 208 }UConverterDataISO2022; 209 210 /* Protos */ 211 /* ISO-2022 ----------------------------------------------------------------- */ 212 213 /*Forward declaration */ 214 U_CFUNC void 215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 216 UErrorCode * err); 217 U_CFUNC void 218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 219 UErrorCode * err); 220 221 #define ESC_2022 0x1B /*ESC*/ 222 223 typedef enum 224 { 225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 229 } UCNV_TableStates_2022; 230 231 /* 232 * The way these state transition arrays work is: 233 * ex : ESC$B is the sequence for JISX208 234 * a) First Iteration: char is ESC 235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 236 * int x = normalize_esq_chars_2022[27] which is equal to 1 237 * ii) Search for this value in escSeqStateTable_Key_2022[] 238 * value of x is stored at escSeqStateTable_Key_2022[0] 239 * iii) Save this index as offset 240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 242 * b) Switch on this state and continue to next char 243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 244 * which is normalize_esq_chars_2022[36] == 4 245 * ii) x is currently 1(from above) 246 * x<<=5 -- x is now 32 247 * x+=normalize_esq_chars_2022[36] 248 * now x is 36 249 * iii) Search for this value in escSeqStateTable_Key_2022[] 250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 253 * c) Switch on this state and continue to next char 254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 255 * ii) x is currently 36 (from above) 256 * x<<=5 -- x is now 1152 257 * x+=normalize_esq_chars_2022[66] 258 * now x is 1161 259 * iii) Search for this value in escSeqStateTable_Key_2022[] 260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 264 */ 265 266 267 /*Below are the 3 arrays depicting a state transition table*/ 268 static const int8_t normalize_esq_chars_2022[256] = { 269 /* 0 1 2 3 4 5 6 7 8 9 */ 270 271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 297 }; 298 299 #ifdef U_ENABLE_GENERIC_ISO_2022 300 /* 301 * When the generic ISO-2022 converter is completely removed, not just disabled 302 * per #ifdef, then the following state table and the associated tables that are 303 * dimensioned with MAX_STATES_2022 should be trimmed. 304 * 305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 306 * the associated escape sequences starting with ESC ( B should be removed. 307 * This includes the ones with key values 1097 and all of the ones above 1000000. 308 * 309 * For the latter, the tables can simply be truncated. 310 * For the former, since the tables must be kept parallel, it is probably best 311 * to simply duplicate an adjacent table cell, parallel in all tables. 312 * 313 * It may make sense to restructure the tables, especially by using small search 314 * tables for the variants instead of indexing them parallel to the table here. 315 */ 316 #endif 317 318 #define MAX_STATES_2022 74 319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 320 /* 0 1 2 3 4 5 6 7 8 9 */ 321 322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 329 ,35947631 ,35947635 ,35947636 ,35947638 330 }; 331 332 #ifdef U_ENABLE_GENERIC_ISO_2022 333 334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 335 /* 0 1 2 3 4 5 6 7 8 9 */ 336 337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 345 }; 346 347 #endif 348 349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 350 /* 0 1 2 3 4 5 6 7 8 9 */ 351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 }; 360 361 362 /* Type def for refactoring changeState_2022 code*/ 363 typedef enum{ 364 #ifdef U_ENABLE_GENERIC_ISO_2022 365 ISO_2022=0, 366 #endif 367 ISO_2022_JP=1, 368 ISO_2022_KR=2, 369 ISO_2022_CN=3 370 } Variant2022; 371 372 /*********** ISO 2022 Converter Protos ***********/ 373 static void 374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 375 376 static void 377 _ISO2022Close(UConverter *converter); 378 379 static void 380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 381 382 static const char* 383 _ISO2022getName(const UConverter* cnv); 384 385 static void 386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 387 388 static UConverter * 389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 390 391 #ifdef U_ENABLE_GENERIC_ISO_2022 392 static void 393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 394 #endif 395 396 /*const UConverterSharedData _ISO2022Data;*/ 397 static const UConverterSharedData _ISO2022JPData; 398 static const UConverterSharedData _ISO2022KRData; 399 static const UConverterSharedData _ISO2022CNData; 400 401 /*************** Converter implementations ******************/ 402 403 /* The purpose of this function is to get around gcc compiler warnings. */ 404 static U_INLINE void 405 fromUWriteUInt8(UConverter *cnv, 406 const char *bytes, int32_t length, 407 uint8_t **target, const char *targetLimit, 408 int32_t **offsets, 409 int32_t sourceIndex, 410 UErrorCode *pErrorCode) 411 { 412 char *targetChars = (char *)*target; 413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 414 offsets, sourceIndex, pErrorCode); 415 *target = (uint8_t*)targetChars; 416 417 } 418 419 static U_INLINE void 420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){ 421 if(myConverterData->version == 1) { 422 UConverter *cnv = myConverterData->currentConverter; 423 424 cnv->toUnicodeStatus=0; /* offset */ 425 cnv->mode=0; /* state */ 426 cnv->toULength=0; /* byteIndex */ 427 } 428 } 429 430 static U_INLINE void 431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 432 /* in ISO-2022-KR the designator sequence appears only once 433 * in a file so we append it only once 434 */ 435 if( converter->charErrorBufferLength==0){ 436 437 converter->charErrorBufferLength = 4; 438 converter->charErrorBuffer[0] = 0x1b; 439 converter->charErrorBuffer[1] = 0x24; 440 converter->charErrorBuffer[2] = 0x29; 441 converter->charErrorBuffer[3] = 0x43; 442 } 443 if(myConverterData->version == 1) { 444 UConverter *cnv = myConverterData->currentConverter; 445 446 cnv->fromUChar32=0; 447 cnv->fromUnicodeStatus=1; /* prevLength */ 448 } 449 } 450 451 static void 452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 453 454 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 455 456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 457 if(cnv->extraInfo != NULL) { 458 UConverterNamePieces stackPieces; 459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; 460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 461 uint32_t version; 462 463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 464 465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 466 myConverterData->currentType = ASCII1; 467 cnv->fromUnicodeStatus =FALSE; 468 if(pArgs->locale){ 469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 470 } 471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 472 myConverterData->version = version; 473 474 /* BEGIN android-changed */ 475 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 476 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 477 if((myLocale[0]=='j' && 478 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 479 myLocale[1]=='s') && 480 (myLocale[2]=='_' || myLocale[2]=='\0'))) 481 { 482 size_t len=0; 483 /* open the required converters and cache them */ 484 if(version>MAX_JA_VERSION) { 485 /* prevent indexing beyond jpCharsetMasks[] */ 486 myConverterData->version = version = 0; 487 } 488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 489 myConverterData->myConverterArray[ISO8859_7] = 490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 491 } 492 if (myLocale[1]=='k') { /* Use KDDI's version. */ 493 myConverterData->myConverterArray[JISX208] = 494 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 495 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 496 myConverterData->myConverterArray[JISX208] = 497 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 498 } else { 499 myConverterData->myConverterArray[JISX208] = 500 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 501 } 502 /* END android-changed */ 503 504 if(jpCharsetMasks[version]&CSM(JISX212)) { 505 myConverterData->myConverterArray[JISX212] = 506 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 507 } 508 if(jpCharsetMasks[version]&CSM(GB2312)) { 509 myConverterData->myConverterArray[GB2312] = 510 /* BEGIN android-changed */ 511 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 512 /* END android-changed */ 513 } 514 if(jpCharsetMasks[version]&CSM(KSC5601)) { 515 myConverterData->myConverterArray[KSC5601] = 516 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 517 } 518 519 /* set the function pointers to appropriate funtions */ 520 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 521 uprv_strcpy(myConverterData->locale,"ja"); 522 523 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 524 len = uprv_strlen(myConverterData->name); 525 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 526 myConverterData->name[len+1]='\0'; 527 } 528 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 529 (myLocale[2]=='_' || myLocale[2]=='\0')) 530 { 531 const char *cnvName; 532 if(version==1) { 533 cnvName="icu-internal-25546"; 534 } else { 535 /* BEGIN android-changed */ 536 cnvName="ksc_5601"; 537 /* END android-changed */ 538 myConverterData->version=version=0; 539 } 540 if(pArgs->onlyTestIsLoadable) { 541 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 542 uprv_free(cnv->extraInfo); 543 cnv->extraInfo=NULL; 544 return; 545 } else { 546 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 547 if (U_FAILURE(*errorCode)) { 548 _ISO2022Close(cnv); 549 return; 550 } 551 552 if(version==1) { 553 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 554 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 555 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 556 }else{ 557 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 558 } 559 560 /* initialize the state variables */ 561 setInitialStateToUnicodeKR(cnv, myConverterData); 562 setInitialStateFromUnicodeKR(cnv, myConverterData); 563 564 /* set the function pointers to appropriate funtions */ 565 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 566 uprv_strcpy(myConverterData->locale,"ko"); 567 } 568 } 569 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 570 (myLocale[2]=='_' || myLocale[2]=='\0')) 571 { 572 573 /* open the required converters and cache them */ 574 /* BEGIN android-changed */ 575 myConverterData->myConverterArray[GB2312_1] = 576 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 577 if(version==1) { 578 myConverterData->myConverterArray[ISO_IR_165] = 579 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 580 } 581 myConverterData->myConverterArray[CNS_11643] = 582 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 583 /* END android-changed */ 584 585 586 /* set the function pointers to appropriate funtions */ 587 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 588 uprv_strcpy(myConverterData->locale,"cn"); 589 590 if (version==1){ 591 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 592 }else{ 593 myConverterData->version = 0; 594 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 595 } 596 } 597 else{ 598 #ifdef U_ENABLE_GENERIC_ISO_2022 599 myConverterData->isFirstBuffer = TRUE; 600 601 /* append the UTF-8 escape sequence */ 602 cnv->charErrorBufferLength = 3; 603 cnv->charErrorBuffer[0] = 0x1b; 604 cnv->charErrorBuffer[1] = 0x25; 605 cnv->charErrorBuffer[2] = 0x42; 606 607 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 608 /* initialize the state variables */ 609 uprv_strcpy(myConverterData->name,"ISO_2022"); 610 #else 611 *errorCode = U_UNSUPPORTED_ERROR; 612 return; 613 #endif 614 } 615 616 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 617 618 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 619 _ISO2022Close(cnv); 620 } 621 } else { 622 *errorCode = U_MEMORY_ALLOCATION_ERROR; 623 } 624 } 625 626 627 static void 628 _ISO2022Close(UConverter *converter) { 629 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 630 UConverterSharedData **array = myData->myConverterArray; 631 int32_t i; 632 633 if (converter->extraInfo != NULL) { 634 /*close the array of converter pointers and free the memory*/ 635 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 636 if(array[i]!=NULL) { 637 ucnv_unloadSharedDataIfReady(array[i]); 638 } 639 } 640 641 ucnv_close(myData->currentConverter); 642 643 if(!converter->isExtraLocal){ 644 uprv_free (converter->extraInfo); 645 converter->extraInfo = NULL; 646 } 647 } 648 } 649 650 static void 651 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 652 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 653 if(choice<=UCNV_RESET_TO_UNICODE) { 654 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 655 myConverterData->key = 0; 656 myConverterData->isEmptySegment = FALSE; 657 } 658 if(choice!=UCNV_RESET_TO_UNICODE) { 659 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 660 } 661 #ifdef U_ENABLE_GENERIC_ISO_2022 662 if(myConverterData->locale[0] == 0){ 663 if(choice<=UCNV_RESET_TO_UNICODE) { 664 myConverterData->isFirstBuffer = TRUE; 665 myConverterData->key = 0; 666 if (converter->mode == UCNV_SO){ 667 ucnv_close (myConverterData->currentConverter); 668 myConverterData->currentConverter=NULL; 669 } 670 converter->mode = UCNV_SI; 671 } 672 if(choice!=UCNV_RESET_TO_UNICODE) { 673 /* re-append UTF-8 escape sequence */ 674 converter->charErrorBufferLength = 3; 675 converter->charErrorBuffer[0] = 0x1b; 676 converter->charErrorBuffer[1] = 0x28; 677 converter->charErrorBuffer[2] = 0x42; 678 } 679 } 680 else 681 #endif 682 { 683 /* reset the state variables */ 684 if(myConverterData->locale[0] == 'k'){ 685 if(choice<=UCNV_RESET_TO_UNICODE) { 686 setInitialStateToUnicodeKR(converter, myConverterData); 687 } 688 if(choice!=UCNV_RESET_TO_UNICODE) { 689 setInitialStateFromUnicodeKR(converter, myConverterData); 690 } 691 } 692 } 693 } 694 695 static const char* 696 _ISO2022getName(const UConverter* cnv){ 697 if(cnv->extraInfo){ 698 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 699 return myData->name; 700 } 701 return NULL; 702 } 703 704 705 /*************** to unicode *******************/ 706 /**************************************************************************** 707 * Recognized escape sequences are 708 * <ESC>(B ASCII 709 * <ESC>.A ISO-8859-1 710 * <ESC>.F ISO-8859-7 711 * <ESC>(J JISX-201 712 * <ESC>(I JISX-201 713 * <ESC>$B JISX-208 714 * <ESC>$@ JISX-208 715 * <ESC>$(D JISX-212 716 * <ESC>$A GB2312 717 * <ESC>$(C KSC5601 718 */ 719 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 720 /* 0 1 2 3 4 5 6 7 8 9 */ 721 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 722 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 723 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 724 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 725 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 726 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 727 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 728 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 729 }; 730 731 /*************** to unicode *******************/ 732 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 733 /* 0 1 2 3 4 5 6 7 8 9 */ 734 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 735 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 736 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 737 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 738 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 739 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 740 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 741 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 742 }; 743 744 745 static UCNV_TableStates_2022 746 getKey_2022(char c,int32_t* key,int32_t* offset){ 747 int32_t togo; 748 int32_t low = 0; 749 int32_t hi = MAX_STATES_2022; 750 int32_t oldmid=0; 751 752 togo = normalize_esq_chars_2022[(uint8_t)c]; 753 if(togo == 0) { 754 /* not a valid character anywhere in an escape sequence */ 755 *key = 0; 756 *offset = 0; 757 return INVALID_2022; 758 } 759 togo = (*key << 5) + togo; 760 761 while (hi != low) /*binary search*/{ 762 763 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 764 765 if (mid == oldmid) 766 break; 767 768 if (escSeqStateTable_Key_2022[mid] > togo){ 769 hi = mid; 770 } 771 else if (escSeqStateTable_Key_2022[mid] < togo){ 772 low = mid; 773 } 774 else /*we found it*/{ 775 *key = togo; 776 *offset = mid; 777 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 778 } 779 oldmid = mid; 780 781 } 782 783 *key = 0; 784 *offset = 0; 785 return INVALID_2022; 786 } 787 788 /*runs through a state machine to determine the escape sequence - codepage correspondance 789 */ 790 static void 791 changeState_2022(UConverter* _this, 792 const char** source, 793 const char* sourceLimit, 794 Variant2022 var, 795 UErrorCode* err){ 796 UCNV_TableStates_2022 value; 797 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 798 uint32_t key = myData2022->key; 799 int32_t offset = 0; 800 int8_t initialToULength = _this->toULength; 801 char c; 802 803 value = VALID_NON_TERMINAL_2022; 804 while (*source < sourceLimit) { 805 c = *(*source)++; 806 _this->toUBytes[_this->toULength++]=(uint8_t)c; 807 value = getKey_2022(c,(int32_t *) &key, &offset); 808 809 switch (value){ 810 811 case VALID_NON_TERMINAL_2022 : 812 /* continue with the loop */ 813 break; 814 815 case VALID_TERMINAL_2022: 816 key = 0; 817 goto DONE; 818 819 case INVALID_2022: 820 goto DONE; 821 822 case VALID_MAYBE_TERMINAL_2022: 823 #ifdef U_ENABLE_GENERIC_ISO_2022 824 /* ESC ( B is ambiguous only for ISO_2022 itself */ 825 if(var == ISO_2022) { 826 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 827 _this->toULength = 0; 828 829 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 830 831 /* continue with the loop */ 832 value = VALID_NON_TERMINAL_2022; 833 break; 834 } else 835 #endif 836 { 837 /* not ISO_2022 itself, finish here */ 838 value = VALID_TERMINAL_2022; 839 key = 0; 840 goto DONE; 841 } 842 } 843 } 844 845 DONE: 846 myData2022->key = key; 847 848 if (value == VALID_NON_TERMINAL_2022) { 849 /* indicate that the escape sequence is incomplete: key!=0 */ 850 return; 851 } else if (value == INVALID_2022 ) { 852 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 853 } else /* value == VALID_TERMINAL_2022 */ { 854 switch(var){ 855 #ifdef U_ENABLE_GENERIC_ISO_2022 856 case ISO_2022: 857 { 858 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 859 if(chosenConverterName == NULL) { 860 /* SS2 or SS3 */ 861 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 862 _this->toUCallbackReason = UCNV_UNASSIGNED; 863 return; 864 } 865 866 _this->mode = UCNV_SI; 867 ucnv_close(myData2022->currentConverter); 868 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 869 if(U_SUCCESS(*err)) { 870 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 871 _this->mode = UCNV_SO; 872 } 873 break; 874 } 875 #endif 876 case ISO_2022_JP: 877 { 878 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 879 switch(tempState) { 880 case INVALID_STATE: 881 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 882 break; 883 case SS2_STATE: 884 if(myData2022->toU2022State.cs[2]!=0) { 885 if(myData2022->toU2022State.g<2) { 886 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 887 } 888 myData2022->toU2022State.g=2; 889 } else { 890 /* illegal to have SS2 before a matching designator */ 891 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 892 } 893 break; 894 /* case SS3_STATE: not used in ISO-2022-JP-x */ 895 case ISO8859_1: 896 case ISO8859_7: 897 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 899 } else { 900 /* G2 charset for SS2 */ 901 myData2022->toU2022State.cs[2]=(int8_t)tempState; 902 } 903 break; 904 default: 905 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 906 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 907 } else { 908 /* G0 charset */ 909 myData2022->toU2022State.cs[0]=(int8_t)tempState; 910 } 911 break; 912 } 913 } 914 break; 915 case ISO_2022_CN: 916 { 917 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 918 switch(tempState) { 919 case INVALID_STATE: 920 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 921 break; 922 case SS2_STATE: 923 if(myData2022->toU2022State.cs[2]!=0) { 924 if(myData2022->toU2022State.g<2) { 925 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 926 } 927 myData2022->toU2022State.g=2; 928 } else { 929 /* illegal to have SS2 before a matching designator */ 930 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 931 } 932 break; 933 case SS3_STATE: 934 if(myData2022->toU2022State.cs[3]!=0) { 935 if(myData2022->toU2022State.g<2) { 936 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 937 } 938 myData2022->toU2022State.g=3; 939 } else { 940 /* illegal to have SS3 before a matching designator */ 941 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 942 } 943 break; 944 case ISO_IR_165: 945 if(myData2022->version==0) { 946 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 947 break; 948 } 949 /*fall through*/ 950 case GB2312_1: 951 /*fall through*/ 952 case CNS_11643_1: 953 myData2022->toU2022State.cs[1]=(int8_t)tempState; 954 break; 955 case CNS_11643_2: 956 myData2022->toU2022State.cs[2]=(int8_t)tempState; 957 break; 958 default: 959 /* other CNS 11643 planes */ 960 if(myData2022->version==0) { 961 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 962 } else { 963 myData2022->toU2022State.cs[3]=(int8_t)tempState; 964 } 965 break; 966 } 967 } 968 break; 969 case ISO_2022_KR: 970 if(offset==0x30){ 971 /* nothing to be done, just accept this one escape sequence */ 972 } else { 973 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 974 } 975 break; 976 977 default: 978 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 979 break; 980 } 981 } 982 if(U_SUCCESS(*err)) { 983 _this->toULength = 0; 984 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 985 if(_this->toULength>1) { 986 /* 987 * Ticket 5691: consistent illegal sequences: 988 * - We include at least the first byte (ESC) in the illegal sequence. 989 * - If any of the non-initial bytes could be the start of a character, 990 * we stop the illegal sequence before the first one of those. 991 * In escape sequences, all following bytes are "printable", that is, 992 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 993 * they are valid single/lead bytes. 994 * For simplicity, we always only report the initial ESC byte as the 995 * illegal sequence and back out all other bytes we looked at. 996 */ 997 /* Back out some bytes. */ 998 int8_t backOutDistance=_this->toULength-1; 999 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1000 if(backOutDistance<=bytesFromThisBuffer) { 1001 /* same as initialToULength<=1 */ 1002 *source-=backOutDistance; 1003 } else { 1004 /* Back out bytes from the previous buffer: Need to replay them. */ 1005 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1006 /* same as -(initialToULength-1) */ 1007 /* preToULength is negative! */ 1008 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1009 *source-=bytesFromThisBuffer; 1010 } 1011 _this->toULength=1; 1012 } 1013 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1014 _this->toUCallbackReason = UCNV_UNASSIGNED; 1015 } 1016 } 1017 1018 /*Checks the characters of the buffer against valid 2022 escape sequences 1019 *if the match we return a pointer to the initial start of the sequence otherwise 1020 *we return sourceLimit 1021 */ 1022 /*for 2022 looks ahead in the stream 1023 *to determine the longest possible convertible 1024 *data stream 1025 */ 1026 static U_INLINE const char* 1027 getEndOfBuffer_2022(const char** source, 1028 const char* sourceLimit, 1029 UBool flush){ 1030 1031 const char* mySource = *source; 1032 1033 #ifdef U_ENABLE_GENERIC_ISO_2022 1034 if (*source >= sourceLimit) 1035 return sourceLimit; 1036 1037 do{ 1038 1039 if (*mySource == ESC_2022){ 1040 int8_t i; 1041 int32_t key = 0; 1042 int32_t offset; 1043 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1044 1045 /* Kludge: I could not 1046 * figure out the reason for validating an escape sequence 1047 * twice - once here and once in changeState_2022(). 1048 * is it possible to have an ESC character in a ISO2022 1049 * byte stream which is valid in a code page? Is it legal? 1050 */ 1051 for (i=0; 1052 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1053 i++) { 1054 value = getKey_2022(*(mySource+i), &key, &offset); 1055 } 1056 if (value > 0 || *mySource==ESC_2022) 1057 return mySource; 1058 1059 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1060 return sourceLimit; 1061 } 1062 }while (++mySource < sourceLimit); 1063 1064 return sourceLimit; 1065 #else 1066 while(mySource < sourceLimit && *mySource != ESC_2022) { 1067 ++mySource; 1068 } 1069 return mySource; 1070 #endif 1071 } 1072 1073 1074 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1075 * any future change in _MBCSFromUChar32() function should be reflected here. 1076 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1077 */ 1078 static U_INLINE int32_t 1079 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1080 UChar32 c, 1081 uint32_t* value, 1082 UBool useFallback, 1083 int outputType) 1084 { 1085 const int32_t *cx; 1086 const uint16_t *table; 1087 uint32_t stage2Entry; 1088 uint32_t myValue; 1089 int32_t length; 1090 const uint8_t *p; 1091 /* 1092 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1093 * Use internal version of ucnv_open() that verifies that the new structures are available, 1094 * else U_INTERNAL_PROGRAM_ERROR. 1095 */ 1096 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1097 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1098 table=sharedData->mbcs.fromUnicodeTable; 1099 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1100 /* get the bytes and the length for the output */ 1101 if(outputType==MBCS_OUTPUT_2){ 1102 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1103 if(myValue<=0xff) { 1104 length=1; 1105 } else { 1106 length=2; 1107 } 1108 } else /* outputType==MBCS_OUTPUT_3 */ { 1109 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1110 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1111 if(myValue<=0xff) { 1112 length=1; 1113 } else if(myValue<=0xffff) { 1114 length=2; 1115 } else { 1116 length=3; 1117 } 1118 } 1119 /* is this code point assigned, or do we use fallbacks? */ 1120 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1121 /* assigned */ 1122 *value=myValue; 1123 return length; 1124 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1125 /* 1126 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1127 * There is no way with this data structure for fallback output 1128 * to be a zero byte. 1129 */ 1130 *value=myValue; 1131 return -length; 1132 } 1133 } 1134 1135 cx=sharedData->mbcs.extIndexes; 1136 if(cx!=NULL) { 1137 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1138 } 1139 1140 /* unassigned */ 1141 return 0; 1142 } 1143 1144 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1145 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1146 * @param retval pointer to output byte 1147 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1148 */ 1149 static U_INLINE int32_t 1150 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1151 UChar32 c, 1152 uint32_t* retval, 1153 UBool useFallback) 1154 { 1155 const uint16_t *table; 1156 int32_t value; 1157 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1158 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1159 return 0; 1160 } 1161 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1162 table=sharedData->mbcs.fromUnicodeTable; 1163 /* get the byte for the output */ 1164 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1165 /* is this code point assigned, or do we use fallbacks? */ 1166 *retval=(uint32_t)(value&0xff); 1167 if(value>=0xf00) { 1168 return 1; /* roundtrip */ 1169 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1170 return -1; /* fallback taken */ 1171 } else { 1172 return 0; /* no mapping */ 1173 } 1174 } 1175 1176 /* 1177 * Check that the result is a 2-byte value with each byte in the range A1..FE 1178 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1179 * to move it to the ISO 2022 range 21..7E. 1180 * Return 0 if out of range. 1181 */ 1182 static U_INLINE uint32_t 1183 _2022FromGR94DBCS(uint32_t value) { 1184 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1185 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1186 ) { 1187 return value - 0x8080; /* shift down to 21..7e byte range */ 1188 } else { 1189 return 0; /* not valid for ISO 2022 */ 1190 } 1191 } 1192 1193 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1194 /* 1195 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1196 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1197 * unchanged. 1198 */ 1199 static U_INLINE uint32_t 1200 _2022ToGR94DBCS(uint32_t value) { 1201 uint32_t returnValue = value + 0x8080; 1202 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1203 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1204 return returnValue; 1205 } else { 1206 return value; 1207 } 1208 } 1209 #endif 1210 1211 #ifdef U_ENABLE_GENERIC_ISO_2022 1212 1213 /********************************************************************************** 1214 * ISO-2022 Converter 1215 * 1216 * 1217 */ 1218 1219 static void 1220 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1221 UErrorCode* err){ 1222 const char* mySourceLimit, *realSourceLimit; 1223 const char* sourceStart; 1224 const UChar* myTargetStart; 1225 UConverter* saveThis; 1226 UConverterDataISO2022* myData; 1227 int8_t length; 1228 1229 saveThis = args->converter; 1230 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1231 1232 realSourceLimit = args->sourceLimit; 1233 while (args->source < realSourceLimit) { 1234 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1235 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1236 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1237 1238 if(args->source < mySourceLimit) { 1239 if(myData->currentConverter==NULL) { 1240 myData->currentConverter = ucnv_open("ASCII",err); 1241 if(U_FAILURE(*err)){ 1242 return; 1243 } 1244 1245 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1246 saveThis->mode = UCNV_SO; 1247 } 1248 1249 /* convert to before the ESC or until the end of the buffer */ 1250 myData->isFirstBuffer=FALSE; 1251 sourceStart = args->source; 1252 myTargetStart = args->target; 1253 args->converter = myData->currentConverter; 1254 ucnv_toUnicode(args->converter, 1255 &args->target, 1256 args->targetLimit, 1257 &args->source, 1258 mySourceLimit, 1259 args->offsets, 1260 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1261 err); 1262 args->converter = saveThis; 1263 1264 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1265 /* move the overflow buffer */ 1266 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1267 myData->currentConverter->UCharErrorBufferLength = 0; 1268 if(length > 0) { 1269 uprv_memcpy(saveThis->UCharErrorBuffer, 1270 myData->currentConverter->UCharErrorBuffer, 1271 length*U_SIZEOF_UCHAR); 1272 } 1273 return; 1274 } 1275 1276 /* 1277 * At least one of: 1278 * -Error while converting 1279 * -Done with entire buffer 1280 * -Need to write offsets or update the current offset 1281 * (leave that up to the code in ucnv.c) 1282 * 1283 * or else we just stopped at an ESC byte and continue with changeState_2022() 1284 */ 1285 if (U_FAILURE(*err) || 1286 (args->source == realSourceLimit) || 1287 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1288 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1289 ) { 1290 /* copy partial or error input for truncated detection and error handling */ 1291 if(U_FAILURE(*err)) { 1292 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1293 if(length > 0) { 1294 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1295 } 1296 } else { 1297 length = saveThis->toULength = myData->currentConverter->toULength; 1298 if(length > 0) { 1299 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1300 if(args->source < mySourceLimit) { 1301 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1302 } 1303 } 1304 } 1305 return; 1306 } 1307 } 1308 } 1309 1310 sourceStart = args->source; 1311 changeState_2022(args->converter, 1312 &(args->source), 1313 realSourceLimit, 1314 ISO_2022, 1315 err); 1316 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1317 /* let the ucnv.c code update its current offset */ 1318 return; 1319 } 1320 } 1321 } 1322 1323 #endif 1324 1325 /* 1326 * To Unicode Callback helper function 1327 */ 1328 static void 1329 toUnicodeCallback(UConverter *cnv, 1330 const uint32_t sourceChar, const uint32_t targetUniChar, 1331 UErrorCode* err){ 1332 if(sourceChar>0xff){ 1333 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1334 cnv->toUBytes[1] = (uint8_t)sourceChar; 1335 cnv->toULength = 2; 1336 } 1337 else{ 1338 cnv->toUBytes[0] =(char) sourceChar; 1339 cnv->toULength = 1; 1340 } 1341 1342 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1343 *err = U_INVALID_CHAR_FOUND; 1344 } 1345 else{ 1346 *err = U_ILLEGAL_CHAR_FOUND; 1347 } 1348 } 1349 1350 /**************************************ISO-2022-JP*************************************************/ 1351 1352 /************************************** IMPORTANT ************************************************** 1353 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1354 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1355 * The converter iterates over each Unicode codepoint 1356 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1357 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1358 * would do as far as possible. 1359 * 1360 * If the implementation of these macros or structure of sharedData struct change in the future, make 1361 * sure that ISO-2022 is also changed. 1362 *************************************************************************************************** 1363 */ 1364 1365 /*************************************************************************************************** 1366 * Rules for ISO-2022-jp encoding 1367 * (i) Escape sequences must be fully contained within a line they should not 1368 * span new lines or CRs 1369 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1370 * JIS-Roman character escape sequence should follow before the line terminates 1371 * (iii) If the first character on the line is represented by two bytes then a two 1372 * byte character escape sequence should precede it 1373 * (iv) If no escape sequence is encountered then the characters are ASCII 1374 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1375 * and invoked with SS2 (ESC N). 1376 * (vi) If there is any G0 designation in text, there must be a switch to 1377 * ASCII or to JIS X 0201-Roman before a space character (but not 1378 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1379 * characters such as tab or CRLF. 1380 * (vi) Supported encodings: 1381 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1382 * 1383 * source : RFC-1554 1384 * 1385 * JISX201, JISX208,JISX212 : new .cnv data files created 1386 * KSC5601 : alias to ibm-949 mapping table 1387 * GB2312 : alias to ibm-1386 mapping table 1388 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1389 * ISO-8859-7 : alisas to ibm-9409 mapping table 1390 */ 1391 1392 /* preference order of JP charsets */ 1393 static const StateEnum jpCharsetPref[]={ 1394 ASCII, 1395 JISX201, 1396 ISO8859_1, 1397 ISO8859_7, 1398 JISX208, 1399 JISX212, 1400 GB2312, 1401 KSC5601, 1402 HWKANA_7BIT 1403 }; 1404 1405 /* 1406 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1407 * not in order of jpCharsetPref[]! 1408 */ 1409 static const char escSeqChars[][6] ={ 1410 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1411 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1412 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1413 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1414 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1415 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1416 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1417 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1418 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1419 1420 }; 1421 static const int8_t escSeqCharsLen[] ={ 1422 3, /* length of <ESC>(B ASCII */ 1423 3, /* length of <ESC>.A ISO-8859-1 */ 1424 3, /* length of <ESC>.F ISO-8859-7 */ 1425 3, /* length of <ESC>(J JISX-201 */ 1426 3, /* length of <ESC>$B JISX-208 */ 1427 4, /* length of <ESC>$(D JISX-212 */ 1428 3, /* length of <ESC>$A GB2312 */ 1429 4, /* length of <ESC>$(C KSC5601 */ 1430 3 /* length of <ESC>(I HWKANA_7BIT */ 1431 }; 1432 1433 /* 1434 * The iteration over various code pages works this way: 1435 * i) Get the currentState from myConverterData->currentState 1436 * ii) Check if the character is mapped to a valid character in the currentState 1437 * Yes -> a) set the initIterState to currentState 1438 * b) remain in this state until an invalid character is found 1439 * No -> a) go to the next code page and find the character 1440 * iii) Before changing the state increment the current state check if the current state 1441 * is equal to the intitIteration state 1442 * Yes -> A character that cannot be represented in any of the supported encodings 1443 * break and return a U_INVALID_CHARACTER error 1444 * No -> Continue and find the character in next code page 1445 * 1446 * 1447 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1448 */ 1449 1450 /* Map 00..7F to Unicode according to JIS X 0201. */ 1451 static U_INLINE uint32_t 1452 jisx201ToU(uint32_t value) { 1453 if(value < 0x5c) { 1454 return value; 1455 } else if(value == 0x5c) { 1456 return 0xa5; 1457 } else if(value == 0x7e) { 1458 return 0x203e; 1459 } else /* value <= 0x7f */ { 1460 return value; 1461 } 1462 } 1463 1464 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1465 static U_INLINE uint32_t 1466 jisx201FromU(uint32_t value) { 1467 if(value<=0x7f) { 1468 if(value!=0x5c && value!=0x7e) { 1469 return value; 1470 } 1471 } else if(value==0xa5) { 1472 return 0x5c; 1473 } else if(value==0x203e) { 1474 return 0x7e; 1475 } 1476 return 0xfffe; 1477 } 1478 1479 /* 1480 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1481 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1482 * Return 0 if the byte pair is out of range. 1483 */ 1484 static U_INLINE uint32_t 1485 _2022FromSJIS(uint32_t value) { 1486 uint8_t trail; 1487 1488 if(value > 0xEFFC) { 1489 return 0; /* beyond JIS X 0208 */ 1490 } 1491 1492 trail = (uint8_t)value; 1493 1494 value &= 0xff00; /* lead byte */ 1495 if(value <= 0x9f00) { 1496 value -= 0x7000; 1497 } else /* 0xe000 <= value <= 0xef00 */ { 1498 value -= 0xb000; 1499 } 1500 value <<= 1; 1501 1502 if(trail <= 0x9e) { 1503 value -= 0x100; 1504 if(trail <= 0x7e) { 1505 value |= trail - 0x1f; 1506 } else { 1507 value |= trail - 0x20; 1508 } 1509 } else /* trail <= 0xfc */ { 1510 value |= trail - 0x7e; 1511 } 1512 return value; 1513 } 1514 1515 /* 1516 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1517 * If either byte is outside 21..7E make sure that the result is not valid 1518 * for Shift-JIS so that the converter catches it. 1519 * Some invalid byte values already turn into equally invalid Shift-JIS 1520 * byte values and need not be tested explicitly. 1521 */ 1522 static U_INLINE void 1523 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1524 if(c1&1) { 1525 ++c1; 1526 if(c2 <= 0x5f) { 1527 c2 += 0x1f; 1528 } else if(c2 <= 0x7e) { 1529 c2 += 0x20; 1530 } else { 1531 c2 = 0; /* invalid */ 1532 } 1533 } else { 1534 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1535 c2 += 0x7e; 1536 } else { 1537 c2 = 0; /* invalid */ 1538 } 1539 } 1540 c1 >>= 1; 1541 if(c1 <= 0x2f) { 1542 c1 += 0x70; 1543 } else if(c1 <= 0x3f) { 1544 c1 += 0xb0; 1545 } else { 1546 c1 = 0; /* invalid */ 1547 } 1548 bytes[0] = (char)c1; 1549 bytes[1] = (char)c2; 1550 } 1551 1552 /* 1553 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1554 * Katakana. 1555 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1556 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1557 * These were the only fallbacks in ICU's jisx-208.ucm file. 1558 */ 1559 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1560 0x2123, /* U+FF61 */ 1561 0x2156, 1562 0x2157, 1563 0x2122, 1564 0x2126, 1565 0x2572, 1566 0x2521, 1567 0x2523, 1568 0x2525, 1569 0x2527, 1570 0x2529, 1571 0x2563, 1572 0x2565, 1573 0x2567, 1574 0x2543, 1575 0x213C, /* U+FF70 */ 1576 0x2522, 1577 0x2524, 1578 0x2526, 1579 0x2528, 1580 0x252A, 1581 0x252B, 1582 0x252D, 1583 0x252F, 1584 0x2531, 1585 0x2533, 1586 0x2535, 1587 0x2537, 1588 0x2539, 1589 0x253B, 1590 0x253D, 1591 0x253F, /* U+FF80 */ 1592 0x2541, 1593 0x2544, 1594 0x2546, 1595 0x2548, 1596 0x254A, 1597 0x254B, 1598 0x254C, 1599 0x254D, 1600 0x254E, 1601 0x254F, 1602 0x2552, 1603 0x2555, 1604 0x2558, 1605 0x255B, 1606 0x255E, 1607 0x255F, /* U+FF90 */ 1608 0x2560, 1609 0x2561, 1610 0x2562, 1611 0x2564, 1612 0x2566, 1613 0x2568, 1614 0x2569, 1615 0x256A, 1616 0x256B, 1617 0x256C, 1618 0x256D, 1619 0x256F, 1620 0x2573, 1621 0x212B, 1622 0x212C /* U+FF9F */ 1623 }; 1624 1625 static void 1626 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1627 UConverter *cnv = args->converter; 1628 UConverterDataISO2022 *converterData; 1629 ISO2022State *pFromU2022State; 1630 uint8_t *target = (uint8_t *) args->target; 1631 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1632 const UChar* source = args->source; 1633 const UChar* sourceLimit = args->sourceLimit; 1634 int32_t* offsets = args->offsets; 1635 UChar32 sourceChar; 1636 char buffer[8]; 1637 int32_t len, outLen; 1638 int8_t choices[10]; 1639 int32_t choiceCount; 1640 uint32_t targetValue = 0; 1641 UBool useFallback; 1642 1643 int32_t i; 1644 int8_t cs, g; 1645 1646 /* set up the state */ 1647 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1648 pFromU2022State = &converterData->fromU2022State; 1649 1650 choiceCount = 0; 1651 1652 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1653 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1654 goto getTrail; 1655 } 1656 1657 while(source < sourceLimit) { 1658 if(target < targetLimit) { 1659 1660 sourceChar = *(source++); 1661 /*check if the char is a First surrogate*/ 1662 if(UTF_IS_SURROGATE(sourceChar)) { 1663 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 1664 getTrail: 1665 /*look ahead to find the trail surrogate*/ 1666 if(source < sourceLimit) { 1667 /* test the following code unit */ 1668 UChar trail=(UChar) *source; 1669 if(UTF_IS_SECOND_SURROGATE(trail)) { 1670 source++; 1671 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 1672 cnv->fromUChar32=0x00; 1673 /* convert this supplementary code point */ 1674 /* exit this condition tree */ 1675 } else { 1676 /* this is an unmatched lead code unit (1st surrogate) */ 1677 /* callback(illegal) */ 1678 *err=U_ILLEGAL_CHAR_FOUND; 1679 cnv->fromUChar32=sourceChar; 1680 break; 1681 } 1682 } else { 1683 /* no more input */ 1684 cnv->fromUChar32=sourceChar; 1685 break; 1686 } 1687 } else { 1688 /* this is an unmatched trail code unit (2nd surrogate) */ 1689 /* callback(illegal) */ 1690 *err=U_ILLEGAL_CHAR_FOUND; 1691 cnv->fromUChar32=sourceChar; 1692 break; 1693 } 1694 } 1695 1696 /* do not convert SO/SI/ESC */ 1697 if(IS_2022_CONTROL(sourceChar)) { 1698 /* callback(illegal) */ 1699 *err=U_ILLEGAL_CHAR_FOUND; 1700 cnv->fromUChar32=sourceChar; 1701 break; 1702 } 1703 1704 /* do the conversion */ 1705 1706 if(choiceCount == 0) { 1707 uint16_t csm; 1708 1709 /* 1710 * The csm variable keeps track of which charsets are allowed 1711 * and not used yet while building the choices[]. 1712 */ 1713 csm = jpCharsetMasks[converterData->version]; 1714 choiceCount = 0; 1715 1716 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1717 if(converterData->version == 3 || converterData->version == 4) { 1718 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1719 } 1720 /* Do not try single-byte half-width Katakana for other versions. */ 1721 csm &= ~CSM(HWKANA_7BIT); 1722 1723 /* try the current G0 charset */ 1724 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1725 csm &= ~CSM(cs); 1726 1727 /* try the current G2 charset */ 1728 if((cs = pFromU2022State->cs[2]) != 0) { 1729 choices[choiceCount++] = cs; 1730 csm &= ~CSM(cs); 1731 } 1732 1733 /* try all the other possible charsets */ 1734 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1735 cs = (int8_t)jpCharsetPref[i]; 1736 if(CSM(cs) & csm) { 1737 choices[choiceCount++] = cs; 1738 csm &= ~CSM(cs); 1739 } 1740 } 1741 } 1742 1743 cs = g = 0; 1744 /* 1745 * len==0: no mapping found yet 1746 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1747 * len>0: found a roundtrip result, done 1748 */ 1749 len = 0; 1750 /* 1751 * We will turn off useFallback after finding a fallback, 1752 * but we still get fallbacks from PUA code points as usual. 1753 * Therefore, we will also need to check that we don't overwrite 1754 * an early fallback with a later one. 1755 */ 1756 useFallback = cnv->useFallback; 1757 1758 for(i = 0; i < choiceCount && len <= 0; ++i) { 1759 uint32_t value; 1760 int32_t len2; 1761 int8_t cs0 = choices[i]; 1762 switch(cs0) { 1763 case ASCII: 1764 if(sourceChar <= 0x7f) { 1765 targetValue = (uint32_t)sourceChar; 1766 len = 1; 1767 cs = cs0; 1768 g = 0; 1769 } 1770 break; 1771 case ISO8859_1: 1772 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1773 targetValue = (uint32_t)sourceChar - 0x80; 1774 len = 1; 1775 cs = cs0; 1776 g = 2; 1777 } 1778 break; 1779 case HWKANA_7BIT: 1780 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1781 if(converterData->version==3) { 1782 /* JIS7: use G1 (SO) */ 1783 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1784 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1785 len = 1; 1786 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1787 g = 1; 1788 } else if(converterData->version==4) { 1789 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1790 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1791 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1792 len = 1; 1793 1794 cs = pFromU2022State->cs[0]; 1795 if(IS_JP_DBCS(cs)) { 1796 /* switch from a DBCS charset to JISX201 */ 1797 cs = (int8_t)JISX201; 1798 } 1799 /* else stay in the current G0 charset */ 1800 g = 0; 1801 } 1802 /* else do not use HWKANA_7BIT with other versions */ 1803 } 1804 break; 1805 case JISX201: 1806 /* G0 SBCS */ 1807 value = jisx201FromU(sourceChar); 1808 if(value <= 0x7f) { 1809 targetValue = value; 1810 len = 1; 1811 cs = cs0; 1812 g = 0; 1813 useFallback = FALSE; 1814 } 1815 break; 1816 case JISX208: 1817 /* G0 DBCS from Shift-JIS table */ 1818 len2 = MBCS_FROM_UCHAR32_ISO2022( 1819 converterData->myConverterArray[cs0], 1820 sourceChar, &value, 1821 useFallback, MBCS_OUTPUT_2); 1822 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1823 value = _2022FromSJIS(value); 1824 if(value != 0) { 1825 targetValue = value; 1826 len = len2; 1827 cs = cs0; 1828 g = 0; 1829 useFallback = FALSE; 1830 } 1831 } else if(len == 0 && useFallback && 1832 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1833 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1834 len = -2; 1835 cs = cs0; 1836 g = 0; 1837 useFallback = FALSE; 1838 } 1839 break; 1840 case ISO8859_7: 1841 /* G0 SBCS forced to 7-bit output */ 1842 len2 = MBCS_SINGLE_FROM_UCHAR32( 1843 converterData->myConverterArray[cs0], 1844 sourceChar, &value, 1845 useFallback); 1846 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1847 targetValue = value - 0x80; 1848 len = len2; 1849 cs = cs0; 1850 g = 2; 1851 useFallback = FALSE; 1852 } 1853 break; 1854 default: 1855 /* G0 DBCS */ 1856 len2 = MBCS_FROM_UCHAR32_ISO2022( 1857 converterData->myConverterArray[cs0], 1858 sourceChar, &value, 1859 useFallback, MBCS_OUTPUT_2); 1860 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1861 if(cs0 == KSC5601) { 1862 /* 1863 * Check for valid bytes for the encoding scheme. 1864 * This is necessary because the sub-converter (windows-949) 1865 * has a broader encoding scheme than is valid for 2022. 1866 */ 1867 value = _2022FromGR94DBCS(value); 1868 if(value == 0) { 1869 break; 1870 } 1871 } 1872 targetValue = value; 1873 len = len2; 1874 cs = cs0; 1875 g = 0; 1876 useFallback = FALSE; 1877 } 1878 break; 1879 } 1880 } 1881 1882 if(len != 0) { 1883 if(len < 0) { 1884 len = -len; /* fallback */ 1885 } 1886 outLen = 0; /* count output bytes */ 1887 1888 /* write SI if necessary (only for JIS7) */ 1889 if(pFromU2022State->g == 1 && g == 0) { 1890 buffer[outLen++] = UCNV_SI; 1891 pFromU2022State->g = 0; 1892 } 1893 1894 /* write the designation sequence if necessary */ 1895 if(cs != pFromU2022State->cs[g]) { 1896 int32_t escLen = escSeqCharsLen[cs]; 1897 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1898 outLen += escLen; 1899 pFromU2022State->cs[g] = cs; 1900 1901 /* invalidate the choices[] */ 1902 choiceCount = 0; 1903 } 1904 1905 /* write the shift sequence if necessary */ 1906 if(g != pFromU2022State->g) { 1907 switch(g) { 1908 /* case 0 handled before writing escapes */ 1909 case 1: 1910 buffer[outLen++] = UCNV_SO; 1911 pFromU2022State->g = 1; 1912 break; 1913 default: /* case 2 */ 1914 buffer[outLen++] = 0x1b; 1915 buffer[outLen++] = 0x4e; 1916 break; 1917 /* no case 3: no SS3 in ISO-2022-JP-x */ 1918 } 1919 } 1920 1921 /* write the output bytes */ 1922 if(len == 1) { 1923 buffer[outLen++] = (char)targetValue; 1924 } else /* len == 2 */ { 1925 buffer[outLen++] = (char)(targetValue >> 8); 1926 buffer[outLen++] = (char)targetValue; 1927 } 1928 } else { 1929 /* 1930 * if we cannot find the character after checking all codepages 1931 * then this is an error 1932 */ 1933 *err = U_INVALID_CHAR_FOUND; 1934 cnv->fromUChar32=sourceChar; 1935 break; 1936 } 1937 1938 if(sourceChar == CR || sourceChar == LF) { 1939 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1940 pFromU2022State->cs[2] = 0; 1941 choiceCount = 0; 1942 } 1943 1944 /* output outLen>0 bytes in buffer[] */ 1945 if(outLen == 1) { 1946 *target++ = buffer[0]; 1947 if(offsets) { 1948 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1949 } 1950 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1951 *target++ = buffer[0]; 1952 *target++ = buffer[1]; 1953 if(offsets) { 1954 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1955 *offsets++ = sourceIndex; 1956 *offsets++ = sourceIndex; 1957 } 1958 } else { 1959 fromUWriteUInt8( 1960 cnv, 1961 buffer, outLen, 1962 &target, (const char *)targetLimit, 1963 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1964 err); 1965 if(U_FAILURE(*err)) { 1966 break; 1967 } 1968 } 1969 } /* end if(myTargetIndex<myTargetLength) */ 1970 else{ 1971 *err =U_BUFFER_OVERFLOW_ERROR; 1972 break; 1973 } 1974 1975 }/* end while(mySourceIndex<mySourceLength) */ 1976 1977 /* 1978 * the end of the input stream and detection of truncated input 1979 * are handled by the framework, but for ISO-2022-JP conversion 1980 * we need to be in ASCII mode at the very end 1981 * 1982 * conditions: 1983 * successful 1984 * in SO mode or not in ASCII mode 1985 * end of input and no truncated input 1986 */ 1987 if( U_SUCCESS(*err) && 1988 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1989 args->flush && source>=sourceLimit && cnv->fromUChar32==0 1990 ) { 1991 int32_t sourceIndex; 1992 1993 outLen = 0; 1994 1995 if(pFromU2022State->g != 0) { 1996 buffer[outLen++] = UCNV_SI; 1997 pFromU2022State->g = 0; 1998 } 1999 2000 if(pFromU2022State->cs[0] != ASCII) { 2001 int32_t escLen = escSeqCharsLen[ASCII]; 2002 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2003 outLen += escLen; 2004 pFromU2022State->cs[0] = (int8_t)ASCII; 2005 } 2006 2007 /* get the source index of the last input character */ 2008 /* 2009 * TODO this would be simpler and more reliable if we used a pair 2010 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2011 * so that we could simply use the prevSourceIndex here; 2012 * this code gives an incorrect result for the rare case of an unmatched 2013 * trail surrogate that is alone in the last buffer of the text stream 2014 */ 2015 sourceIndex=(int32_t)(source-args->source); 2016 if(sourceIndex>0) { 2017 --sourceIndex; 2018 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2019 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2020 ) { 2021 --sourceIndex; 2022 } 2023 } else { 2024 sourceIndex=-1; 2025 } 2026 2027 fromUWriteUInt8( 2028 cnv, 2029 buffer, outLen, 2030 &target, (const char *)targetLimit, 2031 &offsets, sourceIndex, 2032 err); 2033 } 2034 2035 /*save the state and return */ 2036 args->source = source; 2037 args->target = (char*)target; 2038 } 2039 2040 /*************** to unicode *******************/ 2041 2042 static void 2043 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2044 UErrorCode* err){ 2045 char tempBuf[2]; 2046 const char *mySource = (char *) args->source; 2047 UChar *myTarget = args->target; 2048 const char *mySourceLimit = args->sourceLimit; 2049 uint32_t targetUniChar = 0x0000; 2050 uint32_t mySourceChar = 0x0000; 2051 uint32_t tmpSourceChar = 0x0000; 2052 UConverterDataISO2022* myData; 2053 ISO2022State *pToU2022State; 2054 StateEnum cs; 2055 2056 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2057 pToU2022State = &myData->toU2022State; 2058 2059 if(myData->key != 0) { 2060 /* continue with a partial escape sequence */ 2061 goto escape; 2062 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2063 /* continue with a partial double-byte character */ 2064 mySourceChar = args->converter->toUBytes[0]; 2065 args->converter->toULength = 0; 2066 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2067 targetUniChar = missingCharMarker; 2068 goto getTrailByte; 2069 } 2070 2071 while(mySource < mySourceLimit){ 2072 2073 targetUniChar =missingCharMarker; 2074 2075 if(myTarget < args->targetLimit){ 2076 2077 mySourceChar= (unsigned char) *mySource++; 2078 2079 switch(mySourceChar) { 2080 case UCNV_SI: 2081 if(myData->version==3) { 2082 pToU2022State->g=0; 2083 continue; 2084 } else { 2085 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2086 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2087 break; 2088 } 2089 2090 case UCNV_SO: 2091 if(myData->version==3) { 2092 /* JIS7: switch to G1 half-width Katakana */ 2093 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2094 pToU2022State->g=1; 2095 continue; 2096 } else { 2097 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2098 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2099 break; 2100 } 2101 2102 case ESC_2022: 2103 mySource--; 2104 escape: 2105 { 2106 const char * mySourceBefore = mySource; 2107 int8_t toULengthBefore = args->converter->toULength; 2108 2109 changeState_2022(args->converter,&(mySource), 2110 mySourceLimit, ISO_2022_JP,err); 2111 2112 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2113 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2114 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2115 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2116 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2117 } 2118 } 2119 2120 /* invalid or illegal escape sequence */ 2121 if(U_FAILURE(*err)){ 2122 args->target = myTarget; 2123 args->source = mySource; 2124 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2125 return; 2126 } 2127 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2128 if(myData->key==0) { 2129 myData->isEmptySegment = TRUE; 2130 } 2131 continue; 2132 2133 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2134 2135 case CR: 2136 /*falls through*/ 2137 case LF: 2138 /* automatically reset to single-byte mode */ 2139 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2140 pToU2022State->cs[0] = (int8_t)ASCII; 2141 } 2142 pToU2022State->cs[2] = 0; 2143 pToU2022State->g = 0; 2144 /* falls through */ 2145 default: 2146 /* convert one or two bytes */ 2147 myData->isEmptySegment = FALSE; 2148 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2149 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2150 !IS_JP_DBCS(cs) 2151 ) { 2152 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2153 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2154 2155 /* return from a single-shift state to the previous one */ 2156 if(pToU2022State->g >= 2) { 2157 pToU2022State->g=pToU2022State->prevG; 2158 } 2159 } else switch(cs) { 2160 case ASCII: 2161 if(mySourceChar <= 0x7f) { 2162 targetUniChar = mySourceChar; 2163 } 2164 break; 2165 case ISO8859_1: 2166 if(mySourceChar <= 0x7f) { 2167 targetUniChar = mySourceChar + 0x80; 2168 } 2169 /* return from a single-shift state to the previous one */ 2170 pToU2022State->g=pToU2022State->prevG; 2171 break; 2172 case ISO8859_7: 2173 if(mySourceChar <= 0x7f) { 2174 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2175 targetUniChar = 2176 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2177 myData->myConverterArray[cs], 2178 mySourceChar + 0x80); 2179 } 2180 /* return from a single-shift state to the previous one */ 2181 pToU2022State->g=pToU2022State->prevG; 2182 break; 2183 case JISX201: 2184 if(mySourceChar <= 0x7f) { 2185 targetUniChar = jisx201ToU(mySourceChar); 2186 } 2187 break; 2188 case HWKANA_7BIT: 2189 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2190 /* 7-bit halfwidth Katakana */ 2191 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2192 } 2193 break; 2194 default: 2195 /* G0 DBCS */ 2196 if(mySource < mySourceLimit) { 2197 int leadIsOk, trailIsOk; 2198 uint8_t trailByte; 2199 getTrailByte: 2200 trailByte = (uint8_t)*mySource; 2201 /* 2202 * Ticket 5691: consistent illegal sequences: 2203 * - We include at least the first byte in the illegal sequence. 2204 * - If any of the non-initial bytes could be the start of a character, 2205 * we stop the illegal sequence before the first one of those. 2206 * 2207 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2208 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2209 * Otherwise we convert or report the pair of bytes. 2210 */ 2211 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2212 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2213 if (leadIsOk && trailIsOk) { 2214 ++mySource; 2215 tmpSourceChar = (mySourceChar << 8) | trailByte; 2216 if(cs == JISX208) { 2217 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2218 mySourceChar = tmpSourceChar; 2219 } else { 2220 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2221 mySourceChar = tmpSourceChar; 2222 if (cs == KSC5601) { 2223 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2224 } 2225 tempBuf[0] = (char)(tmpSourceChar >> 8); 2226 tempBuf[1] = (char)(tmpSourceChar); 2227 } 2228 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2229 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2230 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2231 ++mySource; 2232 /* add another bit so that the code below writes 2 bytes in case of error */ 2233 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2234 } 2235 } else { 2236 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2237 args->converter->toULength = 1; 2238 goto endloop; 2239 } 2240 } /* End of inner switch */ 2241 break; 2242 } /* End of outer switch */ 2243 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2244 if(args->offsets){ 2245 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2246 } 2247 *(myTarget++)=(UChar)targetUniChar; 2248 } 2249 else if(targetUniChar > missingCharMarker){ 2250 /* disassemble the surrogate pair and write to output*/ 2251 targetUniChar-=0x0010000; 2252 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2253 if(args->offsets){ 2254 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2255 } 2256 ++myTarget; 2257 if(myTarget< args->targetLimit){ 2258 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2259 if(args->offsets){ 2260 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2261 } 2262 ++myTarget; 2263 }else{ 2264 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2265 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2266 } 2267 2268 } 2269 else{ 2270 /* Call the callback function*/ 2271 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2272 break; 2273 } 2274 } 2275 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2276 *err =U_BUFFER_OVERFLOW_ERROR; 2277 break; 2278 } 2279 } 2280 endloop: 2281 args->target = myTarget; 2282 args->source = mySource; 2283 } 2284 2285 2286 /*************************************************************** 2287 * Rules for ISO-2022-KR encoding 2288 * i) The KSC5601 designator sequence should appear only once in a file, 2289 * at the begining of a line before any KSC5601 characters. This usually 2290 * means that it appears by itself on the first line of the file 2291 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2292 * and SI to shift into single byte mode 2293 */ 2294 static void 2295 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2296 2297 UConverter* saveConv = args->converter; 2298 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2299 args->converter=myConverterData->currentConverter; 2300 2301 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2302 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2303 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2304 2305 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2306 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2307 uprv_memcpy( 2308 saveConv->charErrorBuffer, 2309 myConverterData->currentConverter->charErrorBuffer, 2310 myConverterData->currentConverter->charErrorBufferLength); 2311 } 2312 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2313 myConverterData->currentConverter->charErrorBufferLength = 0; 2314 } 2315 args->converter=saveConv; 2316 } 2317 2318 static void 2319 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2320 2321 const UChar *source = args->source; 2322 const UChar *sourceLimit = args->sourceLimit; 2323 unsigned char *target = (unsigned char *) args->target; 2324 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2325 int32_t* offsets = args->offsets; 2326 uint32_t targetByteUnit = 0x0000; 2327 UChar32 sourceChar = 0x0000; 2328 UBool isTargetByteDBCS; 2329 UBool oldIsTargetByteDBCS; 2330 UConverterDataISO2022 *converterData; 2331 UConverterSharedData* sharedData; 2332 UBool useFallback; 2333 int32_t length =0; 2334 2335 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2336 /* if the version is 1 then the user is requesting 2337 * conversion with ibm-25546 pass the arguments to 2338 * MBCS converter and return 2339 */ 2340 if(converterData->version==1){ 2341 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2342 return; 2343 } 2344 2345 /* initialize data */ 2346 sharedData = converterData->currentConverter->sharedData; 2347 useFallback = args->converter->useFallback; 2348 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2349 oldIsTargetByteDBCS = isTargetByteDBCS; 2350 2351 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2352 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2353 goto getTrail; 2354 } 2355 while(source < sourceLimit){ 2356 2357 targetByteUnit = missingCharMarker; 2358 2359 if(target < (unsigned char*) args->targetLimit){ 2360 sourceChar = *source++; 2361 2362 /* do not convert SO/SI/ESC */ 2363 if(IS_2022_CONTROL(sourceChar)) { 2364 /* callback(illegal) */ 2365 *err=U_ILLEGAL_CHAR_FOUND; 2366 args->converter->fromUChar32=sourceChar; 2367 break; 2368 } 2369 2370 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2371 if(length < 0) { 2372 length = -length; /* fallback */ 2373 } 2374 /* only DBCS or SBCS characters are expected*/ 2375 /* DB characters with high bit set to 1 are expected */ 2376 if( length > 2 || length==0 || 2377 (length == 1 && targetByteUnit > 0x7f) || 2378 (length == 2 && 2379 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2380 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2381 ) { 2382 targetByteUnit=missingCharMarker; 2383 } 2384 if (targetByteUnit != missingCharMarker){ 2385 2386 oldIsTargetByteDBCS = isTargetByteDBCS; 2387 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2388 /* append the shift sequence */ 2389 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2390 2391 if (isTargetByteDBCS) 2392 *target++ = UCNV_SO; 2393 else 2394 *target++ = UCNV_SI; 2395 if(offsets) 2396 *(offsets++) = (int32_t)(source - args->source-1); 2397 } 2398 /* write the targetUniChar to target */ 2399 if(targetByteUnit <= 0x00FF){ 2400 if( target < targetLimit){ 2401 *(target++) = (unsigned char) targetByteUnit; 2402 if(offsets){ 2403 *(offsets++) = (int32_t)(source - args->source-1); 2404 } 2405 2406 }else{ 2407 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2408 *err = U_BUFFER_OVERFLOW_ERROR; 2409 } 2410 }else{ 2411 if(target < targetLimit){ 2412 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2413 if(offsets){ 2414 *(offsets++) = (int32_t)(source - args->source-1); 2415 } 2416 if(target < targetLimit){ 2417 *(target++) =(unsigned char) (targetByteUnit -0x80); 2418 if(offsets){ 2419 *(offsets++) = (int32_t)(source - args->source-1); 2420 } 2421 }else{ 2422 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2423 *err = U_BUFFER_OVERFLOW_ERROR; 2424 } 2425 }else{ 2426 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2427 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2428 *err = U_BUFFER_OVERFLOW_ERROR; 2429 } 2430 } 2431 2432 } 2433 else{ 2434 /* oops.. the code point is unassingned 2435 * set the error and reason 2436 */ 2437 2438 /*check if the char is a First surrogate*/ 2439 if(UTF_IS_SURROGATE(sourceChar)) { 2440 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2441 getTrail: 2442 /*look ahead to find the trail surrogate*/ 2443 if(source < sourceLimit) { 2444 /* test the following code unit */ 2445 UChar trail=(UChar) *source; 2446 if(UTF_IS_SECOND_SURROGATE(trail)) { 2447 source++; 2448 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2449 *err = U_INVALID_CHAR_FOUND; 2450 /* convert this surrogate code point */ 2451 /* exit this condition tree */ 2452 } else { 2453 /* this is an unmatched lead code unit (1st surrogate) */ 2454 /* callback(illegal) */ 2455 *err=U_ILLEGAL_CHAR_FOUND; 2456 } 2457 } else { 2458 /* no more input */ 2459 *err = U_ZERO_ERROR; 2460 } 2461 } else { 2462 /* this is an unmatched trail code unit (2nd surrogate) */ 2463 /* callback(illegal) */ 2464 *err=U_ILLEGAL_CHAR_FOUND; 2465 } 2466 } else { 2467 /* callback(unassigned) for a BMP code point */ 2468 *err = U_INVALID_CHAR_FOUND; 2469 } 2470 2471 args->converter->fromUChar32=sourceChar; 2472 break; 2473 } 2474 } /* end if(myTargetIndex<myTargetLength) */ 2475 else{ 2476 *err =U_BUFFER_OVERFLOW_ERROR; 2477 break; 2478 } 2479 2480 }/* end while(mySourceIndex<mySourceLength) */ 2481 2482 /* 2483 * the end of the input stream and detection of truncated input 2484 * are handled by the framework, but for ISO-2022-KR conversion 2485 * we need to be in ASCII mode at the very end 2486 * 2487 * conditions: 2488 * successful 2489 * not in ASCII mode 2490 * end of input and no truncated input 2491 */ 2492 if( U_SUCCESS(*err) && 2493 isTargetByteDBCS && 2494 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2495 ) { 2496 int32_t sourceIndex; 2497 2498 /* we are switching to ASCII */ 2499 isTargetByteDBCS=FALSE; 2500 2501 /* get the source index of the last input character */ 2502 /* 2503 * TODO this would be simpler and more reliable if we used a pair 2504 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2505 * so that we could simply use the prevSourceIndex here; 2506 * this code gives an incorrect result for the rare case of an unmatched 2507 * trail surrogate that is alone in the last buffer of the text stream 2508 */ 2509 sourceIndex=(int32_t)(source-args->source); 2510 if(sourceIndex>0) { 2511 --sourceIndex; 2512 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2513 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2514 ) { 2515 --sourceIndex; 2516 } 2517 } else { 2518 sourceIndex=-1; 2519 } 2520 2521 fromUWriteUInt8( 2522 args->converter, 2523 SHIFT_IN_STR, 1, 2524 &target, (const char *)targetLimit, 2525 &offsets, sourceIndex, 2526 err); 2527 } 2528 2529 /*save the state and return */ 2530 args->source = source; 2531 args->target = (char*)target; 2532 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2533 } 2534 2535 /************************ To Unicode ***************************************/ 2536 2537 static void 2538 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2539 UErrorCode* err){ 2540 char const* sourceStart; 2541 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2542 2543 UConverterToUnicodeArgs subArgs; 2544 int32_t minArgsSize; 2545 2546 /* set up the subconverter arguments */ 2547 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2548 minArgsSize = args->size; 2549 } else { 2550 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2551 } 2552 2553 uprv_memcpy(&subArgs, args, minArgsSize); 2554 subArgs.size = (uint16_t)minArgsSize; 2555 subArgs.converter = myData->currentConverter; 2556 2557 /* remember the original start of the input for offsets */ 2558 sourceStart = args->source; 2559 2560 if(myData->key != 0) { 2561 /* continue with a partial escape sequence */ 2562 goto escape; 2563 } 2564 2565 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2566 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2567 subArgs.source = args->source; 2568 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2569 if(subArgs.source != subArgs.sourceLimit) { 2570 /* 2571 * get the current partial byte sequence 2572 * 2573 * it needs to be moved between the public and the subconverter 2574 * so that the conversion framework, which only sees the public 2575 * converter, can handle truncated and illegal input etc. 2576 */ 2577 if(args->converter->toULength > 0) { 2578 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2579 } 2580 subArgs.converter->toULength = args->converter->toULength; 2581 2582 /* 2583 * Convert up to the end of the input, or to before the next escape character. 2584 * Does not handle conversion extensions because the preToU[] state etc. 2585 * is not copied. 2586 */ 2587 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2588 2589 if(args->offsets != NULL && sourceStart != args->source) { 2590 /* update offsets to base them on the actual start of the input */ 2591 int32_t *offsets = args->offsets; 2592 UChar *target = args->target; 2593 int32_t delta = (int32_t)(args->source - sourceStart); 2594 while(target < subArgs.target) { 2595 if(*offsets >= 0) { 2596 *offsets += delta; 2597 } 2598 ++offsets; 2599 ++target; 2600 } 2601 } 2602 args->source = subArgs.source; 2603 args->target = subArgs.target; 2604 args->offsets = subArgs.offsets; 2605 2606 /* copy input/error/overflow buffers */ 2607 if(subArgs.converter->toULength > 0) { 2608 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2609 } 2610 args->converter->toULength = subArgs.converter->toULength; 2611 2612 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2613 if(subArgs.converter->UCharErrorBufferLength > 0) { 2614 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2615 subArgs.converter->UCharErrorBufferLength); 2616 } 2617 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2618 subArgs.converter->UCharErrorBufferLength = 0; 2619 } 2620 } 2621 2622 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2623 return; 2624 } 2625 2626 escape: 2627 changeState_2022(args->converter, 2628 &(args->source), 2629 args->sourceLimit, 2630 ISO_2022_KR, 2631 err); 2632 } 2633 } 2634 2635 static void 2636 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2637 UErrorCode* err){ 2638 char tempBuf[2]; 2639 const char *mySource = ( char *) args->source; 2640 UChar *myTarget = args->target; 2641 const char *mySourceLimit = args->sourceLimit; 2642 UChar32 targetUniChar = 0x0000; 2643 UChar mySourceChar = 0x0000; 2644 UConverterDataISO2022* myData; 2645 UConverterSharedData* sharedData ; 2646 UBool useFallback; 2647 2648 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2649 if(myData->version==1){ 2650 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2651 return; 2652 } 2653 2654 /* initialize state */ 2655 sharedData = myData->currentConverter->sharedData; 2656 useFallback = args->converter->useFallback; 2657 2658 if(myData->key != 0) { 2659 /* continue with a partial escape sequence */ 2660 goto escape; 2661 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2662 /* continue with a partial double-byte character */ 2663 mySourceChar = args->converter->toUBytes[0]; 2664 args->converter->toULength = 0; 2665 goto getTrailByte; 2666 } 2667 2668 while(mySource< mySourceLimit){ 2669 2670 if(myTarget < args->targetLimit){ 2671 2672 mySourceChar= (unsigned char) *mySource++; 2673 2674 if(mySourceChar==UCNV_SI){ 2675 myData->toU2022State.g = 0; 2676 if (myData->isEmptySegment) { 2677 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2678 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2679 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2680 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2681 args->converter->toULength = 1; 2682 args->target = myTarget; 2683 args->source = mySource; 2684 return; 2685 } 2686 /*consume the source */ 2687 continue; 2688 }else if(mySourceChar==UCNV_SO){ 2689 myData->toU2022State.g = 1; 2690 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2691 /*consume the source */ 2692 continue; 2693 }else if(mySourceChar==ESC_2022){ 2694 mySource--; 2695 escape: 2696 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2697 changeState_2022(args->converter,&(mySource), 2698 mySourceLimit, ISO_2022_KR, err); 2699 if(U_FAILURE(*err)){ 2700 args->target = myTarget; 2701 args->source = mySource; 2702 return; 2703 } 2704 continue; 2705 } 2706 2707 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2708 if(myData->toU2022State.g == 1) { 2709 if(mySource < mySourceLimit) { 2710 int leadIsOk, trailIsOk; 2711 uint8_t trailByte; 2712 getTrailByte: 2713 targetUniChar = missingCharMarker; 2714 trailByte = (uint8_t)*mySource; 2715 /* 2716 * Ticket 5691: consistent illegal sequences: 2717 * - We include at least the first byte in the illegal sequence. 2718 * - If any of the non-initial bytes could be the start of a character, 2719 * we stop the illegal sequence before the first one of those. 2720 * 2721 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2722 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2723 * Otherwise we convert or report the pair of bytes. 2724 */ 2725 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2726 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2727 if (leadIsOk && trailIsOk) { 2728 ++mySource; 2729 tempBuf[0] = (char)(mySourceChar + 0x80); 2730 tempBuf[1] = (char)(trailByte + 0x80); 2731 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2732 mySourceChar = (mySourceChar << 8) | trailByte; 2733 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2734 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2735 ++mySource; 2736 /* add another bit so that the code below writes 2 bytes in case of error */ 2737 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2738 } 2739 } else { 2740 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2741 args->converter->toULength = 1; 2742 break; 2743 } 2744 } 2745 else if(mySourceChar <= 0x7f) { 2746 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2747 } else { 2748 targetUniChar = 0xffff; 2749 } 2750 if(targetUniChar < 0xfffe){ 2751 if(args->offsets) { 2752 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2753 } 2754 *(myTarget++)=(UChar)targetUniChar; 2755 } 2756 else { 2757 /* Call the callback function*/ 2758 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2759 break; 2760 } 2761 } 2762 else{ 2763 *err =U_BUFFER_OVERFLOW_ERROR; 2764 break; 2765 } 2766 } 2767 args->target = myTarget; 2768 args->source = mySource; 2769 } 2770 2771 /*************************** END ISO2022-KR *********************************/ 2772 2773 /*************************** ISO-2022-CN ********************************* 2774 * 2775 * Rules for ISO-2022-CN Encoding: 2776 * i) The designator sequence must appear once on a line before any instance 2777 * of character set it designates. 2778 * ii) If two lines contain characters from the same character set, both lines 2779 * must include the designator sequence. 2780 * iii) Once the designator sequence is known, a shifting sequence has to be found 2781 * to invoke the shifting 2782 * iv) All lines start in ASCII and end in ASCII. 2783 * v) Four shifting sequences are employed for this purpose: 2784 * 2785 * Sequcence ASCII Eq Charsets 2786 * ---------- ------- --------- 2787 * SI <SI> US-ASCII 2788 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2789 * SS2 <ESC>N CNS-11643-1992 Plane 2 2790 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2791 * 2792 * vi) 2793 * SOdesignator : ESC "$" ")" finalchar_for_SO 2794 * SS2designator : ESC "$" "*" finalchar_for_SS2 2795 * SS3designator : ESC "$" "+" finalchar_for_SS3 2796 * 2797 * ESC $ ) A Indicates the bytes following SO are Chinese 2798 * characters as defined in GB 2312-80, until 2799 * another SOdesignation appears 2800 * 2801 * 2802 * ESC $ ) E Indicates the bytes following SO are as defined 2803 * in ISO-IR-165 (for details, see section 2.1), 2804 * until another SOdesignation appears 2805 * 2806 * ESC $ ) G Indicates the bytes following SO are as defined 2807 * in CNS 11643-plane-1, until another 2808 * SOdesignation appears 2809 * 2810 * ESC $ * H Indicates the two bytes immediately following 2811 * SS2 is a Chinese character as defined in CNS 2812 * 11643-plane-2, until another SS2designation 2813 * appears 2814 * (Meaning <ESC>N must preceed every 2 byte 2815 * sequence.) 2816 * 2817 * ESC $ + I Indicates the immediate two bytes following SS3 2818 * is a Chinese character as defined in CNS 2819 * 11643-plane-3, until another SS3designation 2820 * appears 2821 * (Meaning <ESC>O must preceed every 2 byte 2822 * sequence.) 2823 * 2824 * ESC $ + J Indicates the immediate two bytes following SS3 2825 * is a Chinese character as defined in CNS 2826 * 11643-plane-4, until another SS3designation 2827 * appears 2828 * (In English: <ESC>O must preceed every 2 byte 2829 * sequence.) 2830 * 2831 * ESC $ + K Indicates the immediate two bytes following SS3 2832 * is a Chinese character as defined in CNS 2833 * 11643-plane-5, until another SS3designation 2834 * appears 2835 * 2836 * ESC $ + L Indicates the immediate two bytes following SS3 2837 * is a Chinese character as defined in CNS 2838 * 11643-plane-6, until another SS3designation 2839 * appears 2840 * 2841 * ESC $ + M Indicates the immediate two bytes following SS3 2842 * is a Chinese character as defined in CNS 2843 * 11643-plane-7, until another SS3designation 2844 * appears 2845 * 2846 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2847 * has its own designation information before any Chinese characters 2848 * appear 2849 * 2850 */ 2851 2852 /* The following are defined this way to make the strings truely readonly */ 2853 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2854 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2855 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2856 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2857 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2858 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2859 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2860 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2861 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2862 2863 /********************** ISO2022-CN Data **************************/ 2864 static const char* const escSeqCharsCN[10] ={ 2865 SHIFT_IN_STR, /* ASCII */ 2866 GB_2312_80_STR, 2867 ISO_IR_165_STR, 2868 CNS_11643_1992_Plane_1_STR, 2869 CNS_11643_1992_Plane_2_STR, 2870 CNS_11643_1992_Plane_3_STR, 2871 CNS_11643_1992_Plane_4_STR, 2872 CNS_11643_1992_Plane_5_STR, 2873 CNS_11643_1992_Plane_6_STR, 2874 CNS_11643_1992_Plane_7_STR 2875 }; 2876 2877 static void 2878 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2879 UConverter *cnv = args->converter; 2880 UConverterDataISO2022 *converterData; 2881 ISO2022State *pFromU2022State; 2882 uint8_t *target = (uint8_t *) args->target; 2883 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2884 const UChar* source = args->source; 2885 const UChar* sourceLimit = args->sourceLimit; 2886 int32_t* offsets = args->offsets; 2887 UChar32 sourceChar; 2888 char buffer[8]; 2889 int32_t len; 2890 int8_t choices[3]; 2891 int32_t choiceCount; 2892 uint32_t targetValue = 0; 2893 UBool useFallback; 2894 2895 /* set up the state */ 2896 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2897 pFromU2022State = &converterData->fromU2022State; 2898 2899 choiceCount = 0; 2900 2901 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2902 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2903 goto getTrail; 2904 } 2905 2906 while( source < sourceLimit){ 2907 if(target < targetLimit){ 2908 2909 sourceChar = *(source++); 2910 /*check if the char is a First surrogate*/ 2911 if(UTF_IS_SURROGATE(sourceChar)) { 2912 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2913 getTrail: 2914 /*look ahead to find the trail surrogate*/ 2915 if(source < sourceLimit) { 2916 /* test the following code unit */ 2917 UChar trail=(UChar) *source; 2918 if(UTF_IS_SECOND_SURROGATE(trail)) { 2919 source++; 2920 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2921 cnv->fromUChar32=0x00; 2922 /* convert this supplementary code point */ 2923 /* exit this condition tree */ 2924 } else { 2925 /* this is an unmatched lead code unit (1st surrogate) */ 2926 /* callback(illegal) */ 2927 *err=U_ILLEGAL_CHAR_FOUND; 2928 cnv->fromUChar32=sourceChar; 2929 break; 2930 } 2931 } else { 2932 /* no more input */ 2933 cnv->fromUChar32=sourceChar; 2934 break; 2935 } 2936 } else { 2937 /* this is an unmatched trail code unit (2nd surrogate) */ 2938 /* callback(illegal) */ 2939 *err=U_ILLEGAL_CHAR_FOUND; 2940 cnv->fromUChar32=sourceChar; 2941 break; 2942 } 2943 } 2944 2945 /* do the conversion */ 2946 if(sourceChar <= 0x007f ){ 2947 /* do not convert SO/SI/ESC */ 2948 if(IS_2022_CONTROL(sourceChar)) { 2949 /* callback(illegal) */ 2950 *err=U_ILLEGAL_CHAR_FOUND; 2951 cnv->fromUChar32=sourceChar; 2952 break; 2953 } 2954 2955 /* US-ASCII */ 2956 if(pFromU2022State->g == 0) { 2957 buffer[0] = (char)sourceChar; 2958 len = 1; 2959 } else { 2960 buffer[0] = UCNV_SI; 2961 buffer[1] = (char)sourceChar; 2962 len = 2; 2963 pFromU2022State->g = 0; 2964 choiceCount = 0; 2965 } 2966 if(sourceChar == CR || sourceChar == LF) { 2967 /* reset the state at the end of a line */ 2968 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2969 choiceCount = 0; 2970 } 2971 } 2972 else{ 2973 /* convert U+0080..U+10ffff */ 2974 int32_t i; 2975 int8_t cs, g; 2976 2977 if(choiceCount == 0) { 2978 /* try the current SO/G1 converter first */ 2979 choices[0] = pFromU2022State->cs[1]; 2980 2981 /* default to GB2312_1 if none is designated yet */ 2982 if(choices[0] == 0) { 2983 choices[0] = GB2312_1; 2984 } 2985 2986 if(converterData->version == 0) { 2987 /* ISO-2022-CN */ 2988 2989 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 2990 if(choices[0] == GB2312_1) { 2991 choices[1] = (int8_t)CNS_11643_1; 2992 } else { 2993 choices[1] = (int8_t)GB2312_1; 2994 } 2995 2996 choiceCount = 2; 2997 } else { 2998 /* ISO-2022-CN-EXT */ 2999 3000 /* try one of the other converters */ 3001 switch(choices[0]) { 3002 case GB2312_1: 3003 choices[1] = (int8_t)CNS_11643_1; 3004 choices[2] = (int8_t)ISO_IR_165; 3005 break; 3006 case ISO_IR_165: 3007 choices[1] = (int8_t)GB2312_1; 3008 choices[2] = (int8_t)CNS_11643_1; 3009 break; 3010 default: /* CNS_11643_x */ 3011 choices[1] = (int8_t)GB2312_1; 3012 choices[2] = (int8_t)ISO_IR_165; 3013 break; 3014 } 3015 3016 choiceCount = 3; 3017 } 3018 } 3019 3020 cs = g = 0; 3021 /* 3022 * len==0: no mapping found yet 3023 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3024 * len>0: found a roundtrip result, done 3025 */ 3026 len = 0; 3027 /* 3028 * We will turn off useFallback after finding a fallback, 3029 * but we still get fallbacks from PUA code points as usual. 3030 * Therefore, we will also need to check that we don't overwrite 3031 * an early fallback with a later one. 3032 */ 3033 useFallback = cnv->useFallback; 3034 3035 for(i = 0; i < choiceCount && len <= 0; ++i) { 3036 int8_t cs0 = choices[i]; 3037 if(cs0 > 0) { 3038 uint32_t value; 3039 int32_t len2; 3040 if(cs0 >= CNS_11643_0) { 3041 len2 = MBCS_FROM_UCHAR32_ISO2022( 3042 converterData->myConverterArray[CNS_11643], 3043 sourceChar, 3044 &value, 3045 useFallback, 3046 MBCS_OUTPUT_3); 3047 if(len2 == 3 || (len2 == -3 && len == 0)) { 3048 targetValue = value; 3049 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3050 if(len2 >= 0) { 3051 len = 2; 3052 } else { 3053 len = -2; 3054 useFallback = FALSE; 3055 } 3056 if(cs == CNS_11643_1) { 3057 g = 1; 3058 } else if(cs == CNS_11643_2) { 3059 g = 2; 3060 } else /* plane 3..7 */ if(converterData->version == 1) { 3061 g = 3; 3062 } else { 3063 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3064 len = 0; 3065 } 3066 } 3067 } else { 3068 /* GB2312_1 or ISO-IR-165 */ 3069 len2 = MBCS_FROM_UCHAR32_ISO2022( 3070 converterData->myConverterArray[cs0], 3071 sourceChar, 3072 &value, 3073 useFallback, 3074 MBCS_OUTPUT_2); 3075 if(len2 == 2 || (len2 == -2 && len == 0)) { 3076 targetValue = value; 3077 len = len2; 3078 cs = cs0; 3079 g = 1; 3080 useFallback = FALSE; 3081 } 3082 } 3083 } 3084 } 3085 3086 if(len != 0) { 3087 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3088 3089 /* write the designation sequence if necessary */ 3090 if(cs != pFromU2022State->cs[g]) { 3091 if(cs < CNS_11643) { 3092 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3093 } else { 3094 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3095 } 3096 len = 4; 3097 pFromU2022State->cs[g] = cs; 3098 if(g == 1) { 3099 /* changing the SO/G1 charset invalidates the choices[] */ 3100 choiceCount = 0; 3101 } 3102 } 3103 3104 /* write the shift sequence if necessary */ 3105 if(g != pFromU2022State->g) { 3106 switch(g) { 3107 case 1: 3108 buffer[len++] = UCNV_SO; 3109 3110 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3111 pFromU2022State->g = 1; 3112 break; 3113 case 2: 3114 buffer[len++] = 0x1b; 3115 buffer[len++] = 0x4e; 3116 break; 3117 default: /* case 3 */ 3118 buffer[len++] = 0x1b; 3119 buffer[len++] = 0x4f; 3120 break; 3121 } 3122 } 3123 3124 /* write the two output bytes */ 3125 buffer[len++] = (char)(targetValue >> 8); 3126 buffer[len++] = (char)targetValue; 3127 } else { 3128 /* if we cannot find the character after checking all codepages 3129 * then this is an error 3130 */ 3131 *err = U_INVALID_CHAR_FOUND; 3132 cnv->fromUChar32=sourceChar; 3133 break; 3134 } 3135 } 3136 3137 /* output len>0 bytes in buffer[] */ 3138 if(len == 1) { 3139 *target++ = buffer[0]; 3140 if(offsets) { 3141 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3142 } 3143 } else if(len == 2 && (target + 2) <= targetLimit) { 3144 *target++ = buffer[0]; 3145 *target++ = buffer[1]; 3146 if(offsets) { 3147 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3148 *offsets++ = sourceIndex; 3149 *offsets++ = sourceIndex; 3150 } 3151 } else { 3152 fromUWriteUInt8( 3153 cnv, 3154 buffer, len, 3155 &target, (const char *)targetLimit, 3156 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3157 err); 3158 if(U_FAILURE(*err)) { 3159 break; 3160 } 3161 } 3162 } /* end if(myTargetIndex<myTargetLength) */ 3163 else{ 3164 *err =U_BUFFER_OVERFLOW_ERROR; 3165 break; 3166 } 3167 3168 }/* end while(mySourceIndex<mySourceLength) */ 3169 3170 /* 3171 * the end of the input stream and detection of truncated input 3172 * are handled by the framework, but for ISO-2022-CN conversion 3173 * we need to be in ASCII mode at the very end 3174 * 3175 * conditions: 3176 * successful 3177 * not in ASCII mode 3178 * end of input and no truncated input 3179 */ 3180 if( U_SUCCESS(*err) && 3181 pFromU2022State->g!=0 && 3182 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3183 ) { 3184 int32_t sourceIndex; 3185 3186 /* we are switching to ASCII */ 3187 pFromU2022State->g=0; 3188 3189 /* get the source index of the last input character */ 3190 /* 3191 * TODO this would be simpler and more reliable if we used a pair 3192 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3193 * so that we could simply use the prevSourceIndex here; 3194 * this code gives an incorrect result for the rare case of an unmatched 3195 * trail surrogate that is alone in the last buffer of the text stream 3196 */ 3197 sourceIndex=(int32_t)(source-args->source); 3198 if(sourceIndex>0) { 3199 --sourceIndex; 3200 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3201 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3202 ) { 3203 --sourceIndex; 3204 } 3205 } else { 3206 sourceIndex=-1; 3207 } 3208 3209 fromUWriteUInt8( 3210 cnv, 3211 SHIFT_IN_STR, 1, 3212 &target, (const char *)targetLimit, 3213 &offsets, sourceIndex, 3214 err); 3215 } 3216 3217 /*save the state and return */ 3218 args->source = source; 3219 args->target = (char*)target; 3220 } 3221 3222 3223 static void 3224 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3225 UErrorCode* err){ 3226 char tempBuf[3]; 3227 const char *mySource = (char *) args->source; 3228 UChar *myTarget = args->target; 3229 const char *mySourceLimit = args->sourceLimit; 3230 uint32_t targetUniChar = 0x0000; 3231 uint32_t mySourceChar = 0x0000; 3232 UConverterDataISO2022* myData; 3233 ISO2022State *pToU2022State; 3234 3235 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3236 pToU2022State = &myData->toU2022State; 3237 3238 if(myData->key != 0) { 3239 /* continue with a partial escape sequence */ 3240 goto escape; 3241 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3242 /* continue with a partial double-byte character */ 3243 mySourceChar = args->converter->toUBytes[0]; 3244 args->converter->toULength = 0; 3245 targetUniChar = missingCharMarker; 3246 goto getTrailByte; 3247 } 3248 3249 while(mySource < mySourceLimit){ 3250 3251 targetUniChar =missingCharMarker; 3252 3253 if(myTarget < args->targetLimit){ 3254 3255 mySourceChar= (unsigned char) *mySource++; 3256 3257 switch(mySourceChar){ 3258 case UCNV_SI: 3259 pToU2022State->g=0; 3260 if (myData->isEmptySegment) { 3261 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3262 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3263 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3264 args->converter->toUBytes[0] = mySourceChar; 3265 args->converter->toULength = 1; 3266 args->target = myTarget; 3267 args->source = mySource; 3268 return; 3269 } 3270 continue; 3271 3272 case UCNV_SO: 3273 if(pToU2022State->cs[1] != 0) { 3274 pToU2022State->g=1; 3275 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3276 continue; 3277 } else { 3278 /* illegal to have SO before a matching designator */ 3279 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3280 break; 3281 } 3282 3283 case ESC_2022: 3284 mySource--; 3285 escape: 3286 { 3287 const char * mySourceBefore = mySource; 3288 int8_t toULengthBefore = args->converter->toULength; 3289 3290 changeState_2022(args->converter,&(mySource), 3291 mySourceLimit, ISO_2022_CN,err); 3292 3293 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3294 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3295 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3296 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3297 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3298 } 3299 } 3300 3301 /* invalid or illegal escape sequence */ 3302 if(U_FAILURE(*err)){ 3303 args->target = myTarget; 3304 args->source = mySource; 3305 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3306 return; 3307 } 3308 continue; 3309 3310 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3311 3312 case CR: 3313 /*falls through*/ 3314 case LF: 3315 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3316 /* falls through */ 3317 default: 3318 /* convert one or two bytes */ 3319 myData->isEmptySegment = FALSE; 3320 if(pToU2022State->g != 0) { 3321 if(mySource < mySourceLimit) { 3322 UConverterSharedData *cnv; 3323 StateEnum tempState; 3324 int32_t tempBufLen; 3325 int leadIsOk, trailIsOk; 3326 uint8_t trailByte; 3327 getTrailByte: 3328 trailByte = (uint8_t)*mySource; 3329 /* 3330 * Ticket 5691: consistent illegal sequences: 3331 * - We include at least the first byte in the illegal sequence. 3332 * - If any of the non-initial bytes could be the start of a character, 3333 * we stop the illegal sequence before the first one of those. 3334 * 3335 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3336 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3337 * Otherwise we convert or report the pair of bytes. 3338 */ 3339 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3340 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3341 if (leadIsOk && trailIsOk) { 3342 ++mySource; 3343 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3344 if(tempState >= CNS_11643_0) { 3345 cnv = myData->myConverterArray[CNS_11643]; 3346 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3347 tempBuf[1] = (char) (mySourceChar); 3348 tempBuf[2] = (char) trailByte; 3349 tempBufLen = 3; 3350 3351 }else{ 3352 cnv = myData->myConverterArray[tempState]; 3353 tempBuf[0] = (char) (mySourceChar); 3354 tempBuf[1] = (char) trailByte; 3355 tempBufLen = 2; 3356 } 3357 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3358 mySourceChar = (mySourceChar << 8) | trailByte; 3359 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3360 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3361 ++mySource; 3362 /* add another bit so that the code below writes 2 bytes in case of error */ 3363 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3364 } 3365 if(pToU2022State->g>=2) { 3366 /* return from a single-shift state to the previous one */ 3367 pToU2022State->g=pToU2022State->prevG; 3368 } 3369 } else { 3370 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3371 args->converter->toULength = 1; 3372 goto endloop; 3373 } 3374 } 3375 else{ 3376 if(mySourceChar <= 0x7f) { 3377 targetUniChar = (UChar) mySourceChar; 3378 } 3379 } 3380 break; 3381 } 3382 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3383 if(args->offsets){ 3384 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3385 } 3386 *(myTarget++)=(UChar)targetUniChar; 3387 } 3388 else if(targetUniChar > missingCharMarker){ 3389 /* disassemble the surrogate pair and write to output*/ 3390 targetUniChar-=0x0010000; 3391 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3392 if(args->offsets){ 3393 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3394 } 3395 ++myTarget; 3396 if(myTarget< args->targetLimit){ 3397 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3398 if(args->offsets){ 3399 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3400 } 3401 ++myTarget; 3402 }else{ 3403 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3404 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3405 } 3406 3407 } 3408 else{ 3409 /* Call the callback function*/ 3410 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3411 break; 3412 } 3413 } 3414 else{ 3415 *err =U_BUFFER_OVERFLOW_ERROR; 3416 break; 3417 } 3418 } 3419 endloop: 3420 args->target = myTarget; 3421 args->source = mySource; 3422 } 3423 3424 static void 3425 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3426 UConverter *cnv = args->converter; 3427 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3428 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3429 char *p, *subchar; 3430 char buffer[8]; 3431 int32_t length; 3432 3433 subchar=(char *)cnv->subChars; 3434 length=cnv->subCharLen; /* assume length==1 for most variants */ 3435 3436 p = buffer; 3437 switch(myConverterData->locale[0]){ 3438 case 'j': 3439 { 3440 int8_t cs; 3441 3442 if(pFromU2022State->g == 1) { 3443 /* JIS7: switch from G1 to G0 */ 3444 pFromU2022State->g = 0; 3445 *p++ = UCNV_SI; 3446 } 3447 3448 cs = pFromU2022State->cs[0]; 3449 if(cs != ASCII && cs != JISX201) { 3450 /* not in ASCII or JIS X 0201: switch to ASCII */ 3451 pFromU2022State->cs[0] = (int8_t)ASCII; 3452 *p++ = '\x1b'; 3453 *p++ = '\x28'; 3454 *p++ = '\x42'; 3455 } 3456 3457 *p++ = subchar[0]; 3458 break; 3459 } 3460 case 'c': 3461 if(pFromU2022State->g != 0) { 3462 /* not in ASCII mode: switch to ASCII */ 3463 pFromU2022State->g = 0; 3464 *p++ = UCNV_SI; 3465 } 3466 *p++ = subchar[0]; 3467 break; 3468 case 'k': 3469 if(myConverterData->version == 0) { 3470 if(length == 1) { 3471 if((UBool)args->converter->fromUnicodeStatus) { 3472 /* in DBCS mode: switch to SBCS */ 3473 args->converter->fromUnicodeStatus = 0; 3474 *p++ = UCNV_SI; 3475 } 3476 *p++ = subchar[0]; 3477 } else /* length == 2*/ { 3478 if(!(UBool)args->converter->fromUnicodeStatus) { 3479 /* in SBCS mode: switch to DBCS */ 3480 args->converter->fromUnicodeStatus = 1; 3481 *p++ = UCNV_SO; 3482 } 3483 *p++ = subchar[0]; 3484 *p++ = subchar[1]; 3485 } 3486 break; 3487 } else { 3488 /* save the subconverter's substitution string */ 3489 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3490 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3491 3492 /* set our substitution string into the subconverter */ 3493 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3494 myConverterData->currentConverter->subCharLen = (int8_t)length; 3495 3496 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3497 args->converter = myConverterData->currentConverter; 3498 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3499 ucnv_cbFromUWriteSub(args, 0, err); 3500 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3501 args->converter = cnv; 3502 3503 /* restore the subconverter's substitution string */ 3504 myConverterData->currentConverter->subChars = currentSubChars; 3505 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3506 3507 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3508 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3509 uprv_memcpy( 3510 cnv->charErrorBuffer, 3511 myConverterData->currentConverter->charErrorBuffer, 3512 myConverterData->currentConverter->charErrorBufferLength); 3513 } 3514 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3515 myConverterData->currentConverter->charErrorBufferLength = 0; 3516 } 3517 return; 3518 } 3519 default: 3520 /* not expected */ 3521 break; 3522 } 3523 ucnv_cbFromUWriteBytes(args, 3524 buffer, (int32_t)(p - buffer), 3525 offsetIndex, err); 3526 } 3527 3528 /* 3529 * Structure for cloning an ISO 2022 converter into a single memory block. 3530 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3531 * and then ucnv_safeClone() of the sub-converter may additionally align 3532 * currentConverter inside the cloneStruct, for which we need the deadSpace 3533 * after currentConverter. 3534 * This is because UAlignedMemory may be larger than the actually 3535 * necessary alignment size for the platform. 3536 * The other cloneStruct fields will not be moved around, 3537 * and are aligned properly with cloneStruct's alignment. 3538 */ 3539 struct cloneStruct 3540 { 3541 UConverter cnv; 3542 UConverter currentConverter; 3543 UAlignedMemory deadSpace; 3544 UConverterDataISO2022 mydata; 3545 }; 3546 3547 3548 static UConverter * 3549 _ISO_2022_SafeClone( 3550 const UConverter *cnv, 3551 void *stackBuffer, 3552 int32_t *pBufferSize, 3553 UErrorCode *status) 3554 { 3555 struct cloneStruct * localClone; 3556 UConverterDataISO2022 *cnvData; 3557 int32_t i, size; 3558 3559 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3560 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3561 return NULL; 3562 } 3563 3564 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3565 localClone = (struct cloneStruct *)stackBuffer; 3566 3567 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3568 3569 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3570 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3571 localClone->cnv.isExtraLocal = TRUE; 3572 3573 /* share the subconverters */ 3574 3575 if(cnvData->currentConverter != NULL) { 3576 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3577 localClone->mydata.currentConverter = 3578 ucnv_safeClone(cnvData->currentConverter, 3579 &localClone->currentConverter, 3580 &size, status); 3581 if(U_FAILURE(*status)) { 3582 return NULL; 3583 } 3584 } 3585 3586 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3587 if(cnvData->myConverterArray[i] != NULL) { 3588 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3589 } 3590 } 3591 3592 return &localClone->cnv; 3593 } 3594 3595 static void 3596 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3597 const USetAdder *sa, 3598 UConverterUnicodeSet which, 3599 UErrorCode *pErrorCode) 3600 { 3601 int32_t i; 3602 UConverterDataISO2022* cnvData; 3603 3604 if (U_FAILURE(*pErrorCode)) { 3605 return; 3606 } 3607 #ifdef U_ENABLE_GENERIC_ISO_2022 3608 if (cnv->sharedData == &_ISO2022Data) { 3609 /* We use UTF-8 in this case */ 3610 sa->addRange(sa->set, 0, 0xd7FF); 3611 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3612 return; 3613 } 3614 #endif 3615 3616 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3617 3618 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3619 switch(cnvData->locale[0]){ 3620 case 'j': 3621 /* include JIS X 0201 which is hardcoded */ 3622 sa->add(sa->set, 0xa5); 3623 sa->add(sa->set, 0x203e); 3624 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3625 /* include Latin-1 for some variants of JP */ 3626 sa->addRange(sa->set, 0, 0xff); 3627 } else { 3628 /* include ASCII for JP */ 3629 sa->addRange(sa->set, 0, 0x7f); 3630 } 3631 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3632 /* 3633 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3634 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3635 * use half-width Katakana. 3636 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3637 * half-width Katakana via the ESC ( I sequence. 3638 * However, we only emit (fromUnicode) half-width Katakana according to the 3639 * definition of each variant. 3640 * 3641 * When including fallbacks, 3642 * we need to include half-width Katakana Unicode code points for all JP variants because 3643 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3644 */ 3645 /* include half-width Katakana for JP */ 3646 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3647 } 3648 break; 3649 case 'c': 3650 case 'z': 3651 /* include ASCII for CN */ 3652 sa->addRange(sa->set, 0, 0x7f); 3653 break; 3654 case 'k': 3655 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3656 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3657 cnvData->currentConverter, sa, which, pErrorCode); 3658 /* the loop over myConverterArray[] will simply not find another converter */ 3659 break; 3660 default: 3661 break; 3662 } 3663 3664 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3665 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3666 cnvData->version==0 && i==CNS_11643 3667 ) { 3668 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3669 ucnv_MBCSGetUnicodeSetForBytes( 3670 cnvData->myConverterArray[i], 3671 sa, UCNV_ROUNDTRIP_SET, 3672 0, 0x81, 0x82, 3673 pErrorCode); 3674 } 3675 #endif 3676 3677 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3678 UConverterSetFilter filter; 3679 if(cnvData->myConverterArray[i]!=NULL) { 3680 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3681 cnvData->version==0 && i==CNS_11643 3682 ) { 3683 /* 3684 * Version-specific for CN: 3685 * CN version 0 does not map CNS planes 3..7 although 3686 * they are all available in the CNS conversion table; 3687 * CN version 1 (-EXT) does map them all. 3688 * The two versions create different Unicode sets. 3689 */ 3690 filter=UCNV_SET_FILTER_2022_CN; 3691 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3692 /* 3693 * Only add code points that map to Shift-JIS codes 3694 * corresponding to JIS X 0208. 3695 */ 3696 filter=UCNV_SET_FILTER_SJIS; 3697 } else if(i==KSC5601) { 3698 /* 3699 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3700 * are broader than GR94. 3701 */ 3702 filter=UCNV_SET_FILTER_GR94DBCS; 3703 } else { 3704 filter=UCNV_SET_FILTER_NONE; 3705 } 3706 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3707 } 3708 } 3709 3710 /* 3711 * ISO 2022 converters must not convert SO/SI/ESC despite what 3712 * sub-converters do by themselves. 3713 * Remove these characters from the set. 3714 */ 3715 sa->remove(sa->set, 0x0e); 3716 sa->remove(sa->set, 0x0f); 3717 sa->remove(sa->set, 0x1b); 3718 3719 /* ISO 2022 converters do not convert C1 controls either */ 3720 sa->removeRange(sa->set, 0x80, 0x9f); 3721 } 3722 3723 static const UConverterImpl _ISO2022Impl={ 3724 UCNV_ISO_2022, 3725 3726 NULL, 3727 NULL, 3728 3729 _ISO2022Open, 3730 _ISO2022Close, 3731 _ISO2022Reset, 3732 3733 #ifdef U_ENABLE_GENERIC_ISO_2022 3734 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3735 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3736 ucnv_fromUnicode_UTF8, 3737 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3738 #else 3739 NULL, 3740 NULL, 3741 NULL, 3742 NULL, 3743 #endif 3744 NULL, 3745 3746 NULL, 3747 _ISO2022getName, 3748 _ISO_2022_WriteSub, 3749 _ISO_2022_SafeClone, 3750 _ISO_2022_GetUnicodeSet 3751 }; 3752 static const UConverterStaticData _ISO2022StaticData={ 3753 sizeof(UConverterStaticData), 3754 "ISO_2022", 3755 2022, 3756 UCNV_IBM, 3757 UCNV_ISO_2022, 3758 1, 3759 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3760 { 0x1a, 0, 0, 0 }, 3761 1, 3762 FALSE, 3763 FALSE, 3764 0, 3765 0, 3766 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3767 }; 3768 const UConverterSharedData _ISO2022Data={ 3769 sizeof(UConverterSharedData), 3770 ~((uint32_t) 0), 3771 NULL, 3772 NULL, 3773 &_ISO2022StaticData, 3774 FALSE, 3775 &_ISO2022Impl, 3776 0 3777 }; 3778 3779 /*************JP****************/ 3780 static const UConverterImpl _ISO2022JPImpl={ 3781 UCNV_ISO_2022, 3782 3783 NULL, 3784 NULL, 3785 3786 _ISO2022Open, 3787 _ISO2022Close, 3788 _ISO2022Reset, 3789 3790 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3791 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3792 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3793 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3794 NULL, 3795 3796 NULL, 3797 _ISO2022getName, 3798 _ISO_2022_WriteSub, 3799 _ISO_2022_SafeClone, 3800 _ISO_2022_GetUnicodeSet 3801 }; 3802 static const UConverterStaticData _ISO2022JPStaticData={ 3803 sizeof(UConverterStaticData), 3804 "ISO_2022_JP", 3805 0, 3806 UCNV_IBM, 3807 UCNV_ISO_2022, 3808 1, 3809 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3810 { 0x1a, 0, 0, 0 }, 3811 1, 3812 FALSE, 3813 FALSE, 3814 0, 3815 0, 3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3817 }; 3818 static const UConverterSharedData _ISO2022JPData={ 3819 sizeof(UConverterSharedData), 3820 ~((uint32_t) 0), 3821 NULL, 3822 NULL, 3823 &_ISO2022JPStaticData, 3824 FALSE, 3825 &_ISO2022JPImpl, 3826 0 3827 }; 3828 3829 /************* KR ***************/ 3830 static const UConverterImpl _ISO2022KRImpl={ 3831 UCNV_ISO_2022, 3832 3833 NULL, 3834 NULL, 3835 3836 _ISO2022Open, 3837 _ISO2022Close, 3838 _ISO2022Reset, 3839 3840 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3841 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3842 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3843 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3844 NULL, 3845 3846 NULL, 3847 _ISO2022getName, 3848 _ISO_2022_WriteSub, 3849 _ISO_2022_SafeClone, 3850 _ISO_2022_GetUnicodeSet 3851 }; 3852 static const UConverterStaticData _ISO2022KRStaticData={ 3853 sizeof(UConverterStaticData), 3854 "ISO_2022_KR", 3855 0, 3856 UCNV_IBM, 3857 UCNV_ISO_2022, 3858 1, 3859 3, /* max 3 bytes per UChar: SO+DBCS */ 3860 { 0x1a, 0, 0, 0 }, 3861 1, 3862 FALSE, 3863 FALSE, 3864 0, 3865 0, 3866 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3867 }; 3868 static const UConverterSharedData _ISO2022KRData={ 3869 sizeof(UConverterSharedData), 3870 ~((uint32_t) 0), 3871 NULL, 3872 NULL, 3873 &_ISO2022KRStaticData, 3874 FALSE, 3875 &_ISO2022KRImpl, 3876 0 3877 }; 3878 3879 /*************** CN ***************/ 3880 static const UConverterImpl _ISO2022CNImpl={ 3881 3882 UCNV_ISO_2022, 3883 3884 NULL, 3885 NULL, 3886 3887 _ISO2022Open, 3888 _ISO2022Close, 3889 _ISO2022Reset, 3890 3891 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3892 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3893 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3894 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3895 NULL, 3896 3897 NULL, 3898 _ISO2022getName, 3899 _ISO_2022_WriteSub, 3900 _ISO_2022_SafeClone, 3901 _ISO_2022_GetUnicodeSet 3902 }; 3903 static const UConverterStaticData _ISO2022CNStaticData={ 3904 sizeof(UConverterStaticData), 3905 "ISO_2022_CN", 3906 0, 3907 UCNV_IBM, 3908 UCNV_ISO_2022, 3909 1, 3910 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3911 { 0x1a, 0, 0, 0 }, 3912 1, 3913 FALSE, 3914 FALSE, 3915 0, 3916 0, 3917 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3918 }; 3919 static const UConverterSharedData _ISO2022CNData={ 3920 sizeof(UConverterSharedData), 3921 ~((uint32_t) 0), 3922 NULL, 3923 NULL, 3924 &_ISO2022CNStaticData, 3925 FALSE, 3926 &_ISO2022CNImpl, 3927 0 3928 }; 3929 3930 3931 3932 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3933