1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "unicode/utf16.h" 38 #include "ucnv_imp.h" 39 #include "ucnv_bld.h" 40 #include "ucnv_cnv.h" 41 #include "ucnvmbcs.h" 42 #include "cstring.h" 43 #include "cmemory.h" 44 #include "uassert.h" 45 46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 static const char SHIFT_IN_STR[] = "\x0F"; 81 // static const char SHIFT_OUT_STR[] = "\x0E"; 82 83 #define CR 0x0D 84 #define LF 0x0A 85 #define H_TAB 0x09 86 #define V_TAB 0x0B 87 #define SPACE 0x20 88 89 enum { 90 HWKANA_START=0xff61, 91 HWKANA_END=0xff9f 92 }; 93 94 /* 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 96 * as bytes 21..7E. (Subtract 0x80.) 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 98 * as bytes 20..7F. (Subtract 0x80.) 99 * Do not encode C1 control codes with native bytes 80..9F 100 * as bytes 00..1F (C0 control codes). 101 */ 102 enum { 103 GR94_START=0xa1, 104 GR94_END=0xfe, 105 GR96_START=0xa0, 106 GR96_END=0xff 107 }; 108 109 /* 110 * ISO 2022 control codes must not be converted from Unicode 111 * because they would mess up the byte stream. 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 113 * corresponding to SO, SI, and ESC. 114 */ 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 116 117 /* for ISO-2022-JP and -CN implementations */ 118 typedef enum { 119 /* shared values */ 120 INVALID_STATE=-1, 121 ASCII = 0, 122 123 SS2_STATE=0x10, 124 SS3_STATE, 125 126 /* JP */ 127 ISO8859_1 = 1 , 128 ISO8859_7 = 2 , 129 JISX201 = 3, 130 JISX208 = 4, 131 JISX212 = 5, 132 GB2312 =6, 133 KSC5601 =7, 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 135 136 /* CN */ 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 138 GB2312_1=1, 139 ISO_IR_165=2, 140 CNS_11643=3, 141 142 /* 143 * these are used in StateEnum and ISO2022State variables, 144 * but CNS_11643 must be used to index into myConverterArray[] 145 */ 146 CNS_11643_0=0x20, 147 CNS_11643_1, 148 CNS_11643_2, 149 CNS_11643_3, 150 CNS_11643_4, 151 CNS_11643_5, 152 CNS_11643_6, 153 CNS_11643_7 154 } StateEnum; 155 156 /* is the StateEnum charset value for a DBCS charset? */ 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 158 159 #define CSM(cs) ((uint16_t)1<<(cs)) 160 161 /* 162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 164 * 165 * Note: The converter uses some leniency: 166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 167 * all versions, not just JIS7 and JIS8. 168 * - ICU does not distinguish between different versions of JIS X 0208. 169 */ 170 enum { MAX_JA_VERSION=4 }; 171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 177 }; 178 179 typedef enum { 180 ASCII1=0, 181 LATIN1, 182 SBCS, 183 DBCS, 184 MBCS, 185 HWKANA 186 }Cnv2022Type; 187 188 typedef struct ISO2022State { 189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 191 int8_t prevG; /* g before single shift (SS2 or SS3) */ 192 } ISO2022State; 193 194 #define UCNV_OPTIONS_VERSION_MASK 0xf 195 #define UCNV_2022_MAX_CONVERTERS 10 196 197 typedef struct{ 198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 199 UConverter *currentConverter; 200 Cnv2022Type currentType; 201 ISO2022State toU2022State, fromU2022State; 202 uint32_t key; 203 uint32_t version; 204 #ifdef U_ENABLE_GENERIC_ISO_2022 205 UBool isFirstBuffer; 206 #endif 207 UBool isEmptySegment; 208 char name[30]; 209 char locale[3]; 210 }UConverterDataISO2022; 211 212 /* Protos */ 213 /* ISO-2022 ----------------------------------------------------------------- */ 214 215 /*Forward declaration */ 216 U_CFUNC void 217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 218 UErrorCode * err); 219 U_CFUNC void 220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 221 UErrorCode * err); 222 223 #define ESC_2022 0x1B /*ESC*/ 224 225 typedef enum 226 { 227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 231 } UCNV_TableStates_2022; 232 233 /* 234 * The way these state transition arrays work is: 235 * ex : ESC$B is the sequence for JISX208 236 * a) First Iteration: char is ESC 237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 238 * int x = normalize_esq_chars_2022[27] which is equal to 1 239 * ii) Search for this value in escSeqStateTable_Key_2022[] 240 * value of x is stored at escSeqStateTable_Key_2022[0] 241 * iii) Save this index as offset 242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 244 * b) Switch on this state and continue to next char 245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 246 * which is normalize_esq_chars_2022[36] == 4 247 * ii) x is currently 1(from above) 248 * x<<=5 -- x is now 32 249 * x+=normalize_esq_chars_2022[36] 250 * now x is 36 251 * iii) Search for this value in escSeqStateTable_Key_2022[] 252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 255 * c) Switch on this state and continue to next char 256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 257 * ii) x is currently 36 (from above) 258 * x<<=5 -- x is now 1152 259 * x+=normalize_esq_chars_2022[66] 260 * now x is 1161 261 * iii) Search for this value in escSeqStateTable_Key_2022[] 262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 266 */ 267 268 269 /*Below are the 3 arrays depicting a state transition table*/ 270 static const int8_t normalize_esq_chars_2022[256] = { 271 /* 0 1 2 3 4 5 6 7 8 9 */ 272 273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 299 }; 300 301 #ifdef U_ENABLE_GENERIC_ISO_2022 302 /* 303 * When the generic ISO-2022 converter is completely removed, not just disabled 304 * per #ifdef, then the following state table and the associated tables that are 305 * dimensioned with MAX_STATES_2022 should be trimmed. 306 * 307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 308 * the associated escape sequences starting with ESC ( B should be removed. 309 * This includes the ones with key values 1097 and all of the ones above 1000000. 310 * 311 * For the latter, the tables can simply be truncated. 312 * For the former, since the tables must be kept parallel, it is probably best 313 * to simply duplicate an adjacent table cell, parallel in all tables. 314 * 315 * It may make sense to restructure the tables, especially by using small search 316 * tables for the variants instead of indexing them parallel to the table here. 317 */ 318 #endif 319 320 #define MAX_STATES_2022 74 321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 322 /* 0 1 2 3 4 5 6 7 8 9 */ 323 324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 331 ,35947631 ,35947635 ,35947636 ,35947638 332 }; 333 334 #ifdef U_ENABLE_GENERIC_ISO_2022 335 336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 337 /* 0 1 2 3 4 5 6 7 8 9 */ 338 339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 347 }; 348 349 #endif 350 351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 352 /* 0 1 2 3 4 5 6 7 8 9 */ 353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 361 }; 362 363 364 /* Type def for refactoring changeState_2022 code*/ 365 typedef enum{ 366 #ifdef U_ENABLE_GENERIC_ISO_2022 367 ISO_2022=0, 368 #endif 369 ISO_2022_JP=1, 370 ISO_2022_KR=2, 371 ISO_2022_CN=3 372 } Variant2022; 373 374 /*********** ISO 2022 Converter Protos ***********/ 375 static void 376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 377 378 static void 379 _ISO2022Close(UConverter *converter); 380 381 static void 382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 383 384 static const char* 385 _ISO2022getName(const UConverter* cnv); 386 387 static void 388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 389 390 static UConverter * 391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 392 393 #ifdef U_ENABLE_GENERIC_ISO_2022 394 static void 395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 396 #endif 397 398 namespace { 399 400 /*const UConverterSharedData _ISO2022Data;*/ 401 extern const UConverterSharedData _ISO2022JPData; 402 extern const UConverterSharedData _ISO2022KRData; 403 extern const UConverterSharedData _ISO2022CNData; 404 405 } // namespace 406 407 /*************** Converter implementations ******************/ 408 409 /* The purpose of this function is to get around gcc compiler warnings. */ 410 static inline void 411 fromUWriteUInt8(UConverter *cnv, 412 const char *bytes, int32_t length, 413 uint8_t **target, const char *targetLimit, 414 int32_t **offsets, 415 int32_t sourceIndex, 416 UErrorCode *pErrorCode) 417 { 418 char *targetChars = (char *)*target; 419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 420 offsets, sourceIndex, pErrorCode); 421 *target = (uint8_t*)targetChars; 422 423 } 424 425 static inline void 426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 427 if(myConverterData->version == 1) { 428 UConverter *cnv = myConverterData->currentConverter; 429 430 cnv->toUnicodeStatus=0; /* offset */ 431 cnv->mode=0; /* state */ 432 cnv->toULength=0; /* byteIndex */ 433 } 434 } 435 436 static inline void 437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 438 /* in ISO-2022-KR the designator sequence appears only once 439 * in a file so we append it only once 440 */ 441 if( converter->charErrorBufferLength==0){ 442 443 converter->charErrorBufferLength = 4; 444 converter->charErrorBuffer[0] = 0x1b; 445 converter->charErrorBuffer[1] = 0x24; 446 converter->charErrorBuffer[2] = 0x29; 447 converter->charErrorBuffer[3] = 0x43; 448 } 449 if(myConverterData->version == 1) { 450 UConverter *cnv = myConverterData->currentConverter; 451 452 cnv->fromUChar32=0; 453 cnv->fromUnicodeStatus=1; /* prevLength */ 454 } 455 } 456 457 static void 458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 459 460 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 461 462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 463 if(cnv->extraInfo != NULL) { 464 UConverterNamePieces stackPieces; 465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 467 uint32_t version; 468 469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 470 471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 472 myConverterData->currentType = ASCII1; 473 cnv->fromUnicodeStatus =FALSE; 474 if(pArgs->locale){ 475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 476 } 477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 478 myConverterData->version = version; 479 /* Begin Google-specific change. */ 480 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 481 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 482 if((myLocale[0]=='j' && 483 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 484 myLocale[1]=='s') && 485 (myLocale[2]=='_' || myLocale[2]=='\0'))) 486 { 487 size_t len=0; 488 /* open the required converters and cache them */ 489 if(version>MAX_JA_VERSION) { 490 /* prevent indexing beyond jpCharsetMasks[] */ 491 myConverterData->version = version = 0; 492 } 493 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 494 myConverterData->myConverterArray[ISO8859_7] = 495 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 496 } 497 if (myLocale[1]=='k') { /* Use KDDI's version. */ 498 myConverterData->myConverterArray[JISX208] = 499 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 500 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 501 myConverterData->myConverterArray[JISX208] = 502 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 503 } else { 504 /* 505 * Change for http://b/issue?id=937017 : 506 * Restore JIS X 0208 ISO-2022-JP mappings from before 507 * sharing the table with the Shift-JIS converter 508 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797). 509 * TODO(mscherer): Create and use a new, unified Google Shift-JIS 510 * table for both Shift-JIS and ISO-2022-JP. 511 */ 512 myConverterData->myConverterArray[JISX208] = 513 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode); 514 } 515 /* End Google-specific change. */ 516 if(jpCharsetMasks[version]&CSM(JISX212)) { 517 myConverterData->myConverterArray[JISX212] = 518 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 519 } 520 if(jpCharsetMasks[version]&CSM(GB2312)) { 521 myConverterData->myConverterArray[GB2312] = 522 /* BEGIN android-changed */ 523 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 524 /* END android-changed */ 525 } 526 if(jpCharsetMasks[version]&CSM(KSC5601)) { 527 myConverterData->myConverterArray[KSC5601] = 528 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 529 } 530 531 /* set the function pointers to appropriate funtions */ 532 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 533 uprv_strcpy(myConverterData->locale,"ja"); 534 535 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 536 len = uprv_strlen(myConverterData->name); 537 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 538 myConverterData->name[len+1]='\0'; 539 } 540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 541 (myLocale[2]=='_' || myLocale[2]=='\0')) 542 { 543 const char *cnvName; 544 if(version==1) { 545 cnvName="icu-internal-25546"; 546 } else { 547 /* BEGIN android-changed */ 548 cnvName="ksc_5601"; 549 /* END android-changed */ 550 myConverterData->version=version=0; 551 } 552 if(pArgs->onlyTestIsLoadable) { 553 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 554 uprv_free(cnv->extraInfo); 555 cnv->extraInfo=NULL; 556 return; 557 } else { 558 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 559 if (U_FAILURE(*errorCode)) { 560 _ISO2022Close(cnv); 561 return; 562 } 563 564 if(version==1) { 565 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 566 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 567 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 568 }else{ 569 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 570 } 571 572 /* initialize the state variables */ 573 setInitialStateToUnicodeKR(cnv, myConverterData); 574 setInitialStateFromUnicodeKR(cnv, myConverterData); 575 576 /* set the function pointers to appropriate funtions */ 577 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 578 uprv_strcpy(myConverterData->locale,"ko"); 579 } 580 } 581 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 582 (myLocale[2]=='_' || myLocale[2]=='\0')) 583 { 584 585 /* open the required converters and cache them */ 586 /* BEGIN android-changed */ 587 myConverterData->myConverterArray[GB2312_1] = 588 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 589 if(version==1) { 590 myConverterData->myConverterArray[ISO_IR_165] = 591 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 592 } 593 myConverterData->myConverterArray[CNS_11643] = 594 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 595 /* END android-changed */ 596 597 598 /* set the function pointers to appropriate funtions */ 599 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 600 uprv_strcpy(myConverterData->locale,"cn"); 601 602 if (version==0){ 603 myConverterData->version = 0; 604 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 605 }else if (version==1){ 606 myConverterData->version = 1; 607 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 608 }else { 609 myConverterData->version = 2; 610 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 611 } 612 } 613 else{ 614 #ifdef U_ENABLE_GENERIC_ISO_2022 615 myConverterData->isFirstBuffer = TRUE; 616 617 /* append the UTF-8 escape sequence */ 618 cnv->charErrorBufferLength = 3; 619 cnv->charErrorBuffer[0] = 0x1b; 620 cnv->charErrorBuffer[1] = 0x25; 621 cnv->charErrorBuffer[2] = 0x42; 622 623 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 624 /* initialize the state variables */ 625 uprv_strcpy(myConverterData->name,"ISO_2022"); 626 #else 627 *errorCode = U_UNSUPPORTED_ERROR; 628 return; 629 #endif 630 } 631 632 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 633 634 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 635 _ISO2022Close(cnv); 636 } 637 } else { 638 *errorCode = U_MEMORY_ALLOCATION_ERROR; 639 } 640 } 641 642 643 static void 644 _ISO2022Close(UConverter *converter) { 645 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 646 UConverterSharedData **array = myData->myConverterArray; 647 int32_t i; 648 649 if (converter->extraInfo != NULL) { 650 /*close the array of converter pointers and free the memory*/ 651 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 652 if(array[i]!=NULL) { 653 ucnv_unloadSharedDataIfReady(array[i]); 654 } 655 } 656 657 ucnv_close(myData->currentConverter); 658 659 if(!converter->isExtraLocal){ 660 uprv_free (converter->extraInfo); 661 converter->extraInfo = NULL; 662 } 663 } 664 } 665 666 static void 667 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 668 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 669 if(choice<=UCNV_RESET_TO_UNICODE) { 670 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 671 myConverterData->key = 0; 672 myConverterData->isEmptySegment = FALSE; 673 } 674 if(choice!=UCNV_RESET_TO_UNICODE) { 675 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 676 } 677 #ifdef U_ENABLE_GENERIC_ISO_2022 678 if(myConverterData->locale[0] == 0){ 679 if(choice<=UCNV_RESET_TO_UNICODE) { 680 myConverterData->isFirstBuffer = TRUE; 681 myConverterData->key = 0; 682 if (converter->mode == UCNV_SO){ 683 ucnv_close (myConverterData->currentConverter); 684 myConverterData->currentConverter=NULL; 685 } 686 converter->mode = UCNV_SI; 687 } 688 if(choice!=UCNV_RESET_TO_UNICODE) { 689 /* re-append UTF-8 escape sequence */ 690 converter->charErrorBufferLength = 3; 691 converter->charErrorBuffer[0] = 0x1b; 692 converter->charErrorBuffer[1] = 0x28; 693 converter->charErrorBuffer[2] = 0x42; 694 } 695 } 696 else 697 #endif 698 { 699 /* reset the state variables */ 700 if(myConverterData->locale[0] == 'k'){ 701 if(choice<=UCNV_RESET_TO_UNICODE) { 702 setInitialStateToUnicodeKR(converter, myConverterData); 703 } 704 if(choice!=UCNV_RESET_TO_UNICODE) { 705 setInitialStateFromUnicodeKR(converter, myConverterData); 706 } 707 } 708 } 709 } 710 711 static const char* 712 _ISO2022getName(const UConverter* cnv){ 713 if(cnv->extraInfo){ 714 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 715 return myData->name; 716 } 717 return NULL; 718 } 719 720 721 /*************** to unicode *******************/ 722 /**************************************************************************** 723 * Recognized escape sequences are 724 * <ESC>(B ASCII 725 * <ESC>.A ISO-8859-1 726 * <ESC>.F ISO-8859-7 727 * <ESC>(J JISX-201 728 * <ESC>(I JISX-201 729 * <ESC>$B JISX-208 730 * <ESC>$@ JISX-208 731 * <ESC>$(D JISX-212 732 * <ESC>$A GB2312 733 * <ESC>$(C KSC5601 734 */ 735 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 736 /* 0 1 2 3 4 5 6 7 8 9 */ 737 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 738 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 739 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 740 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 741 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 742 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 743 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 745 }; 746 747 /*************** to unicode *******************/ 748 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 749 /* 0 1 2 3 4 5 6 7 8 9 */ 750 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 751 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 752 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 753 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 754 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 755 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 756 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 757 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 758 }; 759 760 761 static UCNV_TableStates_2022 762 getKey_2022(char c,int32_t* key,int32_t* offset){ 763 int32_t togo; 764 int32_t low = 0; 765 int32_t hi = MAX_STATES_2022; 766 int32_t oldmid=0; 767 768 togo = normalize_esq_chars_2022[(uint8_t)c]; 769 if(togo == 0) { 770 /* not a valid character anywhere in an escape sequence */ 771 *key = 0; 772 *offset = 0; 773 return INVALID_2022; 774 } 775 togo = (*key << 5) + togo; 776 777 while (hi != low) /*binary search*/{ 778 779 int32_t mid = (hi+low) >> 1; /*Finds median*/ 780 781 if (mid == oldmid) 782 break; 783 784 if (escSeqStateTable_Key_2022[mid] > togo){ 785 hi = mid; 786 } 787 else if (escSeqStateTable_Key_2022[mid] < togo){ 788 low = mid; 789 } 790 else /*we found it*/{ 791 *key = togo; 792 *offset = mid; 793 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 794 } 795 oldmid = mid; 796 797 } 798 799 *key = 0; 800 *offset = 0; 801 return INVALID_2022; 802 } 803 804 /*runs through a state machine to determine the escape sequence - codepage correspondance 805 */ 806 static void 807 changeState_2022(UConverter* _this, 808 const char** source, 809 const char* sourceLimit, 810 Variant2022 var, 811 UErrorCode* err){ 812 UCNV_TableStates_2022 value; 813 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 814 uint32_t key = myData2022->key; 815 int32_t offset = 0; 816 int8_t initialToULength = _this->toULength; 817 char c; 818 819 value = VALID_NON_TERMINAL_2022; 820 while (*source < sourceLimit) { 821 c = *(*source)++; 822 _this->toUBytes[_this->toULength++]=(uint8_t)c; 823 value = getKey_2022(c,(int32_t *) &key, &offset); 824 825 switch (value){ 826 827 case VALID_NON_TERMINAL_2022 : 828 /* continue with the loop */ 829 break; 830 831 case VALID_TERMINAL_2022: 832 key = 0; 833 goto DONE; 834 835 case INVALID_2022: 836 goto DONE; 837 838 case VALID_MAYBE_TERMINAL_2022: 839 #ifdef U_ENABLE_GENERIC_ISO_2022 840 /* ESC ( B is ambiguous only for ISO_2022 itself */ 841 if(var == ISO_2022) { 842 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 843 _this->toULength = 0; 844 845 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 846 847 /* continue with the loop */ 848 value = VALID_NON_TERMINAL_2022; 849 break; 850 } else 851 #endif 852 { 853 /* not ISO_2022 itself, finish here */ 854 value = VALID_TERMINAL_2022; 855 key = 0; 856 goto DONE; 857 } 858 } 859 } 860 861 DONE: 862 myData2022->key = key; 863 864 if (value == VALID_NON_TERMINAL_2022) { 865 /* indicate that the escape sequence is incomplete: key!=0 */ 866 return; 867 } else if (value == INVALID_2022 ) { 868 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 869 } else /* value == VALID_TERMINAL_2022 */ { 870 switch(var){ 871 #ifdef U_ENABLE_GENERIC_ISO_2022 872 case ISO_2022: 873 { 874 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 875 if(chosenConverterName == NULL) { 876 /* SS2 or SS3 */ 877 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 878 _this->toUCallbackReason = UCNV_UNASSIGNED; 879 return; 880 } 881 882 _this->mode = UCNV_SI; 883 ucnv_close(myData2022->currentConverter); 884 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 885 if(U_SUCCESS(*err)) { 886 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 887 _this->mode = UCNV_SO; 888 } 889 break; 890 } 891 #endif 892 case ISO_2022_JP: 893 { 894 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 895 switch(tempState) { 896 case INVALID_STATE: 897 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 898 break; 899 case SS2_STATE: 900 if(myData2022->toU2022State.cs[2]!=0) { 901 if(myData2022->toU2022State.g<2) { 902 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 903 } 904 myData2022->toU2022State.g=2; 905 } else { 906 /* illegal to have SS2 before a matching designator */ 907 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 908 } 909 break; 910 /* case SS3_STATE: not used in ISO-2022-JP-x */ 911 case ISO8859_1: 912 case ISO8859_7: 913 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 914 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 915 } else { 916 /* G2 charset for SS2 */ 917 myData2022->toU2022State.cs[2]=(int8_t)tempState; 918 } 919 break; 920 default: 921 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 923 } else { 924 /* G0 charset */ 925 myData2022->toU2022State.cs[0]=(int8_t)tempState; 926 } 927 break; 928 } 929 } 930 break; 931 case ISO_2022_CN: 932 { 933 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 934 switch(tempState) { 935 case INVALID_STATE: 936 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 937 break; 938 case SS2_STATE: 939 if(myData2022->toU2022State.cs[2]!=0) { 940 if(myData2022->toU2022State.g<2) { 941 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 942 } 943 myData2022->toU2022State.g=2; 944 } else { 945 /* illegal to have SS2 before a matching designator */ 946 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 947 } 948 break; 949 case SS3_STATE: 950 if(myData2022->toU2022State.cs[3]!=0) { 951 if(myData2022->toU2022State.g<2) { 952 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 953 } 954 myData2022->toU2022State.g=3; 955 } else { 956 /* illegal to have SS3 before a matching designator */ 957 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 958 } 959 break; 960 case ISO_IR_165: 961 if(myData2022->version==0) { 962 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 963 break; 964 } 965 /*fall through*/ 966 case GB2312_1: 967 /*fall through*/ 968 case CNS_11643_1: 969 myData2022->toU2022State.cs[1]=(int8_t)tempState; 970 break; 971 case CNS_11643_2: 972 myData2022->toU2022State.cs[2]=(int8_t)tempState; 973 break; 974 default: 975 /* other CNS 11643 planes */ 976 if(myData2022->version==0) { 977 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 978 } else { 979 myData2022->toU2022State.cs[3]=(int8_t)tempState; 980 } 981 break; 982 } 983 } 984 break; 985 case ISO_2022_KR: 986 if(offset==0x30){ 987 /* nothing to be done, just accept this one escape sequence */ 988 } else { 989 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 990 } 991 break; 992 993 default: 994 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 995 break; 996 } 997 } 998 if(U_SUCCESS(*err)) { 999 _this->toULength = 0; 1000 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1001 if(_this->toULength>1) { 1002 /* 1003 * Ticket 5691: consistent illegal sequences: 1004 * - We include at least the first byte (ESC) in the illegal sequence. 1005 * - If any of the non-initial bytes could be the start of a character, 1006 * we stop the illegal sequence before the first one of those. 1007 * In escape sequences, all following bytes are "printable", that is, 1008 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1009 * they are valid single/lead bytes. 1010 * For simplicity, we always only report the initial ESC byte as the 1011 * illegal sequence and back out all other bytes we looked at. 1012 */ 1013 /* Back out some bytes. */ 1014 int8_t backOutDistance=_this->toULength-1; 1015 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1016 if(backOutDistance<=bytesFromThisBuffer) { 1017 /* same as initialToULength<=1 */ 1018 *source-=backOutDistance; 1019 } else { 1020 /* Back out bytes from the previous buffer: Need to replay them. */ 1021 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1022 /* same as -(initialToULength-1) */ 1023 /* preToULength is negative! */ 1024 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1025 *source-=bytesFromThisBuffer; 1026 } 1027 _this->toULength=1; 1028 } 1029 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1030 _this->toUCallbackReason = UCNV_UNASSIGNED; 1031 } 1032 } 1033 1034 /*Checks the characters of the buffer against valid 2022 escape sequences 1035 *if the match we return a pointer to the initial start of the sequence otherwise 1036 *we return sourceLimit 1037 */ 1038 /*for 2022 looks ahead in the stream 1039 *to determine the longest possible convertible 1040 *data stream 1041 */ 1042 static inline const char* 1043 getEndOfBuffer_2022(const char** source, 1044 const char* sourceLimit, 1045 UBool /*flush*/){ 1046 1047 const char* mySource = *source; 1048 1049 #ifdef U_ENABLE_GENERIC_ISO_2022 1050 if (*source >= sourceLimit) 1051 return sourceLimit; 1052 1053 do{ 1054 1055 if (*mySource == ESC_2022){ 1056 int8_t i; 1057 int32_t key = 0; 1058 int32_t offset; 1059 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1060 1061 /* Kludge: I could not 1062 * figure out the reason for validating an escape sequence 1063 * twice - once here and once in changeState_2022(). 1064 * is it possible to have an ESC character in a ISO2022 1065 * byte stream which is valid in a code page? Is it legal? 1066 */ 1067 for (i=0; 1068 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1069 i++) { 1070 value = getKey_2022(*(mySource+i), &key, &offset); 1071 } 1072 if (value > 0 || *mySource==ESC_2022) 1073 return mySource; 1074 1075 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1076 return sourceLimit; 1077 } 1078 }while (++mySource < sourceLimit); 1079 1080 return sourceLimit; 1081 #else 1082 while(mySource < sourceLimit && *mySource != ESC_2022) { 1083 ++mySource; 1084 } 1085 return mySource; 1086 #endif 1087 } 1088 1089 1090 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1091 * any future change in _MBCSFromUChar32() function should be reflected here. 1092 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1093 */ 1094 static inline int32_t 1095 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1096 UChar32 c, 1097 uint32_t* value, 1098 UBool useFallback, 1099 int outputType) 1100 { 1101 const int32_t *cx; 1102 const uint16_t *table; 1103 uint32_t stage2Entry; 1104 uint32_t myValue; 1105 int32_t length; 1106 const uint8_t *p; 1107 /* 1108 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1109 * Use internal version of ucnv_open() that verifies that the new structures are available, 1110 * else U_INTERNAL_PROGRAM_ERROR. 1111 */ 1112 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1113 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1114 table=sharedData->mbcs.fromUnicodeTable; 1115 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1116 /* get the bytes and the length for the output */ 1117 if(outputType==MBCS_OUTPUT_2){ 1118 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1119 if(myValue<=0xff) { 1120 length=1; 1121 } else { 1122 length=2; 1123 } 1124 } else /* outputType==MBCS_OUTPUT_3 */ { 1125 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1126 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1127 if(myValue<=0xff) { 1128 length=1; 1129 } else if(myValue<=0xffff) { 1130 length=2; 1131 } else { 1132 length=3; 1133 } 1134 } 1135 /* is this code point assigned, or do we use fallbacks? */ 1136 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1137 /* assigned */ 1138 *value=myValue; 1139 return length; 1140 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1141 /* 1142 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1143 * There is no way with this data structure for fallback output 1144 * to be a zero byte. 1145 */ 1146 *value=myValue; 1147 return -length; 1148 } 1149 } 1150 1151 cx=sharedData->mbcs.extIndexes; 1152 if(cx!=NULL) { 1153 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1154 } 1155 1156 /* unassigned */ 1157 return 0; 1158 } 1159 1160 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1161 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1162 * @param retval pointer to output byte 1163 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1164 */ 1165 static inline int32_t 1166 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1167 UChar32 c, 1168 uint32_t* retval, 1169 UBool useFallback) 1170 { 1171 const uint16_t *table; 1172 int32_t value; 1173 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1174 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1175 return 0; 1176 } 1177 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1178 table=sharedData->mbcs.fromUnicodeTable; 1179 /* get the byte for the output */ 1180 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1181 /* is this code point assigned, or do we use fallbacks? */ 1182 *retval=(uint32_t)(value&0xff); 1183 if(value>=0xf00) { 1184 return 1; /* roundtrip */ 1185 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1186 return -1; /* fallback taken */ 1187 } else { 1188 return 0; /* no mapping */ 1189 } 1190 } 1191 1192 /* 1193 * Check that the result is a 2-byte value with each byte in the range A1..FE 1194 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1195 * to move it to the ISO 2022 range 21..7E. 1196 * Return 0 if out of range. 1197 */ 1198 static inline uint32_t 1199 _2022FromGR94DBCS(uint32_t value) { 1200 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1201 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1202 ) { 1203 return value - 0x8080; /* shift down to 21..7e byte range */ 1204 } else { 1205 return 0; /* not valid for ISO 2022 */ 1206 } 1207 } 1208 1209 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1210 /* 1211 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1212 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1213 * unchanged. 1214 */ 1215 static inline uint32_t 1216 _2022ToGR94DBCS(uint32_t value) { 1217 uint32_t returnValue = value + 0x8080; 1218 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1219 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1220 return returnValue; 1221 } else { 1222 return value; 1223 } 1224 } 1225 #endif 1226 1227 #ifdef U_ENABLE_GENERIC_ISO_2022 1228 1229 /********************************************************************************** 1230 * ISO-2022 Converter 1231 * 1232 * 1233 */ 1234 1235 static void 1236 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1237 UErrorCode* err){ 1238 const char* mySourceLimit, *realSourceLimit; 1239 const char* sourceStart; 1240 const UChar* myTargetStart; 1241 UConverter* saveThis; 1242 UConverterDataISO2022* myData; 1243 int8_t length; 1244 1245 saveThis = args->converter; 1246 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1247 1248 realSourceLimit = args->sourceLimit; 1249 while (args->source < realSourceLimit) { 1250 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1251 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1252 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1253 1254 if(args->source < mySourceLimit) { 1255 if(myData->currentConverter==NULL) { 1256 myData->currentConverter = ucnv_open("ASCII",err); 1257 if(U_FAILURE(*err)){ 1258 return; 1259 } 1260 1261 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1262 saveThis->mode = UCNV_SO; 1263 } 1264 1265 /* convert to before the ESC or until the end of the buffer */ 1266 myData->isFirstBuffer=FALSE; 1267 sourceStart = args->source; 1268 myTargetStart = args->target; 1269 args->converter = myData->currentConverter; 1270 ucnv_toUnicode(args->converter, 1271 &args->target, 1272 args->targetLimit, 1273 &args->source, 1274 mySourceLimit, 1275 args->offsets, 1276 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1277 err); 1278 args->converter = saveThis; 1279 1280 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1281 /* move the overflow buffer */ 1282 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1283 myData->currentConverter->UCharErrorBufferLength = 0; 1284 if(length > 0) { 1285 uprv_memcpy(saveThis->UCharErrorBuffer, 1286 myData->currentConverter->UCharErrorBuffer, 1287 length*U_SIZEOF_UCHAR); 1288 } 1289 return; 1290 } 1291 1292 /* 1293 * At least one of: 1294 * -Error while converting 1295 * -Done with entire buffer 1296 * -Need to write offsets or update the current offset 1297 * (leave that up to the code in ucnv.c) 1298 * 1299 * or else we just stopped at an ESC byte and continue with changeState_2022() 1300 */ 1301 if (U_FAILURE(*err) || 1302 (args->source == realSourceLimit) || 1303 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1304 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1305 ) { 1306 /* copy partial or error input for truncated detection and error handling */ 1307 if(U_FAILURE(*err)) { 1308 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1309 if(length > 0) { 1310 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1311 } 1312 } else { 1313 length = saveThis->toULength = myData->currentConverter->toULength; 1314 if(length > 0) { 1315 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1316 if(args->source < mySourceLimit) { 1317 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1318 } 1319 } 1320 } 1321 return; 1322 } 1323 } 1324 } 1325 1326 sourceStart = args->source; 1327 changeState_2022(args->converter, 1328 &(args->source), 1329 realSourceLimit, 1330 ISO_2022, 1331 err); 1332 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1333 /* let the ucnv.c code update its current offset */ 1334 return; 1335 } 1336 } 1337 } 1338 1339 #endif 1340 1341 /* 1342 * To Unicode Callback helper function 1343 */ 1344 static void 1345 toUnicodeCallback(UConverter *cnv, 1346 const uint32_t sourceChar, const uint32_t targetUniChar, 1347 UErrorCode* err){ 1348 if(sourceChar>0xff){ 1349 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1350 cnv->toUBytes[1] = (uint8_t)sourceChar; 1351 cnv->toULength = 2; 1352 } 1353 else{ 1354 cnv->toUBytes[0] =(char) sourceChar; 1355 cnv->toULength = 1; 1356 } 1357 1358 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1359 *err = U_INVALID_CHAR_FOUND; 1360 } 1361 else{ 1362 *err = U_ILLEGAL_CHAR_FOUND; 1363 } 1364 } 1365 1366 /**************************************ISO-2022-JP*************************************************/ 1367 1368 /************************************** IMPORTANT ************************************************** 1369 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1370 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1371 * The converter iterates over each Unicode codepoint 1372 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1373 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1374 * would do as far as possible. 1375 * 1376 * If the implementation of these macros or structure of sharedData struct change in the future, make 1377 * sure that ISO-2022 is also changed. 1378 *************************************************************************************************** 1379 */ 1380 1381 /*************************************************************************************************** 1382 * Rules for ISO-2022-jp encoding 1383 * (i) Escape sequences must be fully contained within a line they should not 1384 * span new lines or CRs 1385 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1386 * JIS-Roman character escape sequence should follow before the line terminates 1387 * (iii) If the first character on the line is represented by two bytes then a two 1388 * byte character escape sequence should precede it 1389 * (iv) If no escape sequence is encountered then the characters are ASCII 1390 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1391 * and invoked with SS2 (ESC N). 1392 * (vi) If there is any G0 designation in text, there must be a switch to 1393 * ASCII or to JIS X 0201-Roman before a space character (but not 1394 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1395 * characters such as tab or CRLF. 1396 * (vi) Supported encodings: 1397 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1398 * 1399 * source : RFC-1554 1400 * 1401 * JISX201, JISX208,JISX212 : new .cnv data files created 1402 * KSC5601 : alias to ibm-949 mapping table 1403 * GB2312 : alias to ibm-1386 mapping table 1404 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1405 * ISO-8859-7 : alisas to ibm-9409 mapping table 1406 */ 1407 1408 /* preference order of JP charsets */ 1409 static const StateEnum jpCharsetPref[]={ 1410 ASCII, 1411 JISX201, 1412 ISO8859_1, 1413 ISO8859_7, 1414 JISX208, 1415 JISX212, 1416 GB2312, 1417 KSC5601, 1418 HWKANA_7BIT 1419 }; 1420 1421 /* 1422 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1423 * not in order of jpCharsetPref[]! 1424 */ 1425 static const char escSeqChars[][6] ={ 1426 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1427 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1428 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1429 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1430 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1431 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1432 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1433 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1434 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1435 1436 }; 1437 static const int8_t escSeqCharsLen[] ={ 1438 3, /* length of <ESC>(B ASCII */ 1439 3, /* length of <ESC>.A ISO-8859-1 */ 1440 3, /* length of <ESC>.F ISO-8859-7 */ 1441 3, /* length of <ESC>(J JISX-201 */ 1442 3, /* length of <ESC>$B JISX-208 */ 1443 4, /* length of <ESC>$(D JISX-212 */ 1444 3, /* length of <ESC>$A GB2312 */ 1445 4, /* length of <ESC>$(C KSC5601 */ 1446 3 /* length of <ESC>(I HWKANA_7BIT */ 1447 }; 1448 1449 /* 1450 * The iteration over various code pages works this way: 1451 * i) Get the currentState from myConverterData->currentState 1452 * ii) Check if the character is mapped to a valid character in the currentState 1453 * Yes -> a) set the initIterState to currentState 1454 * b) remain in this state until an invalid character is found 1455 * No -> a) go to the next code page and find the character 1456 * iii) Before changing the state increment the current state check if the current state 1457 * is equal to the intitIteration state 1458 * Yes -> A character that cannot be represented in any of the supported encodings 1459 * break and return a U_INVALID_CHARACTER error 1460 * No -> Continue and find the character in next code page 1461 * 1462 * 1463 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1464 */ 1465 1466 /* Map 00..7F to Unicode according to JIS X 0201. */ 1467 static inline uint32_t 1468 jisx201ToU(uint32_t value) { 1469 if(value < 0x5c) { 1470 return value; 1471 } else if(value == 0x5c) { 1472 return 0xa5; 1473 } else if(value == 0x7e) { 1474 return 0x203e; 1475 } else /* value <= 0x7f */ { 1476 return value; 1477 } 1478 } 1479 1480 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1481 static inline uint32_t 1482 jisx201FromU(uint32_t value) { 1483 if(value<=0x7f) { 1484 if(value!=0x5c && value!=0x7e) { 1485 return value; 1486 } 1487 } else if(value==0xa5) { 1488 return 0x5c; 1489 } else if(value==0x203e) { 1490 return 0x7e; 1491 } 1492 return 0xfffe; 1493 } 1494 1495 /* 1496 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1497 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1498 * Return 0 if the byte pair is out of range. 1499 */ 1500 static inline uint32_t 1501 _2022FromSJIS(uint32_t value) { 1502 uint8_t trail; 1503 1504 if(value > 0xEFFC) { 1505 return 0; /* beyond JIS X 0208 */ 1506 } 1507 1508 trail = (uint8_t)value; 1509 1510 value &= 0xff00; /* lead byte */ 1511 if(value <= 0x9f00) { 1512 value -= 0x7000; 1513 } else /* 0xe000 <= value <= 0xef00 */ { 1514 value -= 0xb000; 1515 } 1516 value <<= 1; 1517 1518 if(trail <= 0x9e) { 1519 value -= 0x100; 1520 if(trail <= 0x7e) { 1521 value |= trail - 0x1f; 1522 } else { 1523 value |= trail - 0x20; 1524 } 1525 } else /* trail <= 0xfc */ { 1526 value |= trail - 0x7e; 1527 } 1528 return value; 1529 } 1530 1531 /* 1532 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1533 * If either byte is outside 21..7E make sure that the result is not valid 1534 * for Shift-JIS so that the converter catches it. 1535 * Some invalid byte values already turn into equally invalid Shift-JIS 1536 * byte values and need not be tested explicitly. 1537 */ 1538 static inline void 1539 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1540 if(c1&1) { 1541 ++c1; 1542 if(c2 <= 0x5f) { 1543 c2 += 0x1f; 1544 } else if(c2 <= 0x7e) { 1545 c2 += 0x20; 1546 } else { 1547 c2 = 0; /* invalid */ 1548 } 1549 } else { 1550 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1551 c2 += 0x7e; 1552 } else { 1553 c2 = 0; /* invalid */ 1554 } 1555 } 1556 c1 >>= 1; 1557 if(c1 <= 0x2f) { 1558 c1 += 0x70; 1559 } else if(c1 <= 0x3f) { 1560 c1 += 0xb0; 1561 } else { 1562 c1 = 0; /* invalid */ 1563 } 1564 bytes[0] = (char)c1; 1565 bytes[1] = (char)c2; 1566 } 1567 1568 /* 1569 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1570 * Katakana. 1571 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1572 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1573 * These were the only fallbacks in ICU's jisx-208.ucm file. 1574 */ 1575 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1576 0x2123, /* U+FF61 */ 1577 0x2156, 1578 0x2157, 1579 0x2122, 1580 0x2126, 1581 0x2572, 1582 0x2521, 1583 0x2523, 1584 0x2525, 1585 0x2527, 1586 0x2529, 1587 0x2563, 1588 0x2565, 1589 0x2567, 1590 0x2543, 1591 0x213C, /* U+FF70 */ 1592 0x2522, 1593 0x2524, 1594 0x2526, 1595 0x2528, 1596 0x252A, 1597 0x252B, 1598 0x252D, 1599 0x252F, 1600 0x2531, 1601 0x2533, 1602 0x2535, 1603 0x2537, 1604 0x2539, 1605 0x253B, 1606 0x253D, 1607 0x253F, /* U+FF80 */ 1608 0x2541, 1609 0x2544, 1610 0x2546, 1611 0x2548, 1612 0x254A, 1613 0x254B, 1614 0x254C, 1615 0x254D, 1616 0x254E, 1617 0x254F, 1618 0x2552, 1619 0x2555, 1620 0x2558, 1621 0x255B, 1622 0x255E, 1623 0x255F, /* U+FF90 */ 1624 0x2560, 1625 0x2561, 1626 0x2562, 1627 0x2564, 1628 0x2566, 1629 0x2568, 1630 0x2569, 1631 0x256A, 1632 0x256B, 1633 0x256C, 1634 0x256D, 1635 0x256F, 1636 0x2573, 1637 0x212B, 1638 0x212C /* U+FF9F */ 1639 }; 1640 1641 static void 1642 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1643 UConverter *cnv = args->converter; 1644 UConverterDataISO2022 *converterData; 1645 ISO2022State *pFromU2022State; 1646 uint8_t *target = (uint8_t *) args->target; 1647 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1648 const UChar* source = args->source; 1649 const UChar* sourceLimit = args->sourceLimit; 1650 int32_t* offsets = args->offsets; 1651 UChar32 sourceChar; 1652 char buffer[8]; 1653 int32_t len, outLen; 1654 int8_t choices[10]; 1655 int32_t choiceCount; 1656 uint32_t targetValue = 0; 1657 UBool useFallback; 1658 1659 int32_t i; 1660 int8_t cs, g; 1661 1662 /* set up the state */ 1663 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1664 pFromU2022State = &converterData->fromU2022State; 1665 1666 choiceCount = 0; 1667 1668 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1669 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1670 goto getTrail; 1671 } 1672 1673 while(source < sourceLimit) { 1674 if(target < targetLimit) { 1675 1676 sourceChar = *(source++); 1677 /*check if the char is a First surrogate*/ 1678 if(U16_IS_SURROGATE(sourceChar)) { 1679 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1680 getTrail: 1681 /*look ahead to find the trail surrogate*/ 1682 if(source < sourceLimit) { 1683 /* test the following code unit */ 1684 UChar trail=(UChar) *source; 1685 if(U16_IS_TRAIL(trail)) { 1686 source++; 1687 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1688 cnv->fromUChar32=0x00; 1689 /* convert this supplementary code point */ 1690 /* exit this condition tree */ 1691 } else { 1692 /* this is an unmatched lead code unit (1st surrogate) */ 1693 /* callback(illegal) */ 1694 *err=U_ILLEGAL_CHAR_FOUND; 1695 cnv->fromUChar32=sourceChar; 1696 break; 1697 } 1698 } else { 1699 /* no more input */ 1700 cnv->fromUChar32=sourceChar; 1701 break; 1702 } 1703 } else { 1704 /* this is an unmatched trail code unit (2nd surrogate) */ 1705 /* callback(illegal) */ 1706 *err=U_ILLEGAL_CHAR_FOUND; 1707 cnv->fromUChar32=sourceChar; 1708 break; 1709 } 1710 } 1711 1712 /* do not convert SO/SI/ESC */ 1713 if(IS_2022_CONTROL(sourceChar)) { 1714 /* callback(illegal) */ 1715 *err=U_ILLEGAL_CHAR_FOUND; 1716 cnv->fromUChar32=sourceChar; 1717 break; 1718 } 1719 1720 /* do the conversion */ 1721 1722 if(choiceCount == 0) { 1723 uint16_t csm; 1724 1725 /* 1726 * The csm variable keeps track of which charsets are allowed 1727 * and not used yet while building the choices[]. 1728 */ 1729 csm = jpCharsetMasks[converterData->version]; 1730 choiceCount = 0; 1731 1732 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1733 if(converterData->version == 3 || converterData->version == 4) { 1734 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1735 } 1736 /* Do not try single-byte half-width Katakana for other versions. */ 1737 csm &= ~CSM(HWKANA_7BIT); 1738 1739 /* try the current G0 charset */ 1740 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1741 csm &= ~CSM(cs); 1742 1743 /* try the current G2 charset */ 1744 if((cs = pFromU2022State->cs[2]) != 0) { 1745 choices[choiceCount++] = cs; 1746 csm &= ~CSM(cs); 1747 } 1748 1749 /* try all the other possible charsets */ 1750 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1751 cs = (int8_t)jpCharsetPref[i]; 1752 if(CSM(cs) & csm) { 1753 choices[choiceCount++] = cs; 1754 csm &= ~CSM(cs); 1755 } 1756 } 1757 } 1758 1759 cs = g = 0; 1760 /* 1761 * len==0: no mapping found yet 1762 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1763 * len>0: found a roundtrip result, done 1764 */ 1765 len = 0; 1766 /* 1767 * We will turn off useFallback after finding a fallback, 1768 * but we still get fallbacks from PUA code points as usual. 1769 * Therefore, we will also need to check that we don't overwrite 1770 * an early fallback with a later one. 1771 */ 1772 useFallback = cnv->useFallback; 1773 1774 for(i = 0; i < choiceCount && len <= 0; ++i) { 1775 uint32_t value; 1776 int32_t len2; 1777 int8_t cs0 = choices[i]; 1778 switch(cs0) { 1779 case ASCII: 1780 if(sourceChar <= 0x7f) { 1781 targetValue = (uint32_t)sourceChar; 1782 len = 1; 1783 cs = cs0; 1784 g = 0; 1785 } 1786 break; 1787 case ISO8859_1: 1788 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1789 targetValue = (uint32_t)sourceChar - 0x80; 1790 len = 1; 1791 cs = cs0; 1792 g = 2; 1793 } 1794 break; 1795 case HWKANA_7BIT: 1796 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1797 if(converterData->version==3) { 1798 /* JIS7: use G1 (SO) */ 1799 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1800 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1801 len = 1; 1802 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1803 g = 1; 1804 } else if(converterData->version==4) { 1805 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1806 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1807 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1808 len = 1; 1809 1810 cs = pFromU2022State->cs[0]; 1811 if(IS_JP_DBCS(cs)) { 1812 /* switch from a DBCS charset to JISX201 */ 1813 cs = (int8_t)JISX201; 1814 } 1815 /* else stay in the current G0 charset */ 1816 g = 0; 1817 } 1818 /* else do not use HWKANA_7BIT with other versions */ 1819 } 1820 break; 1821 case JISX201: 1822 /* G0 SBCS */ 1823 value = jisx201FromU(sourceChar); 1824 if(value <= 0x7f) { 1825 targetValue = value; 1826 len = 1; 1827 cs = cs0; 1828 g = 0; 1829 useFallback = FALSE; 1830 } 1831 break; 1832 case JISX208: 1833 /* G0 DBCS from Shift-JIS table */ 1834 len2 = MBCS_FROM_UCHAR32_ISO2022( 1835 converterData->myConverterArray[cs0], 1836 sourceChar, &value, 1837 useFallback, MBCS_OUTPUT_2); 1838 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1839 value = _2022FromSJIS(value); 1840 if(value != 0) { 1841 targetValue = value; 1842 len = len2; 1843 cs = cs0; 1844 g = 0; 1845 useFallback = FALSE; 1846 } 1847 } else if(len == 0 && useFallback && 1848 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1849 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1850 len = -2; 1851 cs = cs0; 1852 g = 0; 1853 useFallback = FALSE; 1854 } 1855 break; 1856 case ISO8859_7: 1857 /* G0 SBCS forced to 7-bit output */ 1858 len2 = MBCS_SINGLE_FROM_UCHAR32( 1859 converterData->myConverterArray[cs0], 1860 sourceChar, &value, 1861 useFallback); 1862 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1863 targetValue = value - 0x80; 1864 len = len2; 1865 cs = cs0; 1866 g = 2; 1867 useFallback = FALSE; 1868 } 1869 break; 1870 default: 1871 /* G0 DBCS */ 1872 len2 = MBCS_FROM_UCHAR32_ISO2022( 1873 converterData->myConverterArray[cs0], 1874 sourceChar, &value, 1875 useFallback, MBCS_OUTPUT_2); 1876 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1877 if(cs0 == KSC5601) { 1878 /* 1879 * Check for valid bytes for the encoding scheme. 1880 * This is necessary because the sub-converter (windows-949) 1881 * has a broader encoding scheme than is valid for 2022. 1882 */ 1883 value = _2022FromGR94DBCS(value); 1884 if(value == 0) { 1885 break; 1886 } 1887 } 1888 targetValue = value; 1889 len = len2; 1890 cs = cs0; 1891 g = 0; 1892 useFallback = FALSE; 1893 } 1894 break; 1895 } 1896 } 1897 1898 if(len != 0) { 1899 if(len < 0) { 1900 len = -len; /* fallback */ 1901 } 1902 outLen = 0; /* count output bytes */ 1903 1904 /* write SI if necessary (only for JIS7) */ 1905 if(pFromU2022State->g == 1 && g == 0) { 1906 buffer[outLen++] = UCNV_SI; 1907 pFromU2022State->g = 0; 1908 } 1909 1910 /* write the designation sequence if necessary */ 1911 if(cs != pFromU2022State->cs[g]) { 1912 int32_t escLen = escSeqCharsLen[cs]; 1913 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1914 outLen += escLen; 1915 pFromU2022State->cs[g] = cs; 1916 1917 /* invalidate the choices[] */ 1918 choiceCount = 0; 1919 } 1920 1921 /* write the shift sequence if necessary */ 1922 if(g != pFromU2022State->g) { 1923 switch(g) { 1924 /* case 0 handled before writing escapes */ 1925 case 1: 1926 buffer[outLen++] = UCNV_SO; 1927 pFromU2022State->g = 1; 1928 break; 1929 default: /* case 2 */ 1930 buffer[outLen++] = 0x1b; 1931 buffer[outLen++] = 0x4e; 1932 break; 1933 /* no case 3: no SS3 in ISO-2022-JP-x */ 1934 } 1935 } 1936 1937 /* write the output bytes */ 1938 if(len == 1) { 1939 buffer[outLen++] = (char)targetValue; 1940 } else /* len == 2 */ { 1941 buffer[outLen++] = (char)(targetValue >> 8); 1942 buffer[outLen++] = (char)targetValue; 1943 } 1944 } else { 1945 /* 1946 * if we cannot find the character after checking all codepages 1947 * then this is an error 1948 */ 1949 *err = U_INVALID_CHAR_FOUND; 1950 cnv->fromUChar32=sourceChar; 1951 break; 1952 } 1953 1954 if(sourceChar == CR || sourceChar == LF) { 1955 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1956 pFromU2022State->cs[2] = 0; 1957 choiceCount = 0; 1958 } 1959 1960 /* output outLen>0 bytes in buffer[] */ 1961 if(outLen == 1) { 1962 *target++ = buffer[0]; 1963 if(offsets) { 1964 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1965 } 1966 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1967 *target++ = buffer[0]; 1968 *target++ = buffer[1]; 1969 if(offsets) { 1970 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1971 *offsets++ = sourceIndex; 1972 *offsets++ = sourceIndex; 1973 } 1974 } else { 1975 fromUWriteUInt8( 1976 cnv, 1977 buffer, outLen, 1978 &target, (const char *)targetLimit, 1979 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1980 err); 1981 if(U_FAILURE(*err)) { 1982 break; 1983 } 1984 } 1985 } /* end if(myTargetIndex<myTargetLength) */ 1986 else{ 1987 *err =U_BUFFER_OVERFLOW_ERROR; 1988 break; 1989 } 1990 1991 }/* end while(mySourceIndex<mySourceLength) */ 1992 1993 /* 1994 * the end of the input stream and detection of truncated input 1995 * are handled by the framework, but for ISO-2022-JP conversion 1996 * we need to be in ASCII mode at the very end 1997 * 1998 * conditions: 1999 * successful 2000 * in SO mode or not in ASCII mode 2001 * end of input and no truncated input 2002 */ 2003 if( U_SUCCESS(*err) && 2004 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2005 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2006 ) { 2007 int32_t sourceIndex; 2008 2009 outLen = 0; 2010 2011 if(pFromU2022State->g != 0) { 2012 buffer[outLen++] = UCNV_SI; 2013 pFromU2022State->g = 0; 2014 } 2015 2016 if(pFromU2022State->cs[0] != ASCII) { 2017 int32_t escLen = escSeqCharsLen[ASCII]; 2018 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2019 outLen += escLen; 2020 pFromU2022State->cs[0] = (int8_t)ASCII; 2021 } 2022 2023 /* get the source index of the last input character */ 2024 /* 2025 * TODO this would be simpler and more reliable if we used a pair 2026 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2027 * so that we could simply use the prevSourceIndex here; 2028 * this code gives an incorrect result for the rare case of an unmatched 2029 * trail surrogate that is alone in the last buffer of the text stream 2030 */ 2031 sourceIndex=(int32_t)(source-args->source); 2032 if(sourceIndex>0) { 2033 --sourceIndex; 2034 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2035 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2036 ) { 2037 --sourceIndex; 2038 } 2039 } else { 2040 sourceIndex=-1; 2041 } 2042 2043 fromUWriteUInt8( 2044 cnv, 2045 buffer, outLen, 2046 &target, (const char *)targetLimit, 2047 &offsets, sourceIndex, 2048 err); 2049 } 2050 2051 /*save the state and return */ 2052 args->source = source; 2053 args->target = (char*)target; 2054 } 2055 2056 /*************** to unicode *******************/ 2057 2058 static void 2059 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2060 UErrorCode* err){ 2061 char tempBuf[2]; 2062 const char *mySource = (char *) args->source; 2063 UChar *myTarget = args->target; 2064 const char *mySourceLimit = args->sourceLimit; 2065 uint32_t targetUniChar = 0x0000; 2066 uint32_t mySourceChar = 0x0000; 2067 uint32_t tmpSourceChar = 0x0000; 2068 UConverterDataISO2022* myData; 2069 ISO2022State *pToU2022State; 2070 StateEnum cs; 2071 2072 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2073 pToU2022State = &myData->toU2022State; 2074 2075 if(myData->key != 0) { 2076 /* continue with a partial escape sequence */ 2077 goto escape; 2078 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2079 /* continue with a partial double-byte character */ 2080 mySourceChar = args->converter->toUBytes[0]; 2081 args->converter->toULength = 0; 2082 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2083 targetUniChar = missingCharMarker; 2084 goto getTrailByte; 2085 } 2086 2087 while(mySource < mySourceLimit){ 2088 2089 targetUniChar =missingCharMarker; 2090 2091 if(myTarget < args->targetLimit){ 2092 2093 mySourceChar= (unsigned char) *mySource++; 2094 2095 switch(mySourceChar) { 2096 case UCNV_SI: 2097 if(myData->version==3) { 2098 pToU2022State->g=0; 2099 continue; 2100 } else { 2101 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2102 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2103 break; 2104 } 2105 2106 case UCNV_SO: 2107 if(myData->version==3) { 2108 /* JIS7: switch to G1 half-width Katakana */ 2109 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2110 pToU2022State->g=1; 2111 continue; 2112 } else { 2113 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2114 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2115 break; 2116 } 2117 2118 case ESC_2022: 2119 mySource--; 2120 escape: 2121 { 2122 const char * mySourceBefore = mySource; 2123 int8_t toULengthBefore = args->converter->toULength; 2124 2125 changeState_2022(args->converter,&(mySource), 2126 mySourceLimit, ISO_2022_JP,err); 2127 2128 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2129 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2130 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2131 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2132 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2133 } 2134 } 2135 2136 /* invalid or illegal escape sequence */ 2137 if(U_FAILURE(*err)){ 2138 args->target = myTarget; 2139 args->source = mySource; 2140 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2141 return; 2142 } 2143 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2144 if(myData->key==0) { 2145 myData->isEmptySegment = TRUE; 2146 } 2147 continue; 2148 2149 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2150 2151 case CR: 2152 /*falls through*/ 2153 case LF: 2154 /* automatically reset to single-byte mode */ 2155 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2156 pToU2022State->cs[0] = (int8_t)ASCII; 2157 } 2158 pToU2022State->cs[2] = 0; 2159 pToU2022State->g = 0; 2160 /* falls through */ 2161 default: 2162 /* convert one or two bytes */ 2163 myData->isEmptySegment = FALSE; 2164 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2165 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2166 !IS_JP_DBCS(cs) 2167 ) { 2168 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2169 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2170 2171 /* return from a single-shift state to the previous one */ 2172 if(pToU2022State->g >= 2) { 2173 pToU2022State->g=pToU2022State->prevG; 2174 } 2175 } else switch(cs) { 2176 case ASCII: 2177 if(mySourceChar <= 0x7f) { 2178 targetUniChar = mySourceChar; 2179 } 2180 break; 2181 case ISO8859_1: 2182 if(mySourceChar <= 0x7f) { 2183 targetUniChar = mySourceChar + 0x80; 2184 } 2185 /* return from a single-shift state to the previous one */ 2186 pToU2022State->g=pToU2022State->prevG; 2187 break; 2188 case ISO8859_7: 2189 if(mySourceChar <= 0x7f) { 2190 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2191 targetUniChar = 2192 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2193 myData->myConverterArray[cs], 2194 mySourceChar + 0x80); 2195 } 2196 /* return from a single-shift state to the previous one */ 2197 pToU2022State->g=pToU2022State->prevG; 2198 break; 2199 case JISX201: 2200 if(mySourceChar <= 0x7f) { 2201 targetUniChar = jisx201ToU(mySourceChar); 2202 } 2203 break; 2204 case HWKANA_7BIT: 2205 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2206 /* 7-bit halfwidth Katakana */ 2207 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2208 } 2209 break; 2210 default: 2211 /* G0 DBCS */ 2212 if(mySource < mySourceLimit) { 2213 int leadIsOk, trailIsOk; 2214 uint8_t trailByte; 2215 getTrailByte: 2216 trailByte = (uint8_t)*mySource; 2217 /* 2218 * Ticket 5691: consistent illegal sequences: 2219 * - We include at least the first byte in the illegal sequence. 2220 * - If any of the non-initial bytes could be the start of a character, 2221 * we stop the illegal sequence before the first one of those. 2222 * 2223 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2224 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2225 * Otherwise we convert or report the pair of bytes. 2226 */ 2227 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2228 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2229 if (leadIsOk && trailIsOk) { 2230 ++mySource; 2231 tmpSourceChar = (mySourceChar << 8) | trailByte; 2232 if(cs == JISX208) { 2233 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2234 mySourceChar = tmpSourceChar; 2235 } else { 2236 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2237 mySourceChar = tmpSourceChar; 2238 if (cs == KSC5601) { 2239 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2240 } 2241 tempBuf[0] = (char)(tmpSourceChar >> 8); 2242 tempBuf[1] = (char)(tmpSourceChar); 2243 } 2244 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2245 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2246 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2247 ++mySource; 2248 /* add another bit so that the code below writes 2 bytes in case of error */ 2249 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2250 } 2251 } else { 2252 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2253 args->converter->toULength = 1; 2254 goto endloop; 2255 } 2256 } /* End of inner switch */ 2257 break; 2258 } /* End of outer switch */ 2259 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2260 if(args->offsets){ 2261 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2262 } 2263 *(myTarget++)=(UChar)targetUniChar; 2264 } 2265 else if(targetUniChar > missingCharMarker){ 2266 /* disassemble the surrogate pair and write to output*/ 2267 targetUniChar-=0x0010000; 2268 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2269 if(args->offsets){ 2270 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2271 } 2272 ++myTarget; 2273 if(myTarget< args->targetLimit){ 2274 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2275 if(args->offsets){ 2276 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2277 } 2278 ++myTarget; 2279 }else{ 2280 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2281 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2282 } 2283 2284 } 2285 else{ 2286 /* Call the callback function*/ 2287 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2288 break; 2289 } 2290 } 2291 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2292 *err =U_BUFFER_OVERFLOW_ERROR; 2293 break; 2294 } 2295 } 2296 endloop: 2297 args->target = myTarget; 2298 args->source = mySource; 2299 } 2300 2301 2302 /*************************************************************** 2303 * Rules for ISO-2022-KR encoding 2304 * i) The KSC5601 designator sequence should appear only once in a file, 2305 * at the begining of a line before any KSC5601 characters. This usually 2306 * means that it appears by itself on the first line of the file 2307 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2308 * and SI to shift into single byte mode 2309 */ 2310 static void 2311 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2312 2313 UConverter* saveConv = args->converter; 2314 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2315 args->converter=myConverterData->currentConverter; 2316 2317 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2318 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2319 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2320 2321 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2322 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2323 uprv_memcpy( 2324 saveConv->charErrorBuffer, 2325 myConverterData->currentConverter->charErrorBuffer, 2326 myConverterData->currentConverter->charErrorBufferLength); 2327 } 2328 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2329 myConverterData->currentConverter->charErrorBufferLength = 0; 2330 } 2331 args->converter=saveConv; 2332 } 2333 2334 static void 2335 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2336 2337 const UChar *source = args->source; 2338 const UChar *sourceLimit = args->sourceLimit; 2339 unsigned char *target = (unsigned char *) args->target; 2340 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2341 int32_t* offsets = args->offsets; 2342 uint32_t targetByteUnit = 0x0000; 2343 UChar32 sourceChar = 0x0000; 2344 UBool isTargetByteDBCS; 2345 UBool oldIsTargetByteDBCS; 2346 UConverterDataISO2022 *converterData; 2347 UConverterSharedData* sharedData; 2348 UBool useFallback; 2349 int32_t length =0; 2350 2351 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2352 /* if the version is 1 then the user is requesting 2353 * conversion with ibm-25546 pass the arguments to 2354 * MBCS converter and return 2355 */ 2356 if(converterData->version==1){ 2357 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2358 return; 2359 } 2360 2361 /* initialize data */ 2362 sharedData = converterData->currentConverter->sharedData; 2363 useFallback = args->converter->useFallback; 2364 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2365 oldIsTargetByteDBCS = isTargetByteDBCS; 2366 2367 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2368 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2369 goto getTrail; 2370 } 2371 while(source < sourceLimit){ 2372 2373 targetByteUnit = missingCharMarker; 2374 2375 if(target < (unsigned char*) args->targetLimit){ 2376 sourceChar = *source++; 2377 2378 /* do not convert SO/SI/ESC */ 2379 if(IS_2022_CONTROL(sourceChar)) { 2380 /* callback(illegal) */ 2381 *err=U_ILLEGAL_CHAR_FOUND; 2382 args->converter->fromUChar32=sourceChar; 2383 break; 2384 } 2385 2386 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2387 if(length < 0) { 2388 length = -length; /* fallback */ 2389 } 2390 /* only DBCS or SBCS characters are expected*/ 2391 /* DB characters with high bit set to 1 are expected */ 2392 if( length > 2 || length==0 || 2393 (length == 1 && targetByteUnit > 0x7f) || 2394 (length == 2 && 2395 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2396 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2397 ) { 2398 targetByteUnit=missingCharMarker; 2399 } 2400 if (targetByteUnit != missingCharMarker){ 2401 2402 oldIsTargetByteDBCS = isTargetByteDBCS; 2403 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2404 /* append the shift sequence */ 2405 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2406 2407 if (isTargetByteDBCS) 2408 *target++ = UCNV_SO; 2409 else 2410 *target++ = UCNV_SI; 2411 if(offsets) 2412 *(offsets++) = (int32_t)(source - args->source-1); 2413 } 2414 /* write the targetUniChar to target */ 2415 if(targetByteUnit <= 0x00FF){ 2416 if( target < targetLimit){ 2417 *(target++) = (unsigned char) targetByteUnit; 2418 if(offsets){ 2419 *(offsets++) = (int32_t)(source - args->source-1); 2420 } 2421 2422 }else{ 2423 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2424 *err = U_BUFFER_OVERFLOW_ERROR; 2425 } 2426 }else{ 2427 if(target < targetLimit){ 2428 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2429 if(offsets){ 2430 *(offsets++) = (int32_t)(source - args->source-1); 2431 } 2432 if(target < targetLimit){ 2433 *(target++) =(unsigned char) (targetByteUnit -0x80); 2434 if(offsets){ 2435 *(offsets++) = (int32_t)(source - args->source-1); 2436 } 2437 }else{ 2438 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2439 *err = U_BUFFER_OVERFLOW_ERROR; 2440 } 2441 }else{ 2442 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2443 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2444 *err = U_BUFFER_OVERFLOW_ERROR; 2445 } 2446 } 2447 2448 } 2449 else{ 2450 /* oops.. the code point is unassingned 2451 * set the error and reason 2452 */ 2453 2454 /*check if the char is a First surrogate*/ 2455 if(U16_IS_SURROGATE(sourceChar)) { 2456 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2457 getTrail: 2458 /*look ahead to find the trail surrogate*/ 2459 if(source < sourceLimit) { 2460 /* test the following code unit */ 2461 UChar trail=(UChar) *source; 2462 if(U16_IS_TRAIL(trail)) { 2463 source++; 2464 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2465 *err = U_INVALID_CHAR_FOUND; 2466 /* convert this surrogate code point */ 2467 /* exit this condition tree */ 2468 } else { 2469 /* this is an unmatched lead code unit (1st surrogate) */ 2470 /* callback(illegal) */ 2471 *err=U_ILLEGAL_CHAR_FOUND; 2472 } 2473 } else { 2474 /* no more input */ 2475 *err = U_ZERO_ERROR; 2476 } 2477 } else { 2478 /* this is an unmatched trail code unit (2nd surrogate) */ 2479 /* callback(illegal) */ 2480 *err=U_ILLEGAL_CHAR_FOUND; 2481 } 2482 } else { 2483 /* callback(unassigned) for a BMP code point */ 2484 *err = U_INVALID_CHAR_FOUND; 2485 } 2486 2487 args->converter->fromUChar32=sourceChar; 2488 break; 2489 } 2490 } /* end if(myTargetIndex<myTargetLength) */ 2491 else{ 2492 *err =U_BUFFER_OVERFLOW_ERROR; 2493 break; 2494 } 2495 2496 }/* end while(mySourceIndex<mySourceLength) */ 2497 2498 /* 2499 * the end of the input stream and detection of truncated input 2500 * are handled by the framework, but for ISO-2022-KR conversion 2501 * we need to be in ASCII mode at the very end 2502 * 2503 * conditions: 2504 * successful 2505 * not in ASCII mode 2506 * end of input and no truncated input 2507 */ 2508 if( U_SUCCESS(*err) && 2509 isTargetByteDBCS && 2510 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2511 ) { 2512 int32_t sourceIndex; 2513 2514 /* we are switching to ASCII */ 2515 isTargetByteDBCS=FALSE; 2516 2517 /* get the source index of the last input character */ 2518 /* 2519 * TODO this would be simpler and more reliable if we used a pair 2520 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2521 * so that we could simply use the prevSourceIndex here; 2522 * this code gives an incorrect result for the rare case of an unmatched 2523 * trail surrogate that is alone in the last buffer of the text stream 2524 */ 2525 sourceIndex=(int32_t)(source-args->source); 2526 if(sourceIndex>0) { 2527 --sourceIndex; 2528 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2529 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2530 ) { 2531 --sourceIndex; 2532 } 2533 } else { 2534 sourceIndex=-1; 2535 } 2536 2537 fromUWriteUInt8( 2538 args->converter, 2539 SHIFT_IN_STR, 1, 2540 &target, (const char *)targetLimit, 2541 &offsets, sourceIndex, 2542 err); 2543 } 2544 2545 /*save the state and return */ 2546 args->source = source; 2547 args->target = (char*)target; 2548 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2549 } 2550 2551 /************************ To Unicode ***************************************/ 2552 2553 static void 2554 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2555 UErrorCode* err){ 2556 char const* sourceStart; 2557 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2558 2559 UConverterToUnicodeArgs subArgs; 2560 int32_t minArgsSize; 2561 2562 /* set up the subconverter arguments */ 2563 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2564 minArgsSize = args->size; 2565 } else { 2566 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2567 } 2568 2569 uprv_memcpy(&subArgs, args, minArgsSize); 2570 subArgs.size = (uint16_t)minArgsSize; 2571 subArgs.converter = myData->currentConverter; 2572 2573 /* remember the original start of the input for offsets */ 2574 sourceStart = args->source; 2575 2576 if(myData->key != 0) { 2577 /* continue with a partial escape sequence */ 2578 goto escape; 2579 } 2580 2581 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2582 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2583 subArgs.source = args->source; 2584 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2585 if(subArgs.source != subArgs.sourceLimit) { 2586 /* 2587 * get the current partial byte sequence 2588 * 2589 * it needs to be moved between the public and the subconverter 2590 * so that the conversion framework, which only sees the public 2591 * converter, can handle truncated and illegal input etc. 2592 */ 2593 if(args->converter->toULength > 0) { 2594 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2595 } 2596 subArgs.converter->toULength = args->converter->toULength; 2597 2598 /* 2599 * Convert up to the end of the input, or to before the next escape character. 2600 * Does not handle conversion extensions because the preToU[] state etc. 2601 * is not copied. 2602 */ 2603 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2604 2605 if(args->offsets != NULL && sourceStart != args->source) { 2606 /* update offsets to base them on the actual start of the input */ 2607 int32_t *offsets = args->offsets; 2608 UChar *target = args->target; 2609 int32_t delta = (int32_t)(args->source - sourceStart); 2610 while(target < subArgs.target) { 2611 if(*offsets >= 0) { 2612 *offsets += delta; 2613 } 2614 ++offsets; 2615 ++target; 2616 } 2617 } 2618 args->source = subArgs.source; 2619 args->target = subArgs.target; 2620 args->offsets = subArgs.offsets; 2621 2622 /* copy input/error/overflow buffers */ 2623 if(subArgs.converter->toULength > 0) { 2624 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2625 } 2626 args->converter->toULength = subArgs.converter->toULength; 2627 2628 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2629 if(subArgs.converter->UCharErrorBufferLength > 0) { 2630 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2631 subArgs.converter->UCharErrorBufferLength); 2632 } 2633 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2634 subArgs.converter->UCharErrorBufferLength = 0; 2635 } 2636 } 2637 2638 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2639 return; 2640 } 2641 2642 escape: 2643 changeState_2022(args->converter, 2644 &(args->source), 2645 args->sourceLimit, 2646 ISO_2022_KR, 2647 err); 2648 } 2649 } 2650 2651 static void 2652 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2653 UErrorCode* err){ 2654 char tempBuf[2]; 2655 const char *mySource = ( char *) args->source; 2656 UChar *myTarget = args->target; 2657 const char *mySourceLimit = args->sourceLimit; 2658 UChar32 targetUniChar = 0x0000; 2659 UChar mySourceChar = 0x0000; 2660 UConverterDataISO2022* myData; 2661 UConverterSharedData* sharedData ; 2662 UBool useFallback; 2663 2664 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2665 if(myData->version==1){ 2666 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2667 return; 2668 } 2669 2670 /* initialize state */ 2671 sharedData = myData->currentConverter->sharedData; 2672 useFallback = args->converter->useFallback; 2673 2674 if(myData->key != 0) { 2675 /* continue with a partial escape sequence */ 2676 goto escape; 2677 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2678 /* continue with a partial double-byte character */ 2679 mySourceChar = args->converter->toUBytes[0]; 2680 args->converter->toULength = 0; 2681 goto getTrailByte; 2682 } 2683 2684 while(mySource< mySourceLimit){ 2685 2686 if(myTarget < args->targetLimit){ 2687 2688 mySourceChar= (unsigned char) *mySource++; 2689 2690 if(mySourceChar==UCNV_SI){ 2691 myData->toU2022State.g = 0; 2692 if (myData->isEmptySegment) { 2693 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2694 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2695 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2696 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2697 args->converter->toULength = 1; 2698 args->target = myTarget; 2699 args->source = mySource; 2700 return; 2701 } 2702 /*consume the source */ 2703 continue; 2704 }else if(mySourceChar==UCNV_SO){ 2705 myData->toU2022State.g = 1; 2706 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2707 /*consume the source */ 2708 continue; 2709 }else if(mySourceChar==ESC_2022){ 2710 mySource--; 2711 escape: 2712 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2713 changeState_2022(args->converter,&(mySource), 2714 mySourceLimit, ISO_2022_KR, err); 2715 if(U_FAILURE(*err)){ 2716 args->target = myTarget; 2717 args->source = mySource; 2718 return; 2719 } 2720 continue; 2721 } 2722 2723 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2724 if(myData->toU2022State.g == 1) { 2725 if(mySource < mySourceLimit) { 2726 int leadIsOk, trailIsOk; 2727 uint8_t trailByte; 2728 getTrailByte: 2729 targetUniChar = missingCharMarker; 2730 trailByte = (uint8_t)*mySource; 2731 /* 2732 * Ticket 5691: consistent illegal sequences: 2733 * - We include at least the first byte in the illegal sequence. 2734 * - If any of the non-initial bytes could be the start of a character, 2735 * we stop the illegal sequence before the first one of those. 2736 * 2737 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2738 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2739 * Otherwise we convert or report the pair of bytes. 2740 */ 2741 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2742 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2743 if (leadIsOk && trailIsOk) { 2744 ++mySource; 2745 tempBuf[0] = (char)(mySourceChar + 0x80); 2746 tempBuf[1] = (char)(trailByte + 0x80); 2747 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2748 mySourceChar = (mySourceChar << 8) | trailByte; 2749 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2750 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2751 ++mySource; 2752 /* add another bit so that the code below writes 2 bytes in case of error */ 2753 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2754 } 2755 } else { 2756 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2757 args->converter->toULength = 1; 2758 break; 2759 } 2760 } 2761 else if(mySourceChar <= 0x7f) { 2762 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2763 } else { 2764 targetUniChar = 0xffff; 2765 } 2766 if(targetUniChar < 0xfffe){ 2767 if(args->offsets) { 2768 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2769 } 2770 *(myTarget++)=(UChar)targetUniChar; 2771 } 2772 else { 2773 /* Call the callback function*/ 2774 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2775 break; 2776 } 2777 } 2778 else{ 2779 *err =U_BUFFER_OVERFLOW_ERROR; 2780 break; 2781 } 2782 } 2783 args->target = myTarget; 2784 args->source = mySource; 2785 } 2786 2787 /*************************** END ISO2022-KR *********************************/ 2788 2789 /*************************** ISO-2022-CN ********************************* 2790 * 2791 * Rules for ISO-2022-CN Encoding: 2792 * i) The designator sequence must appear once on a line before any instance 2793 * of character set it designates. 2794 * ii) If two lines contain characters from the same character set, both lines 2795 * must include the designator sequence. 2796 * iii) Once the designator sequence is known, a shifting sequence has to be found 2797 * to invoke the shifting 2798 * iv) All lines start in ASCII and end in ASCII. 2799 * v) Four shifting sequences are employed for this purpose: 2800 * 2801 * Sequcence ASCII Eq Charsets 2802 * ---------- ------- --------- 2803 * SI <SI> US-ASCII 2804 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2805 * SS2 <ESC>N CNS-11643-1992 Plane 2 2806 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2807 * 2808 * vi) 2809 * SOdesignator : ESC "$" ")" finalchar_for_SO 2810 * SS2designator : ESC "$" "*" finalchar_for_SS2 2811 * SS3designator : ESC "$" "+" finalchar_for_SS3 2812 * 2813 * ESC $ ) A Indicates the bytes following SO are Chinese 2814 * characters as defined in GB 2312-80, until 2815 * another SOdesignation appears 2816 * 2817 * 2818 * ESC $ ) E Indicates the bytes following SO are as defined 2819 * in ISO-IR-165 (for details, see section 2.1), 2820 * until another SOdesignation appears 2821 * 2822 * ESC $ ) G Indicates the bytes following SO are as defined 2823 * in CNS 11643-plane-1, until another 2824 * SOdesignation appears 2825 * 2826 * ESC $ * H Indicates the two bytes immediately following 2827 * SS2 is a Chinese character as defined in CNS 2828 * 11643-plane-2, until another SS2designation 2829 * appears 2830 * (Meaning <ESC>N must preceed every 2 byte 2831 * sequence.) 2832 * 2833 * ESC $ + I Indicates the immediate two bytes following SS3 2834 * is a Chinese character as defined in CNS 2835 * 11643-plane-3, until another SS3designation 2836 * appears 2837 * (Meaning <ESC>O must preceed every 2 byte 2838 * sequence.) 2839 * 2840 * ESC $ + J Indicates the immediate two bytes following SS3 2841 * is a Chinese character as defined in CNS 2842 * 11643-plane-4, until another SS3designation 2843 * appears 2844 * (In English: <ESC>O must preceed every 2 byte 2845 * sequence.) 2846 * 2847 * ESC $ + K Indicates the immediate two bytes following SS3 2848 * is a Chinese character as defined in CNS 2849 * 11643-plane-5, until another SS3designation 2850 * appears 2851 * 2852 * ESC $ + L Indicates the immediate two bytes following SS3 2853 * is a Chinese character as defined in CNS 2854 * 11643-plane-6, until another SS3designation 2855 * appears 2856 * 2857 * ESC $ + M Indicates the immediate two bytes following SS3 2858 * is a Chinese character as defined in CNS 2859 * 11643-plane-7, until another SS3designation 2860 * appears 2861 * 2862 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2863 * has its own designation information before any Chinese characters 2864 * appear 2865 * 2866 */ 2867 2868 /* The following are defined this way to make the strings truly readonly */ 2869 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2870 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2871 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2872 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2873 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2874 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2875 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2876 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2877 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2878 2879 /********************** ISO2022-CN Data **************************/ 2880 static const char* const escSeqCharsCN[10] ={ 2881 SHIFT_IN_STR, /* 0 ASCII */ 2882 GB_2312_80_STR, /* 1 GB2312_1 */ 2883 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2884 CNS_11643_1992_Plane_1_STR, 2885 CNS_11643_1992_Plane_2_STR, 2886 CNS_11643_1992_Plane_3_STR, 2887 CNS_11643_1992_Plane_4_STR, 2888 CNS_11643_1992_Plane_5_STR, 2889 CNS_11643_1992_Plane_6_STR, 2890 CNS_11643_1992_Plane_7_STR 2891 }; 2892 2893 static void 2894 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2895 UConverter *cnv = args->converter; 2896 UConverterDataISO2022 *converterData; 2897 ISO2022State *pFromU2022State; 2898 uint8_t *target = (uint8_t *) args->target; 2899 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2900 const UChar* source = args->source; 2901 const UChar* sourceLimit = args->sourceLimit; 2902 int32_t* offsets = args->offsets; 2903 UChar32 sourceChar; 2904 char buffer[8]; 2905 int32_t len; 2906 int8_t choices[3]; 2907 int32_t choiceCount; 2908 uint32_t targetValue = 0; 2909 UBool useFallback; 2910 2911 /* set up the state */ 2912 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2913 pFromU2022State = &converterData->fromU2022State; 2914 2915 choiceCount = 0; 2916 2917 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2918 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2919 goto getTrail; 2920 } 2921 2922 while( source < sourceLimit){ 2923 if(target < targetLimit){ 2924 2925 sourceChar = *(source++); 2926 /*check if the char is a First surrogate*/ 2927 if(U16_IS_SURROGATE(sourceChar)) { 2928 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2929 getTrail: 2930 /*look ahead to find the trail surrogate*/ 2931 if(source < sourceLimit) { 2932 /* test the following code unit */ 2933 UChar trail=(UChar) *source; 2934 if(U16_IS_TRAIL(trail)) { 2935 source++; 2936 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2937 cnv->fromUChar32=0x00; 2938 /* convert this supplementary code point */ 2939 /* exit this condition tree */ 2940 } else { 2941 /* this is an unmatched lead code unit (1st surrogate) */ 2942 /* callback(illegal) */ 2943 *err=U_ILLEGAL_CHAR_FOUND; 2944 cnv->fromUChar32=sourceChar; 2945 break; 2946 } 2947 } else { 2948 /* no more input */ 2949 cnv->fromUChar32=sourceChar; 2950 break; 2951 } 2952 } else { 2953 /* this is an unmatched trail code unit (2nd surrogate) */ 2954 /* callback(illegal) */ 2955 *err=U_ILLEGAL_CHAR_FOUND; 2956 cnv->fromUChar32=sourceChar; 2957 break; 2958 } 2959 } 2960 2961 /* do the conversion */ 2962 if(sourceChar <= 0x007f ){ 2963 /* do not convert SO/SI/ESC */ 2964 if(IS_2022_CONTROL(sourceChar)) { 2965 /* callback(illegal) */ 2966 *err=U_ILLEGAL_CHAR_FOUND; 2967 cnv->fromUChar32=sourceChar; 2968 break; 2969 } 2970 2971 /* US-ASCII */ 2972 if(pFromU2022State->g == 0) { 2973 buffer[0] = (char)sourceChar; 2974 len = 1; 2975 } else { 2976 buffer[0] = UCNV_SI; 2977 buffer[1] = (char)sourceChar; 2978 len = 2; 2979 pFromU2022State->g = 0; 2980 choiceCount = 0; 2981 } 2982 if(sourceChar == CR || sourceChar == LF) { 2983 /* reset the state at the end of a line */ 2984 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2985 choiceCount = 0; 2986 } 2987 } 2988 else{ 2989 /* convert U+0080..U+10ffff */ 2990 int32_t i; 2991 int8_t cs, g; 2992 2993 if(choiceCount == 0) { 2994 /* try the current SO/G1 converter first */ 2995 choices[0] = pFromU2022State->cs[1]; 2996 2997 /* default to GB2312_1 if none is designated yet */ 2998 if(choices[0] == 0) { 2999 choices[0] = GB2312_1; 3000 } 3001 3002 if(converterData->version == 0) { 3003 /* ISO-2022-CN */ 3004 3005 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3006 if(choices[0] == GB2312_1) { 3007 choices[1] = (int8_t)CNS_11643_1; 3008 } else { 3009 choices[1] = (int8_t)GB2312_1; 3010 } 3011 3012 choiceCount = 2; 3013 } else if (converterData->version == 1) { 3014 /* ISO-2022-CN-EXT */ 3015 3016 /* try one of the other converters */ 3017 switch(choices[0]) { 3018 case GB2312_1: 3019 choices[1] = (int8_t)CNS_11643_1; 3020 choices[2] = (int8_t)ISO_IR_165; 3021 break; 3022 case ISO_IR_165: 3023 choices[1] = (int8_t)GB2312_1; 3024 choices[2] = (int8_t)CNS_11643_1; 3025 break; 3026 default: /* CNS_11643_x */ 3027 choices[1] = (int8_t)GB2312_1; 3028 choices[2] = (int8_t)ISO_IR_165; 3029 break; 3030 } 3031 3032 choiceCount = 3; 3033 } else { 3034 choices[0] = (int8_t)CNS_11643_1; 3035 choices[1] = (int8_t)GB2312_1; 3036 } 3037 } 3038 3039 cs = g = 0; 3040 /* 3041 * len==0: no mapping found yet 3042 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3043 * len>0: found a roundtrip result, done 3044 */ 3045 len = 0; 3046 /* 3047 * We will turn off useFallback after finding a fallback, 3048 * but we still get fallbacks from PUA code points as usual. 3049 * Therefore, we will also need to check that we don't overwrite 3050 * an early fallback with a later one. 3051 */ 3052 useFallback = cnv->useFallback; 3053 3054 for(i = 0; i < choiceCount && len <= 0; ++i) { 3055 int8_t cs0 = choices[i]; 3056 if(cs0 > 0) { 3057 uint32_t value; 3058 int32_t len2; 3059 if(cs0 >= CNS_11643_0) { 3060 len2 = MBCS_FROM_UCHAR32_ISO2022( 3061 converterData->myConverterArray[CNS_11643], 3062 sourceChar, 3063 &value, 3064 useFallback, 3065 MBCS_OUTPUT_3); 3066 if(len2 == 3 || (len2 == -3 && len == 0)) { 3067 targetValue = value; 3068 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3069 if(len2 >= 0) { 3070 len = 2; 3071 } else { 3072 len = -2; 3073 useFallback = FALSE; 3074 } 3075 if(cs == CNS_11643_1) { 3076 g = 1; 3077 } else if(cs == CNS_11643_2) { 3078 g = 2; 3079 } else /* plane 3..7 */ if(converterData->version == 1) { 3080 g = 3; 3081 } else { 3082 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3083 len = 0; 3084 } 3085 } 3086 } else { 3087 /* GB2312_1 or ISO-IR-165 */ 3088 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3089 len2 = MBCS_FROM_UCHAR32_ISO2022( 3090 converterData->myConverterArray[cs0], 3091 sourceChar, 3092 &value, 3093 useFallback, 3094 MBCS_OUTPUT_2); 3095 if(len2 == 2 || (len2 == -2 && len == 0)) { 3096 targetValue = value; 3097 len = len2; 3098 cs = cs0; 3099 g = 1; 3100 useFallback = FALSE; 3101 } 3102 } 3103 } 3104 } 3105 3106 if(len != 0) { 3107 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3108 3109 /* write the designation sequence if necessary */ 3110 if(cs != pFromU2022State->cs[g]) { 3111 if(cs < CNS_11643) { 3112 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3113 } else { 3114 U_ASSERT(cs >= CNS_11643_1); 3115 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3116 } 3117 len = 4; 3118 pFromU2022State->cs[g] = cs; 3119 if(g == 1) { 3120 /* changing the SO/G1 charset invalidates the choices[] */ 3121 choiceCount = 0; 3122 } 3123 } 3124 3125 /* write the shift sequence if necessary */ 3126 if(g != pFromU2022State->g) { 3127 switch(g) { 3128 case 1: 3129 buffer[len++] = UCNV_SO; 3130 3131 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3132 pFromU2022State->g = 1; 3133 break; 3134 case 2: 3135 buffer[len++] = 0x1b; 3136 buffer[len++] = 0x4e; 3137 break; 3138 default: /* case 3 */ 3139 buffer[len++] = 0x1b; 3140 buffer[len++] = 0x4f; 3141 break; 3142 } 3143 } 3144 3145 /* write the two output bytes */ 3146 buffer[len++] = (char)(targetValue >> 8); 3147 buffer[len++] = (char)targetValue; 3148 } else { 3149 /* if we cannot find the character after checking all codepages 3150 * then this is an error 3151 */ 3152 *err = U_INVALID_CHAR_FOUND; 3153 cnv->fromUChar32=sourceChar; 3154 break; 3155 } 3156 } 3157 3158 /* output len>0 bytes in buffer[] */ 3159 if(len == 1) { 3160 *target++ = buffer[0]; 3161 if(offsets) { 3162 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3163 } 3164 } else if(len == 2 && (target + 2) <= targetLimit) { 3165 *target++ = buffer[0]; 3166 *target++ = buffer[1]; 3167 if(offsets) { 3168 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3169 *offsets++ = sourceIndex; 3170 *offsets++ = sourceIndex; 3171 } 3172 } else { 3173 fromUWriteUInt8( 3174 cnv, 3175 buffer, len, 3176 &target, (const char *)targetLimit, 3177 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3178 err); 3179 if(U_FAILURE(*err)) { 3180 break; 3181 } 3182 } 3183 } /* end if(myTargetIndex<myTargetLength) */ 3184 else{ 3185 *err =U_BUFFER_OVERFLOW_ERROR; 3186 break; 3187 } 3188 3189 }/* end while(mySourceIndex<mySourceLength) */ 3190 3191 /* 3192 * the end of the input stream and detection of truncated input 3193 * are handled by the framework, but for ISO-2022-CN conversion 3194 * we need to be in ASCII mode at the very end 3195 * 3196 * conditions: 3197 * successful 3198 * not in ASCII mode 3199 * end of input and no truncated input 3200 */ 3201 if( U_SUCCESS(*err) && 3202 pFromU2022State->g!=0 && 3203 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3204 ) { 3205 int32_t sourceIndex; 3206 3207 /* we are switching to ASCII */ 3208 pFromU2022State->g=0; 3209 3210 /* get the source index of the last input character */ 3211 /* 3212 * TODO this would be simpler and more reliable if we used a pair 3213 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3214 * so that we could simply use the prevSourceIndex here; 3215 * this code gives an incorrect result for the rare case of an unmatched 3216 * trail surrogate that is alone in the last buffer of the text stream 3217 */ 3218 sourceIndex=(int32_t)(source-args->source); 3219 if(sourceIndex>0) { 3220 --sourceIndex; 3221 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3222 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3223 ) { 3224 --sourceIndex; 3225 } 3226 } else { 3227 sourceIndex=-1; 3228 } 3229 3230 fromUWriteUInt8( 3231 cnv, 3232 SHIFT_IN_STR, 1, 3233 &target, (const char *)targetLimit, 3234 &offsets, sourceIndex, 3235 err); 3236 } 3237 3238 /*save the state and return */ 3239 args->source = source; 3240 args->target = (char*)target; 3241 } 3242 3243 3244 static void 3245 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3246 UErrorCode* err){ 3247 char tempBuf[3]; 3248 const char *mySource = (char *) args->source; 3249 UChar *myTarget = args->target; 3250 const char *mySourceLimit = args->sourceLimit; 3251 uint32_t targetUniChar = 0x0000; 3252 uint32_t mySourceChar = 0x0000; 3253 UConverterDataISO2022* myData; 3254 ISO2022State *pToU2022State; 3255 3256 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3257 pToU2022State = &myData->toU2022State; 3258 3259 if(myData->key != 0) { 3260 /* continue with a partial escape sequence */ 3261 goto escape; 3262 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3263 /* continue with a partial double-byte character */ 3264 mySourceChar = args->converter->toUBytes[0]; 3265 args->converter->toULength = 0; 3266 targetUniChar = missingCharMarker; 3267 goto getTrailByte; 3268 } 3269 3270 while(mySource < mySourceLimit){ 3271 3272 targetUniChar =missingCharMarker; 3273 3274 if(myTarget < args->targetLimit){ 3275 3276 mySourceChar= (unsigned char) *mySource++; 3277 3278 switch(mySourceChar){ 3279 case UCNV_SI: 3280 pToU2022State->g=0; 3281 if (myData->isEmptySegment) { 3282 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3283 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3284 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3285 args->converter->toUBytes[0] = mySourceChar; 3286 args->converter->toULength = 1; 3287 args->target = myTarget; 3288 args->source = mySource; 3289 return; 3290 } 3291 continue; 3292 3293 case UCNV_SO: 3294 if(pToU2022State->cs[1] != 0) { 3295 pToU2022State->g=1; 3296 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3297 continue; 3298 } else { 3299 /* illegal to have SO before a matching designator */ 3300 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3301 break; 3302 } 3303 3304 case ESC_2022: 3305 mySource--; 3306 escape: 3307 { 3308 const char * mySourceBefore = mySource; 3309 int8_t toULengthBefore = args->converter->toULength; 3310 3311 changeState_2022(args->converter,&(mySource), 3312 mySourceLimit, ISO_2022_CN,err); 3313 3314 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3315 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3316 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3317 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3318 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3319 } 3320 } 3321 3322 /* invalid or illegal escape sequence */ 3323 if(U_FAILURE(*err)){ 3324 args->target = myTarget; 3325 args->source = mySource; 3326 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3327 return; 3328 } 3329 continue; 3330 3331 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3332 3333 case CR: 3334 /*falls through*/ 3335 case LF: 3336 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3337 /* falls through */ 3338 default: 3339 /* convert one or two bytes */ 3340 myData->isEmptySegment = FALSE; 3341 if(pToU2022State->g != 0) { 3342 if(mySource < mySourceLimit) { 3343 UConverterSharedData *cnv; 3344 StateEnum tempState; 3345 int32_t tempBufLen; 3346 int leadIsOk, trailIsOk; 3347 uint8_t trailByte; 3348 getTrailByte: 3349 trailByte = (uint8_t)*mySource; 3350 /* 3351 * Ticket 5691: consistent illegal sequences: 3352 * - We include at least the first byte in the illegal sequence. 3353 * - If any of the non-initial bytes could be the start of a character, 3354 * we stop the illegal sequence before the first one of those. 3355 * 3356 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3357 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3358 * Otherwise we convert or report the pair of bytes. 3359 */ 3360 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3361 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3362 if (leadIsOk && trailIsOk) { 3363 ++mySource; 3364 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3365 if(tempState >= CNS_11643_0) { 3366 cnv = myData->myConverterArray[CNS_11643]; 3367 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3368 tempBuf[1] = (char) (mySourceChar); 3369 tempBuf[2] = (char) trailByte; 3370 tempBufLen = 3; 3371 3372 }else{ 3373 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3374 cnv = myData->myConverterArray[tempState]; 3375 tempBuf[0] = (char) (mySourceChar); 3376 tempBuf[1] = (char) trailByte; 3377 tempBufLen = 2; 3378 } 3379 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3380 mySourceChar = (mySourceChar << 8) | trailByte; 3381 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3382 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3383 ++mySource; 3384 /* add another bit so that the code below writes 2 bytes in case of error */ 3385 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3386 } 3387 if(pToU2022State->g>=2) { 3388 /* return from a single-shift state to the previous one */ 3389 pToU2022State->g=pToU2022State->prevG; 3390 } 3391 } else { 3392 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3393 args->converter->toULength = 1; 3394 goto endloop; 3395 } 3396 } 3397 else{ 3398 if(mySourceChar <= 0x7f) { 3399 targetUniChar = (UChar) mySourceChar; 3400 } 3401 } 3402 break; 3403 } 3404 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3405 if(args->offsets){ 3406 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3407 } 3408 *(myTarget++)=(UChar)targetUniChar; 3409 } 3410 else if(targetUniChar > missingCharMarker){ 3411 /* disassemble the surrogate pair and write to output*/ 3412 targetUniChar-=0x0010000; 3413 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3414 if(args->offsets){ 3415 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3416 } 3417 ++myTarget; 3418 if(myTarget< args->targetLimit){ 3419 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3420 if(args->offsets){ 3421 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3422 } 3423 ++myTarget; 3424 }else{ 3425 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3426 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3427 } 3428 3429 } 3430 else{ 3431 /* Call the callback function*/ 3432 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3433 break; 3434 } 3435 } 3436 else{ 3437 *err =U_BUFFER_OVERFLOW_ERROR; 3438 break; 3439 } 3440 } 3441 endloop: 3442 args->target = myTarget; 3443 args->source = mySource; 3444 } 3445 3446 static void 3447 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3448 UConverter *cnv = args->converter; 3449 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3450 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3451 char *p, *subchar; 3452 char buffer[8]; 3453 int32_t length; 3454 3455 subchar=(char *)cnv->subChars; 3456 length=cnv->subCharLen; /* assume length==1 for most variants */ 3457 3458 p = buffer; 3459 switch(myConverterData->locale[0]){ 3460 case 'j': 3461 { 3462 int8_t cs; 3463 3464 if(pFromU2022State->g == 1) { 3465 /* JIS7: switch from G1 to G0 */ 3466 pFromU2022State->g = 0; 3467 *p++ = UCNV_SI; 3468 } 3469 3470 cs = pFromU2022State->cs[0]; 3471 if(cs != ASCII && cs != JISX201) { 3472 /* not in ASCII or JIS X 0201: switch to ASCII */ 3473 pFromU2022State->cs[0] = (int8_t)ASCII; 3474 *p++ = '\x1b'; 3475 *p++ = '\x28'; 3476 *p++ = '\x42'; 3477 } 3478 3479 *p++ = subchar[0]; 3480 break; 3481 } 3482 case 'c': 3483 if(pFromU2022State->g != 0) { 3484 /* not in ASCII mode: switch to ASCII */ 3485 pFromU2022State->g = 0; 3486 *p++ = UCNV_SI; 3487 } 3488 *p++ = subchar[0]; 3489 break; 3490 case 'k': 3491 if(myConverterData->version == 0) { 3492 if(length == 1) { 3493 if((UBool)args->converter->fromUnicodeStatus) { 3494 /* in DBCS mode: switch to SBCS */ 3495 args->converter->fromUnicodeStatus = 0; 3496 *p++ = UCNV_SI; 3497 } 3498 *p++ = subchar[0]; 3499 } else /* length == 2*/ { 3500 if(!(UBool)args->converter->fromUnicodeStatus) { 3501 /* in SBCS mode: switch to DBCS */ 3502 args->converter->fromUnicodeStatus = 1; 3503 *p++ = UCNV_SO; 3504 } 3505 *p++ = subchar[0]; 3506 *p++ = subchar[1]; 3507 } 3508 break; 3509 } else { 3510 /* save the subconverter's substitution string */ 3511 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3512 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3513 3514 /* set our substitution string into the subconverter */ 3515 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3516 myConverterData->currentConverter->subCharLen = (int8_t)length; 3517 3518 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3519 args->converter = myConverterData->currentConverter; 3520 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3521 ucnv_cbFromUWriteSub(args, 0, err); 3522 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3523 args->converter = cnv; 3524 3525 /* restore the subconverter's substitution string */ 3526 myConverterData->currentConverter->subChars = currentSubChars; 3527 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3528 3529 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3530 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3531 uprv_memcpy( 3532 cnv->charErrorBuffer, 3533 myConverterData->currentConverter->charErrorBuffer, 3534 myConverterData->currentConverter->charErrorBufferLength); 3535 } 3536 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3537 myConverterData->currentConverter->charErrorBufferLength = 0; 3538 } 3539 return; 3540 } 3541 default: 3542 /* not expected */ 3543 break; 3544 } 3545 ucnv_cbFromUWriteBytes(args, 3546 buffer, (int32_t)(p - buffer), 3547 offsetIndex, err); 3548 } 3549 3550 /* 3551 * Structure for cloning an ISO 2022 converter into a single memory block. 3552 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3553 * and then ucnv_safeClone() of the sub-converter may additionally align 3554 * currentConverter inside the cloneStruct, for which we need the deadSpace 3555 * after currentConverter. 3556 * This is because UAlignedMemory may be larger than the actually 3557 * necessary alignment size for the platform. 3558 * The other cloneStruct fields will not be moved around, 3559 * and are aligned properly with cloneStruct's alignment. 3560 */ 3561 struct cloneStruct 3562 { 3563 UConverter cnv; 3564 UConverter currentConverter; 3565 UAlignedMemory deadSpace; 3566 UConverterDataISO2022 mydata; 3567 }; 3568 3569 3570 static UConverter * 3571 _ISO_2022_SafeClone( 3572 const UConverter *cnv, 3573 void *stackBuffer, 3574 int32_t *pBufferSize, 3575 UErrorCode *status) 3576 { 3577 struct cloneStruct * localClone; 3578 UConverterDataISO2022 *cnvData; 3579 int32_t i, size; 3580 3581 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3582 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3583 return NULL; 3584 } 3585 3586 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3587 localClone = (struct cloneStruct *)stackBuffer; 3588 3589 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3590 3591 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3592 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3593 localClone->cnv.isExtraLocal = TRUE; 3594 3595 /* share the subconverters */ 3596 3597 if(cnvData->currentConverter != NULL) { 3598 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3599 localClone->mydata.currentConverter = 3600 ucnv_safeClone(cnvData->currentConverter, 3601 &localClone->currentConverter, 3602 &size, status); 3603 if(U_FAILURE(*status)) { 3604 return NULL; 3605 } 3606 } 3607 3608 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3609 if(cnvData->myConverterArray[i] != NULL) { 3610 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3611 } 3612 } 3613 3614 return &localClone->cnv; 3615 } 3616 3617 static void 3618 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3619 const USetAdder *sa, 3620 UConverterUnicodeSet which, 3621 UErrorCode *pErrorCode) 3622 { 3623 int32_t i; 3624 UConverterDataISO2022* cnvData; 3625 3626 if (U_FAILURE(*pErrorCode)) { 3627 return; 3628 } 3629 #ifdef U_ENABLE_GENERIC_ISO_2022 3630 if (cnv->sharedData == &_ISO2022Data) { 3631 /* We use UTF-8 in this case */ 3632 sa->addRange(sa->set, 0, 0xd7FF); 3633 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3634 return; 3635 } 3636 #endif 3637 3638 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3639 3640 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3641 switch(cnvData->locale[0]){ 3642 case 'j': 3643 /* include JIS X 0201 which is hardcoded */ 3644 sa->add(sa->set, 0xa5); 3645 sa->add(sa->set, 0x203e); 3646 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3647 /* include Latin-1 for some variants of JP */ 3648 sa->addRange(sa->set, 0, 0xff); 3649 } else { 3650 /* include ASCII for JP */ 3651 sa->addRange(sa->set, 0, 0x7f); 3652 } 3653 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3654 /* 3655 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3656 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3657 * use half-width Katakana. 3658 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3659 * half-width Katakana via the ESC ( I sequence. 3660 * However, we only emit (fromUnicode) half-width Katakana according to the 3661 * definition of each variant. 3662 * 3663 * When including fallbacks, 3664 * we need to include half-width Katakana Unicode code points for all JP variants because 3665 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3666 */ 3667 /* include half-width Katakana for JP */ 3668 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3669 } 3670 break; 3671 case 'c': 3672 case 'z': 3673 /* include ASCII for CN */ 3674 sa->addRange(sa->set, 0, 0x7f); 3675 break; 3676 case 'k': 3677 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3678 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3679 cnvData->currentConverter, sa, which, pErrorCode); 3680 /* the loop over myConverterArray[] will simply not find another converter */ 3681 break; 3682 default: 3683 break; 3684 } 3685 3686 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3687 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3688 cnvData->version==0 && i==CNS_11643 3689 ) { 3690 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3691 ucnv_MBCSGetUnicodeSetForBytes( 3692 cnvData->myConverterArray[i], 3693 sa, UCNV_ROUNDTRIP_SET, 3694 0, 0x81, 0x82, 3695 pErrorCode); 3696 } 3697 #endif 3698 3699 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3700 UConverterSetFilter filter; 3701 if(cnvData->myConverterArray[i]!=NULL) { 3702 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3703 cnvData->version==0 && i==CNS_11643 3704 ) { 3705 /* 3706 * Version-specific for CN: 3707 * CN version 0 does not map CNS planes 3..7 although 3708 * they are all available in the CNS conversion table; 3709 * CN version 1 (-EXT) does map them all. 3710 * The two versions create different Unicode sets. 3711 */ 3712 filter=UCNV_SET_FILTER_2022_CN; 3713 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3714 /* 3715 * Only add code points that map to Shift-JIS codes 3716 * corresponding to JIS X 0208. 3717 */ 3718 filter=UCNV_SET_FILTER_SJIS; 3719 } else if(i==KSC5601) { 3720 /* 3721 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3722 * are broader than GR94. 3723 */ 3724 filter=UCNV_SET_FILTER_GR94DBCS; 3725 } else { 3726 filter=UCNV_SET_FILTER_NONE; 3727 } 3728 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3729 } 3730 } 3731 3732 /* 3733 * ISO 2022 converters must not convert SO/SI/ESC despite what 3734 * sub-converters do by themselves. 3735 * Remove these characters from the set. 3736 */ 3737 sa->remove(sa->set, 0x0e); 3738 sa->remove(sa->set, 0x0f); 3739 sa->remove(sa->set, 0x1b); 3740 3741 /* ISO 2022 converters do not convert C1 controls either */ 3742 sa->removeRange(sa->set, 0x80, 0x9f); 3743 } 3744 3745 static const UConverterImpl _ISO2022Impl={ 3746 UCNV_ISO_2022, 3747 3748 NULL, 3749 NULL, 3750 3751 _ISO2022Open, 3752 _ISO2022Close, 3753 _ISO2022Reset, 3754 3755 #ifdef U_ENABLE_GENERIC_ISO_2022 3756 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3757 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3758 ucnv_fromUnicode_UTF8, 3759 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3760 #else 3761 NULL, 3762 NULL, 3763 NULL, 3764 NULL, 3765 #endif 3766 NULL, 3767 3768 NULL, 3769 _ISO2022getName, 3770 _ISO_2022_WriteSub, 3771 _ISO_2022_SafeClone, 3772 _ISO_2022_GetUnicodeSet, 3773 3774 NULL, 3775 NULL 3776 }; 3777 static const UConverterStaticData _ISO2022StaticData={ 3778 sizeof(UConverterStaticData), 3779 "ISO_2022", 3780 2022, 3781 UCNV_IBM, 3782 UCNV_ISO_2022, 3783 1, 3784 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3785 { 0x1a, 0, 0, 0 }, 3786 1, 3787 FALSE, 3788 FALSE, 3789 0, 3790 0, 3791 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3792 }; 3793 const UConverterSharedData _ISO2022Data={ 3794 sizeof(UConverterSharedData), 3795 ~((uint32_t) 0), 3796 NULL, 3797 NULL, 3798 &_ISO2022StaticData, 3799 FALSE, 3800 &_ISO2022Impl, 3801 0, UCNV_MBCS_TABLE_INITIALIZER 3802 }; 3803 3804 /*************JP****************/ 3805 static const UConverterImpl _ISO2022JPImpl={ 3806 UCNV_ISO_2022, 3807 3808 NULL, 3809 NULL, 3810 3811 _ISO2022Open, 3812 _ISO2022Close, 3813 _ISO2022Reset, 3814 3815 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3816 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3817 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3818 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3819 NULL, 3820 3821 NULL, 3822 _ISO2022getName, 3823 _ISO_2022_WriteSub, 3824 _ISO_2022_SafeClone, 3825 _ISO_2022_GetUnicodeSet, 3826 3827 NULL, 3828 NULL 3829 }; 3830 static const UConverterStaticData _ISO2022JPStaticData={ 3831 sizeof(UConverterStaticData), 3832 "ISO_2022_JP", 3833 0, 3834 UCNV_IBM, 3835 UCNV_ISO_2022, 3836 1, 3837 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3838 { 0x1a, 0, 0, 0 }, 3839 1, 3840 FALSE, 3841 FALSE, 3842 0, 3843 0, 3844 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3845 }; 3846 3847 namespace { 3848 3849 const UConverterSharedData _ISO2022JPData={ 3850 sizeof(UConverterSharedData), 3851 ~((uint32_t) 0), 3852 NULL, 3853 NULL, 3854 &_ISO2022JPStaticData, 3855 FALSE, 3856 &_ISO2022JPImpl, 3857 0, UCNV_MBCS_TABLE_INITIALIZER 3858 }; 3859 3860 } // namespace 3861 3862 /************* KR ***************/ 3863 static const UConverterImpl _ISO2022KRImpl={ 3864 UCNV_ISO_2022, 3865 3866 NULL, 3867 NULL, 3868 3869 _ISO2022Open, 3870 _ISO2022Close, 3871 _ISO2022Reset, 3872 3873 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3874 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3875 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3876 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3877 NULL, 3878 3879 NULL, 3880 _ISO2022getName, 3881 _ISO_2022_WriteSub, 3882 _ISO_2022_SafeClone, 3883 _ISO_2022_GetUnicodeSet, 3884 3885 NULL, 3886 NULL 3887 }; 3888 static const UConverterStaticData _ISO2022KRStaticData={ 3889 sizeof(UConverterStaticData), 3890 "ISO_2022_KR", 3891 0, 3892 UCNV_IBM, 3893 UCNV_ISO_2022, 3894 1, 3895 3, /* max 3 bytes per UChar: SO+DBCS */ 3896 { 0x1a, 0, 0, 0 }, 3897 1, 3898 FALSE, 3899 FALSE, 3900 0, 3901 0, 3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3903 }; 3904 3905 namespace { 3906 3907 const UConverterSharedData _ISO2022KRData={ 3908 sizeof(UConverterSharedData), 3909 ~((uint32_t) 0), 3910 NULL, 3911 NULL, 3912 &_ISO2022KRStaticData, 3913 FALSE, 3914 &_ISO2022KRImpl, 3915 0, UCNV_MBCS_TABLE_INITIALIZER 3916 }; 3917 3918 } // namespace 3919 3920 /*************** CN ***************/ 3921 static const UConverterImpl _ISO2022CNImpl={ 3922 3923 UCNV_ISO_2022, 3924 3925 NULL, 3926 NULL, 3927 3928 _ISO2022Open, 3929 _ISO2022Close, 3930 _ISO2022Reset, 3931 3932 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3933 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3934 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3935 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3936 NULL, 3937 3938 NULL, 3939 _ISO2022getName, 3940 _ISO_2022_WriteSub, 3941 _ISO_2022_SafeClone, 3942 _ISO_2022_GetUnicodeSet, 3943 3944 NULL, 3945 NULL 3946 }; 3947 static const UConverterStaticData _ISO2022CNStaticData={ 3948 sizeof(UConverterStaticData), 3949 "ISO_2022_CN", 3950 0, 3951 UCNV_IBM, 3952 UCNV_ISO_2022, 3953 1, 3954 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3955 { 0x1a, 0, 0, 0 }, 3956 1, 3957 FALSE, 3958 FALSE, 3959 0, 3960 0, 3961 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3962 }; 3963 3964 namespace { 3965 3966 const UConverterSharedData _ISO2022CNData={ 3967 sizeof(UConverterSharedData), 3968 ~((uint32_t) 0), 3969 NULL, 3970 NULL, 3971 &_ISO2022CNStaticData, 3972 FALSE, 3973 &_ISO2022CNImpl, 3974 0, UCNV_MBCS_TABLE_INITIALIZER 3975 }; 3976 3977 } // namespace 3978 3979 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3980