1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "unicode/utf16.h" 38 #include "ucnv_imp.h" 39 #include "ucnv_bld.h" 40 #include "ucnv_cnv.h" 41 #include "ucnvmbcs.h" 42 #include "cstring.h" 43 #include "cmemory.h" 44 #include "uassert.h" 45 46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 static const char SHIFT_IN_STR[] = "\x0F"; 81 // static const char SHIFT_OUT_STR[] = "\x0E"; 82 83 #define CR 0x0D 84 #define LF 0x0A 85 #define H_TAB 0x09 86 #define V_TAB 0x0B 87 #define SPACE 0x20 88 89 enum { 90 HWKANA_START=0xff61, 91 HWKANA_END=0xff9f 92 }; 93 94 /* 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 96 * as bytes 21..7E. (Subtract 0x80.) 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 98 * as bytes 20..7F. (Subtract 0x80.) 99 * Do not encode C1 control codes with native bytes 80..9F 100 * as bytes 00..1F (C0 control codes). 101 */ 102 enum { 103 GR94_START=0xa1, 104 GR94_END=0xfe, 105 GR96_START=0xa0, 106 GR96_END=0xff 107 }; 108 109 /* 110 * ISO 2022 control codes must not be converted from Unicode 111 * because they would mess up the byte stream. 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 113 * corresponding to SO, SI, and ESC. 114 */ 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 116 117 /* for ISO-2022-JP and -CN implementations */ 118 typedef enum { 119 /* shared values */ 120 INVALID_STATE=-1, 121 ASCII = 0, 122 123 SS2_STATE=0x10, 124 SS3_STATE, 125 126 /* JP */ 127 ISO8859_1 = 1 , 128 ISO8859_7 = 2 , 129 JISX201 = 3, 130 JISX208 = 4, 131 JISX212 = 5, 132 GB2312 =6, 133 KSC5601 =7, 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 135 136 /* CN */ 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 138 GB2312_1=1, 139 ISO_IR_165=2, 140 CNS_11643=3, 141 142 /* 143 * these are used in StateEnum and ISO2022State variables, 144 * but CNS_11643 must be used to index into myConverterArray[] 145 */ 146 CNS_11643_0=0x20, 147 CNS_11643_1, 148 CNS_11643_2, 149 CNS_11643_3, 150 CNS_11643_4, 151 CNS_11643_5, 152 CNS_11643_6, 153 CNS_11643_7 154 } StateEnum; 155 156 /* is the StateEnum charset value for a DBCS charset? */ 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 158 159 #define CSM(cs) ((uint16_t)1<<(cs)) 160 161 /* 162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 164 * 165 * Note: The converter uses some leniency: 166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 167 * all versions, not just JIS7 and JIS8. 168 * - ICU does not distinguish between different versions of JIS X 0208. 169 */ 170 enum { MAX_JA_VERSION=4 }; 171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 177 }; 178 179 typedef enum { 180 ASCII1=0, 181 LATIN1, 182 SBCS, 183 DBCS, 184 MBCS, 185 HWKANA 186 }Cnv2022Type; 187 188 typedef struct ISO2022State { 189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 191 int8_t prevG; /* g before single shift (SS2 or SS3) */ 192 } ISO2022State; 193 194 #define UCNV_OPTIONS_VERSION_MASK 0xf 195 #define UCNV_2022_MAX_CONVERTERS 10 196 197 typedef struct{ 198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 199 UConverter *currentConverter; 200 Cnv2022Type currentType; 201 ISO2022State toU2022State, fromU2022State; 202 uint32_t key; 203 uint32_t version; 204 #ifdef U_ENABLE_GENERIC_ISO_2022 205 UBool isFirstBuffer; 206 #endif 207 UBool isEmptySegment; 208 char name[30]; 209 char locale[3]; 210 }UConverterDataISO2022; 211 212 /* Protos */ 213 /* ISO-2022 ----------------------------------------------------------------- */ 214 215 /*Forward declaration */ 216 U_CFUNC void 217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 218 UErrorCode * err); 219 U_CFUNC void 220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 221 UErrorCode * err); 222 223 #define ESC_2022 0x1B /*ESC*/ 224 225 typedef enum 226 { 227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 231 } UCNV_TableStates_2022; 232 233 /* 234 * The way these state transition arrays work is: 235 * ex : ESC$B is the sequence for JISX208 236 * a) First Iteration: char is ESC 237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 238 * int x = normalize_esq_chars_2022[27] which is equal to 1 239 * ii) Search for this value in escSeqStateTable_Key_2022[] 240 * value of x is stored at escSeqStateTable_Key_2022[0] 241 * iii) Save this index as offset 242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 244 * b) Switch on this state and continue to next char 245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 246 * which is normalize_esq_chars_2022[36] == 4 247 * ii) x is currently 1(from above) 248 * x<<=5 -- x is now 32 249 * x+=normalize_esq_chars_2022[36] 250 * now x is 36 251 * iii) Search for this value in escSeqStateTable_Key_2022[] 252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 255 * c) Switch on this state and continue to next char 256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 257 * ii) x is currently 36 (from above) 258 * x<<=5 -- x is now 1152 259 * x+=normalize_esq_chars_2022[66] 260 * now x is 1161 261 * iii) Search for this value in escSeqStateTable_Key_2022[] 262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 266 */ 267 268 269 /*Below are the 3 arrays depicting a state transition table*/ 270 static const int8_t normalize_esq_chars_2022[256] = { 271 /* 0 1 2 3 4 5 6 7 8 9 */ 272 273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 299 }; 300 301 #ifdef U_ENABLE_GENERIC_ISO_2022 302 /* 303 * When the generic ISO-2022 converter is completely removed, not just disabled 304 * per #ifdef, then the following state table and the associated tables that are 305 * dimensioned with MAX_STATES_2022 should be trimmed. 306 * 307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 308 * the associated escape sequences starting with ESC ( B should be removed. 309 * This includes the ones with key values 1097 and all of the ones above 1000000. 310 * 311 * For the latter, the tables can simply be truncated. 312 * For the former, since the tables must be kept parallel, it is probably best 313 * to simply duplicate an adjacent table cell, parallel in all tables. 314 * 315 * It may make sense to restructure the tables, especially by using small search 316 * tables for the variants instead of indexing them parallel to the table here. 317 */ 318 #endif 319 320 #define MAX_STATES_2022 74 321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 322 /* 0 1 2 3 4 5 6 7 8 9 */ 323 324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 331 ,35947631 ,35947635 ,35947636 ,35947638 332 }; 333 334 #ifdef U_ENABLE_GENERIC_ISO_2022 335 336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 337 /* 0 1 2 3 4 5 6 7 8 9 */ 338 339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 347 }; 348 349 #endif 350 351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 352 /* 0 1 2 3 4 5 6 7 8 9 */ 353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 361 }; 362 363 364 /* Type def for refactoring changeState_2022 code*/ 365 typedef enum{ 366 #ifdef U_ENABLE_GENERIC_ISO_2022 367 ISO_2022=0, 368 #endif 369 ISO_2022_JP=1, 370 ISO_2022_KR=2, 371 ISO_2022_CN=3 372 } Variant2022; 373 374 /*********** ISO 2022 Converter Protos ***********/ 375 static void 376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 377 378 static void 379 _ISO2022Close(UConverter *converter); 380 381 static void 382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 383 384 static const char* 385 _ISO2022getName(const UConverter* cnv); 386 387 static void 388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 389 390 static UConverter * 391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 392 393 #ifdef U_ENABLE_GENERIC_ISO_2022 394 static void 395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 396 #endif 397 398 namespace { 399 400 /*const UConverterSharedData _ISO2022Data;*/ 401 extern const UConverterSharedData _ISO2022JPData; 402 extern const UConverterSharedData _ISO2022KRData; 403 extern const UConverterSharedData _ISO2022CNData; 404 405 } // namespace 406 407 /*************** Converter implementations ******************/ 408 409 /* The purpose of this function is to get around gcc compiler warnings. */ 410 static inline void 411 fromUWriteUInt8(UConverter *cnv, 412 const char *bytes, int32_t length, 413 uint8_t **target, const char *targetLimit, 414 int32_t **offsets, 415 int32_t sourceIndex, 416 UErrorCode *pErrorCode) 417 { 418 char *targetChars = (char *)*target; 419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 420 offsets, sourceIndex, pErrorCode); 421 *target = (uint8_t*)targetChars; 422 423 } 424 425 static inline void 426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 427 if(myConverterData->version == 1) { 428 UConverter *cnv = myConverterData->currentConverter; 429 430 cnv->toUnicodeStatus=0; /* offset */ 431 cnv->mode=0; /* state */ 432 cnv->toULength=0; /* byteIndex */ 433 } 434 } 435 436 static inline void 437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 438 /* in ISO-2022-KR the designator sequence appears only once 439 * in a file so we append it only once 440 */ 441 if( converter->charErrorBufferLength==0){ 442 443 converter->charErrorBufferLength = 4; 444 converter->charErrorBuffer[0] = 0x1b; 445 converter->charErrorBuffer[1] = 0x24; 446 converter->charErrorBuffer[2] = 0x29; 447 converter->charErrorBuffer[3] = 0x43; 448 } 449 if(myConverterData->version == 1) { 450 UConverter *cnv = myConverterData->currentConverter; 451 452 cnv->fromUChar32=0; 453 cnv->fromUnicodeStatus=1; /* prevLength */ 454 } 455 } 456 457 static void 458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 459 460 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 461 462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 463 if(cnv->extraInfo != NULL) { 464 UConverterNamePieces stackPieces; 465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 467 uint32_t version; 468 469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 470 471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 472 myConverterData->currentType = ASCII1; 473 cnv->fromUnicodeStatus =FALSE; 474 if(pArgs->locale){ 475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 476 } 477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 478 myConverterData->version = version; 479 480 /* BEGIN android-changed */ 481 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 482 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 483 if((myLocale[0]=='j' && 484 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 485 myLocale[1]=='s') && 486 (myLocale[2]=='_' || myLocale[2]=='\0'))) 487 { 488 size_t len=0; 489 /* open the required converters and cache them */ 490 if(version>MAX_JA_VERSION) { 491 /* prevent indexing beyond jpCharsetMasks[] */ 492 myConverterData->version = version = 0; 493 } 494 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 495 myConverterData->myConverterArray[ISO8859_7] = 496 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 497 } 498 if (myLocale[1]=='k') { /* Use KDDI's version. */ 499 myConverterData->myConverterArray[JISX208] = 500 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 501 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 502 myConverterData->myConverterArray[JISX208] = 503 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 504 } else { 505 myConverterData->myConverterArray[JISX208] = 506 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 507 } 508 /* END android-changed */ 509 510 if(jpCharsetMasks[version]&CSM(JISX212)) { 511 myConverterData->myConverterArray[JISX212] = 512 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 513 } 514 if(jpCharsetMasks[version]&CSM(GB2312)) { 515 myConverterData->myConverterArray[GB2312] = 516 /* BEGIN android-changed */ 517 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 518 /* END android-changed */ 519 } 520 if(jpCharsetMasks[version]&CSM(KSC5601)) { 521 myConverterData->myConverterArray[KSC5601] = 522 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 523 } 524 525 /* set the function pointers to appropriate funtions */ 526 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 527 uprv_strcpy(myConverterData->locale,"ja"); 528 529 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 530 len = uprv_strlen(myConverterData->name); 531 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 532 myConverterData->name[len+1]='\0'; 533 } 534 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 535 (myLocale[2]=='_' || myLocale[2]=='\0')) 536 { 537 const char *cnvName; 538 if(version==1) { 539 cnvName="icu-internal-25546"; 540 } else { 541 /* BEGIN android-changed */ 542 cnvName="ksc_5601"; 543 /* END android-changed */ 544 myConverterData->version=version=0; 545 } 546 if(pArgs->onlyTestIsLoadable) { 547 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 548 uprv_free(cnv->extraInfo); 549 cnv->extraInfo=NULL; 550 return; 551 } else { 552 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 553 if (U_FAILURE(*errorCode)) { 554 _ISO2022Close(cnv); 555 return; 556 } 557 558 if(version==1) { 559 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 560 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 561 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 562 }else{ 563 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 564 } 565 566 /* initialize the state variables */ 567 setInitialStateToUnicodeKR(cnv, myConverterData); 568 setInitialStateFromUnicodeKR(cnv, myConverterData); 569 570 /* set the function pointers to appropriate funtions */ 571 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 572 uprv_strcpy(myConverterData->locale,"ko"); 573 } 574 } 575 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 576 (myLocale[2]=='_' || myLocale[2]=='\0')) 577 { 578 579 /* open the required converters and cache them */ 580 /* BEGIN android-changed */ 581 myConverterData->myConverterArray[GB2312_1] = 582 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 583 if(version==1) { 584 myConverterData->myConverterArray[ISO_IR_165] = 585 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 586 } 587 myConverterData->myConverterArray[CNS_11643] = 588 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 589 /* END android-changed */ 590 591 592 /* set the function pointers to appropriate funtions */ 593 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 594 uprv_strcpy(myConverterData->locale,"cn"); 595 596 if (version==0){ 597 myConverterData->version = 0; 598 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 599 }else if (version==1){ 600 myConverterData->version = 1; 601 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 602 }else { 603 myConverterData->version = 2; 604 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 605 } 606 } 607 else{ 608 #ifdef U_ENABLE_GENERIC_ISO_2022 609 myConverterData->isFirstBuffer = TRUE; 610 611 /* append the UTF-8 escape sequence */ 612 cnv->charErrorBufferLength = 3; 613 cnv->charErrorBuffer[0] = 0x1b; 614 cnv->charErrorBuffer[1] = 0x25; 615 cnv->charErrorBuffer[2] = 0x42; 616 617 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 618 /* initialize the state variables */ 619 uprv_strcpy(myConverterData->name,"ISO_2022"); 620 #else 621 *errorCode = U_UNSUPPORTED_ERROR; 622 return; 623 #endif 624 } 625 626 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 627 628 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 629 _ISO2022Close(cnv); 630 } 631 } else { 632 *errorCode = U_MEMORY_ALLOCATION_ERROR; 633 } 634 } 635 636 637 static void 638 _ISO2022Close(UConverter *converter) { 639 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 640 UConverterSharedData **array = myData->myConverterArray; 641 int32_t i; 642 643 if (converter->extraInfo != NULL) { 644 /*close the array of converter pointers and free the memory*/ 645 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 646 if(array[i]!=NULL) { 647 ucnv_unloadSharedDataIfReady(array[i]); 648 } 649 } 650 651 ucnv_close(myData->currentConverter); 652 653 if(!converter->isExtraLocal){ 654 uprv_free (converter->extraInfo); 655 converter->extraInfo = NULL; 656 } 657 } 658 } 659 660 static void 661 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 662 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 663 if(choice<=UCNV_RESET_TO_UNICODE) { 664 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 665 myConverterData->key = 0; 666 myConverterData->isEmptySegment = FALSE; 667 } 668 if(choice!=UCNV_RESET_TO_UNICODE) { 669 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 670 } 671 #ifdef U_ENABLE_GENERIC_ISO_2022 672 if(myConverterData->locale[0] == 0){ 673 if(choice<=UCNV_RESET_TO_UNICODE) { 674 myConverterData->isFirstBuffer = TRUE; 675 myConverterData->key = 0; 676 if (converter->mode == UCNV_SO){ 677 ucnv_close (myConverterData->currentConverter); 678 myConverterData->currentConverter=NULL; 679 } 680 converter->mode = UCNV_SI; 681 } 682 if(choice!=UCNV_RESET_TO_UNICODE) { 683 /* re-append UTF-8 escape sequence */ 684 converter->charErrorBufferLength = 3; 685 converter->charErrorBuffer[0] = 0x1b; 686 converter->charErrorBuffer[1] = 0x28; 687 converter->charErrorBuffer[2] = 0x42; 688 } 689 } 690 else 691 #endif 692 { 693 /* reset the state variables */ 694 if(myConverterData->locale[0] == 'k'){ 695 if(choice<=UCNV_RESET_TO_UNICODE) { 696 setInitialStateToUnicodeKR(converter, myConverterData); 697 } 698 if(choice!=UCNV_RESET_TO_UNICODE) { 699 setInitialStateFromUnicodeKR(converter, myConverterData); 700 } 701 } 702 } 703 } 704 705 static const char* 706 _ISO2022getName(const UConverter* cnv){ 707 if(cnv->extraInfo){ 708 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 709 return myData->name; 710 } 711 return NULL; 712 } 713 714 715 /*************** to unicode *******************/ 716 /**************************************************************************** 717 * Recognized escape sequences are 718 * <ESC>(B ASCII 719 * <ESC>.A ISO-8859-1 720 * <ESC>.F ISO-8859-7 721 * <ESC>(J JISX-201 722 * <ESC>(I JISX-201 723 * <ESC>$B JISX-208 724 * <ESC>$@ JISX-208 725 * <ESC>$(D JISX-212 726 * <ESC>$A GB2312 727 * <ESC>$(C KSC5601 728 */ 729 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 730 /* 0 1 2 3 4 5 6 7 8 9 */ 731 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 732 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 733 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 734 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 735 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 736 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 737 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 738 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 739 }; 740 741 /*************** to unicode *******************/ 742 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 743 /* 0 1 2 3 4 5 6 7 8 9 */ 744 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 745 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 746 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 747 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 748 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 749 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 750 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 751 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 752 }; 753 754 755 static UCNV_TableStates_2022 756 getKey_2022(char c,int32_t* key,int32_t* offset){ 757 int32_t togo; 758 int32_t low = 0; 759 int32_t hi = MAX_STATES_2022; 760 int32_t oldmid=0; 761 762 togo = normalize_esq_chars_2022[(uint8_t)c]; 763 if(togo == 0) { 764 /* not a valid character anywhere in an escape sequence */ 765 *key = 0; 766 *offset = 0; 767 return INVALID_2022; 768 } 769 togo = (*key << 5) + togo; 770 771 while (hi != low) /*binary search*/{ 772 773 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 774 775 if (mid == oldmid) 776 break; 777 778 if (escSeqStateTable_Key_2022[mid] > togo){ 779 hi = mid; 780 } 781 else if (escSeqStateTable_Key_2022[mid] < togo){ 782 low = mid; 783 } 784 else /*we found it*/{ 785 *key = togo; 786 *offset = mid; 787 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 788 } 789 oldmid = mid; 790 791 } 792 793 *key = 0; 794 *offset = 0; 795 return INVALID_2022; 796 } 797 798 /*runs through a state machine to determine the escape sequence - codepage correspondance 799 */ 800 static void 801 changeState_2022(UConverter* _this, 802 const char** source, 803 const char* sourceLimit, 804 Variant2022 var, 805 UErrorCode* err){ 806 UCNV_TableStates_2022 value; 807 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 808 uint32_t key = myData2022->key; 809 int32_t offset = 0; 810 int8_t initialToULength = _this->toULength; 811 char c; 812 813 value = VALID_NON_TERMINAL_2022; 814 while (*source < sourceLimit) { 815 c = *(*source)++; 816 _this->toUBytes[_this->toULength++]=(uint8_t)c; 817 value = getKey_2022(c,(int32_t *) &key, &offset); 818 819 switch (value){ 820 821 case VALID_NON_TERMINAL_2022 : 822 /* continue with the loop */ 823 break; 824 825 case VALID_TERMINAL_2022: 826 key = 0; 827 goto DONE; 828 829 case INVALID_2022: 830 goto DONE; 831 832 case VALID_MAYBE_TERMINAL_2022: 833 #ifdef U_ENABLE_GENERIC_ISO_2022 834 /* ESC ( B is ambiguous only for ISO_2022 itself */ 835 if(var == ISO_2022) { 836 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 837 _this->toULength = 0; 838 839 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 840 841 /* continue with the loop */ 842 value = VALID_NON_TERMINAL_2022; 843 break; 844 } else 845 #endif 846 { 847 /* not ISO_2022 itself, finish here */ 848 value = VALID_TERMINAL_2022; 849 key = 0; 850 goto DONE; 851 } 852 } 853 } 854 855 DONE: 856 myData2022->key = key; 857 858 if (value == VALID_NON_TERMINAL_2022) { 859 /* indicate that the escape sequence is incomplete: key!=0 */ 860 return; 861 } else if (value == INVALID_2022 ) { 862 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 863 } else /* value == VALID_TERMINAL_2022 */ { 864 switch(var){ 865 #ifdef U_ENABLE_GENERIC_ISO_2022 866 case ISO_2022: 867 { 868 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 869 if(chosenConverterName == NULL) { 870 /* SS2 or SS3 */ 871 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 872 _this->toUCallbackReason = UCNV_UNASSIGNED; 873 return; 874 } 875 876 _this->mode = UCNV_SI; 877 ucnv_close(myData2022->currentConverter); 878 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 879 if(U_SUCCESS(*err)) { 880 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 881 _this->mode = UCNV_SO; 882 } 883 break; 884 } 885 #endif 886 case ISO_2022_JP: 887 { 888 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 889 switch(tempState) { 890 case INVALID_STATE: 891 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 892 break; 893 case SS2_STATE: 894 if(myData2022->toU2022State.cs[2]!=0) { 895 if(myData2022->toU2022State.g<2) { 896 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 897 } 898 myData2022->toU2022State.g=2; 899 } else { 900 /* illegal to have SS2 before a matching designator */ 901 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 902 } 903 break; 904 /* case SS3_STATE: not used in ISO-2022-JP-x */ 905 case ISO8859_1: 906 case ISO8859_7: 907 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 908 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 909 } else { 910 /* G2 charset for SS2 */ 911 myData2022->toU2022State.cs[2]=(int8_t)tempState; 912 } 913 break; 914 default: 915 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 917 } else { 918 /* G0 charset */ 919 myData2022->toU2022State.cs[0]=(int8_t)tempState; 920 } 921 break; 922 } 923 } 924 break; 925 case ISO_2022_CN: 926 { 927 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 928 switch(tempState) { 929 case INVALID_STATE: 930 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 931 break; 932 case SS2_STATE: 933 if(myData2022->toU2022State.cs[2]!=0) { 934 if(myData2022->toU2022State.g<2) { 935 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 936 } 937 myData2022->toU2022State.g=2; 938 } else { 939 /* illegal to have SS2 before a matching designator */ 940 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 941 } 942 break; 943 case SS3_STATE: 944 if(myData2022->toU2022State.cs[3]!=0) { 945 if(myData2022->toU2022State.g<2) { 946 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 947 } 948 myData2022->toU2022State.g=3; 949 } else { 950 /* illegal to have SS3 before a matching designator */ 951 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 952 } 953 break; 954 case ISO_IR_165: 955 if(myData2022->version==0) { 956 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 957 break; 958 } 959 /*fall through*/ 960 case GB2312_1: 961 /*fall through*/ 962 case CNS_11643_1: 963 myData2022->toU2022State.cs[1]=(int8_t)tempState; 964 break; 965 case CNS_11643_2: 966 myData2022->toU2022State.cs[2]=(int8_t)tempState; 967 break; 968 default: 969 /* other CNS 11643 planes */ 970 if(myData2022->version==0) { 971 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 972 } else { 973 myData2022->toU2022State.cs[3]=(int8_t)tempState; 974 } 975 break; 976 } 977 } 978 break; 979 case ISO_2022_KR: 980 if(offset==0x30){ 981 /* nothing to be done, just accept this one escape sequence */ 982 } else { 983 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 984 } 985 break; 986 987 default: 988 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 989 break; 990 } 991 } 992 if(U_SUCCESS(*err)) { 993 _this->toULength = 0; 994 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 995 if(_this->toULength>1) { 996 /* 997 * Ticket 5691: consistent illegal sequences: 998 * - We include at least the first byte (ESC) in the illegal sequence. 999 * - If any of the non-initial bytes could be the start of a character, 1000 * we stop the illegal sequence before the first one of those. 1001 * In escape sequences, all following bytes are "printable", that is, 1002 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1003 * they are valid single/lead bytes. 1004 * For simplicity, we always only report the initial ESC byte as the 1005 * illegal sequence and back out all other bytes we looked at. 1006 */ 1007 /* Back out some bytes. */ 1008 int8_t backOutDistance=_this->toULength-1; 1009 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1010 if(backOutDistance<=bytesFromThisBuffer) { 1011 /* same as initialToULength<=1 */ 1012 *source-=backOutDistance; 1013 } else { 1014 /* Back out bytes from the previous buffer: Need to replay them. */ 1015 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1016 /* same as -(initialToULength-1) */ 1017 /* preToULength is negative! */ 1018 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1019 *source-=bytesFromThisBuffer; 1020 } 1021 _this->toULength=1; 1022 } 1023 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1024 _this->toUCallbackReason = UCNV_UNASSIGNED; 1025 } 1026 } 1027 1028 /*Checks the characters of the buffer against valid 2022 escape sequences 1029 *if the match we return a pointer to the initial start of the sequence otherwise 1030 *we return sourceLimit 1031 */ 1032 /*for 2022 looks ahead in the stream 1033 *to determine the longest possible convertible 1034 *data stream 1035 */ 1036 static inline const char* 1037 getEndOfBuffer_2022(const char** source, 1038 const char* sourceLimit, 1039 UBool /*flush*/){ 1040 1041 const char* mySource = *source; 1042 1043 #ifdef U_ENABLE_GENERIC_ISO_2022 1044 if (*source >= sourceLimit) 1045 return sourceLimit; 1046 1047 do{ 1048 1049 if (*mySource == ESC_2022){ 1050 int8_t i; 1051 int32_t key = 0; 1052 int32_t offset; 1053 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1054 1055 /* Kludge: I could not 1056 * figure out the reason for validating an escape sequence 1057 * twice - once here and once in changeState_2022(). 1058 * is it possible to have an ESC character in a ISO2022 1059 * byte stream which is valid in a code page? Is it legal? 1060 */ 1061 for (i=0; 1062 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1063 i++) { 1064 value = getKey_2022(*(mySource+i), &key, &offset); 1065 } 1066 if (value > 0 || *mySource==ESC_2022) 1067 return mySource; 1068 1069 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1070 return sourceLimit; 1071 } 1072 }while (++mySource < sourceLimit); 1073 1074 return sourceLimit; 1075 #else 1076 while(mySource < sourceLimit && *mySource != ESC_2022) { 1077 ++mySource; 1078 } 1079 return mySource; 1080 #endif 1081 } 1082 1083 1084 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1085 * any future change in _MBCSFromUChar32() function should be reflected here. 1086 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1087 */ 1088 static inline int32_t 1089 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1090 UChar32 c, 1091 uint32_t* value, 1092 UBool useFallback, 1093 int outputType) 1094 { 1095 const int32_t *cx; 1096 const uint16_t *table; 1097 uint32_t stage2Entry; 1098 uint32_t myValue; 1099 int32_t length; 1100 const uint8_t *p; 1101 /* 1102 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1103 * Use internal version of ucnv_open() that verifies that the new structures are available, 1104 * else U_INTERNAL_PROGRAM_ERROR. 1105 */ 1106 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1107 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1108 table=sharedData->mbcs.fromUnicodeTable; 1109 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1110 /* get the bytes and the length for the output */ 1111 if(outputType==MBCS_OUTPUT_2){ 1112 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1113 if(myValue<=0xff) { 1114 length=1; 1115 } else { 1116 length=2; 1117 } 1118 } else /* outputType==MBCS_OUTPUT_3 */ { 1119 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1120 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1121 if(myValue<=0xff) { 1122 length=1; 1123 } else if(myValue<=0xffff) { 1124 length=2; 1125 } else { 1126 length=3; 1127 } 1128 } 1129 /* is this code point assigned, or do we use fallbacks? */ 1130 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1131 /* assigned */ 1132 *value=myValue; 1133 return length; 1134 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1135 /* 1136 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1137 * There is no way with this data structure for fallback output 1138 * to be a zero byte. 1139 */ 1140 *value=myValue; 1141 return -length; 1142 } 1143 } 1144 1145 cx=sharedData->mbcs.extIndexes; 1146 if(cx!=NULL) { 1147 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1148 } 1149 1150 /* unassigned */ 1151 return 0; 1152 } 1153 1154 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1155 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1156 * @param retval pointer to output byte 1157 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1158 */ 1159 static inline int32_t 1160 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1161 UChar32 c, 1162 uint32_t* retval, 1163 UBool useFallback) 1164 { 1165 const uint16_t *table; 1166 int32_t value; 1167 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1168 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1169 return 0; 1170 } 1171 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1172 table=sharedData->mbcs.fromUnicodeTable; 1173 /* get the byte for the output */ 1174 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1175 /* is this code point assigned, or do we use fallbacks? */ 1176 *retval=(uint32_t)(value&0xff); 1177 if(value>=0xf00) { 1178 return 1; /* roundtrip */ 1179 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1180 return -1; /* fallback taken */ 1181 } else { 1182 return 0; /* no mapping */ 1183 } 1184 } 1185 1186 /* 1187 * Check that the result is a 2-byte value with each byte in the range A1..FE 1188 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1189 * to move it to the ISO 2022 range 21..7E. 1190 * Return 0 if out of range. 1191 */ 1192 static inline uint32_t 1193 _2022FromGR94DBCS(uint32_t value) { 1194 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1195 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1196 ) { 1197 return value - 0x8080; /* shift down to 21..7e byte range */ 1198 } else { 1199 return 0; /* not valid for ISO 2022 */ 1200 } 1201 } 1202 1203 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1204 /* 1205 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1206 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1207 * unchanged. 1208 */ 1209 static inline uint32_t 1210 _2022ToGR94DBCS(uint32_t value) { 1211 uint32_t returnValue = value + 0x8080; 1212 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1213 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1214 return returnValue; 1215 } else { 1216 return value; 1217 } 1218 } 1219 #endif 1220 1221 #ifdef U_ENABLE_GENERIC_ISO_2022 1222 1223 /********************************************************************************** 1224 * ISO-2022 Converter 1225 * 1226 * 1227 */ 1228 1229 static void 1230 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1231 UErrorCode* err){ 1232 const char* mySourceLimit, *realSourceLimit; 1233 const char* sourceStart; 1234 const UChar* myTargetStart; 1235 UConverter* saveThis; 1236 UConverterDataISO2022* myData; 1237 int8_t length; 1238 1239 saveThis = args->converter; 1240 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1241 1242 realSourceLimit = args->sourceLimit; 1243 while (args->source < realSourceLimit) { 1244 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1245 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1246 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1247 1248 if(args->source < mySourceLimit) { 1249 if(myData->currentConverter==NULL) { 1250 myData->currentConverter = ucnv_open("ASCII",err); 1251 if(U_FAILURE(*err)){ 1252 return; 1253 } 1254 1255 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1256 saveThis->mode = UCNV_SO; 1257 } 1258 1259 /* convert to before the ESC or until the end of the buffer */ 1260 myData->isFirstBuffer=FALSE; 1261 sourceStart = args->source; 1262 myTargetStart = args->target; 1263 args->converter = myData->currentConverter; 1264 ucnv_toUnicode(args->converter, 1265 &args->target, 1266 args->targetLimit, 1267 &args->source, 1268 mySourceLimit, 1269 args->offsets, 1270 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1271 err); 1272 args->converter = saveThis; 1273 1274 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1275 /* move the overflow buffer */ 1276 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1277 myData->currentConverter->UCharErrorBufferLength = 0; 1278 if(length > 0) { 1279 uprv_memcpy(saveThis->UCharErrorBuffer, 1280 myData->currentConverter->UCharErrorBuffer, 1281 length*U_SIZEOF_UCHAR); 1282 } 1283 return; 1284 } 1285 1286 /* 1287 * At least one of: 1288 * -Error while converting 1289 * -Done with entire buffer 1290 * -Need to write offsets or update the current offset 1291 * (leave that up to the code in ucnv.c) 1292 * 1293 * or else we just stopped at an ESC byte and continue with changeState_2022() 1294 */ 1295 if (U_FAILURE(*err) || 1296 (args->source == realSourceLimit) || 1297 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1298 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1299 ) { 1300 /* copy partial or error input for truncated detection and error handling */ 1301 if(U_FAILURE(*err)) { 1302 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1303 if(length > 0) { 1304 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1305 } 1306 } else { 1307 length = saveThis->toULength = myData->currentConverter->toULength; 1308 if(length > 0) { 1309 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1310 if(args->source < mySourceLimit) { 1311 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1312 } 1313 } 1314 } 1315 return; 1316 } 1317 } 1318 } 1319 1320 sourceStart = args->source; 1321 changeState_2022(args->converter, 1322 &(args->source), 1323 realSourceLimit, 1324 ISO_2022, 1325 err); 1326 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1327 /* let the ucnv.c code update its current offset */ 1328 return; 1329 } 1330 } 1331 } 1332 1333 #endif 1334 1335 /* 1336 * To Unicode Callback helper function 1337 */ 1338 static void 1339 toUnicodeCallback(UConverter *cnv, 1340 const uint32_t sourceChar, const uint32_t targetUniChar, 1341 UErrorCode* err){ 1342 if(sourceChar>0xff){ 1343 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1344 cnv->toUBytes[1] = (uint8_t)sourceChar; 1345 cnv->toULength = 2; 1346 } 1347 else{ 1348 cnv->toUBytes[0] =(char) sourceChar; 1349 cnv->toULength = 1; 1350 } 1351 1352 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1353 *err = U_INVALID_CHAR_FOUND; 1354 } 1355 else{ 1356 *err = U_ILLEGAL_CHAR_FOUND; 1357 } 1358 } 1359 1360 /**************************************ISO-2022-JP*************************************************/ 1361 1362 /************************************** IMPORTANT ************************************************** 1363 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1364 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1365 * The converter iterates over each Unicode codepoint 1366 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1367 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1368 * would do as far as possible. 1369 * 1370 * If the implementation of these macros or structure of sharedData struct change in the future, make 1371 * sure that ISO-2022 is also changed. 1372 *************************************************************************************************** 1373 */ 1374 1375 /*************************************************************************************************** 1376 * Rules for ISO-2022-jp encoding 1377 * (i) Escape sequences must be fully contained within a line they should not 1378 * span new lines or CRs 1379 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1380 * JIS-Roman character escape sequence should follow before the line terminates 1381 * (iii) If the first character on the line is represented by two bytes then a two 1382 * byte character escape sequence should precede it 1383 * (iv) If no escape sequence is encountered then the characters are ASCII 1384 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1385 * and invoked with SS2 (ESC N). 1386 * (vi) If there is any G0 designation in text, there must be a switch to 1387 * ASCII or to JIS X 0201-Roman before a space character (but not 1388 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1389 * characters such as tab or CRLF. 1390 * (vi) Supported encodings: 1391 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1392 * 1393 * source : RFC-1554 1394 * 1395 * JISX201, JISX208,JISX212 : new .cnv data files created 1396 * KSC5601 : alias to ibm-949 mapping table 1397 * GB2312 : alias to ibm-1386 mapping table 1398 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1399 * ISO-8859-7 : alisas to ibm-9409 mapping table 1400 */ 1401 1402 /* preference order of JP charsets */ 1403 static const StateEnum jpCharsetPref[]={ 1404 ASCII, 1405 JISX201, 1406 ISO8859_1, 1407 ISO8859_7, 1408 JISX208, 1409 JISX212, 1410 GB2312, 1411 KSC5601, 1412 HWKANA_7BIT 1413 }; 1414 1415 /* 1416 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1417 * not in order of jpCharsetPref[]! 1418 */ 1419 static const char escSeqChars[][6] ={ 1420 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1421 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1422 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1423 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1424 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1425 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1426 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1427 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1428 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1429 1430 }; 1431 static const int8_t escSeqCharsLen[] ={ 1432 3, /* length of <ESC>(B ASCII */ 1433 3, /* length of <ESC>.A ISO-8859-1 */ 1434 3, /* length of <ESC>.F ISO-8859-7 */ 1435 3, /* length of <ESC>(J JISX-201 */ 1436 3, /* length of <ESC>$B JISX-208 */ 1437 4, /* length of <ESC>$(D JISX-212 */ 1438 3, /* length of <ESC>$A GB2312 */ 1439 4, /* length of <ESC>$(C KSC5601 */ 1440 3 /* length of <ESC>(I HWKANA_7BIT */ 1441 }; 1442 1443 /* 1444 * The iteration over various code pages works this way: 1445 * i) Get the currentState from myConverterData->currentState 1446 * ii) Check if the character is mapped to a valid character in the currentState 1447 * Yes -> a) set the initIterState to currentState 1448 * b) remain in this state until an invalid character is found 1449 * No -> a) go to the next code page and find the character 1450 * iii) Before changing the state increment the current state check if the current state 1451 * is equal to the intitIteration state 1452 * Yes -> A character that cannot be represented in any of the supported encodings 1453 * break and return a U_INVALID_CHARACTER error 1454 * No -> Continue and find the character in next code page 1455 * 1456 * 1457 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1458 */ 1459 1460 /* Map 00..7F to Unicode according to JIS X 0201. */ 1461 static inline uint32_t 1462 jisx201ToU(uint32_t value) { 1463 if(value < 0x5c) { 1464 return value; 1465 } else if(value == 0x5c) { 1466 return 0xa5; 1467 } else if(value == 0x7e) { 1468 return 0x203e; 1469 } else /* value <= 0x7f */ { 1470 return value; 1471 } 1472 } 1473 1474 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1475 static inline uint32_t 1476 jisx201FromU(uint32_t value) { 1477 if(value<=0x7f) { 1478 if(value!=0x5c && value!=0x7e) { 1479 return value; 1480 } 1481 } else if(value==0xa5) { 1482 return 0x5c; 1483 } else if(value==0x203e) { 1484 return 0x7e; 1485 } 1486 return 0xfffe; 1487 } 1488 1489 /* 1490 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1491 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1492 * Return 0 if the byte pair is out of range. 1493 */ 1494 static inline uint32_t 1495 _2022FromSJIS(uint32_t value) { 1496 uint8_t trail; 1497 1498 if(value > 0xEFFC) { 1499 return 0; /* beyond JIS X 0208 */ 1500 } 1501 1502 trail = (uint8_t)value; 1503 1504 value &= 0xff00; /* lead byte */ 1505 if(value <= 0x9f00) { 1506 value -= 0x7000; 1507 } else /* 0xe000 <= value <= 0xef00 */ { 1508 value -= 0xb000; 1509 } 1510 value <<= 1; 1511 1512 if(trail <= 0x9e) { 1513 value -= 0x100; 1514 if(trail <= 0x7e) { 1515 value |= trail - 0x1f; 1516 } else { 1517 value |= trail - 0x20; 1518 } 1519 } else /* trail <= 0xfc */ { 1520 value |= trail - 0x7e; 1521 } 1522 return value; 1523 } 1524 1525 /* 1526 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1527 * If either byte is outside 21..7E make sure that the result is not valid 1528 * for Shift-JIS so that the converter catches it. 1529 * Some invalid byte values already turn into equally invalid Shift-JIS 1530 * byte values and need not be tested explicitly. 1531 */ 1532 static inline void 1533 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1534 if(c1&1) { 1535 ++c1; 1536 if(c2 <= 0x5f) { 1537 c2 += 0x1f; 1538 } else if(c2 <= 0x7e) { 1539 c2 += 0x20; 1540 } else { 1541 c2 = 0; /* invalid */ 1542 } 1543 } else { 1544 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1545 c2 += 0x7e; 1546 } else { 1547 c2 = 0; /* invalid */ 1548 } 1549 } 1550 c1 >>= 1; 1551 if(c1 <= 0x2f) { 1552 c1 += 0x70; 1553 } else if(c1 <= 0x3f) { 1554 c1 += 0xb0; 1555 } else { 1556 c1 = 0; /* invalid */ 1557 } 1558 bytes[0] = (char)c1; 1559 bytes[1] = (char)c2; 1560 } 1561 1562 /* 1563 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1564 * Katakana. 1565 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1566 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1567 * These were the only fallbacks in ICU's jisx-208.ucm file. 1568 */ 1569 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1570 0x2123, /* U+FF61 */ 1571 0x2156, 1572 0x2157, 1573 0x2122, 1574 0x2126, 1575 0x2572, 1576 0x2521, 1577 0x2523, 1578 0x2525, 1579 0x2527, 1580 0x2529, 1581 0x2563, 1582 0x2565, 1583 0x2567, 1584 0x2543, 1585 0x213C, /* U+FF70 */ 1586 0x2522, 1587 0x2524, 1588 0x2526, 1589 0x2528, 1590 0x252A, 1591 0x252B, 1592 0x252D, 1593 0x252F, 1594 0x2531, 1595 0x2533, 1596 0x2535, 1597 0x2537, 1598 0x2539, 1599 0x253B, 1600 0x253D, 1601 0x253F, /* U+FF80 */ 1602 0x2541, 1603 0x2544, 1604 0x2546, 1605 0x2548, 1606 0x254A, 1607 0x254B, 1608 0x254C, 1609 0x254D, 1610 0x254E, 1611 0x254F, 1612 0x2552, 1613 0x2555, 1614 0x2558, 1615 0x255B, 1616 0x255E, 1617 0x255F, /* U+FF90 */ 1618 0x2560, 1619 0x2561, 1620 0x2562, 1621 0x2564, 1622 0x2566, 1623 0x2568, 1624 0x2569, 1625 0x256A, 1626 0x256B, 1627 0x256C, 1628 0x256D, 1629 0x256F, 1630 0x2573, 1631 0x212B, 1632 0x212C /* U+FF9F */ 1633 }; 1634 1635 static void 1636 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1637 UConverter *cnv = args->converter; 1638 UConverterDataISO2022 *converterData; 1639 ISO2022State *pFromU2022State; 1640 uint8_t *target = (uint8_t *) args->target; 1641 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1642 const UChar* source = args->source; 1643 const UChar* sourceLimit = args->sourceLimit; 1644 int32_t* offsets = args->offsets; 1645 UChar32 sourceChar; 1646 char buffer[8]; 1647 int32_t len, outLen; 1648 int8_t choices[10]; 1649 int32_t choiceCount; 1650 uint32_t targetValue = 0; 1651 UBool useFallback; 1652 1653 int32_t i; 1654 int8_t cs, g; 1655 1656 /* set up the state */ 1657 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1658 pFromU2022State = &converterData->fromU2022State; 1659 1660 choiceCount = 0; 1661 1662 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1663 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1664 goto getTrail; 1665 } 1666 1667 while(source < sourceLimit) { 1668 if(target < targetLimit) { 1669 1670 sourceChar = *(source++); 1671 /*check if the char is a First surrogate*/ 1672 if(U16_IS_SURROGATE(sourceChar)) { 1673 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1674 getTrail: 1675 /*look ahead to find the trail surrogate*/ 1676 if(source < sourceLimit) { 1677 /* test the following code unit */ 1678 UChar trail=(UChar) *source; 1679 if(U16_IS_TRAIL(trail)) { 1680 source++; 1681 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1682 cnv->fromUChar32=0x00; 1683 /* convert this supplementary code point */ 1684 /* exit this condition tree */ 1685 } else { 1686 /* this is an unmatched lead code unit (1st surrogate) */ 1687 /* callback(illegal) */ 1688 *err=U_ILLEGAL_CHAR_FOUND; 1689 cnv->fromUChar32=sourceChar; 1690 break; 1691 } 1692 } else { 1693 /* no more input */ 1694 cnv->fromUChar32=sourceChar; 1695 break; 1696 } 1697 } else { 1698 /* this is an unmatched trail code unit (2nd surrogate) */ 1699 /* callback(illegal) */ 1700 *err=U_ILLEGAL_CHAR_FOUND; 1701 cnv->fromUChar32=sourceChar; 1702 break; 1703 } 1704 } 1705 1706 /* do not convert SO/SI/ESC */ 1707 if(IS_2022_CONTROL(sourceChar)) { 1708 /* callback(illegal) */ 1709 *err=U_ILLEGAL_CHAR_FOUND; 1710 cnv->fromUChar32=sourceChar; 1711 break; 1712 } 1713 1714 /* do the conversion */ 1715 1716 if(choiceCount == 0) { 1717 uint16_t csm; 1718 1719 /* 1720 * The csm variable keeps track of which charsets are allowed 1721 * and not used yet while building the choices[]. 1722 */ 1723 csm = jpCharsetMasks[converterData->version]; 1724 choiceCount = 0; 1725 1726 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1727 if(converterData->version == 3 || converterData->version == 4) { 1728 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1729 } 1730 /* Do not try single-byte half-width Katakana for other versions. */ 1731 csm &= ~CSM(HWKANA_7BIT); 1732 1733 /* try the current G0 charset */ 1734 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1735 csm &= ~CSM(cs); 1736 1737 /* try the current G2 charset */ 1738 if((cs = pFromU2022State->cs[2]) != 0) { 1739 choices[choiceCount++] = cs; 1740 csm &= ~CSM(cs); 1741 } 1742 1743 /* try all the other possible charsets */ 1744 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1745 cs = (int8_t)jpCharsetPref[i]; 1746 if(CSM(cs) & csm) { 1747 choices[choiceCount++] = cs; 1748 csm &= ~CSM(cs); 1749 } 1750 } 1751 } 1752 1753 cs = g = 0; 1754 /* 1755 * len==0: no mapping found yet 1756 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1757 * len>0: found a roundtrip result, done 1758 */ 1759 len = 0; 1760 /* 1761 * We will turn off useFallback after finding a fallback, 1762 * but we still get fallbacks from PUA code points as usual. 1763 * Therefore, we will also need to check that we don't overwrite 1764 * an early fallback with a later one. 1765 */ 1766 useFallback = cnv->useFallback; 1767 1768 for(i = 0; i < choiceCount && len <= 0; ++i) { 1769 uint32_t value; 1770 int32_t len2; 1771 int8_t cs0 = choices[i]; 1772 switch(cs0) { 1773 case ASCII: 1774 if(sourceChar <= 0x7f) { 1775 targetValue = (uint32_t)sourceChar; 1776 len = 1; 1777 cs = cs0; 1778 g = 0; 1779 } 1780 break; 1781 case ISO8859_1: 1782 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1783 targetValue = (uint32_t)sourceChar - 0x80; 1784 len = 1; 1785 cs = cs0; 1786 g = 2; 1787 } 1788 break; 1789 case HWKANA_7BIT: 1790 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1791 if(converterData->version==3) { 1792 /* JIS7: use G1 (SO) */ 1793 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1794 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1795 len = 1; 1796 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1797 g = 1; 1798 } else if(converterData->version==4) { 1799 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1800 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1801 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1802 len = 1; 1803 1804 cs = pFromU2022State->cs[0]; 1805 if(IS_JP_DBCS(cs)) { 1806 /* switch from a DBCS charset to JISX201 */ 1807 cs = (int8_t)JISX201; 1808 } 1809 /* else stay in the current G0 charset */ 1810 g = 0; 1811 } 1812 /* else do not use HWKANA_7BIT with other versions */ 1813 } 1814 break; 1815 case JISX201: 1816 /* G0 SBCS */ 1817 value = jisx201FromU(sourceChar); 1818 if(value <= 0x7f) { 1819 targetValue = value; 1820 len = 1; 1821 cs = cs0; 1822 g = 0; 1823 useFallback = FALSE; 1824 } 1825 break; 1826 case JISX208: 1827 /* G0 DBCS from Shift-JIS table */ 1828 len2 = MBCS_FROM_UCHAR32_ISO2022( 1829 converterData->myConverterArray[cs0], 1830 sourceChar, &value, 1831 useFallback, MBCS_OUTPUT_2); 1832 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1833 value = _2022FromSJIS(value); 1834 if(value != 0) { 1835 targetValue = value; 1836 len = len2; 1837 cs = cs0; 1838 g = 0; 1839 useFallback = FALSE; 1840 } 1841 } else if(len == 0 && useFallback && 1842 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1843 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1844 len = -2; 1845 cs = cs0; 1846 g = 0; 1847 useFallback = FALSE; 1848 } 1849 break; 1850 case ISO8859_7: 1851 /* G0 SBCS forced to 7-bit output */ 1852 len2 = MBCS_SINGLE_FROM_UCHAR32( 1853 converterData->myConverterArray[cs0], 1854 sourceChar, &value, 1855 useFallback); 1856 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1857 targetValue = value - 0x80; 1858 len = len2; 1859 cs = cs0; 1860 g = 2; 1861 useFallback = FALSE; 1862 } 1863 break; 1864 default: 1865 /* G0 DBCS */ 1866 len2 = MBCS_FROM_UCHAR32_ISO2022( 1867 converterData->myConverterArray[cs0], 1868 sourceChar, &value, 1869 useFallback, MBCS_OUTPUT_2); 1870 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1871 if(cs0 == KSC5601) { 1872 /* 1873 * Check for valid bytes for the encoding scheme. 1874 * This is necessary because the sub-converter (windows-949) 1875 * has a broader encoding scheme than is valid for 2022. 1876 */ 1877 value = _2022FromGR94DBCS(value); 1878 if(value == 0) { 1879 break; 1880 } 1881 } 1882 targetValue = value; 1883 len = len2; 1884 cs = cs0; 1885 g = 0; 1886 useFallback = FALSE; 1887 } 1888 break; 1889 } 1890 } 1891 1892 if(len != 0) { 1893 if(len < 0) { 1894 len = -len; /* fallback */ 1895 } 1896 outLen = 0; /* count output bytes */ 1897 1898 /* write SI if necessary (only for JIS7) */ 1899 if(pFromU2022State->g == 1 && g == 0) { 1900 buffer[outLen++] = UCNV_SI; 1901 pFromU2022State->g = 0; 1902 } 1903 1904 /* write the designation sequence if necessary */ 1905 if(cs != pFromU2022State->cs[g]) { 1906 int32_t escLen = escSeqCharsLen[cs]; 1907 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1908 outLen += escLen; 1909 pFromU2022State->cs[g] = cs; 1910 1911 /* invalidate the choices[] */ 1912 choiceCount = 0; 1913 } 1914 1915 /* write the shift sequence if necessary */ 1916 if(g != pFromU2022State->g) { 1917 switch(g) { 1918 /* case 0 handled before writing escapes */ 1919 case 1: 1920 buffer[outLen++] = UCNV_SO; 1921 pFromU2022State->g = 1; 1922 break; 1923 default: /* case 2 */ 1924 buffer[outLen++] = 0x1b; 1925 buffer[outLen++] = 0x4e; 1926 break; 1927 /* no case 3: no SS3 in ISO-2022-JP-x */ 1928 } 1929 } 1930 1931 /* write the output bytes */ 1932 if(len == 1) { 1933 buffer[outLen++] = (char)targetValue; 1934 } else /* len == 2 */ { 1935 buffer[outLen++] = (char)(targetValue >> 8); 1936 buffer[outLen++] = (char)targetValue; 1937 } 1938 } else { 1939 /* 1940 * if we cannot find the character after checking all codepages 1941 * then this is an error 1942 */ 1943 *err = U_INVALID_CHAR_FOUND; 1944 cnv->fromUChar32=sourceChar; 1945 break; 1946 } 1947 1948 if(sourceChar == CR || sourceChar == LF) { 1949 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1950 pFromU2022State->cs[2] = 0; 1951 choiceCount = 0; 1952 } 1953 1954 /* output outLen>0 bytes in buffer[] */ 1955 if(outLen == 1) { 1956 *target++ = buffer[0]; 1957 if(offsets) { 1958 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1959 } 1960 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1961 *target++ = buffer[0]; 1962 *target++ = buffer[1]; 1963 if(offsets) { 1964 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1965 *offsets++ = sourceIndex; 1966 *offsets++ = sourceIndex; 1967 } 1968 } else { 1969 fromUWriteUInt8( 1970 cnv, 1971 buffer, outLen, 1972 &target, (const char *)targetLimit, 1973 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1974 err); 1975 if(U_FAILURE(*err)) { 1976 break; 1977 } 1978 } 1979 } /* end if(myTargetIndex<myTargetLength) */ 1980 else{ 1981 *err =U_BUFFER_OVERFLOW_ERROR; 1982 break; 1983 } 1984 1985 }/* end while(mySourceIndex<mySourceLength) */ 1986 1987 /* 1988 * the end of the input stream and detection of truncated input 1989 * are handled by the framework, but for ISO-2022-JP conversion 1990 * we need to be in ASCII mode at the very end 1991 * 1992 * conditions: 1993 * successful 1994 * in SO mode or not in ASCII mode 1995 * end of input and no truncated input 1996 */ 1997 if( U_SUCCESS(*err) && 1998 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1999 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2000 ) { 2001 int32_t sourceIndex; 2002 2003 outLen = 0; 2004 2005 if(pFromU2022State->g != 0) { 2006 buffer[outLen++] = UCNV_SI; 2007 pFromU2022State->g = 0; 2008 } 2009 2010 if(pFromU2022State->cs[0] != ASCII) { 2011 int32_t escLen = escSeqCharsLen[ASCII]; 2012 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2013 outLen += escLen; 2014 pFromU2022State->cs[0] = (int8_t)ASCII; 2015 } 2016 2017 /* get the source index of the last input character */ 2018 /* 2019 * TODO this would be simpler and more reliable if we used a pair 2020 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2021 * so that we could simply use the prevSourceIndex here; 2022 * this code gives an incorrect result for the rare case of an unmatched 2023 * trail surrogate that is alone in the last buffer of the text stream 2024 */ 2025 sourceIndex=(int32_t)(source-args->source); 2026 if(sourceIndex>0) { 2027 --sourceIndex; 2028 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2029 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2030 ) { 2031 --sourceIndex; 2032 } 2033 } else { 2034 sourceIndex=-1; 2035 } 2036 2037 fromUWriteUInt8( 2038 cnv, 2039 buffer, outLen, 2040 &target, (const char *)targetLimit, 2041 &offsets, sourceIndex, 2042 err); 2043 } 2044 2045 /*save the state and return */ 2046 args->source = source; 2047 args->target = (char*)target; 2048 } 2049 2050 /*************** to unicode *******************/ 2051 2052 static void 2053 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2054 UErrorCode* err){ 2055 char tempBuf[2]; 2056 const char *mySource = (char *) args->source; 2057 UChar *myTarget = args->target; 2058 const char *mySourceLimit = args->sourceLimit; 2059 uint32_t targetUniChar = 0x0000; 2060 uint32_t mySourceChar = 0x0000; 2061 uint32_t tmpSourceChar = 0x0000; 2062 UConverterDataISO2022* myData; 2063 ISO2022State *pToU2022State; 2064 StateEnum cs; 2065 2066 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2067 pToU2022State = &myData->toU2022State; 2068 2069 if(myData->key != 0) { 2070 /* continue with a partial escape sequence */ 2071 goto escape; 2072 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2073 /* continue with a partial double-byte character */ 2074 mySourceChar = args->converter->toUBytes[0]; 2075 args->converter->toULength = 0; 2076 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2077 targetUniChar = missingCharMarker; 2078 goto getTrailByte; 2079 } 2080 2081 while(mySource < mySourceLimit){ 2082 2083 targetUniChar =missingCharMarker; 2084 2085 if(myTarget < args->targetLimit){ 2086 2087 mySourceChar= (unsigned char) *mySource++; 2088 2089 switch(mySourceChar) { 2090 case UCNV_SI: 2091 if(myData->version==3) { 2092 pToU2022State->g=0; 2093 continue; 2094 } else { 2095 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2096 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2097 break; 2098 } 2099 2100 case UCNV_SO: 2101 if(myData->version==3) { 2102 /* JIS7: switch to G1 half-width Katakana */ 2103 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2104 pToU2022State->g=1; 2105 continue; 2106 } else { 2107 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2108 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2109 break; 2110 } 2111 2112 case ESC_2022: 2113 mySource--; 2114 escape: 2115 { 2116 const char * mySourceBefore = mySource; 2117 int8_t toULengthBefore = args->converter->toULength; 2118 2119 changeState_2022(args->converter,&(mySource), 2120 mySourceLimit, ISO_2022_JP,err); 2121 2122 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2123 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2124 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2125 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2126 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2127 } 2128 } 2129 2130 /* invalid or illegal escape sequence */ 2131 if(U_FAILURE(*err)){ 2132 args->target = myTarget; 2133 args->source = mySource; 2134 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2135 return; 2136 } 2137 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2138 if(myData->key==0) { 2139 myData->isEmptySegment = TRUE; 2140 } 2141 continue; 2142 2143 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2144 2145 case CR: 2146 /*falls through*/ 2147 case LF: 2148 /* automatically reset to single-byte mode */ 2149 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2150 pToU2022State->cs[0] = (int8_t)ASCII; 2151 } 2152 pToU2022State->cs[2] = 0; 2153 pToU2022State->g = 0; 2154 /* falls through */ 2155 default: 2156 /* convert one or two bytes */ 2157 myData->isEmptySegment = FALSE; 2158 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2159 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2160 !IS_JP_DBCS(cs) 2161 ) { 2162 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2163 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2164 2165 /* return from a single-shift state to the previous one */ 2166 if(pToU2022State->g >= 2) { 2167 pToU2022State->g=pToU2022State->prevG; 2168 } 2169 } else switch(cs) { 2170 case ASCII: 2171 if(mySourceChar <= 0x7f) { 2172 targetUniChar = mySourceChar; 2173 } 2174 break; 2175 case ISO8859_1: 2176 if(mySourceChar <= 0x7f) { 2177 targetUniChar = mySourceChar + 0x80; 2178 } 2179 /* return from a single-shift state to the previous one */ 2180 pToU2022State->g=pToU2022State->prevG; 2181 break; 2182 case ISO8859_7: 2183 if(mySourceChar <= 0x7f) { 2184 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2185 targetUniChar = 2186 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2187 myData->myConverterArray[cs], 2188 mySourceChar + 0x80); 2189 } 2190 /* return from a single-shift state to the previous one */ 2191 pToU2022State->g=pToU2022State->prevG; 2192 break; 2193 case JISX201: 2194 if(mySourceChar <= 0x7f) { 2195 targetUniChar = jisx201ToU(mySourceChar); 2196 } 2197 break; 2198 case HWKANA_7BIT: 2199 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2200 /* 7-bit halfwidth Katakana */ 2201 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2202 } 2203 break; 2204 default: 2205 /* G0 DBCS */ 2206 if(mySource < mySourceLimit) { 2207 int leadIsOk, trailIsOk; 2208 uint8_t trailByte; 2209 getTrailByte: 2210 trailByte = (uint8_t)*mySource; 2211 /* 2212 * Ticket 5691: consistent illegal sequences: 2213 * - We include at least the first byte in the illegal sequence. 2214 * - If any of the non-initial bytes could be the start of a character, 2215 * we stop the illegal sequence before the first one of those. 2216 * 2217 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2218 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2219 * Otherwise we convert or report the pair of bytes. 2220 */ 2221 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2222 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2223 if (leadIsOk && trailIsOk) { 2224 ++mySource; 2225 tmpSourceChar = (mySourceChar << 8) | trailByte; 2226 if(cs == JISX208) { 2227 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2228 mySourceChar = tmpSourceChar; 2229 } else { 2230 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2231 mySourceChar = tmpSourceChar; 2232 if (cs == KSC5601) { 2233 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2234 } 2235 tempBuf[0] = (char)(tmpSourceChar >> 8); 2236 tempBuf[1] = (char)(tmpSourceChar); 2237 } 2238 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2239 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2240 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2241 ++mySource; 2242 /* add another bit so that the code below writes 2 bytes in case of error */ 2243 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2244 } 2245 } else { 2246 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2247 args->converter->toULength = 1; 2248 goto endloop; 2249 } 2250 } /* End of inner switch */ 2251 break; 2252 } /* End of outer switch */ 2253 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2254 if(args->offsets){ 2255 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2256 } 2257 *(myTarget++)=(UChar)targetUniChar; 2258 } 2259 else if(targetUniChar > missingCharMarker){ 2260 /* disassemble the surrogate pair and write to output*/ 2261 targetUniChar-=0x0010000; 2262 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2263 if(args->offsets){ 2264 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2265 } 2266 ++myTarget; 2267 if(myTarget< args->targetLimit){ 2268 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2269 if(args->offsets){ 2270 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2271 } 2272 ++myTarget; 2273 }else{ 2274 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2275 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2276 } 2277 2278 } 2279 else{ 2280 /* Call the callback function*/ 2281 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2282 break; 2283 } 2284 } 2285 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2286 *err =U_BUFFER_OVERFLOW_ERROR; 2287 break; 2288 } 2289 } 2290 endloop: 2291 args->target = myTarget; 2292 args->source = mySource; 2293 } 2294 2295 2296 /*************************************************************** 2297 * Rules for ISO-2022-KR encoding 2298 * i) The KSC5601 designator sequence should appear only once in a file, 2299 * at the begining of a line before any KSC5601 characters. This usually 2300 * means that it appears by itself on the first line of the file 2301 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2302 * and SI to shift into single byte mode 2303 */ 2304 static void 2305 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2306 2307 UConverter* saveConv = args->converter; 2308 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2309 args->converter=myConverterData->currentConverter; 2310 2311 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2312 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2313 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2314 2315 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2316 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2317 uprv_memcpy( 2318 saveConv->charErrorBuffer, 2319 myConverterData->currentConverter->charErrorBuffer, 2320 myConverterData->currentConverter->charErrorBufferLength); 2321 } 2322 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2323 myConverterData->currentConverter->charErrorBufferLength = 0; 2324 } 2325 args->converter=saveConv; 2326 } 2327 2328 static void 2329 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2330 2331 const UChar *source = args->source; 2332 const UChar *sourceLimit = args->sourceLimit; 2333 unsigned char *target = (unsigned char *) args->target; 2334 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2335 int32_t* offsets = args->offsets; 2336 uint32_t targetByteUnit = 0x0000; 2337 UChar32 sourceChar = 0x0000; 2338 UBool isTargetByteDBCS; 2339 UBool oldIsTargetByteDBCS; 2340 UConverterDataISO2022 *converterData; 2341 UConverterSharedData* sharedData; 2342 UBool useFallback; 2343 int32_t length =0; 2344 2345 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2346 /* if the version is 1 then the user is requesting 2347 * conversion with ibm-25546 pass the arguments to 2348 * MBCS converter and return 2349 */ 2350 if(converterData->version==1){ 2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2352 return; 2353 } 2354 2355 /* initialize data */ 2356 sharedData = converterData->currentConverter->sharedData; 2357 useFallback = args->converter->useFallback; 2358 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2359 oldIsTargetByteDBCS = isTargetByteDBCS; 2360 2361 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2362 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2363 goto getTrail; 2364 } 2365 while(source < sourceLimit){ 2366 2367 targetByteUnit = missingCharMarker; 2368 2369 if(target < (unsigned char*) args->targetLimit){ 2370 sourceChar = *source++; 2371 2372 /* do not convert SO/SI/ESC */ 2373 if(IS_2022_CONTROL(sourceChar)) { 2374 /* callback(illegal) */ 2375 *err=U_ILLEGAL_CHAR_FOUND; 2376 args->converter->fromUChar32=sourceChar; 2377 break; 2378 } 2379 2380 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2381 if(length < 0) { 2382 length = -length; /* fallback */ 2383 } 2384 /* only DBCS or SBCS characters are expected*/ 2385 /* DB characters with high bit set to 1 are expected */ 2386 if( length > 2 || length==0 || 2387 (length == 1 && targetByteUnit > 0x7f) || 2388 (length == 2 && 2389 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2390 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2391 ) { 2392 targetByteUnit=missingCharMarker; 2393 } 2394 if (targetByteUnit != missingCharMarker){ 2395 2396 oldIsTargetByteDBCS = isTargetByteDBCS; 2397 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2398 /* append the shift sequence */ 2399 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2400 2401 if (isTargetByteDBCS) 2402 *target++ = UCNV_SO; 2403 else 2404 *target++ = UCNV_SI; 2405 if(offsets) 2406 *(offsets++) = (int32_t)(source - args->source-1); 2407 } 2408 /* write the targetUniChar to target */ 2409 if(targetByteUnit <= 0x00FF){ 2410 if( target < targetLimit){ 2411 *(target++) = (unsigned char) targetByteUnit; 2412 if(offsets){ 2413 *(offsets++) = (int32_t)(source - args->source-1); 2414 } 2415 2416 }else{ 2417 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2418 *err = U_BUFFER_OVERFLOW_ERROR; 2419 } 2420 }else{ 2421 if(target < targetLimit){ 2422 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2423 if(offsets){ 2424 *(offsets++) = (int32_t)(source - args->source-1); 2425 } 2426 if(target < targetLimit){ 2427 *(target++) =(unsigned char) (targetByteUnit -0x80); 2428 if(offsets){ 2429 *(offsets++) = (int32_t)(source - args->source-1); 2430 } 2431 }else{ 2432 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2433 *err = U_BUFFER_OVERFLOW_ERROR; 2434 } 2435 }else{ 2436 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2437 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2438 *err = U_BUFFER_OVERFLOW_ERROR; 2439 } 2440 } 2441 2442 } 2443 else{ 2444 /* oops.. the code point is unassingned 2445 * set the error and reason 2446 */ 2447 2448 /*check if the char is a First surrogate*/ 2449 if(U16_IS_SURROGATE(sourceChar)) { 2450 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2451 getTrail: 2452 /*look ahead to find the trail surrogate*/ 2453 if(source < sourceLimit) { 2454 /* test the following code unit */ 2455 UChar trail=(UChar) *source; 2456 if(U16_IS_TRAIL(trail)) { 2457 source++; 2458 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2459 *err = U_INVALID_CHAR_FOUND; 2460 /* convert this surrogate code point */ 2461 /* exit this condition tree */ 2462 } else { 2463 /* this is an unmatched lead code unit (1st surrogate) */ 2464 /* callback(illegal) */ 2465 *err=U_ILLEGAL_CHAR_FOUND; 2466 } 2467 } else { 2468 /* no more input */ 2469 *err = U_ZERO_ERROR; 2470 } 2471 } else { 2472 /* this is an unmatched trail code unit (2nd surrogate) */ 2473 /* callback(illegal) */ 2474 *err=U_ILLEGAL_CHAR_FOUND; 2475 } 2476 } else { 2477 /* callback(unassigned) for a BMP code point */ 2478 *err = U_INVALID_CHAR_FOUND; 2479 } 2480 2481 args->converter->fromUChar32=sourceChar; 2482 break; 2483 } 2484 } /* end if(myTargetIndex<myTargetLength) */ 2485 else{ 2486 *err =U_BUFFER_OVERFLOW_ERROR; 2487 break; 2488 } 2489 2490 }/* end while(mySourceIndex<mySourceLength) */ 2491 2492 /* 2493 * the end of the input stream and detection of truncated input 2494 * are handled by the framework, but for ISO-2022-KR conversion 2495 * we need to be in ASCII mode at the very end 2496 * 2497 * conditions: 2498 * successful 2499 * not in ASCII mode 2500 * end of input and no truncated input 2501 */ 2502 if( U_SUCCESS(*err) && 2503 isTargetByteDBCS && 2504 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2505 ) { 2506 int32_t sourceIndex; 2507 2508 /* we are switching to ASCII */ 2509 isTargetByteDBCS=FALSE; 2510 2511 /* get the source index of the last input character */ 2512 /* 2513 * TODO this would be simpler and more reliable if we used a pair 2514 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2515 * so that we could simply use the prevSourceIndex here; 2516 * this code gives an incorrect result for the rare case of an unmatched 2517 * trail surrogate that is alone in the last buffer of the text stream 2518 */ 2519 sourceIndex=(int32_t)(source-args->source); 2520 if(sourceIndex>0) { 2521 --sourceIndex; 2522 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2523 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2524 ) { 2525 --sourceIndex; 2526 } 2527 } else { 2528 sourceIndex=-1; 2529 } 2530 2531 fromUWriteUInt8( 2532 args->converter, 2533 SHIFT_IN_STR, 1, 2534 &target, (const char *)targetLimit, 2535 &offsets, sourceIndex, 2536 err); 2537 } 2538 2539 /*save the state and return */ 2540 args->source = source; 2541 args->target = (char*)target; 2542 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2543 } 2544 2545 /************************ To Unicode ***************************************/ 2546 2547 static void 2548 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2549 UErrorCode* err){ 2550 char const* sourceStart; 2551 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2552 2553 UConverterToUnicodeArgs subArgs; 2554 int32_t minArgsSize; 2555 2556 /* set up the subconverter arguments */ 2557 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2558 minArgsSize = args->size; 2559 } else { 2560 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2561 } 2562 2563 uprv_memcpy(&subArgs, args, minArgsSize); 2564 subArgs.size = (uint16_t)minArgsSize; 2565 subArgs.converter = myData->currentConverter; 2566 2567 /* remember the original start of the input for offsets */ 2568 sourceStart = args->source; 2569 2570 if(myData->key != 0) { 2571 /* continue with a partial escape sequence */ 2572 goto escape; 2573 } 2574 2575 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2576 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2577 subArgs.source = args->source; 2578 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2579 if(subArgs.source != subArgs.sourceLimit) { 2580 /* 2581 * get the current partial byte sequence 2582 * 2583 * it needs to be moved between the public and the subconverter 2584 * so that the conversion framework, which only sees the public 2585 * converter, can handle truncated and illegal input etc. 2586 */ 2587 if(args->converter->toULength > 0) { 2588 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2589 } 2590 subArgs.converter->toULength = args->converter->toULength; 2591 2592 /* 2593 * Convert up to the end of the input, or to before the next escape character. 2594 * Does not handle conversion extensions because the preToU[] state etc. 2595 * is not copied. 2596 */ 2597 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2598 2599 if(args->offsets != NULL && sourceStart != args->source) { 2600 /* update offsets to base them on the actual start of the input */ 2601 int32_t *offsets = args->offsets; 2602 UChar *target = args->target; 2603 int32_t delta = (int32_t)(args->source - sourceStart); 2604 while(target < subArgs.target) { 2605 if(*offsets >= 0) { 2606 *offsets += delta; 2607 } 2608 ++offsets; 2609 ++target; 2610 } 2611 } 2612 args->source = subArgs.source; 2613 args->target = subArgs.target; 2614 args->offsets = subArgs.offsets; 2615 2616 /* copy input/error/overflow buffers */ 2617 if(subArgs.converter->toULength > 0) { 2618 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2619 } 2620 args->converter->toULength = subArgs.converter->toULength; 2621 2622 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2623 if(subArgs.converter->UCharErrorBufferLength > 0) { 2624 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2625 subArgs.converter->UCharErrorBufferLength); 2626 } 2627 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2628 subArgs.converter->UCharErrorBufferLength = 0; 2629 } 2630 } 2631 2632 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2633 return; 2634 } 2635 2636 escape: 2637 changeState_2022(args->converter, 2638 &(args->source), 2639 args->sourceLimit, 2640 ISO_2022_KR, 2641 err); 2642 } 2643 } 2644 2645 static void 2646 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2647 UErrorCode* err){ 2648 char tempBuf[2]; 2649 const char *mySource = ( char *) args->source; 2650 UChar *myTarget = args->target; 2651 const char *mySourceLimit = args->sourceLimit; 2652 UChar32 targetUniChar = 0x0000; 2653 UChar mySourceChar = 0x0000; 2654 UConverterDataISO2022* myData; 2655 UConverterSharedData* sharedData ; 2656 UBool useFallback; 2657 2658 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2659 if(myData->version==1){ 2660 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2661 return; 2662 } 2663 2664 /* initialize state */ 2665 sharedData = myData->currentConverter->sharedData; 2666 useFallback = args->converter->useFallback; 2667 2668 if(myData->key != 0) { 2669 /* continue with a partial escape sequence */ 2670 goto escape; 2671 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2672 /* continue with a partial double-byte character */ 2673 mySourceChar = args->converter->toUBytes[0]; 2674 args->converter->toULength = 0; 2675 goto getTrailByte; 2676 } 2677 2678 while(mySource< mySourceLimit){ 2679 2680 if(myTarget < args->targetLimit){ 2681 2682 mySourceChar= (unsigned char) *mySource++; 2683 2684 if(mySourceChar==UCNV_SI){ 2685 myData->toU2022State.g = 0; 2686 if (myData->isEmptySegment) { 2687 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2688 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2689 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2690 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2691 args->converter->toULength = 1; 2692 args->target = myTarget; 2693 args->source = mySource; 2694 return; 2695 } 2696 /*consume the source */ 2697 continue; 2698 }else if(mySourceChar==UCNV_SO){ 2699 myData->toU2022State.g = 1; 2700 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2701 /*consume the source */ 2702 continue; 2703 }else if(mySourceChar==ESC_2022){ 2704 mySource--; 2705 escape: 2706 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2707 changeState_2022(args->converter,&(mySource), 2708 mySourceLimit, ISO_2022_KR, err); 2709 if(U_FAILURE(*err)){ 2710 args->target = myTarget; 2711 args->source = mySource; 2712 return; 2713 } 2714 continue; 2715 } 2716 2717 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2718 if(myData->toU2022State.g == 1) { 2719 if(mySource < mySourceLimit) { 2720 int leadIsOk, trailIsOk; 2721 uint8_t trailByte; 2722 getTrailByte: 2723 targetUniChar = missingCharMarker; 2724 trailByte = (uint8_t)*mySource; 2725 /* 2726 * Ticket 5691: consistent illegal sequences: 2727 * - We include at least the first byte in the illegal sequence. 2728 * - If any of the non-initial bytes could be the start of a character, 2729 * we stop the illegal sequence before the first one of those. 2730 * 2731 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2732 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2733 * Otherwise we convert or report the pair of bytes. 2734 */ 2735 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2736 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2737 if (leadIsOk && trailIsOk) { 2738 ++mySource; 2739 tempBuf[0] = (char)(mySourceChar + 0x80); 2740 tempBuf[1] = (char)(trailByte + 0x80); 2741 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2742 mySourceChar = (mySourceChar << 8) | trailByte; 2743 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2744 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2745 ++mySource; 2746 /* add another bit so that the code below writes 2 bytes in case of error */ 2747 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2748 } 2749 } else { 2750 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2751 args->converter->toULength = 1; 2752 break; 2753 } 2754 } 2755 else if(mySourceChar <= 0x7f) { 2756 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2757 } else { 2758 targetUniChar = 0xffff; 2759 } 2760 if(targetUniChar < 0xfffe){ 2761 if(args->offsets) { 2762 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2763 } 2764 *(myTarget++)=(UChar)targetUniChar; 2765 } 2766 else { 2767 /* Call the callback function*/ 2768 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2769 break; 2770 } 2771 } 2772 else{ 2773 *err =U_BUFFER_OVERFLOW_ERROR; 2774 break; 2775 } 2776 } 2777 args->target = myTarget; 2778 args->source = mySource; 2779 } 2780 2781 /*************************** END ISO2022-KR *********************************/ 2782 2783 /*************************** ISO-2022-CN ********************************* 2784 * 2785 * Rules for ISO-2022-CN Encoding: 2786 * i) The designator sequence must appear once on a line before any instance 2787 * of character set it designates. 2788 * ii) If two lines contain characters from the same character set, both lines 2789 * must include the designator sequence. 2790 * iii) Once the designator sequence is known, a shifting sequence has to be found 2791 * to invoke the shifting 2792 * iv) All lines start in ASCII and end in ASCII. 2793 * v) Four shifting sequences are employed for this purpose: 2794 * 2795 * Sequcence ASCII Eq Charsets 2796 * ---------- ------- --------- 2797 * SI <SI> US-ASCII 2798 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2799 * SS2 <ESC>N CNS-11643-1992 Plane 2 2800 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2801 * 2802 * vi) 2803 * SOdesignator : ESC "$" ")" finalchar_for_SO 2804 * SS2designator : ESC "$" "*" finalchar_for_SS2 2805 * SS3designator : ESC "$" "+" finalchar_for_SS3 2806 * 2807 * ESC $ ) A Indicates the bytes following SO are Chinese 2808 * characters as defined in GB 2312-80, until 2809 * another SOdesignation appears 2810 * 2811 * 2812 * ESC $ ) E Indicates the bytes following SO are as defined 2813 * in ISO-IR-165 (for details, see section 2.1), 2814 * until another SOdesignation appears 2815 * 2816 * ESC $ ) G Indicates the bytes following SO are as defined 2817 * in CNS 11643-plane-1, until another 2818 * SOdesignation appears 2819 * 2820 * ESC $ * H Indicates the two bytes immediately following 2821 * SS2 is a Chinese character as defined in CNS 2822 * 11643-plane-2, until another SS2designation 2823 * appears 2824 * (Meaning <ESC>N must preceed every 2 byte 2825 * sequence.) 2826 * 2827 * ESC $ + I Indicates the immediate two bytes following SS3 2828 * is a Chinese character as defined in CNS 2829 * 11643-plane-3, until another SS3designation 2830 * appears 2831 * (Meaning <ESC>O must preceed every 2 byte 2832 * sequence.) 2833 * 2834 * ESC $ + J Indicates the immediate two bytes following SS3 2835 * is a Chinese character as defined in CNS 2836 * 11643-plane-4, until another SS3designation 2837 * appears 2838 * (In English: <ESC>O must preceed every 2 byte 2839 * sequence.) 2840 * 2841 * ESC $ + K Indicates the immediate two bytes following SS3 2842 * is a Chinese character as defined in CNS 2843 * 11643-plane-5, until another SS3designation 2844 * appears 2845 * 2846 * ESC $ + L Indicates the immediate two bytes following SS3 2847 * is a Chinese character as defined in CNS 2848 * 11643-plane-6, until another SS3designation 2849 * appears 2850 * 2851 * ESC $ + M Indicates the immediate two bytes following SS3 2852 * is a Chinese character as defined in CNS 2853 * 11643-plane-7, until another SS3designation 2854 * appears 2855 * 2856 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2857 * has its own designation information before any Chinese characters 2858 * appear 2859 * 2860 */ 2861 2862 /* The following are defined this way to make the strings truly readonly */ 2863 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2864 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2865 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2866 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2867 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2868 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2869 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2870 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2871 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2872 2873 /********************** ISO2022-CN Data **************************/ 2874 static const char* const escSeqCharsCN[10] ={ 2875 SHIFT_IN_STR, /* 0 ASCII */ 2876 GB_2312_80_STR, /* 1 GB2312_1 */ 2877 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2878 CNS_11643_1992_Plane_1_STR, 2879 CNS_11643_1992_Plane_2_STR, 2880 CNS_11643_1992_Plane_3_STR, 2881 CNS_11643_1992_Plane_4_STR, 2882 CNS_11643_1992_Plane_5_STR, 2883 CNS_11643_1992_Plane_6_STR, 2884 CNS_11643_1992_Plane_7_STR 2885 }; 2886 2887 static void 2888 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2889 UConverter *cnv = args->converter; 2890 UConverterDataISO2022 *converterData; 2891 ISO2022State *pFromU2022State; 2892 uint8_t *target = (uint8_t *) args->target; 2893 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2894 const UChar* source = args->source; 2895 const UChar* sourceLimit = args->sourceLimit; 2896 int32_t* offsets = args->offsets; 2897 UChar32 sourceChar; 2898 char buffer[8]; 2899 int32_t len; 2900 int8_t choices[3]; 2901 int32_t choiceCount; 2902 uint32_t targetValue = 0; 2903 UBool useFallback; 2904 2905 /* set up the state */ 2906 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2907 pFromU2022State = &converterData->fromU2022State; 2908 2909 choiceCount = 0; 2910 2911 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2912 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2913 goto getTrail; 2914 } 2915 2916 while( source < sourceLimit){ 2917 if(target < targetLimit){ 2918 2919 sourceChar = *(source++); 2920 /*check if the char is a First surrogate*/ 2921 if(U16_IS_SURROGATE(sourceChar)) { 2922 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2923 getTrail: 2924 /*look ahead to find the trail surrogate*/ 2925 if(source < sourceLimit) { 2926 /* test the following code unit */ 2927 UChar trail=(UChar) *source; 2928 if(U16_IS_TRAIL(trail)) { 2929 source++; 2930 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2931 cnv->fromUChar32=0x00; 2932 /* convert this supplementary code point */ 2933 /* exit this condition tree */ 2934 } else { 2935 /* this is an unmatched lead code unit (1st surrogate) */ 2936 /* callback(illegal) */ 2937 *err=U_ILLEGAL_CHAR_FOUND; 2938 cnv->fromUChar32=sourceChar; 2939 break; 2940 } 2941 } else { 2942 /* no more input */ 2943 cnv->fromUChar32=sourceChar; 2944 break; 2945 } 2946 } else { 2947 /* this is an unmatched trail code unit (2nd surrogate) */ 2948 /* callback(illegal) */ 2949 *err=U_ILLEGAL_CHAR_FOUND; 2950 cnv->fromUChar32=sourceChar; 2951 break; 2952 } 2953 } 2954 2955 /* do the conversion */ 2956 if(sourceChar <= 0x007f ){ 2957 /* do not convert SO/SI/ESC */ 2958 if(IS_2022_CONTROL(sourceChar)) { 2959 /* callback(illegal) */ 2960 *err=U_ILLEGAL_CHAR_FOUND; 2961 cnv->fromUChar32=sourceChar; 2962 break; 2963 } 2964 2965 /* US-ASCII */ 2966 if(pFromU2022State->g == 0) { 2967 buffer[0] = (char)sourceChar; 2968 len = 1; 2969 } else { 2970 buffer[0] = UCNV_SI; 2971 buffer[1] = (char)sourceChar; 2972 len = 2; 2973 pFromU2022State->g = 0; 2974 choiceCount = 0; 2975 } 2976 if(sourceChar == CR || sourceChar == LF) { 2977 /* reset the state at the end of a line */ 2978 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2979 choiceCount = 0; 2980 } 2981 } 2982 else{ 2983 /* convert U+0080..U+10ffff */ 2984 int32_t i; 2985 int8_t cs, g; 2986 2987 if(choiceCount == 0) { 2988 /* try the current SO/G1 converter first */ 2989 choices[0] = pFromU2022State->cs[1]; 2990 2991 /* default to GB2312_1 if none is designated yet */ 2992 if(choices[0] == 0) { 2993 choices[0] = GB2312_1; 2994 } 2995 2996 if(converterData->version == 0) { 2997 /* ISO-2022-CN */ 2998 2999 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3000 if(choices[0] == GB2312_1) { 3001 choices[1] = (int8_t)CNS_11643_1; 3002 } else { 3003 choices[1] = (int8_t)GB2312_1; 3004 } 3005 3006 choiceCount = 2; 3007 } else if (converterData->version == 1) { 3008 /* ISO-2022-CN-EXT */ 3009 3010 /* try one of the other converters */ 3011 switch(choices[0]) { 3012 case GB2312_1: 3013 choices[1] = (int8_t)CNS_11643_1; 3014 choices[2] = (int8_t)ISO_IR_165; 3015 break; 3016 case ISO_IR_165: 3017 choices[1] = (int8_t)GB2312_1; 3018 choices[2] = (int8_t)CNS_11643_1; 3019 break; 3020 default: /* CNS_11643_x */ 3021 choices[1] = (int8_t)GB2312_1; 3022 choices[2] = (int8_t)ISO_IR_165; 3023 break; 3024 } 3025 3026 choiceCount = 3; 3027 } else { 3028 choices[0] = (int8_t)CNS_11643_1; 3029 choices[1] = (int8_t)GB2312_1; 3030 } 3031 } 3032 3033 cs = g = 0; 3034 /* 3035 * len==0: no mapping found yet 3036 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3037 * len>0: found a roundtrip result, done 3038 */ 3039 len = 0; 3040 /* 3041 * We will turn off useFallback after finding a fallback, 3042 * but we still get fallbacks from PUA code points as usual. 3043 * Therefore, we will also need to check that we don't overwrite 3044 * an early fallback with a later one. 3045 */ 3046 useFallback = cnv->useFallback; 3047 3048 for(i = 0; i < choiceCount && len <= 0; ++i) { 3049 int8_t cs0 = choices[i]; 3050 if(cs0 > 0) { 3051 uint32_t value; 3052 int32_t len2; 3053 if(cs0 >= CNS_11643_0) { 3054 len2 = MBCS_FROM_UCHAR32_ISO2022( 3055 converterData->myConverterArray[CNS_11643], 3056 sourceChar, 3057 &value, 3058 useFallback, 3059 MBCS_OUTPUT_3); 3060 if(len2 == 3 || (len2 == -3 && len == 0)) { 3061 targetValue = value; 3062 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3063 if(len2 >= 0) { 3064 len = 2; 3065 } else { 3066 len = -2; 3067 useFallback = FALSE; 3068 } 3069 if(cs == CNS_11643_1) { 3070 g = 1; 3071 } else if(cs == CNS_11643_2) { 3072 g = 2; 3073 } else /* plane 3..7 */ if(converterData->version == 1) { 3074 g = 3; 3075 } else { 3076 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3077 len = 0; 3078 } 3079 } 3080 } else { 3081 /* GB2312_1 or ISO-IR-165 */ 3082 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3083 len2 = MBCS_FROM_UCHAR32_ISO2022( 3084 converterData->myConverterArray[cs0], 3085 sourceChar, 3086 &value, 3087 useFallback, 3088 MBCS_OUTPUT_2); 3089 if(len2 == 2 || (len2 == -2 && len == 0)) { 3090 targetValue = value; 3091 len = len2; 3092 cs = cs0; 3093 g = 1; 3094 useFallback = FALSE; 3095 } 3096 } 3097 } 3098 } 3099 3100 if(len != 0) { 3101 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3102 3103 /* write the designation sequence if necessary */ 3104 if(cs != pFromU2022State->cs[g]) { 3105 if(cs < CNS_11643) { 3106 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3107 } else { 3108 U_ASSERT(cs >= CNS_11643_1); 3109 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3110 } 3111 len = 4; 3112 pFromU2022State->cs[g] = cs; 3113 if(g == 1) { 3114 /* changing the SO/G1 charset invalidates the choices[] */ 3115 choiceCount = 0; 3116 } 3117 } 3118 3119 /* write the shift sequence if necessary */ 3120 if(g != pFromU2022State->g) { 3121 switch(g) { 3122 case 1: 3123 buffer[len++] = UCNV_SO; 3124 3125 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3126 pFromU2022State->g = 1; 3127 break; 3128 case 2: 3129 buffer[len++] = 0x1b; 3130 buffer[len++] = 0x4e; 3131 break; 3132 default: /* case 3 */ 3133 buffer[len++] = 0x1b; 3134 buffer[len++] = 0x4f; 3135 break; 3136 } 3137 } 3138 3139 /* write the two output bytes */ 3140 buffer[len++] = (char)(targetValue >> 8); 3141 buffer[len++] = (char)targetValue; 3142 } else { 3143 /* if we cannot find the character after checking all codepages 3144 * then this is an error 3145 */ 3146 *err = U_INVALID_CHAR_FOUND; 3147 cnv->fromUChar32=sourceChar; 3148 break; 3149 } 3150 } 3151 3152 /* output len>0 bytes in buffer[] */ 3153 if(len == 1) { 3154 *target++ = buffer[0]; 3155 if(offsets) { 3156 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3157 } 3158 } else if(len == 2 && (target + 2) <= targetLimit) { 3159 *target++ = buffer[0]; 3160 *target++ = buffer[1]; 3161 if(offsets) { 3162 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3163 *offsets++ = sourceIndex; 3164 *offsets++ = sourceIndex; 3165 } 3166 } else { 3167 fromUWriteUInt8( 3168 cnv, 3169 buffer, len, 3170 &target, (const char *)targetLimit, 3171 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3172 err); 3173 if(U_FAILURE(*err)) { 3174 break; 3175 } 3176 } 3177 } /* end if(myTargetIndex<myTargetLength) */ 3178 else{ 3179 *err =U_BUFFER_OVERFLOW_ERROR; 3180 break; 3181 } 3182 3183 }/* end while(mySourceIndex<mySourceLength) */ 3184 3185 /* 3186 * the end of the input stream and detection of truncated input 3187 * are handled by the framework, but for ISO-2022-CN conversion 3188 * we need to be in ASCII mode at the very end 3189 * 3190 * conditions: 3191 * successful 3192 * not in ASCII mode 3193 * end of input and no truncated input 3194 */ 3195 if( U_SUCCESS(*err) && 3196 pFromU2022State->g!=0 && 3197 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3198 ) { 3199 int32_t sourceIndex; 3200 3201 /* we are switching to ASCII */ 3202 pFromU2022State->g=0; 3203 3204 /* get the source index of the last input character */ 3205 /* 3206 * TODO this would be simpler and more reliable if we used a pair 3207 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3208 * so that we could simply use the prevSourceIndex here; 3209 * this code gives an incorrect result for the rare case of an unmatched 3210 * trail surrogate that is alone in the last buffer of the text stream 3211 */ 3212 sourceIndex=(int32_t)(source-args->source); 3213 if(sourceIndex>0) { 3214 --sourceIndex; 3215 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3216 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3217 ) { 3218 --sourceIndex; 3219 } 3220 } else { 3221 sourceIndex=-1; 3222 } 3223 3224 fromUWriteUInt8( 3225 cnv, 3226 SHIFT_IN_STR, 1, 3227 &target, (const char *)targetLimit, 3228 &offsets, sourceIndex, 3229 err); 3230 } 3231 3232 /*save the state and return */ 3233 args->source = source; 3234 args->target = (char*)target; 3235 } 3236 3237 3238 static void 3239 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3240 UErrorCode* err){ 3241 char tempBuf[3]; 3242 const char *mySource = (char *) args->source; 3243 UChar *myTarget = args->target; 3244 const char *mySourceLimit = args->sourceLimit; 3245 uint32_t targetUniChar = 0x0000; 3246 uint32_t mySourceChar = 0x0000; 3247 UConverterDataISO2022* myData; 3248 ISO2022State *pToU2022State; 3249 3250 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3251 pToU2022State = &myData->toU2022State; 3252 3253 if(myData->key != 0) { 3254 /* continue with a partial escape sequence */ 3255 goto escape; 3256 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3257 /* continue with a partial double-byte character */ 3258 mySourceChar = args->converter->toUBytes[0]; 3259 args->converter->toULength = 0; 3260 targetUniChar = missingCharMarker; 3261 goto getTrailByte; 3262 } 3263 3264 while(mySource < mySourceLimit){ 3265 3266 targetUniChar =missingCharMarker; 3267 3268 if(myTarget < args->targetLimit){ 3269 3270 mySourceChar= (unsigned char) *mySource++; 3271 3272 switch(mySourceChar){ 3273 case UCNV_SI: 3274 pToU2022State->g=0; 3275 if (myData->isEmptySegment) { 3276 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3277 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3278 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3279 args->converter->toUBytes[0] = mySourceChar; 3280 args->converter->toULength = 1; 3281 args->target = myTarget; 3282 args->source = mySource; 3283 return; 3284 } 3285 continue; 3286 3287 case UCNV_SO: 3288 if(pToU2022State->cs[1] != 0) { 3289 pToU2022State->g=1; 3290 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3291 continue; 3292 } else { 3293 /* illegal to have SO before a matching designator */ 3294 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3295 break; 3296 } 3297 3298 case ESC_2022: 3299 mySource--; 3300 escape: 3301 { 3302 const char * mySourceBefore = mySource; 3303 int8_t toULengthBefore = args->converter->toULength; 3304 3305 changeState_2022(args->converter,&(mySource), 3306 mySourceLimit, ISO_2022_CN,err); 3307 3308 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3309 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3310 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3311 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3312 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3313 } 3314 } 3315 3316 /* invalid or illegal escape sequence */ 3317 if(U_FAILURE(*err)){ 3318 args->target = myTarget; 3319 args->source = mySource; 3320 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3321 return; 3322 } 3323 continue; 3324 3325 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3326 3327 case CR: 3328 /*falls through*/ 3329 case LF: 3330 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3331 /* falls through */ 3332 default: 3333 /* convert one or two bytes */ 3334 myData->isEmptySegment = FALSE; 3335 if(pToU2022State->g != 0) { 3336 if(mySource < mySourceLimit) { 3337 UConverterSharedData *cnv; 3338 StateEnum tempState; 3339 int32_t tempBufLen; 3340 int leadIsOk, trailIsOk; 3341 uint8_t trailByte; 3342 getTrailByte: 3343 trailByte = (uint8_t)*mySource; 3344 /* 3345 * Ticket 5691: consistent illegal sequences: 3346 * - We include at least the first byte in the illegal sequence. 3347 * - If any of the non-initial bytes could be the start of a character, 3348 * we stop the illegal sequence before the first one of those. 3349 * 3350 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3351 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3352 * Otherwise we convert or report the pair of bytes. 3353 */ 3354 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3355 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3356 if (leadIsOk && trailIsOk) { 3357 ++mySource; 3358 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3359 if(tempState >= CNS_11643_0) { 3360 cnv = myData->myConverterArray[CNS_11643]; 3361 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3362 tempBuf[1] = (char) (mySourceChar); 3363 tempBuf[2] = (char) trailByte; 3364 tempBufLen = 3; 3365 3366 }else{ 3367 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3368 cnv = myData->myConverterArray[tempState]; 3369 tempBuf[0] = (char) (mySourceChar); 3370 tempBuf[1] = (char) trailByte; 3371 tempBufLen = 2; 3372 } 3373 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3374 mySourceChar = (mySourceChar << 8) | trailByte; 3375 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3376 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3377 ++mySource; 3378 /* add another bit so that the code below writes 2 bytes in case of error */ 3379 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3380 } 3381 if(pToU2022State->g>=2) { 3382 /* return from a single-shift state to the previous one */ 3383 pToU2022State->g=pToU2022State->prevG; 3384 } 3385 } else { 3386 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3387 args->converter->toULength = 1; 3388 goto endloop; 3389 } 3390 } 3391 else{ 3392 if(mySourceChar <= 0x7f) { 3393 targetUniChar = (UChar) mySourceChar; 3394 } 3395 } 3396 break; 3397 } 3398 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3399 if(args->offsets){ 3400 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3401 } 3402 *(myTarget++)=(UChar)targetUniChar; 3403 } 3404 else if(targetUniChar > missingCharMarker){ 3405 /* disassemble the surrogate pair and write to output*/ 3406 targetUniChar-=0x0010000; 3407 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3408 if(args->offsets){ 3409 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3410 } 3411 ++myTarget; 3412 if(myTarget< args->targetLimit){ 3413 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3414 if(args->offsets){ 3415 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3416 } 3417 ++myTarget; 3418 }else{ 3419 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3420 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3421 } 3422 3423 } 3424 else{ 3425 /* Call the callback function*/ 3426 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3427 break; 3428 } 3429 } 3430 else{ 3431 *err =U_BUFFER_OVERFLOW_ERROR; 3432 break; 3433 } 3434 } 3435 endloop: 3436 args->target = myTarget; 3437 args->source = mySource; 3438 } 3439 3440 static void 3441 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3442 UConverter *cnv = args->converter; 3443 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3444 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3445 char *p, *subchar; 3446 char buffer[8]; 3447 int32_t length; 3448 3449 subchar=(char *)cnv->subChars; 3450 length=cnv->subCharLen; /* assume length==1 for most variants */ 3451 3452 p = buffer; 3453 switch(myConverterData->locale[0]){ 3454 case 'j': 3455 { 3456 int8_t cs; 3457 3458 if(pFromU2022State->g == 1) { 3459 /* JIS7: switch from G1 to G0 */ 3460 pFromU2022State->g = 0; 3461 *p++ = UCNV_SI; 3462 } 3463 3464 cs = pFromU2022State->cs[0]; 3465 if(cs != ASCII && cs != JISX201) { 3466 /* not in ASCII or JIS X 0201: switch to ASCII */ 3467 pFromU2022State->cs[0] = (int8_t)ASCII; 3468 *p++ = '\x1b'; 3469 *p++ = '\x28'; 3470 *p++ = '\x42'; 3471 } 3472 3473 *p++ = subchar[0]; 3474 break; 3475 } 3476 case 'c': 3477 if(pFromU2022State->g != 0) { 3478 /* not in ASCII mode: switch to ASCII */ 3479 pFromU2022State->g = 0; 3480 *p++ = UCNV_SI; 3481 } 3482 *p++ = subchar[0]; 3483 break; 3484 case 'k': 3485 if(myConverterData->version == 0) { 3486 if(length == 1) { 3487 if((UBool)args->converter->fromUnicodeStatus) { 3488 /* in DBCS mode: switch to SBCS */ 3489 args->converter->fromUnicodeStatus = 0; 3490 *p++ = UCNV_SI; 3491 } 3492 *p++ = subchar[0]; 3493 } else /* length == 2*/ { 3494 if(!(UBool)args->converter->fromUnicodeStatus) { 3495 /* in SBCS mode: switch to DBCS */ 3496 args->converter->fromUnicodeStatus = 1; 3497 *p++ = UCNV_SO; 3498 } 3499 *p++ = subchar[0]; 3500 *p++ = subchar[1]; 3501 } 3502 break; 3503 } else { 3504 /* save the subconverter's substitution string */ 3505 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3506 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3507 3508 /* set our substitution string into the subconverter */ 3509 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3510 myConverterData->currentConverter->subCharLen = (int8_t)length; 3511 3512 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3513 args->converter = myConverterData->currentConverter; 3514 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3515 ucnv_cbFromUWriteSub(args, 0, err); 3516 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3517 args->converter = cnv; 3518 3519 /* restore the subconverter's substitution string */ 3520 myConverterData->currentConverter->subChars = currentSubChars; 3521 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3522 3523 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3524 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3525 uprv_memcpy( 3526 cnv->charErrorBuffer, 3527 myConverterData->currentConverter->charErrorBuffer, 3528 myConverterData->currentConverter->charErrorBufferLength); 3529 } 3530 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3531 myConverterData->currentConverter->charErrorBufferLength = 0; 3532 } 3533 return; 3534 } 3535 default: 3536 /* not expected */ 3537 break; 3538 } 3539 ucnv_cbFromUWriteBytes(args, 3540 buffer, (int32_t)(p - buffer), 3541 offsetIndex, err); 3542 } 3543 3544 /* 3545 * Structure for cloning an ISO 2022 converter into a single memory block. 3546 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3547 * and then ucnv_safeClone() of the sub-converter may additionally align 3548 * currentConverter inside the cloneStruct, for which we need the deadSpace 3549 * after currentConverter. 3550 * This is because UAlignedMemory may be larger than the actually 3551 * necessary alignment size for the platform. 3552 * The other cloneStruct fields will not be moved around, 3553 * and are aligned properly with cloneStruct's alignment. 3554 */ 3555 struct cloneStruct 3556 { 3557 UConverter cnv; 3558 UConverter currentConverter; 3559 UAlignedMemory deadSpace; 3560 UConverterDataISO2022 mydata; 3561 }; 3562 3563 3564 static UConverter * 3565 _ISO_2022_SafeClone( 3566 const UConverter *cnv, 3567 void *stackBuffer, 3568 int32_t *pBufferSize, 3569 UErrorCode *status) 3570 { 3571 struct cloneStruct * localClone; 3572 UConverterDataISO2022 *cnvData; 3573 int32_t i, size; 3574 3575 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3576 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3577 return NULL; 3578 } 3579 3580 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3581 localClone = (struct cloneStruct *)stackBuffer; 3582 3583 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3584 3585 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3586 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3587 localClone->cnv.isExtraLocal = TRUE; 3588 3589 /* share the subconverters */ 3590 3591 if(cnvData->currentConverter != NULL) { 3592 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3593 localClone->mydata.currentConverter = 3594 ucnv_safeClone(cnvData->currentConverter, 3595 &localClone->currentConverter, 3596 &size, status); 3597 if(U_FAILURE(*status)) { 3598 return NULL; 3599 } 3600 } 3601 3602 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3603 if(cnvData->myConverterArray[i] != NULL) { 3604 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3605 } 3606 } 3607 3608 return &localClone->cnv; 3609 } 3610 3611 static void 3612 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3613 const USetAdder *sa, 3614 UConverterUnicodeSet which, 3615 UErrorCode *pErrorCode) 3616 { 3617 int32_t i; 3618 UConverterDataISO2022* cnvData; 3619 3620 if (U_FAILURE(*pErrorCode)) { 3621 return; 3622 } 3623 #ifdef U_ENABLE_GENERIC_ISO_2022 3624 if (cnv->sharedData == &_ISO2022Data) { 3625 /* We use UTF-8 in this case */ 3626 sa->addRange(sa->set, 0, 0xd7FF); 3627 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3628 return; 3629 } 3630 #endif 3631 3632 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3633 3634 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3635 switch(cnvData->locale[0]){ 3636 case 'j': 3637 /* include JIS X 0201 which is hardcoded */ 3638 sa->add(sa->set, 0xa5); 3639 sa->add(sa->set, 0x203e); 3640 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3641 /* include Latin-1 for some variants of JP */ 3642 sa->addRange(sa->set, 0, 0xff); 3643 } else { 3644 /* include ASCII for JP */ 3645 sa->addRange(sa->set, 0, 0x7f); 3646 } 3647 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3648 /* 3649 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3650 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3651 * use half-width Katakana. 3652 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3653 * half-width Katakana via the ESC ( I sequence. 3654 * However, we only emit (fromUnicode) half-width Katakana according to the 3655 * definition of each variant. 3656 * 3657 * When including fallbacks, 3658 * we need to include half-width Katakana Unicode code points for all JP variants because 3659 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3660 */ 3661 /* include half-width Katakana for JP */ 3662 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3663 } 3664 break; 3665 case 'c': 3666 case 'z': 3667 /* include ASCII for CN */ 3668 sa->addRange(sa->set, 0, 0x7f); 3669 break; 3670 case 'k': 3671 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3672 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3673 cnvData->currentConverter, sa, which, pErrorCode); 3674 /* the loop over myConverterArray[] will simply not find another converter */ 3675 break; 3676 default: 3677 break; 3678 } 3679 3680 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3681 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3682 cnvData->version==0 && i==CNS_11643 3683 ) { 3684 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3685 ucnv_MBCSGetUnicodeSetForBytes( 3686 cnvData->myConverterArray[i], 3687 sa, UCNV_ROUNDTRIP_SET, 3688 0, 0x81, 0x82, 3689 pErrorCode); 3690 } 3691 #endif 3692 3693 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3694 UConverterSetFilter filter; 3695 if(cnvData->myConverterArray[i]!=NULL) { 3696 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3697 cnvData->version==0 && i==CNS_11643 3698 ) { 3699 /* 3700 * Version-specific for CN: 3701 * CN version 0 does not map CNS planes 3..7 although 3702 * they are all available in the CNS conversion table; 3703 * CN version 1 (-EXT) does map them all. 3704 * The two versions create different Unicode sets. 3705 */ 3706 filter=UCNV_SET_FILTER_2022_CN; 3707 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3708 /* 3709 * Only add code points that map to Shift-JIS codes 3710 * corresponding to JIS X 0208. 3711 */ 3712 filter=UCNV_SET_FILTER_SJIS; 3713 } else if(i==KSC5601) { 3714 /* 3715 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3716 * are broader than GR94. 3717 */ 3718 filter=UCNV_SET_FILTER_GR94DBCS; 3719 } else { 3720 filter=UCNV_SET_FILTER_NONE; 3721 } 3722 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3723 } 3724 } 3725 3726 /* 3727 * ISO 2022 converters must not convert SO/SI/ESC despite what 3728 * sub-converters do by themselves. 3729 * Remove these characters from the set. 3730 */ 3731 sa->remove(sa->set, 0x0e); 3732 sa->remove(sa->set, 0x0f); 3733 sa->remove(sa->set, 0x1b); 3734 3735 /* ISO 2022 converters do not convert C1 controls either */ 3736 sa->removeRange(sa->set, 0x80, 0x9f); 3737 } 3738 3739 static const UConverterImpl _ISO2022Impl={ 3740 UCNV_ISO_2022, 3741 3742 NULL, 3743 NULL, 3744 3745 _ISO2022Open, 3746 _ISO2022Close, 3747 _ISO2022Reset, 3748 3749 #ifdef U_ENABLE_GENERIC_ISO_2022 3750 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3751 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3752 ucnv_fromUnicode_UTF8, 3753 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3754 #else 3755 NULL, 3756 NULL, 3757 NULL, 3758 NULL, 3759 #endif 3760 NULL, 3761 3762 NULL, 3763 _ISO2022getName, 3764 _ISO_2022_WriteSub, 3765 _ISO_2022_SafeClone, 3766 _ISO_2022_GetUnicodeSet, 3767 3768 NULL, 3769 NULL 3770 }; 3771 static const UConverterStaticData _ISO2022StaticData={ 3772 sizeof(UConverterStaticData), 3773 "ISO_2022", 3774 2022, 3775 UCNV_IBM, 3776 UCNV_ISO_2022, 3777 1, 3778 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3779 { 0x1a, 0, 0, 0 }, 3780 1, 3781 FALSE, 3782 FALSE, 3783 0, 3784 0, 3785 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3786 }; 3787 const UConverterSharedData _ISO2022Data={ 3788 sizeof(UConverterSharedData), 3789 ~((uint32_t) 0), 3790 NULL, 3791 NULL, 3792 &_ISO2022StaticData, 3793 FALSE, 3794 &_ISO2022Impl, 3795 0, UCNV_MBCS_TABLE_INITIALIZER 3796 }; 3797 3798 /*************JP****************/ 3799 static const UConverterImpl _ISO2022JPImpl={ 3800 UCNV_ISO_2022, 3801 3802 NULL, 3803 NULL, 3804 3805 _ISO2022Open, 3806 _ISO2022Close, 3807 _ISO2022Reset, 3808 3809 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3810 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3811 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3812 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3813 NULL, 3814 3815 NULL, 3816 _ISO2022getName, 3817 _ISO_2022_WriteSub, 3818 _ISO_2022_SafeClone, 3819 _ISO_2022_GetUnicodeSet, 3820 3821 NULL, 3822 NULL 3823 }; 3824 static const UConverterStaticData _ISO2022JPStaticData={ 3825 sizeof(UConverterStaticData), 3826 "ISO_2022_JP", 3827 0, 3828 UCNV_IBM, 3829 UCNV_ISO_2022, 3830 1, 3831 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3832 { 0x1a, 0, 0, 0 }, 3833 1, 3834 FALSE, 3835 FALSE, 3836 0, 3837 0, 3838 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3839 }; 3840 3841 namespace { 3842 3843 const UConverterSharedData _ISO2022JPData={ 3844 sizeof(UConverterSharedData), 3845 ~((uint32_t) 0), 3846 NULL, 3847 NULL, 3848 &_ISO2022JPStaticData, 3849 FALSE, 3850 &_ISO2022JPImpl, 3851 0, UCNV_MBCS_TABLE_INITIALIZER 3852 }; 3853 3854 } // namespace 3855 3856 /************* KR ***************/ 3857 static const UConverterImpl _ISO2022KRImpl={ 3858 UCNV_ISO_2022, 3859 3860 NULL, 3861 NULL, 3862 3863 _ISO2022Open, 3864 _ISO2022Close, 3865 _ISO2022Reset, 3866 3867 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3868 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3869 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3870 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3871 NULL, 3872 3873 NULL, 3874 _ISO2022getName, 3875 _ISO_2022_WriteSub, 3876 _ISO_2022_SafeClone, 3877 _ISO_2022_GetUnicodeSet, 3878 3879 NULL, 3880 NULL 3881 }; 3882 static const UConverterStaticData _ISO2022KRStaticData={ 3883 sizeof(UConverterStaticData), 3884 "ISO_2022_KR", 3885 0, 3886 UCNV_IBM, 3887 UCNV_ISO_2022, 3888 1, 3889 3, /* max 3 bytes per UChar: SO+DBCS */ 3890 { 0x1a, 0, 0, 0 }, 3891 1, 3892 FALSE, 3893 FALSE, 3894 0, 3895 0, 3896 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3897 }; 3898 3899 namespace { 3900 3901 const UConverterSharedData _ISO2022KRData={ 3902 sizeof(UConverterSharedData), 3903 ~((uint32_t) 0), 3904 NULL, 3905 NULL, 3906 &_ISO2022KRStaticData, 3907 FALSE, 3908 &_ISO2022KRImpl, 3909 0, UCNV_MBCS_TABLE_INITIALIZER 3910 }; 3911 3912 } // namespace 3913 3914 /*************** CN ***************/ 3915 static const UConverterImpl _ISO2022CNImpl={ 3916 3917 UCNV_ISO_2022, 3918 3919 NULL, 3920 NULL, 3921 3922 _ISO2022Open, 3923 _ISO2022Close, 3924 _ISO2022Reset, 3925 3926 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3927 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3928 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3929 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3930 NULL, 3931 3932 NULL, 3933 _ISO2022getName, 3934 _ISO_2022_WriteSub, 3935 _ISO_2022_SafeClone, 3936 _ISO_2022_GetUnicodeSet, 3937 3938 NULL, 3939 NULL 3940 }; 3941 static const UConverterStaticData _ISO2022CNStaticData={ 3942 sizeof(UConverterStaticData), 3943 "ISO_2022_CN", 3944 0, 3945 UCNV_IBM, 3946 UCNV_ISO_2022, 3947 1, 3948 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3949 { 0x1a, 0, 0, 0 }, 3950 1, 3951 FALSE, 3952 FALSE, 3953 0, 3954 0, 3955 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3956 }; 3957 3958 namespace { 3959 3960 const UConverterSharedData _ISO2022CNData={ 3961 sizeof(UConverterSharedData), 3962 ~((uint32_t) 0), 3963 NULL, 3964 NULL, 3965 &_ISO2022CNStaticData, 3966 FALSE, 3967 &_ISO2022CNImpl, 3968 0, UCNV_MBCS_TABLE_INITIALIZER 3969 }; 3970 3971 } // namespace 3972 3973 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3974