1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "unicode/utf16.h" 38 #include "ucnv_imp.h" 39 #include "ucnv_bld.h" 40 #include "ucnv_cnv.h" 41 #include "ucnvmbcs.h" 42 #include "cstring.h" 43 #include "cmemory.h" 44 #include "uassert.h" 45 46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 static const char SHIFT_IN_STR[] = "\x0F"; 81 // static const char SHIFT_OUT_STR[] = "\x0E"; 82 83 #define CR 0x0D 84 #define LF 0x0A 85 #define H_TAB 0x09 86 #define V_TAB 0x0B 87 #define SPACE 0x20 88 89 enum { 90 HWKANA_START=0xff61, 91 HWKANA_END=0xff9f 92 }; 93 94 /* 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 96 * as bytes 21..7E. (Subtract 0x80.) 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 98 * as bytes 20..7F. (Subtract 0x80.) 99 * Do not encode C1 control codes with native bytes 80..9F 100 * as bytes 00..1F (C0 control codes). 101 */ 102 enum { 103 GR94_START=0xa1, 104 GR94_END=0xfe, 105 GR96_START=0xa0, 106 GR96_END=0xff 107 }; 108 109 /* 110 * ISO 2022 control codes must not be converted from Unicode 111 * because they would mess up the byte stream. 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 113 * corresponding to SO, SI, and ESC. 114 */ 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 116 117 /* for ISO-2022-JP and -CN implementations */ 118 typedef enum { 119 /* shared values */ 120 INVALID_STATE=-1, 121 ASCII = 0, 122 123 SS2_STATE=0x10, 124 SS3_STATE, 125 126 /* JP */ 127 ISO8859_1 = 1 , 128 ISO8859_7 = 2 , 129 JISX201 = 3, 130 JISX208 = 4, 131 JISX212 = 5, 132 GB2312 =6, 133 KSC5601 =7, 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 135 136 /* CN */ 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 138 GB2312_1=1, 139 ISO_IR_165=2, 140 CNS_11643=3, 141 142 /* 143 * these are used in StateEnum and ISO2022State variables, 144 * but CNS_11643 must be used to index into myConverterArray[] 145 */ 146 CNS_11643_0=0x20, 147 CNS_11643_1, 148 CNS_11643_2, 149 CNS_11643_3, 150 CNS_11643_4, 151 CNS_11643_5, 152 CNS_11643_6, 153 CNS_11643_7 154 } StateEnum; 155 156 /* is the StateEnum charset value for a DBCS charset? */ 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 158 159 #define CSM(cs) ((uint16_t)1<<(cs)) 160 161 /* 162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 164 * 165 * Note: The converter uses some leniency: 166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 167 * all versions, not just JIS7 and JIS8. 168 * - ICU does not distinguish between different versions of JIS X 0208. 169 */ 170 enum { MAX_JA_VERSION=4 }; 171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 177 }; 178 179 typedef enum { 180 ASCII1=0, 181 LATIN1, 182 SBCS, 183 DBCS, 184 MBCS, 185 HWKANA 186 }Cnv2022Type; 187 188 typedef struct ISO2022State { 189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 191 int8_t prevG; /* g before single shift (SS2 or SS3) */ 192 } ISO2022State; 193 194 #define UCNV_OPTIONS_VERSION_MASK 0xf 195 #define UCNV_2022_MAX_CONVERTERS 10 196 197 typedef struct{ 198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 199 UConverter *currentConverter; 200 Cnv2022Type currentType; 201 ISO2022State toU2022State, fromU2022State; 202 uint32_t key; 203 uint32_t version; 204 #ifdef U_ENABLE_GENERIC_ISO_2022 205 UBool isFirstBuffer; 206 #endif 207 UBool isEmptySegment; 208 char name[30]; 209 char locale[3]; 210 }UConverterDataISO2022; 211 212 /* Protos */ 213 /* ISO-2022 ----------------------------------------------------------------- */ 214 215 /*Forward declaration */ 216 U_CFUNC void 217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 218 UErrorCode * err); 219 U_CFUNC void 220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 221 UErrorCode * err); 222 223 #define ESC_2022 0x1B /*ESC*/ 224 225 typedef enum 226 { 227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 231 } UCNV_TableStates_2022; 232 233 /* 234 * The way these state transition arrays work is: 235 * ex : ESC$B is the sequence for JISX208 236 * a) First Iteration: char is ESC 237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 238 * int x = normalize_esq_chars_2022[27] which is equal to 1 239 * ii) Search for this value in escSeqStateTable_Key_2022[] 240 * value of x is stored at escSeqStateTable_Key_2022[0] 241 * iii) Save this index as offset 242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 244 * b) Switch on this state and continue to next char 245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 246 * which is normalize_esq_chars_2022[36] == 4 247 * ii) x is currently 1(from above) 248 * x<<=5 -- x is now 32 249 * x+=normalize_esq_chars_2022[36] 250 * now x is 36 251 * iii) Search for this value in escSeqStateTable_Key_2022[] 252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 255 * c) Switch on this state and continue to next char 256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 257 * ii) x is currently 36 (from above) 258 * x<<=5 -- x is now 1152 259 * x+=normalize_esq_chars_2022[66] 260 * now x is 1161 261 * iii) Search for this value in escSeqStateTable_Key_2022[] 262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 266 */ 267 268 269 /*Below are the 3 arrays depicting a state transition table*/ 270 static const int8_t normalize_esq_chars_2022[256] = { 271 /* 0 1 2 3 4 5 6 7 8 9 */ 272 273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 299 }; 300 301 #ifdef U_ENABLE_GENERIC_ISO_2022 302 /* 303 * When the generic ISO-2022 converter is completely removed, not just disabled 304 * per #ifdef, then the following state table and the associated tables that are 305 * dimensioned with MAX_STATES_2022 should be trimmed. 306 * 307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 308 * the associated escape sequences starting with ESC ( B should be removed. 309 * This includes the ones with key values 1097 and all of the ones above 1000000. 310 * 311 * For the latter, the tables can simply be truncated. 312 * For the former, since the tables must be kept parallel, it is probably best 313 * to simply duplicate an adjacent table cell, parallel in all tables. 314 * 315 * It may make sense to restructure the tables, especially by using small search 316 * tables for the variants instead of indexing them parallel to the table here. 317 */ 318 #endif 319 320 #define MAX_STATES_2022 74 321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 322 /* 0 1 2 3 4 5 6 7 8 9 */ 323 324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 331 ,35947631 ,35947635 ,35947636 ,35947638 332 }; 333 334 #ifdef U_ENABLE_GENERIC_ISO_2022 335 336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 337 /* 0 1 2 3 4 5 6 7 8 9 */ 338 339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 347 }; 348 349 #endif 350 351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 352 /* 0 1 2 3 4 5 6 7 8 9 */ 353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 361 }; 362 363 364 /* Type def for refactoring changeState_2022 code*/ 365 typedef enum{ 366 #ifdef U_ENABLE_GENERIC_ISO_2022 367 ISO_2022=0, 368 #endif 369 ISO_2022_JP=1, 370 ISO_2022_KR=2, 371 ISO_2022_CN=3 372 } Variant2022; 373 374 /*********** ISO 2022 Converter Protos ***********/ 375 static void 376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 377 378 static void 379 _ISO2022Close(UConverter *converter); 380 381 static void 382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 383 384 static const char* 385 _ISO2022getName(const UConverter* cnv); 386 387 static void 388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 389 390 static UConverter * 391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 392 393 #ifdef U_ENABLE_GENERIC_ISO_2022 394 static void 395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 396 #endif 397 398 namespace { 399 400 /*const UConverterSharedData _ISO2022Data;*/ 401 extern const UConverterSharedData _ISO2022JPData; 402 extern const UConverterSharedData _ISO2022KRData; 403 extern const UConverterSharedData _ISO2022CNData; 404 405 } // namespace 406 407 /*************** Converter implementations ******************/ 408 409 /* The purpose of this function is to get around gcc compiler warnings. */ 410 static inline void 411 fromUWriteUInt8(UConverter *cnv, 412 const char *bytes, int32_t length, 413 uint8_t **target, const char *targetLimit, 414 int32_t **offsets, 415 int32_t sourceIndex, 416 UErrorCode *pErrorCode) 417 { 418 char *targetChars = (char *)*target; 419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 420 offsets, sourceIndex, pErrorCode); 421 *target = (uint8_t*)targetChars; 422 423 } 424 425 static inline void 426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 427 if(myConverterData->version == 1) { 428 UConverter *cnv = myConverterData->currentConverter; 429 430 cnv->toUnicodeStatus=0; /* offset */ 431 cnv->mode=0; /* state */ 432 cnv->toULength=0; /* byteIndex */ 433 } 434 } 435 436 static inline void 437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 438 /* in ISO-2022-KR the designator sequence appears only once 439 * in a file so we append it only once 440 */ 441 if( converter->charErrorBufferLength==0){ 442 443 converter->charErrorBufferLength = 4; 444 converter->charErrorBuffer[0] = 0x1b; 445 converter->charErrorBuffer[1] = 0x24; 446 converter->charErrorBuffer[2] = 0x29; 447 converter->charErrorBuffer[3] = 0x43; 448 } 449 if(myConverterData->version == 1) { 450 UConverter *cnv = myConverterData->currentConverter; 451 452 cnv->fromUChar32=0; 453 cnv->fromUnicodeStatus=1; /* prevLength */ 454 } 455 } 456 457 static void 458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 459 460 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 461 462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 463 if(cnv->extraInfo != NULL) { 464 UConverterNamePieces stackPieces; 465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 467 uint32_t version; 468 469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 470 471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 472 myConverterData->currentType = ASCII1; 473 cnv->fromUnicodeStatus =FALSE; 474 if(pArgs->locale){ 475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 476 } 477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 478 myConverterData->version = version; 479 /* Begin Google-specific change. */ 480 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 481 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 482 if((myLocale[0]=='j' && 483 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 484 myLocale[1]=='s') && 485 (myLocale[2]=='_' || myLocale[2]=='\0'))) 486 { 487 size_t len=0; 488 /* open the required converters and cache them */ 489 if(version>MAX_JA_VERSION) { 490 /* prevent indexing beyond jpCharsetMasks[] */ 491 myConverterData->version = version = 0; 492 } 493 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 494 myConverterData->myConverterArray[ISO8859_7] = 495 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 496 } 497 if (myLocale[1]=='k') { /* Use KDDI's version. */ 498 myConverterData->myConverterArray[JISX208] = 499 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 500 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 501 myConverterData->myConverterArray[JISX208] = 502 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 503 } else { 504 /* 505 * Change for http://b/issue?id=937017 : 506 * Restore JIS X 0208 ISO-2022-JP mappings from before 507 * sharing the table with the Shift-JIS converter 508 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797). 509 * TODO(mscherer): Create and use a new, unified Google Shift-JIS 510 * table for both Shift-JIS and ISO-2022-JP. 511 */ 512 myConverterData->myConverterArray[JISX208] = 513 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode); 514 } 515 /* End Google-specific change. */ 516 /* END android-changed */ 517 518 if(jpCharsetMasks[version]&CSM(JISX212)) { 519 myConverterData->myConverterArray[JISX212] = 520 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 521 } 522 if(jpCharsetMasks[version]&CSM(GB2312)) { 523 myConverterData->myConverterArray[GB2312] = 524 /* BEGIN android-changed */ 525 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 526 /* END android-changed */ 527 } 528 if(jpCharsetMasks[version]&CSM(KSC5601)) { 529 myConverterData->myConverterArray[KSC5601] = 530 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 531 } 532 533 /* set the function pointers to appropriate funtions */ 534 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 535 uprv_strcpy(myConverterData->locale,"ja"); 536 537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 538 len = uprv_strlen(myConverterData->name); 539 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 540 myConverterData->name[len+1]='\0'; 541 } 542 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 543 (myLocale[2]=='_' || myLocale[2]=='\0')) 544 { 545 const char *cnvName; 546 if(version==1) { 547 cnvName="icu-internal-25546"; 548 } else { 549 /* BEGIN android-changed */ 550 cnvName="ksc_5601"; 551 /* END android-changed */ 552 myConverterData->version=version=0; 553 } 554 if(pArgs->onlyTestIsLoadable) { 555 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 556 uprv_free(cnv->extraInfo); 557 cnv->extraInfo=NULL; 558 return; 559 } else { 560 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 561 if (U_FAILURE(*errorCode)) { 562 _ISO2022Close(cnv); 563 return; 564 } 565 566 if(version==1) { 567 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 568 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 569 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 570 }else{ 571 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 572 } 573 574 /* initialize the state variables */ 575 setInitialStateToUnicodeKR(cnv, myConverterData); 576 setInitialStateFromUnicodeKR(cnv, myConverterData); 577 578 /* set the function pointers to appropriate funtions */ 579 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 580 uprv_strcpy(myConverterData->locale,"ko"); 581 } 582 } 583 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 584 (myLocale[2]=='_' || myLocale[2]=='\0')) 585 { 586 587 /* open the required converters and cache them */ 588 /* BEGIN android-changed */ 589 myConverterData->myConverterArray[GB2312_1] = 590 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 591 if(version==1) { 592 myConverterData->myConverterArray[ISO_IR_165] = 593 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 594 } 595 myConverterData->myConverterArray[CNS_11643] = 596 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 597 /* END android-changed */ 598 599 600 /* set the function pointers to appropriate funtions */ 601 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 602 uprv_strcpy(myConverterData->locale,"cn"); 603 604 if (version==0){ 605 myConverterData->version = 0; 606 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 607 }else if (version==1){ 608 myConverterData->version = 1; 609 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 610 }else { 611 myConverterData->version = 2; 612 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 613 } 614 } 615 else{ 616 #ifdef U_ENABLE_GENERIC_ISO_2022 617 myConverterData->isFirstBuffer = TRUE; 618 619 /* append the UTF-8 escape sequence */ 620 cnv->charErrorBufferLength = 3; 621 cnv->charErrorBuffer[0] = 0x1b; 622 cnv->charErrorBuffer[1] = 0x25; 623 cnv->charErrorBuffer[2] = 0x42; 624 625 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 626 /* initialize the state variables */ 627 uprv_strcpy(myConverterData->name,"ISO_2022"); 628 #else 629 *errorCode = U_UNSUPPORTED_ERROR; 630 return; 631 #endif 632 } 633 634 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 635 636 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 637 _ISO2022Close(cnv); 638 } 639 } else { 640 *errorCode = U_MEMORY_ALLOCATION_ERROR; 641 } 642 } 643 644 645 static void 646 _ISO2022Close(UConverter *converter) { 647 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 648 UConverterSharedData **array = myData->myConverterArray; 649 int32_t i; 650 651 if (converter->extraInfo != NULL) { 652 /*close the array of converter pointers and free the memory*/ 653 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 654 if(array[i]!=NULL) { 655 ucnv_unloadSharedDataIfReady(array[i]); 656 } 657 } 658 659 ucnv_close(myData->currentConverter); 660 661 if(!converter->isExtraLocal){ 662 uprv_free (converter->extraInfo); 663 converter->extraInfo = NULL; 664 } 665 } 666 } 667 668 static void 669 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 670 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 671 if(choice<=UCNV_RESET_TO_UNICODE) { 672 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 673 myConverterData->key = 0; 674 myConverterData->isEmptySegment = FALSE; 675 } 676 if(choice!=UCNV_RESET_TO_UNICODE) { 677 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 678 } 679 #ifdef U_ENABLE_GENERIC_ISO_2022 680 if(myConverterData->locale[0] == 0){ 681 if(choice<=UCNV_RESET_TO_UNICODE) { 682 myConverterData->isFirstBuffer = TRUE; 683 myConverterData->key = 0; 684 if (converter->mode == UCNV_SO){ 685 ucnv_close (myConverterData->currentConverter); 686 myConverterData->currentConverter=NULL; 687 } 688 converter->mode = UCNV_SI; 689 } 690 if(choice!=UCNV_RESET_TO_UNICODE) { 691 /* re-append UTF-8 escape sequence */ 692 converter->charErrorBufferLength = 3; 693 converter->charErrorBuffer[0] = 0x1b; 694 converter->charErrorBuffer[1] = 0x28; 695 converter->charErrorBuffer[2] = 0x42; 696 } 697 } 698 else 699 #endif 700 { 701 /* reset the state variables */ 702 if(myConverterData->locale[0] == 'k'){ 703 if(choice<=UCNV_RESET_TO_UNICODE) { 704 setInitialStateToUnicodeKR(converter, myConverterData); 705 } 706 if(choice!=UCNV_RESET_TO_UNICODE) { 707 setInitialStateFromUnicodeKR(converter, myConverterData); 708 } 709 } 710 } 711 } 712 713 static const char* 714 _ISO2022getName(const UConverter* cnv){ 715 if(cnv->extraInfo){ 716 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 717 return myData->name; 718 } 719 return NULL; 720 } 721 722 723 /*************** to unicode *******************/ 724 /**************************************************************************** 725 * Recognized escape sequences are 726 * <ESC>(B ASCII 727 * <ESC>.A ISO-8859-1 728 * <ESC>.F ISO-8859-7 729 * <ESC>(J JISX-201 730 * <ESC>(I JISX-201 731 * <ESC>$B JISX-208 732 * <ESC>$@ JISX-208 733 * <ESC>$(D JISX-212 734 * <ESC>$A GB2312 735 * <ESC>$(C KSC5601 736 */ 737 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 738 /* 0 1 2 3 4 5 6 7 8 9 */ 739 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 740 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 741 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 742 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 743 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 745 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 746 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 747 }; 748 749 /*************** to unicode *******************/ 750 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 751 /* 0 1 2 3 4 5 6 7 8 9 */ 752 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 753 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 754 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 755 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 756 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 757 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 760 }; 761 762 763 static UCNV_TableStates_2022 764 getKey_2022(char c,int32_t* key,int32_t* offset){ 765 int32_t togo; 766 int32_t low = 0; 767 int32_t hi = MAX_STATES_2022; 768 int32_t oldmid=0; 769 770 togo = normalize_esq_chars_2022[(uint8_t)c]; 771 if(togo == 0) { 772 /* not a valid character anywhere in an escape sequence */ 773 *key = 0; 774 *offset = 0; 775 return INVALID_2022; 776 } 777 togo = (*key << 5) + togo; 778 779 while (hi != low) /*binary search*/{ 780 781 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 782 783 if (mid == oldmid) 784 break; 785 786 if (escSeqStateTable_Key_2022[mid] > togo){ 787 hi = mid; 788 } 789 else if (escSeqStateTable_Key_2022[mid] < togo){ 790 low = mid; 791 } 792 else /*we found it*/{ 793 *key = togo; 794 *offset = mid; 795 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 796 } 797 oldmid = mid; 798 799 } 800 801 *key = 0; 802 *offset = 0; 803 return INVALID_2022; 804 } 805 806 /*runs through a state machine to determine the escape sequence - codepage correspondance 807 */ 808 static void 809 changeState_2022(UConverter* _this, 810 const char** source, 811 const char* sourceLimit, 812 Variant2022 var, 813 UErrorCode* err){ 814 UCNV_TableStates_2022 value; 815 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 816 uint32_t key = myData2022->key; 817 int32_t offset = 0; 818 int8_t initialToULength = _this->toULength; 819 char c; 820 821 value = VALID_NON_TERMINAL_2022; 822 while (*source < sourceLimit) { 823 c = *(*source)++; 824 _this->toUBytes[_this->toULength++]=(uint8_t)c; 825 value = getKey_2022(c,(int32_t *) &key, &offset); 826 827 switch (value){ 828 829 case VALID_NON_TERMINAL_2022 : 830 /* continue with the loop */ 831 break; 832 833 case VALID_TERMINAL_2022: 834 key = 0; 835 goto DONE; 836 837 case INVALID_2022: 838 goto DONE; 839 840 case VALID_MAYBE_TERMINAL_2022: 841 #ifdef U_ENABLE_GENERIC_ISO_2022 842 /* ESC ( B is ambiguous only for ISO_2022 itself */ 843 if(var == ISO_2022) { 844 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 845 _this->toULength = 0; 846 847 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 848 849 /* continue with the loop */ 850 value = VALID_NON_TERMINAL_2022; 851 break; 852 } else 853 #endif 854 { 855 /* not ISO_2022 itself, finish here */ 856 value = VALID_TERMINAL_2022; 857 key = 0; 858 goto DONE; 859 } 860 } 861 } 862 863 DONE: 864 myData2022->key = key; 865 866 if (value == VALID_NON_TERMINAL_2022) { 867 /* indicate that the escape sequence is incomplete: key!=0 */ 868 return; 869 } else if (value == INVALID_2022 ) { 870 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 871 } else /* value == VALID_TERMINAL_2022 */ { 872 switch(var){ 873 #ifdef U_ENABLE_GENERIC_ISO_2022 874 case ISO_2022: 875 { 876 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 877 if(chosenConverterName == NULL) { 878 /* SS2 or SS3 */ 879 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 880 _this->toUCallbackReason = UCNV_UNASSIGNED; 881 return; 882 } 883 884 _this->mode = UCNV_SI; 885 ucnv_close(myData2022->currentConverter); 886 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 887 if(U_SUCCESS(*err)) { 888 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 889 _this->mode = UCNV_SO; 890 } 891 break; 892 } 893 #endif 894 case ISO_2022_JP: 895 { 896 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 897 switch(tempState) { 898 case INVALID_STATE: 899 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 900 break; 901 case SS2_STATE: 902 if(myData2022->toU2022State.cs[2]!=0) { 903 if(myData2022->toU2022State.g<2) { 904 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 905 } 906 myData2022->toU2022State.g=2; 907 } else { 908 /* illegal to have SS2 before a matching designator */ 909 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 910 } 911 break; 912 /* case SS3_STATE: not used in ISO-2022-JP-x */ 913 case ISO8859_1: 914 case ISO8859_7: 915 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 917 } else { 918 /* G2 charset for SS2 */ 919 myData2022->toU2022State.cs[2]=(int8_t)tempState; 920 } 921 break; 922 default: 923 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 925 } else { 926 /* G0 charset */ 927 myData2022->toU2022State.cs[0]=(int8_t)tempState; 928 } 929 break; 930 } 931 } 932 break; 933 case ISO_2022_CN: 934 { 935 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 936 switch(tempState) { 937 case INVALID_STATE: 938 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 939 break; 940 case SS2_STATE: 941 if(myData2022->toU2022State.cs[2]!=0) { 942 if(myData2022->toU2022State.g<2) { 943 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 944 } 945 myData2022->toU2022State.g=2; 946 } else { 947 /* illegal to have SS2 before a matching designator */ 948 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 949 } 950 break; 951 case SS3_STATE: 952 if(myData2022->toU2022State.cs[3]!=0) { 953 if(myData2022->toU2022State.g<2) { 954 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 955 } 956 myData2022->toU2022State.g=3; 957 } else { 958 /* illegal to have SS3 before a matching designator */ 959 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 960 } 961 break; 962 case ISO_IR_165: 963 if(myData2022->version==0) { 964 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 965 break; 966 } 967 /*fall through*/ 968 case GB2312_1: 969 /*fall through*/ 970 case CNS_11643_1: 971 myData2022->toU2022State.cs[1]=(int8_t)tempState; 972 break; 973 case CNS_11643_2: 974 myData2022->toU2022State.cs[2]=(int8_t)tempState; 975 break; 976 default: 977 /* other CNS 11643 planes */ 978 if(myData2022->version==0) { 979 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 980 } else { 981 myData2022->toU2022State.cs[3]=(int8_t)tempState; 982 } 983 break; 984 } 985 } 986 break; 987 case ISO_2022_KR: 988 if(offset==0x30){ 989 /* nothing to be done, just accept this one escape sequence */ 990 } else { 991 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 992 } 993 break; 994 995 default: 996 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 997 break; 998 } 999 } 1000 if(U_SUCCESS(*err)) { 1001 _this->toULength = 0; 1002 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1003 if(_this->toULength>1) { 1004 /* 1005 * Ticket 5691: consistent illegal sequences: 1006 * - We include at least the first byte (ESC) in the illegal sequence. 1007 * - If any of the non-initial bytes could be the start of a character, 1008 * we stop the illegal sequence before the first one of those. 1009 * In escape sequences, all following bytes are "printable", that is, 1010 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1011 * they are valid single/lead bytes. 1012 * For simplicity, we always only report the initial ESC byte as the 1013 * illegal sequence and back out all other bytes we looked at. 1014 */ 1015 /* Back out some bytes. */ 1016 int8_t backOutDistance=_this->toULength-1; 1017 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1018 if(backOutDistance<=bytesFromThisBuffer) { 1019 /* same as initialToULength<=1 */ 1020 *source-=backOutDistance; 1021 } else { 1022 /* Back out bytes from the previous buffer: Need to replay them. */ 1023 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1024 /* same as -(initialToULength-1) */ 1025 /* preToULength is negative! */ 1026 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1027 *source-=bytesFromThisBuffer; 1028 } 1029 _this->toULength=1; 1030 } 1031 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1032 _this->toUCallbackReason = UCNV_UNASSIGNED; 1033 } 1034 } 1035 1036 /*Checks the characters of the buffer against valid 2022 escape sequences 1037 *if the match we return a pointer to the initial start of the sequence otherwise 1038 *we return sourceLimit 1039 */ 1040 /*for 2022 looks ahead in the stream 1041 *to determine the longest possible convertible 1042 *data stream 1043 */ 1044 static inline const char* 1045 getEndOfBuffer_2022(const char** source, 1046 const char* sourceLimit, 1047 UBool /*flush*/){ 1048 1049 const char* mySource = *source; 1050 1051 #ifdef U_ENABLE_GENERIC_ISO_2022 1052 if (*source >= sourceLimit) 1053 return sourceLimit; 1054 1055 do{ 1056 1057 if (*mySource == ESC_2022){ 1058 int8_t i; 1059 int32_t key = 0; 1060 int32_t offset; 1061 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1062 1063 /* Kludge: I could not 1064 * figure out the reason for validating an escape sequence 1065 * twice - once here and once in changeState_2022(). 1066 * is it possible to have an ESC character in a ISO2022 1067 * byte stream which is valid in a code page? Is it legal? 1068 */ 1069 for (i=0; 1070 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1071 i++) { 1072 value = getKey_2022(*(mySource+i), &key, &offset); 1073 } 1074 if (value > 0 || *mySource==ESC_2022) 1075 return mySource; 1076 1077 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1078 return sourceLimit; 1079 } 1080 }while (++mySource < sourceLimit); 1081 1082 return sourceLimit; 1083 #else 1084 while(mySource < sourceLimit && *mySource != ESC_2022) { 1085 ++mySource; 1086 } 1087 return mySource; 1088 #endif 1089 } 1090 1091 1092 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1093 * any future change in _MBCSFromUChar32() function should be reflected here. 1094 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1095 */ 1096 static inline int32_t 1097 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1098 UChar32 c, 1099 uint32_t* value, 1100 UBool useFallback, 1101 int outputType) 1102 { 1103 const int32_t *cx; 1104 const uint16_t *table; 1105 uint32_t stage2Entry; 1106 uint32_t myValue; 1107 int32_t length; 1108 const uint8_t *p; 1109 /* 1110 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1111 * Use internal version of ucnv_open() that verifies that the new structures are available, 1112 * else U_INTERNAL_PROGRAM_ERROR. 1113 */ 1114 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1115 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1116 table=sharedData->mbcs.fromUnicodeTable; 1117 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1118 /* get the bytes and the length for the output */ 1119 if(outputType==MBCS_OUTPUT_2){ 1120 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1121 if(myValue<=0xff) { 1122 length=1; 1123 } else { 1124 length=2; 1125 } 1126 } else /* outputType==MBCS_OUTPUT_3 */ { 1127 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1128 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1129 if(myValue<=0xff) { 1130 length=1; 1131 } else if(myValue<=0xffff) { 1132 length=2; 1133 } else { 1134 length=3; 1135 } 1136 } 1137 /* is this code point assigned, or do we use fallbacks? */ 1138 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1139 /* assigned */ 1140 *value=myValue; 1141 return length; 1142 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1143 /* 1144 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1145 * There is no way with this data structure for fallback output 1146 * to be a zero byte. 1147 */ 1148 *value=myValue; 1149 return -length; 1150 } 1151 } 1152 1153 cx=sharedData->mbcs.extIndexes; 1154 if(cx!=NULL) { 1155 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1156 } 1157 1158 /* unassigned */ 1159 return 0; 1160 } 1161 1162 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1163 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1164 * @param retval pointer to output byte 1165 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1166 */ 1167 static inline int32_t 1168 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1169 UChar32 c, 1170 uint32_t* retval, 1171 UBool useFallback) 1172 { 1173 const uint16_t *table; 1174 int32_t value; 1175 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1176 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1177 return 0; 1178 } 1179 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1180 table=sharedData->mbcs.fromUnicodeTable; 1181 /* get the byte for the output */ 1182 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1183 /* is this code point assigned, or do we use fallbacks? */ 1184 *retval=(uint32_t)(value&0xff); 1185 if(value>=0xf00) { 1186 return 1; /* roundtrip */ 1187 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1188 return -1; /* fallback taken */ 1189 } else { 1190 return 0; /* no mapping */ 1191 } 1192 } 1193 1194 /* 1195 * Check that the result is a 2-byte value with each byte in the range A1..FE 1196 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1197 * to move it to the ISO 2022 range 21..7E. 1198 * Return 0 if out of range. 1199 */ 1200 static inline uint32_t 1201 _2022FromGR94DBCS(uint32_t value) { 1202 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1203 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1204 ) { 1205 return value - 0x8080; /* shift down to 21..7e byte range */ 1206 } else { 1207 return 0; /* not valid for ISO 2022 */ 1208 } 1209 } 1210 1211 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1212 /* 1213 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1214 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1215 * unchanged. 1216 */ 1217 static inline uint32_t 1218 _2022ToGR94DBCS(uint32_t value) { 1219 uint32_t returnValue = value + 0x8080; 1220 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1221 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1222 return returnValue; 1223 } else { 1224 return value; 1225 } 1226 } 1227 #endif 1228 1229 #ifdef U_ENABLE_GENERIC_ISO_2022 1230 1231 /********************************************************************************** 1232 * ISO-2022 Converter 1233 * 1234 * 1235 */ 1236 1237 static void 1238 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1239 UErrorCode* err){ 1240 const char* mySourceLimit, *realSourceLimit; 1241 const char* sourceStart; 1242 const UChar* myTargetStart; 1243 UConverter* saveThis; 1244 UConverterDataISO2022* myData; 1245 int8_t length; 1246 1247 saveThis = args->converter; 1248 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1249 1250 realSourceLimit = args->sourceLimit; 1251 while (args->source < realSourceLimit) { 1252 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1253 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1254 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1255 1256 if(args->source < mySourceLimit) { 1257 if(myData->currentConverter==NULL) { 1258 myData->currentConverter = ucnv_open("ASCII",err); 1259 if(U_FAILURE(*err)){ 1260 return; 1261 } 1262 1263 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1264 saveThis->mode = UCNV_SO; 1265 } 1266 1267 /* convert to before the ESC or until the end of the buffer */ 1268 myData->isFirstBuffer=FALSE; 1269 sourceStart = args->source; 1270 myTargetStart = args->target; 1271 args->converter = myData->currentConverter; 1272 ucnv_toUnicode(args->converter, 1273 &args->target, 1274 args->targetLimit, 1275 &args->source, 1276 mySourceLimit, 1277 args->offsets, 1278 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1279 err); 1280 args->converter = saveThis; 1281 1282 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1283 /* move the overflow buffer */ 1284 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1285 myData->currentConverter->UCharErrorBufferLength = 0; 1286 if(length > 0) { 1287 uprv_memcpy(saveThis->UCharErrorBuffer, 1288 myData->currentConverter->UCharErrorBuffer, 1289 length*U_SIZEOF_UCHAR); 1290 } 1291 return; 1292 } 1293 1294 /* 1295 * At least one of: 1296 * -Error while converting 1297 * -Done with entire buffer 1298 * -Need to write offsets or update the current offset 1299 * (leave that up to the code in ucnv.c) 1300 * 1301 * or else we just stopped at an ESC byte and continue with changeState_2022() 1302 */ 1303 if (U_FAILURE(*err) || 1304 (args->source == realSourceLimit) || 1305 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1306 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1307 ) { 1308 /* copy partial or error input for truncated detection and error handling */ 1309 if(U_FAILURE(*err)) { 1310 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1311 if(length > 0) { 1312 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1313 } 1314 } else { 1315 length = saveThis->toULength = myData->currentConverter->toULength; 1316 if(length > 0) { 1317 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1318 if(args->source < mySourceLimit) { 1319 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1320 } 1321 } 1322 } 1323 return; 1324 } 1325 } 1326 } 1327 1328 sourceStart = args->source; 1329 changeState_2022(args->converter, 1330 &(args->source), 1331 realSourceLimit, 1332 ISO_2022, 1333 err); 1334 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1335 /* let the ucnv.c code update its current offset */ 1336 return; 1337 } 1338 } 1339 } 1340 1341 #endif 1342 1343 /* 1344 * To Unicode Callback helper function 1345 */ 1346 static void 1347 toUnicodeCallback(UConverter *cnv, 1348 const uint32_t sourceChar, const uint32_t targetUniChar, 1349 UErrorCode* err){ 1350 if(sourceChar>0xff){ 1351 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1352 cnv->toUBytes[1] = (uint8_t)sourceChar; 1353 cnv->toULength = 2; 1354 } 1355 else{ 1356 cnv->toUBytes[0] =(char) sourceChar; 1357 cnv->toULength = 1; 1358 } 1359 1360 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1361 *err = U_INVALID_CHAR_FOUND; 1362 } 1363 else{ 1364 *err = U_ILLEGAL_CHAR_FOUND; 1365 } 1366 } 1367 1368 /**************************************ISO-2022-JP*************************************************/ 1369 1370 /************************************** IMPORTANT ************************************************** 1371 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1372 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1373 * The converter iterates over each Unicode codepoint 1374 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1375 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1376 * would do as far as possible. 1377 * 1378 * If the implementation of these macros or structure of sharedData struct change in the future, make 1379 * sure that ISO-2022 is also changed. 1380 *************************************************************************************************** 1381 */ 1382 1383 /*************************************************************************************************** 1384 * Rules for ISO-2022-jp encoding 1385 * (i) Escape sequences must be fully contained within a line they should not 1386 * span new lines or CRs 1387 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1388 * JIS-Roman character escape sequence should follow before the line terminates 1389 * (iii) If the first character on the line is represented by two bytes then a two 1390 * byte character escape sequence should precede it 1391 * (iv) If no escape sequence is encountered then the characters are ASCII 1392 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1393 * and invoked with SS2 (ESC N). 1394 * (vi) If there is any G0 designation in text, there must be a switch to 1395 * ASCII or to JIS X 0201-Roman before a space character (but not 1396 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1397 * characters such as tab or CRLF. 1398 * (vi) Supported encodings: 1399 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1400 * 1401 * source : RFC-1554 1402 * 1403 * JISX201, JISX208,JISX212 : new .cnv data files created 1404 * KSC5601 : alias to ibm-949 mapping table 1405 * GB2312 : alias to ibm-1386 mapping table 1406 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1407 * ISO-8859-7 : alisas to ibm-9409 mapping table 1408 */ 1409 1410 /* preference order of JP charsets */ 1411 static const StateEnum jpCharsetPref[]={ 1412 ASCII, 1413 JISX201, 1414 ISO8859_1, 1415 ISO8859_7, 1416 JISX208, 1417 JISX212, 1418 GB2312, 1419 KSC5601, 1420 HWKANA_7BIT 1421 }; 1422 1423 /* 1424 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1425 * not in order of jpCharsetPref[]! 1426 */ 1427 static const char escSeqChars[][6] ={ 1428 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1429 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1430 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1431 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1432 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1433 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1434 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1435 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1436 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1437 1438 }; 1439 static const int8_t escSeqCharsLen[] ={ 1440 3, /* length of <ESC>(B ASCII */ 1441 3, /* length of <ESC>.A ISO-8859-1 */ 1442 3, /* length of <ESC>.F ISO-8859-7 */ 1443 3, /* length of <ESC>(J JISX-201 */ 1444 3, /* length of <ESC>$B JISX-208 */ 1445 4, /* length of <ESC>$(D JISX-212 */ 1446 3, /* length of <ESC>$A GB2312 */ 1447 4, /* length of <ESC>$(C KSC5601 */ 1448 3 /* length of <ESC>(I HWKANA_7BIT */ 1449 }; 1450 1451 /* 1452 * The iteration over various code pages works this way: 1453 * i) Get the currentState from myConverterData->currentState 1454 * ii) Check if the character is mapped to a valid character in the currentState 1455 * Yes -> a) set the initIterState to currentState 1456 * b) remain in this state until an invalid character is found 1457 * No -> a) go to the next code page and find the character 1458 * iii) Before changing the state increment the current state check if the current state 1459 * is equal to the intitIteration state 1460 * Yes -> A character that cannot be represented in any of the supported encodings 1461 * break and return a U_INVALID_CHARACTER error 1462 * No -> Continue and find the character in next code page 1463 * 1464 * 1465 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1466 */ 1467 1468 /* Map 00..7F to Unicode according to JIS X 0201. */ 1469 static inline uint32_t 1470 jisx201ToU(uint32_t value) { 1471 if(value < 0x5c) { 1472 return value; 1473 } else if(value == 0x5c) { 1474 return 0xa5; 1475 } else if(value == 0x7e) { 1476 return 0x203e; 1477 } else /* value <= 0x7f */ { 1478 return value; 1479 } 1480 } 1481 1482 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1483 static inline uint32_t 1484 jisx201FromU(uint32_t value) { 1485 if(value<=0x7f) { 1486 if(value!=0x5c && value!=0x7e) { 1487 return value; 1488 } 1489 } else if(value==0xa5) { 1490 return 0x5c; 1491 } else if(value==0x203e) { 1492 return 0x7e; 1493 } 1494 return 0xfffe; 1495 } 1496 1497 /* 1498 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1499 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1500 * Return 0 if the byte pair is out of range. 1501 */ 1502 static inline uint32_t 1503 _2022FromSJIS(uint32_t value) { 1504 uint8_t trail; 1505 1506 if(value > 0xEFFC) { 1507 return 0; /* beyond JIS X 0208 */ 1508 } 1509 1510 trail = (uint8_t)value; 1511 1512 value &= 0xff00; /* lead byte */ 1513 if(value <= 0x9f00) { 1514 value -= 0x7000; 1515 } else /* 0xe000 <= value <= 0xef00 */ { 1516 value -= 0xb000; 1517 } 1518 value <<= 1; 1519 1520 if(trail <= 0x9e) { 1521 value -= 0x100; 1522 if(trail <= 0x7e) { 1523 value |= trail - 0x1f; 1524 } else { 1525 value |= trail - 0x20; 1526 } 1527 } else /* trail <= 0xfc */ { 1528 value |= trail - 0x7e; 1529 } 1530 return value; 1531 } 1532 1533 /* 1534 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1535 * If either byte is outside 21..7E make sure that the result is not valid 1536 * for Shift-JIS so that the converter catches it. 1537 * Some invalid byte values already turn into equally invalid Shift-JIS 1538 * byte values and need not be tested explicitly. 1539 */ 1540 static inline void 1541 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1542 if(c1&1) { 1543 ++c1; 1544 if(c2 <= 0x5f) { 1545 c2 += 0x1f; 1546 } else if(c2 <= 0x7e) { 1547 c2 += 0x20; 1548 } else { 1549 c2 = 0; /* invalid */ 1550 } 1551 } else { 1552 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1553 c2 += 0x7e; 1554 } else { 1555 c2 = 0; /* invalid */ 1556 } 1557 } 1558 c1 >>= 1; 1559 if(c1 <= 0x2f) { 1560 c1 += 0x70; 1561 } else if(c1 <= 0x3f) { 1562 c1 += 0xb0; 1563 } else { 1564 c1 = 0; /* invalid */ 1565 } 1566 bytes[0] = (char)c1; 1567 bytes[1] = (char)c2; 1568 } 1569 1570 /* 1571 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1572 * Katakana. 1573 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1574 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1575 * These were the only fallbacks in ICU's jisx-208.ucm file. 1576 */ 1577 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1578 0x2123, /* U+FF61 */ 1579 0x2156, 1580 0x2157, 1581 0x2122, 1582 0x2126, 1583 0x2572, 1584 0x2521, 1585 0x2523, 1586 0x2525, 1587 0x2527, 1588 0x2529, 1589 0x2563, 1590 0x2565, 1591 0x2567, 1592 0x2543, 1593 0x213C, /* U+FF70 */ 1594 0x2522, 1595 0x2524, 1596 0x2526, 1597 0x2528, 1598 0x252A, 1599 0x252B, 1600 0x252D, 1601 0x252F, 1602 0x2531, 1603 0x2533, 1604 0x2535, 1605 0x2537, 1606 0x2539, 1607 0x253B, 1608 0x253D, 1609 0x253F, /* U+FF80 */ 1610 0x2541, 1611 0x2544, 1612 0x2546, 1613 0x2548, 1614 0x254A, 1615 0x254B, 1616 0x254C, 1617 0x254D, 1618 0x254E, 1619 0x254F, 1620 0x2552, 1621 0x2555, 1622 0x2558, 1623 0x255B, 1624 0x255E, 1625 0x255F, /* U+FF90 */ 1626 0x2560, 1627 0x2561, 1628 0x2562, 1629 0x2564, 1630 0x2566, 1631 0x2568, 1632 0x2569, 1633 0x256A, 1634 0x256B, 1635 0x256C, 1636 0x256D, 1637 0x256F, 1638 0x2573, 1639 0x212B, 1640 0x212C /* U+FF9F */ 1641 }; 1642 1643 static void 1644 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1645 UConverter *cnv = args->converter; 1646 UConverterDataISO2022 *converterData; 1647 ISO2022State *pFromU2022State; 1648 uint8_t *target = (uint8_t *) args->target; 1649 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1650 const UChar* source = args->source; 1651 const UChar* sourceLimit = args->sourceLimit; 1652 int32_t* offsets = args->offsets; 1653 UChar32 sourceChar; 1654 char buffer[8]; 1655 int32_t len, outLen; 1656 int8_t choices[10]; 1657 int32_t choiceCount; 1658 uint32_t targetValue = 0; 1659 UBool useFallback; 1660 1661 int32_t i; 1662 int8_t cs, g; 1663 1664 /* set up the state */ 1665 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1666 pFromU2022State = &converterData->fromU2022State; 1667 1668 choiceCount = 0; 1669 1670 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1671 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1672 goto getTrail; 1673 } 1674 1675 while(source < sourceLimit) { 1676 if(target < targetLimit) { 1677 1678 sourceChar = *(source++); 1679 /*check if the char is a First surrogate*/ 1680 if(U16_IS_SURROGATE(sourceChar)) { 1681 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1682 getTrail: 1683 /*look ahead to find the trail surrogate*/ 1684 if(source < sourceLimit) { 1685 /* test the following code unit */ 1686 UChar trail=(UChar) *source; 1687 if(U16_IS_TRAIL(trail)) { 1688 source++; 1689 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1690 cnv->fromUChar32=0x00; 1691 /* convert this supplementary code point */ 1692 /* exit this condition tree */ 1693 } else { 1694 /* this is an unmatched lead code unit (1st surrogate) */ 1695 /* callback(illegal) */ 1696 *err=U_ILLEGAL_CHAR_FOUND; 1697 cnv->fromUChar32=sourceChar; 1698 break; 1699 } 1700 } else { 1701 /* no more input */ 1702 cnv->fromUChar32=sourceChar; 1703 break; 1704 } 1705 } else { 1706 /* this is an unmatched trail code unit (2nd surrogate) */ 1707 /* callback(illegal) */ 1708 *err=U_ILLEGAL_CHAR_FOUND; 1709 cnv->fromUChar32=sourceChar; 1710 break; 1711 } 1712 } 1713 1714 /* do not convert SO/SI/ESC */ 1715 if(IS_2022_CONTROL(sourceChar)) { 1716 /* callback(illegal) */ 1717 *err=U_ILLEGAL_CHAR_FOUND; 1718 cnv->fromUChar32=sourceChar; 1719 break; 1720 } 1721 1722 /* do the conversion */ 1723 1724 if(choiceCount == 0) { 1725 uint16_t csm; 1726 1727 /* 1728 * The csm variable keeps track of which charsets are allowed 1729 * and not used yet while building the choices[]. 1730 */ 1731 csm = jpCharsetMasks[converterData->version]; 1732 choiceCount = 0; 1733 1734 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1735 if(converterData->version == 3 || converterData->version == 4) { 1736 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1737 } 1738 /* Do not try single-byte half-width Katakana for other versions. */ 1739 csm &= ~CSM(HWKANA_7BIT); 1740 1741 /* try the current G0 charset */ 1742 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1743 csm &= ~CSM(cs); 1744 1745 /* try the current G2 charset */ 1746 if((cs = pFromU2022State->cs[2]) != 0) { 1747 choices[choiceCount++] = cs; 1748 csm &= ~CSM(cs); 1749 } 1750 1751 /* try all the other possible charsets */ 1752 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1753 cs = (int8_t)jpCharsetPref[i]; 1754 if(CSM(cs) & csm) { 1755 choices[choiceCount++] = cs; 1756 csm &= ~CSM(cs); 1757 } 1758 } 1759 } 1760 1761 cs = g = 0; 1762 /* 1763 * len==0: no mapping found yet 1764 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1765 * len>0: found a roundtrip result, done 1766 */ 1767 len = 0; 1768 /* 1769 * We will turn off useFallback after finding a fallback, 1770 * but we still get fallbacks from PUA code points as usual. 1771 * Therefore, we will also need to check that we don't overwrite 1772 * an early fallback with a later one. 1773 */ 1774 useFallback = cnv->useFallback; 1775 1776 for(i = 0; i < choiceCount && len <= 0; ++i) { 1777 uint32_t value; 1778 int32_t len2; 1779 int8_t cs0 = choices[i]; 1780 switch(cs0) { 1781 case ASCII: 1782 if(sourceChar <= 0x7f) { 1783 targetValue = (uint32_t)sourceChar; 1784 len = 1; 1785 cs = cs0; 1786 g = 0; 1787 } 1788 break; 1789 case ISO8859_1: 1790 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1791 targetValue = (uint32_t)sourceChar - 0x80; 1792 len = 1; 1793 cs = cs0; 1794 g = 2; 1795 } 1796 break; 1797 case HWKANA_7BIT: 1798 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1799 if(converterData->version==3) { 1800 /* JIS7: use G1 (SO) */ 1801 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1802 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1803 len = 1; 1804 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1805 g = 1; 1806 } else if(converterData->version==4) { 1807 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1808 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1809 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1810 len = 1; 1811 1812 cs = pFromU2022State->cs[0]; 1813 if(IS_JP_DBCS(cs)) { 1814 /* switch from a DBCS charset to JISX201 */ 1815 cs = (int8_t)JISX201; 1816 } 1817 /* else stay in the current G0 charset */ 1818 g = 0; 1819 } 1820 /* else do not use HWKANA_7BIT with other versions */ 1821 } 1822 break; 1823 case JISX201: 1824 /* G0 SBCS */ 1825 value = jisx201FromU(sourceChar); 1826 if(value <= 0x7f) { 1827 targetValue = value; 1828 len = 1; 1829 cs = cs0; 1830 g = 0; 1831 useFallback = FALSE; 1832 } 1833 break; 1834 case JISX208: 1835 /* G0 DBCS from Shift-JIS table */ 1836 len2 = MBCS_FROM_UCHAR32_ISO2022( 1837 converterData->myConverterArray[cs0], 1838 sourceChar, &value, 1839 useFallback, MBCS_OUTPUT_2); 1840 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1841 value = _2022FromSJIS(value); 1842 if(value != 0) { 1843 targetValue = value; 1844 len = len2; 1845 cs = cs0; 1846 g = 0; 1847 useFallback = FALSE; 1848 } 1849 } else if(len == 0 && useFallback && 1850 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1851 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1852 len = -2; 1853 cs = cs0; 1854 g = 0; 1855 useFallback = FALSE; 1856 } 1857 break; 1858 case ISO8859_7: 1859 /* G0 SBCS forced to 7-bit output */ 1860 len2 = MBCS_SINGLE_FROM_UCHAR32( 1861 converterData->myConverterArray[cs0], 1862 sourceChar, &value, 1863 useFallback); 1864 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1865 targetValue = value - 0x80; 1866 len = len2; 1867 cs = cs0; 1868 g = 2; 1869 useFallback = FALSE; 1870 } 1871 break; 1872 default: 1873 /* G0 DBCS */ 1874 len2 = MBCS_FROM_UCHAR32_ISO2022( 1875 converterData->myConverterArray[cs0], 1876 sourceChar, &value, 1877 useFallback, MBCS_OUTPUT_2); 1878 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1879 if(cs0 == KSC5601) { 1880 /* 1881 * Check for valid bytes for the encoding scheme. 1882 * This is necessary because the sub-converter (windows-949) 1883 * has a broader encoding scheme than is valid for 2022. 1884 */ 1885 value = _2022FromGR94DBCS(value); 1886 if(value == 0) { 1887 break; 1888 } 1889 } 1890 targetValue = value; 1891 len = len2; 1892 cs = cs0; 1893 g = 0; 1894 useFallback = FALSE; 1895 } 1896 break; 1897 } 1898 } 1899 1900 if(len != 0) { 1901 if(len < 0) { 1902 len = -len; /* fallback */ 1903 } 1904 outLen = 0; /* count output bytes */ 1905 1906 /* write SI if necessary (only for JIS7) */ 1907 if(pFromU2022State->g == 1 && g == 0) { 1908 buffer[outLen++] = UCNV_SI; 1909 pFromU2022State->g = 0; 1910 } 1911 1912 /* write the designation sequence if necessary */ 1913 if(cs != pFromU2022State->cs[g]) { 1914 int32_t escLen = escSeqCharsLen[cs]; 1915 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1916 outLen += escLen; 1917 pFromU2022State->cs[g] = cs; 1918 1919 /* invalidate the choices[] */ 1920 choiceCount = 0; 1921 } 1922 1923 /* write the shift sequence if necessary */ 1924 if(g != pFromU2022State->g) { 1925 switch(g) { 1926 /* case 0 handled before writing escapes */ 1927 case 1: 1928 buffer[outLen++] = UCNV_SO; 1929 pFromU2022State->g = 1; 1930 break; 1931 default: /* case 2 */ 1932 buffer[outLen++] = 0x1b; 1933 buffer[outLen++] = 0x4e; 1934 break; 1935 /* no case 3: no SS3 in ISO-2022-JP-x */ 1936 } 1937 } 1938 1939 /* write the output bytes */ 1940 if(len == 1) { 1941 buffer[outLen++] = (char)targetValue; 1942 } else /* len == 2 */ { 1943 buffer[outLen++] = (char)(targetValue >> 8); 1944 buffer[outLen++] = (char)targetValue; 1945 } 1946 } else { 1947 /* 1948 * if we cannot find the character after checking all codepages 1949 * then this is an error 1950 */ 1951 *err = U_INVALID_CHAR_FOUND; 1952 cnv->fromUChar32=sourceChar; 1953 break; 1954 } 1955 1956 if(sourceChar == CR || sourceChar == LF) { 1957 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1958 pFromU2022State->cs[2] = 0; 1959 choiceCount = 0; 1960 } 1961 1962 /* output outLen>0 bytes in buffer[] */ 1963 if(outLen == 1) { 1964 *target++ = buffer[0]; 1965 if(offsets) { 1966 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1967 } 1968 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1969 *target++ = buffer[0]; 1970 *target++ = buffer[1]; 1971 if(offsets) { 1972 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1973 *offsets++ = sourceIndex; 1974 *offsets++ = sourceIndex; 1975 } 1976 } else { 1977 fromUWriteUInt8( 1978 cnv, 1979 buffer, outLen, 1980 &target, (const char *)targetLimit, 1981 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1982 err); 1983 if(U_FAILURE(*err)) { 1984 break; 1985 } 1986 } 1987 } /* end if(myTargetIndex<myTargetLength) */ 1988 else{ 1989 *err =U_BUFFER_OVERFLOW_ERROR; 1990 break; 1991 } 1992 1993 }/* end while(mySourceIndex<mySourceLength) */ 1994 1995 /* 1996 * the end of the input stream and detection of truncated input 1997 * are handled by the framework, but for ISO-2022-JP conversion 1998 * we need to be in ASCII mode at the very end 1999 * 2000 * conditions: 2001 * successful 2002 * in SO mode or not in ASCII mode 2003 * end of input and no truncated input 2004 */ 2005 if( U_SUCCESS(*err) && 2006 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2007 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2008 ) { 2009 int32_t sourceIndex; 2010 2011 outLen = 0; 2012 2013 if(pFromU2022State->g != 0) { 2014 buffer[outLen++] = UCNV_SI; 2015 pFromU2022State->g = 0; 2016 } 2017 2018 if(pFromU2022State->cs[0] != ASCII) { 2019 int32_t escLen = escSeqCharsLen[ASCII]; 2020 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2021 outLen += escLen; 2022 pFromU2022State->cs[0] = (int8_t)ASCII; 2023 } 2024 2025 /* get the source index of the last input character */ 2026 /* 2027 * TODO this would be simpler and more reliable if we used a pair 2028 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2029 * so that we could simply use the prevSourceIndex here; 2030 * this code gives an incorrect result for the rare case of an unmatched 2031 * trail surrogate that is alone in the last buffer of the text stream 2032 */ 2033 sourceIndex=(int32_t)(source-args->source); 2034 if(sourceIndex>0) { 2035 --sourceIndex; 2036 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2037 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2038 ) { 2039 --sourceIndex; 2040 } 2041 } else { 2042 sourceIndex=-1; 2043 } 2044 2045 fromUWriteUInt8( 2046 cnv, 2047 buffer, outLen, 2048 &target, (const char *)targetLimit, 2049 &offsets, sourceIndex, 2050 err); 2051 } 2052 2053 /*save the state and return */ 2054 args->source = source; 2055 args->target = (char*)target; 2056 } 2057 2058 /*************** to unicode *******************/ 2059 2060 static void 2061 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2062 UErrorCode* err){ 2063 char tempBuf[2]; 2064 const char *mySource = (char *) args->source; 2065 UChar *myTarget = args->target; 2066 const char *mySourceLimit = args->sourceLimit; 2067 uint32_t targetUniChar = 0x0000; 2068 uint32_t mySourceChar = 0x0000; 2069 uint32_t tmpSourceChar = 0x0000; 2070 UConverterDataISO2022* myData; 2071 ISO2022State *pToU2022State; 2072 StateEnum cs; 2073 2074 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2075 pToU2022State = &myData->toU2022State; 2076 2077 if(myData->key != 0) { 2078 /* continue with a partial escape sequence */ 2079 goto escape; 2080 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2081 /* continue with a partial double-byte character */ 2082 mySourceChar = args->converter->toUBytes[0]; 2083 args->converter->toULength = 0; 2084 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2085 targetUniChar = missingCharMarker; 2086 goto getTrailByte; 2087 } 2088 2089 while(mySource < mySourceLimit){ 2090 2091 targetUniChar =missingCharMarker; 2092 2093 if(myTarget < args->targetLimit){ 2094 2095 mySourceChar= (unsigned char) *mySource++; 2096 2097 switch(mySourceChar) { 2098 case UCNV_SI: 2099 if(myData->version==3) { 2100 pToU2022State->g=0; 2101 continue; 2102 } else { 2103 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2104 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2105 break; 2106 } 2107 2108 case UCNV_SO: 2109 if(myData->version==3) { 2110 /* JIS7: switch to G1 half-width Katakana */ 2111 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2112 pToU2022State->g=1; 2113 continue; 2114 } else { 2115 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2116 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2117 break; 2118 } 2119 2120 case ESC_2022: 2121 mySource--; 2122 escape: 2123 { 2124 const char * mySourceBefore = mySource; 2125 int8_t toULengthBefore = args->converter->toULength; 2126 2127 changeState_2022(args->converter,&(mySource), 2128 mySourceLimit, ISO_2022_JP,err); 2129 2130 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2131 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2132 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2133 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2134 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2135 } 2136 } 2137 2138 /* invalid or illegal escape sequence */ 2139 if(U_FAILURE(*err)){ 2140 args->target = myTarget; 2141 args->source = mySource; 2142 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2143 return; 2144 } 2145 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2146 if(myData->key==0) { 2147 myData->isEmptySegment = TRUE; 2148 } 2149 continue; 2150 2151 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2152 2153 case CR: 2154 /*falls through*/ 2155 case LF: 2156 /* automatically reset to single-byte mode */ 2157 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2158 pToU2022State->cs[0] = (int8_t)ASCII; 2159 } 2160 pToU2022State->cs[2] = 0; 2161 pToU2022State->g = 0; 2162 /* falls through */ 2163 default: 2164 /* convert one or two bytes */ 2165 myData->isEmptySegment = FALSE; 2166 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2167 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2168 !IS_JP_DBCS(cs) 2169 ) { 2170 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2171 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2172 2173 /* return from a single-shift state to the previous one */ 2174 if(pToU2022State->g >= 2) { 2175 pToU2022State->g=pToU2022State->prevG; 2176 } 2177 } else switch(cs) { 2178 case ASCII: 2179 if(mySourceChar <= 0x7f) { 2180 targetUniChar = mySourceChar; 2181 } 2182 break; 2183 case ISO8859_1: 2184 if(mySourceChar <= 0x7f) { 2185 targetUniChar = mySourceChar + 0x80; 2186 } 2187 /* return from a single-shift state to the previous one */ 2188 pToU2022State->g=pToU2022State->prevG; 2189 break; 2190 case ISO8859_7: 2191 if(mySourceChar <= 0x7f) { 2192 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2193 targetUniChar = 2194 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2195 myData->myConverterArray[cs], 2196 mySourceChar + 0x80); 2197 } 2198 /* return from a single-shift state to the previous one */ 2199 pToU2022State->g=pToU2022State->prevG; 2200 break; 2201 case JISX201: 2202 if(mySourceChar <= 0x7f) { 2203 targetUniChar = jisx201ToU(mySourceChar); 2204 } 2205 break; 2206 case HWKANA_7BIT: 2207 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2208 /* 7-bit halfwidth Katakana */ 2209 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2210 } 2211 break; 2212 default: 2213 /* G0 DBCS */ 2214 if(mySource < mySourceLimit) { 2215 int leadIsOk, trailIsOk; 2216 uint8_t trailByte; 2217 getTrailByte: 2218 trailByte = (uint8_t)*mySource; 2219 /* 2220 * Ticket 5691: consistent illegal sequences: 2221 * - We include at least the first byte in the illegal sequence. 2222 * - If any of the non-initial bytes could be the start of a character, 2223 * we stop the illegal sequence before the first one of those. 2224 * 2225 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2226 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2227 * Otherwise we convert or report the pair of bytes. 2228 */ 2229 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2230 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2231 if (leadIsOk && trailIsOk) { 2232 ++mySource; 2233 tmpSourceChar = (mySourceChar << 8) | trailByte; 2234 if(cs == JISX208) { 2235 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2236 mySourceChar = tmpSourceChar; 2237 } else { 2238 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2239 mySourceChar = tmpSourceChar; 2240 if (cs == KSC5601) { 2241 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2242 } 2243 tempBuf[0] = (char)(tmpSourceChar >> 8); 2244 tempBuf[1] = (char)(tmpSourceChar); 2245 } 2246 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2247 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2248 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2249 ++mySource; 2250 /* add another bit so that the code below writes 2 bytes in case of error */ 2251 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2252 } 2253 } else { 2254 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2255 args->converter->toULength = 1; 2256 goto endloop; 2257 } 2258 } /* End of inner switch */ 2259 break; 2260 } /* End of outer switch */ 2261 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2262 if(args->offsets){ 2263 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2264 } 2265 *(myTarget++)=(UChar)targetUniChar; 2266 } 2267 else if(targetUniChar > missingCharMarker){ 2268 /* disassemble the surrogate pair and write to output*/ 2269 targetUniChar-=0x0010000; 2270 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2271 if(args->offsets){ 2272 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2273 } 2274 ++myTarget; 2275 if(myTarget< args->targetLimit){ 2276 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2277 if(args->offsets){ 2278 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2279 } 2280 ++myTarget; 2281 }else{ 2282 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2283 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2284 } 2285 2286 } 2287 else{ 2288 /* Call the callback function*/ 2289 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2290 break; 2291 } 2292 } 2293 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2294 *err =U_BUFFER_OVERFLOW_ERROR; 2295 break; 2296 } 2297 } 2298 endloop: 2299 args->target = myTarget; 2300 args->source = mySource; 2301 } 2302 2303 2304 /*************************************************************** 2305 * Rules for ISO-2022-KR encoding 2306 * i) The KSC5601 designator sequence should appear only once in a file, 2307 * at the begining of a line before any KSC5601 characters. This usually 2308 * means that it appears by itself on the first line of the file 2309 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2310 * and SI to shift into single byte mode 2311 */ 2312 static void 2313 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2314 2315 UConverter* saveConv = args->converter; 2316 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2317 args->converter=myConverterData->currentConverter; 2318 2319 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2320 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2321 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2322 2323 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2324 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2325 uprv_memcpy( 2326 saveConv->charErrorBuffer, 2327 myConverterData->currentConverter->charErrorBuffer, 2328 myConverterData->currentConverter->charErrorBufferLength); 2329 } 2330 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2331 myConverterData->currentConverter->charErrorBufferLength = 0; 2332 } 2333 args->converter=saveConv; 2334 } 2335 2336 static void 2337 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2338 2339 const UChar *source = args->source; 2340 const UChar *sourceLimit = args->sourceLimit; 2341 unsigned char *target = (unsigned char *) args->target; 2342 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2343 int32_t* offsets = args->offsets; 2344 uint32_t targetByteUnit = 0x0000; 2345 UChar32 sourceChar = 0x0000; 2346 UBool isTargetByteDBCS; 2347 UBool oldIsTargetByteDBCS; 2348 UConverterDataISO2022 *converterData; 2349 UConverterSharedData* sharedData; 2350 UBool useFallback; 2351 int32_t length =0; 2352 2353 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2354 /* if the version is 1 then the user is requesting 2355 * conversion with ibm-25546 pass the arguments to 2356 * MBCS converter and return 2357 */ 2358 if(converterData->version==1){ 2359 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2360 return; 2361 } 2362 2363 /* initialize data */ 2364 sharedData = converterData->currentConverter->sharedData; 2365 useFallback = args->converter->useFallback; 2366 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2367 oldIsTargetByteDBCS = isTargetByteDBCS; 2368 2369 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2370 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2371 goto getTrail; 2372 } 2373 while(source < sourceLimit){ 2374 2375 targetByteUnit = missingCharMarker; 2376 2377 if(target < (unsigned char*) args->targetLimit){ 2378 sourceChar = *source++; 2379 2380 /* do not convert SO/SI/ESC */ 2381 if(IS_2022_CONTROL(sourceChar)) { 2382 /* callback(illegal) */ 2383 *err=U_ILLEGAL_CHAR_FOUND; 2384 args->converter->fromUChar32=sourceChar; 2385 break; 2386 } 2387 2388 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2389 if(length < 0) { 2390 length = -length; /* fallback */ 2391 } 2392 /* only DBCS or SBCS characters are expected*/ 2393 /* DB characters with high bit set to 1 are expected */ 2394 if( length > 2 || length==0 || 2395 (length == 1 && targetByteUnit > 0x7f) || 2396 (length == 2 && 2397 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2398 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2399 ) { 2400 targetByteUnit=missingCharMarker; 2401 } 2402 if (targetByteUnit != missingCharMarker){ 2403 2404 oldIsTargetByteDBCS = isTargetByteDBCS; 2405 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2406 /* append the shift sequence */ 2407 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2408 2409 if (isTargetByteDBCS) 2410 *target++ = UCNV_SO; 2411 else 2412 *target++ = UCNV_SI; 2413 if(offsets) 2414 *(offsets++) = (int32_t)(source - args->source-1); 2415 } 2416 /* write the targetUniChar to target */ 2417 if(targetByteUnit <= 0x00FF){ 2418 if( target < targetLimit){ 2419 *(target++) = (unsigned char) targetByteUnit; 2420 if(offsets){ 2421 *(offsets++) = (int32_t)(source - args->source-1); 2422 } 2423 2424 }else{ 2425 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2426 *err = U_BUFFER_OVERFLOW_ERROR; 2427 } 2428 }else{ 2429 if(target < targetLimit){ 2430 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2431 if(offsets){ 2432 *(offsets++) = (int32_t)(source - args->source-1); 2433 } 2434 if(target < targetLimit){ 2435 *(target++) =(unsigned char) (targetByteUnit -0x80); 2436 if(offsets){ 2437 *(offsets++) = (int32_t)(source - args->source-1); 2438 } 2439 }else{ 2440 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2441 *err = U_BUFFER_OVERFLOW_ERROR; 2442 } 2443 }else{ 2444 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2445 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2446 *err = U_BUFFER_OVERFLOW_ERROR; 2447 } 2448 } 2449 2450 } 2451 else{ 2452 /* oops.. the code point is unassingned 2453 * set the error and reason 2454 */ 2455 2456 /*check if the char is a First surrogate*/ 2457 if(U16_IS_SURROGATE(sourceChar)) { 2458 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2459 getTrail: 2460 /*look ahead to find the trail surrogate*/ 2461 if(source < sourceLimit) { 2462 /* test the following code unit */ 2463 UChar trail=(UChar) *source; 2464 if(U16_IS_TRAIL(trail)) { 2465 source++; 2466 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2467 *err = U_INVALID_CHAR_FOUND; 2468 /* convert this surrogate code point */ 2469 /* exit this condition tree */ 2470 } else { 2471 /* this is an unmatched lead code unit (1st surrogate) */ 2472 /* callback(illegal) */ 2473 *err=U_ILLEGAL_CHAR_FOUND; 2474 } 2475 } else { 2476 /* no more input */ 2477 *err = U_ZERO_ERROR; 2478 } 2479 } else { 2480 /* this is an unmatched trail code unit (2nd surrogate) */ 2481 /* callback(illegal) */ 2482 *err=U_ILLEGAL_CHAR_FOUND; 2483 } 2484 } else { 2485 /* callback(unassigned) for a BMP code point */ 2486 *err = U_INVALID_CHAR_FOUND; 2487 } 2488 2489 args->converter->fromUChar32=sourceChar; 2490 break; 2491 } 2492 } /* end if(myTargetIndex<myTargetLength) */ 2493 else{ 2494 *err =U_BUFFER_OVERFLOW_ERROR; 2495 break; 2496 } 2497 2498 }/* end while(mySourceIndex<mySourceLength) */ 2499 2500 /* 2501 * the end of the input stream and detection of truncated input 2502 * are handled by the framework, but for ISO-2022-KR conversion 2503 * we need to be in ASCII mode at the very end 2504 * 2505 * conditions: 2506 * successful 2507 * not in ASCII mode 2508 * end of input and no truncated input 2509 */ 2510 if( U_SUCCESS(*err) && 2511 isTargetByteDBCS && 2512 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2513 ) { 2514 int32_t sourceIndex; 2515 2516 /* we are switching to ASCII */ 2517 isTargetByteDBCS=FALSE; 2518 2519 /* get the source index of the last input character */ 2520 /* 2521 * TODO this would be simpler and more reliable if we used a pair 2522 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2523 * so that we could simply use the prevSourceIndex here; 2524 * this code gives an incorrect result for the rare case of an unmatched 2525 * trail surrogate that is alone in the last buffer of the text stream 2526 */ 2527 sourceIndex=(int32_t)(source-args->source); 2528 if(sourceIndex>0) { 2529 --sourceIndex; 2530 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2531 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2532 ) { 2533 --sourceIndex; 2534 } 2535 } else { 2536 sourceIndex=-1; 2537 } 2538 2539 fromUWriteUInt8( 2540 args->converter, 2541 SHIFT_IN_STR, 1, 2542 &target, (const char *)targetLimit, 2543 &offsets, sourceIndex, 2544 err); 2545 } 2546 2547 /*save the state and return */ 2548 args->source = source; 2549 args->target = (char*)target; 2550 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2551 } 2552 2553 /************************ To Unicode ***************************************/ 2554 2555 static void 2556 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2557 UErrorCode* err){ 2558 char const* sourceStart; 2559 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2560 2561 UConverterToUnicodeArgs subArgs; 2562 int32_t minArgsSize; 2563 2564 /* set up the subconverter arguments */ 2565 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2566 minArgsSize = args->size; 2567 } else { 2568 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2569 } 2570 2571 uprv_memcpy(&subArgs, args, minArgsSize); 2572 subArgs.size = (uint16_t)minArgsSize; 2573 subArgs.converter = myData->currentConverter; 2574 2575 /* remember the original start of the input for offsets */ 2576 sourceStart = args->source; 2577 2578 if(myData->key != 0) { 2579 /* continue with a partial escape sequence */ 2580 goto escape; 2581 } 2582 2583 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2584 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2585 subArgs.source = args->source; 2586 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2587 if(subArgs.source != subArgs.sourceLimit) { 2588 /* 2589 * get the current partial byte sequence 2590 * 2591 * it needs to be moved between the public and the subconverter 2592 * so that the conversion framework, which only sees the public 2593 * converter, can handle truncated and illegal input etc. 2594 */ 2595 if(args->converter->toULength > 0) { 2596 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2597 } 2598 subArgs.converter->toULength = args->converter->toULength; 2599 2600 /* 2601 * Convert up to the end of the input, or to before the next escape character. 2602 * Does not handle conversion extensions because the preToU[] state etc. 2603 * is not copied. 2604 */ 2605 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2606 2607 if(args->offsets != NULL && sourceStart != args->source) { 2608 /* update offsets to base them on the actual start of the input */ 2609 int32_t *offsets = args->offsets; 2610 UChar *target = args->target; 2611 int32_t delta = (int32_t)(args->source - sourceStart); 2612 while(target < subArgs.target) { 2613 if(*offsets >= 0) { 2614 *offsets += delta; 2615 } 2616 ++offsets; 2617 ++target; 2618 } 2619 } 2620 args->source = subArgs.source; 2621 args->target = subArgs.target; 2622 args->offsets = subArgs.offsets; 2623 2624 /* copy input/error/overflow buffers */ 2625 if(subArgs.converter->toULength > 0) { 2626 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2627 } 2628 args->converter->toULength = subArgs.converter->toULength; 2629 2630 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2631 if(subArgs.converter->UCharErrorBufferLength > 0) { 2632 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2633 subArgs.converter->UCharErrorBufferLength); 2634 } 2635 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2636 subArgs.converter->UCharErrorBufferLength = 0; 2637 } 2638 } 2639 2640 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2641 return; 2642 } 2643 2644 escape: 2645 changeState_2022(args->converter, 2646 &(args->source), 2647 args->sourceLimit, 2648 ISO_2022_KR, 2649 err); 2650 } 2651 } 2652 2653 static void 2654 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2655 UErrorCode* err){ 2656 char tempBuf[2]; 2657 const char *mySource = ( char *) args->source; 2658 UChar *myTarget = args->target; 2659 const char *mySourceLimit = args->sourceLimit; 2660 UChar32 targetUniChar = 0x0000; 2661 UChar mySourceChar = 0x0000; 2662 UConverterDataISO2022* myData; 2663 UConverterSharedData* sharedData ; 2664 UBool useFallback; 2665 2666 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2667 if(myData->version==1){ 2668 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2669 return; 2670 } 2671 2672 /* initialize state */ 2673 sharedData = myData->currentConverter->sharedData; 2674 useFallback = args->converter->useFallback; 2675 2676 if(myData->key != 0) { 2677 /* continue with a partial escape sequence */ 2678 goto escape; 2679 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2680 /* continue with a partial double-byte character */ 2681 mySourceChar = args->converter->toUBytes[0]; 2682 args->converter->toULength = 0; 2683 goto getTrailByte; 2684 } 2685 2686 while(mySource< mySourceLimit){ 2687 2688 if(myTarget < args->targetLimit){ 2689 2690 mySourceChar= (unsigned char) *mySource++; 2691 2692 if(mySourceChar==UCNV_SI){ 2693 myData->toU2022State.g = 0; 2694 if (myData->isEmptySegment) { 2695 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2696 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2697 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2698 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2699 args->converter->toULength = 1; 2700 args->target = myTarget; 2701 args->source = mySource; 2702 return; 2703 } 2704 /*consume the source */ 2705 continue; 2706 }else if(mySourceChar==UCNV_SO){ 2707 myData->toU2022State.g = 1; 2708 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2709 /*consume the source */ 2710 continue; 2711 }else if(mySourceChar==ESC_2022){ 2712 mySource--; 2713 escape: 2714 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2715 changeState_2022(args->converter,&(mySource), 2716 mySourceLimit, ISO_2022_KR, err); 2717 if(U_FAILURE(*err)){ 2718 args->target = myTarget; 2719 args->source = mySource; 2720 return; 2721 } 2722 continue; 2723 } 2724 2725 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2726 if(myData->toU2022State.g == 1) { 2727 if(mySource < mySourceLimit) { 2728 int leadIsOk, trailIsOk; 2729 uint8_t trailByte; 2730 getTrailByte: 2731 targetUniChar = missingCharMarker; 2732 trailByte = (uint8_t)*mySource; 2733 /* 2734 * Ticket 5691: consistent illegal sequences: 2735 * - We include at least the first byte in the illegal sequence. 2736 * - If any of the non-initial bytes could be the start of a character, 2737 * we stop the illegal sequence before the first one of those. 2738 * 2739 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2740 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2741 * Otherwise we convert or report the pair of bytes. 2742 */ 2743 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2744 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2745 if (leadIsOk && trailIsOk) { 2746 ++mySource; 2747 tempBuf[0] = (char)(mySourceChar + 0x80); 2748 tempBuf[1] = (char)(trailByte + 0x80); 2749 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2750 mySourceChar = (mySourceChar << 8) | trailByte; 2751 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2752 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2753 ++mySource; 2754 /* add another bit so that the code below writes 2 bytes in case of error */ 2755 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2756 } 2757 } else { 2758 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2759 args->converter->toULength = 1; 2760 break; 2761 } 2762 } 2763 else if(mySourceChar <= 0x7f) { 2764 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2765 } else { 2766 targetUniChar = 0xffff; 2767 } 2768 if(targetUniChar < 0xfffe){ 2769 if(args->offsets) { 2770 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2771 } 2772 *(myTarget++)=(UChar)targetUniChar; 2773 } 2774 else { 2775 /* Call the callback function*/ 2776 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2777 break; 2778 } 2779 } 2780 else{ 2781 *err =U_BUFFER_OVERFLOW_ERROR; 2782 break; 2783 } 2784 } 2785 args->target = myTarget; 2786 args->source = mySource; 2787 } 2788 2789 /*************************** END ISO2022-KR *********************************/ 2790 2791 /*************************** ISO-2022-CN ********************************* 2792 * 2793 * Rules for ISO-2022-CN Encoding: 2794 * i) The designator sequence must appear once on a line before any instance 2795 * of character set it designates. 2796 * ii) If two lines contain characters from the same character set, both lines 2797 * must include the designator sequence. 2798 * iii) Once the designator sequence is known, a shifting sequence has to be found 2799 * to invoke the shifting 2800 * iv) All lines start in ASCII and end in ASCII. 2801 * v) Four shifting sequences are employed for this purpose: 2802 * 2803 * Sequcence ASCII Eq Charsets 2804 * ---------- ------- --------- 2805 * SI <SI> US-ASCII 2806 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2807 * SS2 <ESC>N CNS-11643-1992 Plane 2 2808 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2809 * 2810 * vi) 2811 * SOdesignator : ESC "$" ")" finalchar_for_SO 2812 * SS2designator : ESC "$" "*" finalchar_for_SS2 2813 * SS3designator : ESC "$" "+" finalchar_for_SS3 2814 * 2815 * ESC $ ) A Indicates the bytes following SO are Chinese 2816 * characters as defined in GB 2312-80, until 2817 * another SOdesignation appears 2818 * 2819 * 2820 * ESC $ ) E Indicates the bytes following SO are as defined 2821 * in ISO-IR-165 (for details, see section 2.1), 2822 * until another SOdesignation appears 2823 * 2824 * ESC $ ) G Indicates the bytes following SO are as defined 2825 * in CNS 11643-plane-1, until another 2826 * SOdesignation appears 2827 * 2828 * ESC $ * H Indicates the two bytes immediately following 2829 * SS2 is a Chinese character as defined in CNS 2830 * 11643-plane-2, until another SS2designation 2831 * appears 2832 * (Meaning <ESC>N must preceed every 2 byte 2833 * sequence.) 2834 * 2835 * ESC $ + I Indicates the immediate two bytes following SS3 2836 * is a Chinese character as defined in CNS 2837 * 11643-plane-3, until another SS3designation 2838 * appears 2839 * (Meaning <ESC>O must preceed every 2 byte 2840 * sequence.) 2841 * 2842 * ESC $ + J Indicates the immediate two bytes following SS3 2843 * is a Chinese character as defined in CNS 2844 * 11643-plane-4, until another SS3designation 2845 * appears 2846 * (In English: <ESC>O must preceed every 2 byte 2847 * sequence.) 2848 * 2849 * ESC $ + K Indicates the immediate two bytes following SS3 2850 * is a Chinese character as defined in CNS 2851 * 11643-plane-5, until another SS3designation 2852 * appears 2853 * 2854 * ESC $ + L Indicates the immediate two bytes following SS3 2855 * is a Chinese character as defined in CNS 2856 * 11643-plane-6, until another SS3designation 2857 * appears 2858 * 2859 * ESC $ + M Indicates the immediate two bytes following SS3 2860 * is a Chinese character as defined in CNS 2861 * 11643-plane-7, until another SS3designation 2862 * appears 2863 * 2864 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2865 * has its own designation information before any Chinese characters 2866 * appear 2867 * 2868 */ 2869 2870 /* The following are defined this way to make the strings truly readonly */ 2871 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2872 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2873 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2874 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2875 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2876 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2877 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2878 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2879 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2880 2881 /********************** ISO2022-CN Data **************************/ 2882 static const char* const escSeqCharsCN[10] ={ 2883 SHIFT_IN_STR, /* 0 ASCII */ 2884 GB_2312_80_STR, /* 1 GB2312_1 */ 2885 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2886 CNS_11643_1992_Plane_1_STR, 2887 CNS_11643_1992_Plane_2_STR, 2888 CNS_11643_1992_Plane_3_STR, 2889 CNS_11643_1992_Plane_4_STR, 2890 CNS_11643_1992_Plane_5_STR, 2891 CNS_11643_1992_Plane_6_STR, 2892 CNS_11643_1992_Plane_7_STR 2893 }; 2894 2895 static void 2896 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2897 UConverter *cnv = args->converter; 2898 UConverterDataISO2022 *converterData; 2899 ISO2022State *pFromU2022State; 2900 uint8_t *target = (uint8_t *) args->target; 2901 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2902 const UChar* source = args->source; 2903 const UChar* sourceLimit = args->sourceLimit; 2904 int32_t* offsets = args->offsets; 2905 UChar32 sourceChar; 2906 char buffer[8]; 2907 int32_t len; 2908 int8_t choices[3]; 2909 int32_t choiceCount; 2910 uint32_t targetValue = 0; 2911 UBool useFallback; 2912 2913 /* set up the state */ 2914 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2915 pFromU2022State = &converterData->fromU2022State; 2916 2917 choiceCount = 0; 2918 2919 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2920 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2921 goto getTrail; 2922 } 2923 2924 while( source < sourceLimit){ 2925 if(target < targetLimit){ 2926 2927 sourceChar = *(source++); 2928 /*check if the char is a First surrogate*/ 2929 if(U16_IS_SURROGATE(sourceChar)) { 2930 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2931 getTrail: 2932 /*look ahead to find the trail surrogate*/ 2933 if(source < sourceLimit) { 2934 /* test the following code unit */ 2935 UChar trail=(UChar) *source; 2936 if(U16_IS_TRAIL(trail)) { 2937 source++; 2938 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2939 cnv->fromUChar32=0x00; 2940 /* convert this supplementary code point */ 2941 /* exit this condition tree */ 2942 } else { 2943 /* this is an unmatched lead code unit (1st surrogate) */ 2944 /* callback(illegal) */ 2945 *err=U_ILLEGAL_CHAR_FOUND; 2946 cnv->fromUChar32=sourceChar; 2947 break; 2948 } 2949 } else { 2950 /* no more input */ 2951 cnv->fromUChar32=sourceChar; 2952 break; 2953 } 2954 } else { 2955 /* this is an unmatched trail code unit (2nd surrogate) */ 2956 /* callback(illegal) */ 2957 *err=U_ILLEGAL_CHAR_FOUND; 2958 cnv->fromUChar32=sourceChar; 2959 break; 2960 } 2961 } 2962 2963 /* do the conversion */ 2964 if(sourceChar <= 0x007f ){ 2965 /* do not convert SO/SI/ESC */ 2966 if(IS_2022_CONTROL(sourceChar)) { 2967 /* callback(illegal) */ 2968 *err=U_ILLEGAL_CHAR_FOUND; 2969 cnv->fromUChar32=sourceChar; 2970 break; 2971 } 2972 2973 /* US-ASCII */ 2974 if(pFromU2022State->g == 0) { 2975 buffer[0] = (char)sourceChar; 2976 len = 1; 2977 } else { 2978 buffer[0] = UCNV_SI; 2979 buffer[1] = (char)sourceChar; 2980 len = 2; 2981 pFromU2022State->g = 0; 2982 choiceCount = 0; 2983 } 2984 if(sourceChar == CR || sourceChar == LF) { 2985 /* reset the state at the end of a line */ 2986 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2987 choiceCount = 0; 2988 } 2989 } 2990 else{ 2991 /* convert U+0080..U+10ffff */ 2992 int32_t i; 2993 int8_t cs, g; 2994 2995 if(choiceCount == 0) { 2996 /* try the current SO/G1 converter first */ 2997 choices[0] = pFromU2022State->cs[1]; 2998 2999 /* default to GB2312_1 if none is designated yet */ 3000 if(choices[0] == 0) { 3001 choices[0] = GB2312_1; 3002 } 3003 3004 if(converterData->version == 0) { 3005 /* ISO-2022-CN */ 3006 3007 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3008 if(choices[0] == GB2312_1) { 3009 choices[1] = (int8_t)CNS_11643_1; 3010 } else { 3011 choices[1] = (int8_t)GB2312_1; 3012 } 3013 3014 choiceCount = 2; 3015 } else if (converterData->version == 1) { 3016 /* ISO-2022-CN-EXT */ 3017 3018 /* try one of the other converters */ 3019 switch(choices[0]) { 3020 case GB2312_1: 3021 choices[1] = (int8_t)CNS_11643_1; 3022 choices[2] = (int8_t)ISO_IR_165; 3023 break; 3024 case ISO_IR_165: 3025 choices[1] = (int8_t)GB2312_1; 3026 choices[2] = (int8_t)CNS_11643_1; 3027 break; 3028 default: /* CNS_11643_x */ 3029 choices[1] = (int8_t)GB2312_1; 3030 choices[2] = (int8_t)ISO_IR_165; 3031 break; 3032 } 3033 3034 choiceCount = 3; 3035 } else { 3036 choices[0] = (int8_t)CNS_11643_1; 3037 choices[1] = (int8_t)GB2312_1; 3038 } 3039 } 3040 3041 cs = g = 0; 3042 /* 3043 * len==0: no mapping found yet 3044 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3045 * len>0: found a roundtrip result, done 3046 */ 3047 len = 0; 3048 /* 3049 * We will turn off useFallback after finding a fallback, 3050 * but we still get fallbacks from PUA code points as usual. 3051 * Therefore, we will also need to check that we don't overwrite 3052 * an early fallback with a later one. 3053 */ 3054 useFallback = cnv->useFallback; 3055 3056 for(i = 0; i < choiceCount && len <= 0; ++i) { 3057 int8_t cs0 = choices[i]; 3058 if(cs0 > 0) { 3059 uint32_t value; 3060 int32_t len2; 3061 if(cs0 >= CNS_11643_0) { 3062 len2 = MBCS_FROM_UCHAR32_ISO2022( 3063 converterData->myConverterArray[CNS_11643], 3064 sourceChar, 3065 &value, 3066 useFallback, 3067 MBCS_OUTPUT_3); 3068 if(len2 == 3 || (len2 == -3 && len == 0)) { 3069 targetValue = value; 3070 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3071 if(len2 >= 0) { 3072 len = 2; 3073 } else { 3074 len = -2; 3075 useFallback = FALSE; 3076 } 3077 if(cs == CNS_11643_1) { 3078 g = 1; 3079 } else if(cs == CNS_11643_2) { 3080 g = 2; 3081 } else /* plane 3..7 */ if(converterData->version == 1) { 3082 g = 3; 3083 } else { 3084 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3085 len = 0; 3086 } 3087 } 3088 } else { 3089 /* GB2312_1 or ISO-IR-165 */ 3090 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3091 len2 = MBCS_FROM_UCHAR32_ISO2022( 3092 converterData->myConverterArray[cs0], 3093 sourceChar, 3094 &value, 3095 useFallback, 3096 MBCS_OUTPUT_2); 3097 if(len2 == 2 || (len2 == -2 && len == 0)) { 3098 targetValue = value; 3099 len = len2; 3100 cs = cs0; 3101 g = 1; 3102 useFallback = FALSE; 3103 } 3104 } 3105 } 3106 } 3107 3108 if(len != 0) { 3109 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3110 3111 /* write the designation sequence if necessary */ 3112 if(cs != pFromU2022State->cs[g]) { 3113 if(cs < CNS_11643) { 3114 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3115 } else { 3116 U_ASSERT(cs >= CNS_11643_1); 3117 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3118 } 3119 len = 4; 3120 pFromU2022State->cs[g] = cs; 3121 if(g == 1) { 3122 /* changing the SO/G1 charset invalidates the choices[] */ 3123 choiceCount = 0; 3124 } 3125 } 3126 3127 /* write the shift sequence if necessary */ 3128 if(g != pFromU2022State->g) { 3129 switch(g) { 3130 case 1: 3131 buffer[len++] = UCNV_SO; 3132 3133 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3134 pFromU2022State->g = 1; 3135 break; 3136 case 2: 3137 buffer[len++] = 0x1b; 3138 buffer[len++] = 0x4e; 3139 break; 3140 default: /* case 3 */ 3141 buffer[len++] = 0x1b; 3142 buffer[len++] = 0x4f; 3143 break; 3144 } 3145 } 3146 3147 /* write the two output bytes */ 3148 buffer[len++] = (char)(targetValue >> 8); 3149 buffer[len++] = (char)targetValue; 3150 } else { 3151 /* if we cannot find the character after checking all codepages 3152 * then this is an error 3153 */ 3154 *err = U_INVALID_CHAR_FOUND; 3155 cnv->fromUChar32=sourceChar; 3156 break; 3157 } 3158 } 3159 3160 /* output len>0 bytes in buffer[] */ 3161 if(len == 1) { 3162 *target++ = buffer[0]; 3163 if(offsets) { 3164 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3165 } 3166 } else if(len == 2 && (target + 2) <= targetLimit) { 3167 *target++ = buffer[0]; 3168 *target++ = buffer[1]; 3169 if(offsets) { 3170 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3171 *offsets++ = sourceIndex; 3172 *offsets++ = sourceIndex; 3173 } 3174 } else { 3175 fromUWriteUInt8( 3176 cnv, 3177 buffer, len, 3178 &target, (const char *)targetLimit, 3179 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3180 err); 3181 if(U_FAILURE(*err)) { 3182 break; 3183 } 3184 } 3185 } /* end if(myTargetIndex<myTargetLength) */ 3186 else{ 3187 *err =U_BUFFER_OVERFLOW_ERROR; 3188 break; 3189 } 3190 3191 }/* end while(mySourceIndex<mySourceLength) */ 3192 3193 /* 3194 * the end of the input stream and detection of truncated input 3195 * are handled by the framework, but for ISO-2022-CN conversion 3196 * we need to be in ASCII mode at the very end 3197 * 3198 * conditions: 3199 * successful 3200 * not in ASCII mode 3201 * end of input and no truncated input 3202 */ 3203 if( U_SUCCESS(*err) && 3204 pFromU2022State->g!=0 && 3205 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3206 ) { 3207 int32_t sourceIndex; 3208 3209 /* we are switching to ASCII */ 3210 pFromU2022State->g=0; 3211 3212 /* get the source index of the last input character */ 3213 /* 3214 * TODO this would be simpler and more reliable if we used a pair 3215 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3216 * so that we could simply use the prevSourceIndex here; 3217 * this code gives an incorrect result for the rare case of an unmatched 3218 * trail surrogate that is alone in the last buffer of the text stream 3219 */ 3220 sourceIndex=(int32_t)(source-args->source); 3221 if(sourceIndex>0) { 3222 --sourceIndex; 3223 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3224 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3225 ) { 3226 --sourceIndex; 3227 } 3228 } else { 3229 sourceIndex=-1; 3230 } 3231 3232 fromUWriteUInt8( 3233 cnv, 3234 SHIFT_IN_STR, 1, 3235 &target, (const char *)targetLimit, 3236 &offsets, sourceIndex, 3237 err); 3238 } 3239 3240 /*save the state and return */ 3241 args->source = source; 3242 args->target = (char*)target; 3243 } 3244 3245 3246 static void 3247 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3248 UErrorCode* err){ 3249 char tempBuf[3]; 3250 const char *mySource = (char *) args->source; 3251 UChar *myTarget = args->target; 3252 const char *mySourceLimit = args->sourceLimit; 3253 uint32_t targetUniChar = 0x0000; 3254 uint32_t mySourceChar = 0x0000; 3255 UConverterDataISO2022* myData; 3256 ISO2022State *pToU2022State; 3257 3258 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3259 pToU2022State = &myData->toU2022State; 3260 3261 if(myData->key != 0) { 3262 /* continue with a partial escape sequence */ 3263 goto escape; 3264 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3265 /* continue with a partial double-byte character */ 3266 mySourceChar = args->converter->toUBytes[0]; 3267 args->converter->toULength = 0; 3268 targetUniChar = missingCharMarker; 3269 goto getTrailByte; 3270 } 3271 3272 while(mySource < mySourceLimit){ 3273 3274 targetUniChar =missingCharMarker; 3275 3276 if(myTarget < args->targetLimit){ 3277 3278 mySourceChar= (unsigned char) *mySource++; 3279 3280 switch(mySourceChar){ 3281 case UCNV_SI: 3282 pToU2022State->g=0; 3283 if (myData->isEmptySegment) { 3284 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3285 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3286 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3287 args->converter->toUBytes[0] = mySourceChar; 3288 args->converter->toULength = 1; 3289 args->target = myTarget; 3290 args->source = mySource; 3291 return; 3292 } 3293 continue; 3294 3295 case UCNV_SO: 3296 if(pToU2022State->cs[1] != 0) { 3297 pToU2022State->g=1; 3298 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3299 continue; 3300 } else { 3301 /* illegal to have SO before a matching designator */ 3302 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3303 break; 3304 } 3305 3306 case ESC_2022: 3307 mySource--; 3308 escape: 3309 { 3310 const char * mySourceBefore = mySource; 3311 int8_t toULengthBefore = args->converter->toULength; 3312 3313 changeState_2022(args->converter,&(mySource), 3314 mySourceLimit, ISO_2022_CN,err); 3315 3316 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3317 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3318 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3319 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3320 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3321 } 3322 } 3323 3324 /* invalid or illegal escape sequence */ 3325 if(U_FAILURE(*err)){ 3326 args->target = myTarget; 3327 args->source = mySource; 3328 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3329 return; 3330 } 3331 continue; 3332 3333 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3334 3335 case CR: 3336 /*falls through*/ 3337 case LF: 3338 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3339 /* falls through */ 3340 default: 3341 /* convert one or two bytes */ 3342 myData->isEmptySegment = FALSE; 3343 if(pToU2022State->g != 0) { 3344 if(mySource < mySourceLimit) { 3345 UConverterSharedData *cnv; 3346 StateEnum tempState; 3347 int32_t tempBufLen; 3348 int leadIsOk, trailIsOk; 3349 uint8_t trailByte; 3350 getTrailByte: 3351 trailByte = (uint8_t)*mySource; 3352 /* 3353 * Ticket 5691: consistent illegal sequences: 3354 * - We include at least the first byte in the illegal sequence. 3355 * - If any of the non-initial bytes could be the start of a character, 3356 * we stop the illegal sequence before the first one of those. 3357 * 3358 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3359 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3360 * Otherwise we convert or report the pair of bytes. 3361 */ 3362 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3363 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3364 if (leadIsOk && trailIsOk) { 3365 ++mySource; 3366 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3367 if(tempState >= CNS_11643_0) { 3368 cnv = myData->myConverterArray[CNS_11643]; 3369 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3370 tempBuf[1] = (char) (mySourceChar); 3371 tempBuf[2] = (char) trailByte; 3372 tempBufLen = 3; 3373 3374 }else{ 3375 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3376 cnv = myData->myConverterArray[tempState]; 3377 tempBuf[0] = (char) (mySourceChar); 3378 tempBuf[1] = (char) trailByte; 3379 tempBufLen = 2; 3380 } 3381 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3382 mySourceChar = (mySourceChar << 8) | trailByte; 3383 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3384 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3385 ++mySource; 3386 /* add another bit so that the code below writes 2 bytes in case of error */ 3387 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3388 } 3389 if(pToU2022State->g>=2) { 3390 /* return from a single-shift state to the previous one */ 3391 pToU2022State->g=pToU2022State->prevG; 3392 } 3393 } else { 3394 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3395 args->converter->toULength = 1; 3396 goto endloop; 3397 } 3398 } 3399 else{ 3400 if(mySourceChar <= 0x7f) { 3401 targetUniChar = (UChar) mySourceChar; 3402 } 3403 } 3404 break; 3405 } 3406 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3407 if(args->offsets){ 3408 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3409 } 3410 *(myTarget++)=(UChar)targetUniChar; 3411 } 3412 else if(targetUniChar > missingCharMarker){ 3413 /* disassemble the surrogate pair and write to output*/ 3414 targetUniChar-=0x0010000; 3415 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3416 if(args->offsets){ 3417 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3418 } 3419 ++myTarget; 3420 if(myTarget< args->targetLimit){ 3421 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3422 if(args->offsets){ 3423 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3424 } 3425 ++myTarget; 3426 }else{ 3427 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3428 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3429 } 3430 3431 } 3432 else{ 3433 /* Call the callback function*/ 3434 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3435 break; 3436 } 3437 } 3438 else{ 3439 *err =U_BUFFER_OVERFLOW_ERROR; 3440 break; 3441 } 3442 } 3443 endloop: 3444 args->target = myTarget; 3445 args->source = mySource; 3446 } 3447 3448 static void 3449 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3450 UConverter *cnv = args->converter; 3451 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3452 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3453 char *p, *subchar; 3454 char buffer[8]; 3455 int32_t length; 3456 3457 subchar=(char *)cnv->subChars; 3458 length=cnv->subCharLen; /* assume length==1 for most variants */ 3459 3460 p = buffer; 3461 switch(myConverterData->locale[0]){ 3462 case 'j': 3463 { 3464 int8_t cs; 3465 3466 if(pFromU2022State->g == 1) { 3467 /* JIS7: switch from G1 to G0 */ 3468 pFromU2022State->g = 0; 3469 *p++ = UCNV_SI; 3470 } 3471 3472 cs = pFromU2022State->cs[0]; 3473 if(cs != ASCII && cs != JISX201) { 3474 /* not in ASCII or JIS X 0201: switch to ASCII */ 3475 pFromU2022State->cs[0] = (int8_t)ASCII; 3476 *p++ = '\x1b'; 3477 *p++ = '\x28'; 3478 *p++ = '\x42'; 3479 } 3480 3481 *p++ = subchar[0]; 3482 break; 3483 } 3484 case 'c': 3485 if(pFromU2022State->g != 0) { 3486 /* not in ASCII mode: switch to ASCII */ 3487 pFromU2022State->g = 0; 3488 *p++ = UCNV_SI; 3489 } 3490 *p++ = subchar[0]; 3491 break; 3492 case 'k': 3493 if(myConverterData->version == 0) { 3494 if(length == 1) { 3495 if((UBool)args->converter->fromUnicodeStatus) { 3496 /* in DBCS mode: switch to SBCS */ 3497 args->converter->fromUnicodeStatus = 0; 3498 *p++ = UCNV_SI; 3499 } 3500 *p++ = subchar[0]; 3501 } else /* length == 2*/ { 3502 if(!(UBool)args->converter->fromUnicodeStatus) { 3503 /* in SBCS mode: switch to DBCS */ 3504 args->converter->fromUnicodeStatus = 1; 3505 *p++ = UCNV_SO; 3506 } 3507 *p++ = subchar[0]; 3508 *p++ = subchar[1]; 3509 } 3510 break; 3511 } else { 3512 /* save the subconverter's substitution string */ 3513 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3514 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3515 3516 /* set our substitution string into the subconverter */ 3517 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3518 myConverterData->currentConverter->subCharLen = (int8_t)length; 3519 3520 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3521 args->converter = myConverterData->currentConverter; 3522 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3523 ucnv_cbFromUWriteSub(args, 0, err); 3524 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3525 args->converter = cnv; 3526 3527 /* restore the subconverter's substitution string */ 3528 myConverterData->currentConverter->subChars = currentSubChars; 3529 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3530 3531 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3532 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3533 uprv_memcpy( 3534 cnv->charErrorBuffer, 3535 myConverterData->currentConverter->charErrorBuffer, 3536 myConverterData->currentConverter->charErrorBufferLength); 3537 } 3538 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3539 myConverterData->currentConverter->charErrorBufferLength = 0; 3540 } 3541 return; 3542 } 3543 default: 3544 /* not expected */ 3545 break; 3546 } 3547 ucnv_cbFromUWriteBytes(args, 3548 buffer, (int32_t)(p - buffer), 3549 offsetIndex, err); 3550 } 3551 3552 /* 3553 * Structure for cloning an ISO 2022 converter into a single memory block. 3554 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3555 * and then ucnv_safeClone() of the sub-converter may additionally align 3556 * currentConverter inside the cloneStruct, for which we need the deadSpace 3557 * after currentConverter. 3558 * This is because UAlignedMemory may be larger than the actually 3559 * necessary alignment size for the platform. 3560 * The other cloneStruct fields will not be moved around, 3561 * and are aligned properly with cloneStruct's alignment. 3562 */ 3563 struct cloneStruct 3564 { 3565 UConverter cnv; 3566 UConverter currentConverter; 3567 UAlignedMemory deadSpace; 3568 UConverterDataISO2022 mydata; 3569 }; 3570 3571 3572 static UConverter * 3573 _ISO_2022_SafeClone( 3574 const UConverter *cnv, 3575 void *stackBuffer, 3576 int32_t *pBufferSize, 3577 UErrorCode *status) 3578 { 3579 struct cloneStruct * localClone; 3580 UConverterDataISO2022 *cnvData; 3581 int32_t i, size; 3582 3583 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3584 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3585 return NULL; 3586 } 3587 3588 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3589 localClone = (struct cloneStruct *)stackBuffer; 3590 3591 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3592 3593 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3594 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3595 localClone->cnv.isExtraLocal = TRUE; 3596 3597 /* share the subconverters */ 3598 3599 if(cnvData->currentConverter != NULL) { 3600 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3601 localClone->mydata.currentConverter = 3602 ucnv_safeClone(cnvData->currentConverter, 3603 &localClone->currentConverter, 3604 &size, status); 3605 if(U_FAILURE(*status)) { 3606 return NULL; 3607 } 3608 } 3609 3610 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3611 if(cnvData->myConverterArray[i] != NULL) { 3612 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3613 } 3614 } 3615 3616 return &localClone->cnv; 3617 } 3618 3619 static void 3620 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3621 const USetAdder *sa, 3622 UConverterUnicodeSet which, 3623 UErrorCode *pErrorCode) 3624 { 3625 int32_t i; 3626 UConverterDataISO2022* cnvData; 3627 3628 if (U_FAILURE(*pErrorCode)) { 3629 return; 3630 } 3631 #ifdef U_ENABLE_GENERIC_ISO_2022 3632 if (cnv->sharedData == &_ISO2022Data) { 3633 /* We use UTF-8 in this case */ 3634 sa->addRange(sa->set, 0, 0xd7FF); 3635 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3636 return; 3637 } 3638 #endif 3639 3640 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3641 3642 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3643 switch(cnvData->locale[0]){ 3644 case 'j': 3645 /* include JIS X 0201 which is hardcoded */ 3646 sa->add(sa->set, 0xa5); 3647 sa->add(sa->set, 0x203e); 3648 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3649 /* include Latin-1 for some variants of JP */ 3650 sa->addRange(sa->set, 0, 0xff); 3651 } else { 3652 /* include ASCII for JP */ 3653 sa->addRange(sa->set, 0, 0x7f); 3654 } 3655 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3656 /* 3657 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3658 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3659 * use half-width Katakana. 3660 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3661 * half-width Katakana via the ESC ( I sequence. 3662 * However, we only emit (fromUnicode) half-width Katakana according to the 3663 * definition of each variant. 3664 * 3665 * When including fallbacks, 3666 * we need to include half-width Katakana Unicode code points for all JP variants because 3667 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3668 */ 3669 /* include half-width Katakana for JP */ 3670 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3671 } 3672 break; 3673 case 'c': 3674 case 'z': 3675 /* include ASCII for CN */ 3676 sa->addRange(sa->set, 0, 0x7f); 3677 break; 3678 case 'k': 3679 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3680 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3681 cnvData->currentConverter, sa, which, pErrorCode); 3682 /* the loop over myConverterArray[] will simply not find another converter */ 3683 break; 3684 default: 3685 break; 3686 } 3687 3688 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3689 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3690 cnvData->version==0 && i==CNS_11643 3691 ) { 3692 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3693 ucnv_MBCSGetUnicodeSetForBytes( 3694 cnvData->myConverterArray[i], 3695 sa, UCNV_ROUNDTRIP_SET, 3696 0, 0x81, 0x82, 3697 pErrorCode); 3698 } 3699 #endif 3700 3701 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3702 UConverterSetFilter filter; 3703 if(cnvData->myConverterArray[i]!=NULL) { 3704 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3705 cnvData->version==0 && i==CNS_11643 3706 ) { 3707 /* 3708 * Version-specific for CN: 3709 * CN version 0 does not map CNS planes 3..7 although 3710 * they are all available in the CNS conversion table; 3711 * CN version 1 (-EXT) does map them all. 3712 * The two versions create different Unicode sets. 3713 */ 3714 filter=UCNV_SET_FILTER_2022_CN; 3715 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3716 /* 3717 * Only add code points that map to Shift-JIS codes 3718 * corresponding to JIS X 0208. 3719 */ 3720 filter=UCNV_SET_FILTER_SJIS; 3721 } else if(i==KSC5601) { 3722 /* 3723 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3724 * are broader than GR94. 3725 */ 3726 filter=UCNV_SET_FILTER_GR94DBCS; 3727 } else { 3728 filter=UCNV_SET_FILTER_NONE; 3729 } 3730 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3731 } 3732 } 3733 3734 /* 3735 * ISO 2022 converters must not convert SO/SI/ESC despite what 3736 * sub-converters do by themselves. 3737 * Remove these characters from the set. 3738 */ 3739 sa->remove(sa->set, 0x0e); 3740 sa->remove(sa->set, 0x0f); 3741 sa->remove(sa->set, 0x1b); 3742 3743 /* ISO 2022 converters do not convert C1 controls either */ 3744 sa->removeRange(sa->set, 0x80, 0x9f); 3745 } 3746 3747 static const UConverterImpl _ISO2022Impl={ 3748 UCNV_ISO_2022, 3749 3750 NULL, 3751 NULL, 3752 3753 _ISO2022Open, 3754 _ISO2022Close, 3755 _ISO2022Reset, 3756 3757 #ifdef U_ENABLE_GENERIC_ISO_2022 3758 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3759 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3760 ucnv_fromUnicode_UTF8, 3761 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3762 #else 3763 NULL, 3764 NULL, 3765 NULL, 3766 NULL, 3767 #endif 3768 NULL, 3769 3770 NULL, 3771 _ISO2022getName, 3772 _ISO_2022_WriteSub, 3773 _ISO_2022_SafeClone, 3774 _ISO_2022_GetUnicodeSet, 3775 3776 NULL, 3777 NULL 3778 }; 3779 static const UConverterStaticData _ISO2022StaticData={ 3780 sizeof(UConverterStaticData), 3781 "ISO_2022", 3782 2022, 3783 UCNV_IBM, 3784 UCNV_ISO_2022, 3785 1, 3786 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3787 { 0x1a, 0, 0, 0 }, 3788 1, 3789 FALSE, 3790 FALSE, 3791 0, 3792 0, 3793 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3794 }; 3795 const UConverterSharedData _ISO2022Data={ 3796 sizeof(UConverterSharedData), 3797 ~((uint32_t) 0), 3798 NULL, 3799 NULL, 3800 &_ISO2022StaticData, 3801 FALSE, 3802 &_ISO2022Impl, 3803 0, UCNV_MBCS_TABLE_INITIALIZER 3804 }; 3805 3806 /*************JP****************/ 3807 static const UConverterImpl _ISO2022JPImpl={ 3808 UCNV_ISO_2022, 3809 3810 NULL, 3811 NULL, 3812 3813 _ISO2022Open, 3814 _ISO2022Close, 3815 _ISO2022Reset, 3816 3817 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3818 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3819 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3820 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3821 NULL, 3822 3823 NULL, 3824 _ISO2022getName, 3825 _ISO_2022_WriteSub, 3826 _ISO_2022_SafeClone, 3827 _ISO_2022_GetUnicodeSet, 3828 3829 NULL, 3830 NULL 3831 }; 3832 static const UConverterStaticData _ISO2022JPStaticData={ 3833 sizeof(UConverterStaticData), 3834 "ISO_2022_JP", 3835 0, 3836 UCNV_IBM, 3837 UCNV_ISO_2022, 3838 1, 3839 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3840 { 0x1a, 0, 0, 0 }, 3841 1, 3842 FALSE, 3843 FALSE, 3844 0, 3845 0, 3846 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3847 }; 3848 3849 namespace { 3850 3851 const UConverterSharedData _ISO2022JPData={ 3852 sizeof(UConverterSharedData), 3853 ~((uint32_t) 0), 3854 NULL, 3855 NULL, 3856 &_ISO2022JPStaticData, 3857 FALSE, 3858 &_ISO2022JPImpl, 3859 0, UCNV_MBCS_TABLE_INITIALIZER 3860 }; 3861 3862 } // namespace 3863 3864 /************* KR ***************/ 3865 static const UConverterImpl _ISO2022KRImpl={ 3866 UCNV_ISO_2022, 3867 3868 NULL, 3869 NULL, 3870 3871 _ISO2022Open, 3872 _ISO2022Close, 3873 _ISO2022Reset, 3874 3875 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3876 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3877 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3878 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3879 NULL, 3880 3881 NULL, 3882 _ISO2022getName, 3883 _ISO_2022_WriteSub, 3884 _ISO_2022_SafeClone, 3885 _ISO_2022_GetUnicodeSet, 3886 3887 NULL, 3888 NULL 3889 }; 3890 static const UConverterStaticData _ISO2022KRStaticData={ 3891 sizeof(UConverterStaticData), 3892 "ISO_2022_KR", 3893 0, 3894 UCNV_IBM, 3895 UCNV_ISO_2022, 3896 1, 3897 3, /* max 3 bytes per UChar: SO+DBCS */ 3898 { 0x1a, 0, 0, 0 }, 3899 1, 3900 FALSE, 3901 FALSE, 3902 0, 3903 0, 3904 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3905 }; 3906 3907 namespace { 3908 3909 const UConverterSharedData _ISO2022KRData={ 3910 sizeof(UConverterSharedData), 3911 ~((uint32_t) 0), 3912 NULL, 3913 NULL, 3914 &_ISO2022KRStaticData, 3915 FALSE, 3916 &_ISO2022KRImpl, 3917 0, UCNV_MBCS_TABLE_INITIALIZER 3918 }; 3919 3920 } // namespace 3921 3922 /*************** CN ***************/ 3923 static const UConverterImpl _ISO2022CNImpl={ 3924 3925 UCNV_ISO_2022, 3926 3927 NULL, 3928 NULL, 3929 3930 _ISO2022Open, 3931 _ISO2022Close, 3932 _ISO2022Reset, 3933 3934 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3935 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3936 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3937 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3938 NULL, 3939 3940 NULL, 3941 _ISO2022getName, 3942 _ISO_2022_WriteSub, 3943 _ISO_2022_SafeClone, 3944 _ISO_2022_GetUnicodeSet, 3945 3946 NULL, 3947 NULL 3948 }; 3949 static const UConverterStaticData _ISO2022CNStaticData={ 3950 sizeof(UConverterStaticData), 3951 "ISO_2022_CN", 3952 0, 3953 UCNV_IBM, 3954 UCNV_ISO_2022, 3955 1, 3956 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3957 { 0x1a, 0, 0, 0 }, 3958 1, 3959 FALSE, 3960 FALSE, 3961 0, 3962 0, 3963 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3964 }; 3965 3966 namespace { 3967 3968 const UConverterSharedData _ISO2022CNData={ 3969 sizeof(UConverterSharedData), 3970 ~((uint32_t) 0), 3971 NULL, 3972 NULL, 3973 &_ISO2022CNStaticData, 3974 FALSE, 3975 &_ISO2022CNImpl, 3976 0, UCNV_MBCS_TABLE_INITIALIZER 3977 }; 3978 3979 } // namespace 3980 3981 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3982