1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "unicode/utf16.h" 38 #include "ucnv_imp.h" 39 #include "ucnv_bld.h" 40 #include "ucnv_cnv.h" 41 #include "ucnvmbcs.h" 42 #include "cstring.h" 43 #include "cmemory.h" 44 #include "uassert.h" 45 46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 static const char SHIFT_IN_STR[] = "\x0F"; 81 // static const char SHIFT_OUT_STR[] = "\x0E"; 82 83 #define CR 0x0D 84 #define LF 0x0A 85 #define H_TAB 0x09 86 #define V_TAB 0x0B 87 #define SPACE 0x20 88 89 enum { 90 HWKANA_START=0xff61, 91 HWKANA_END=0xff9f 92 }; 93 94 /* 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 96 * as bytes 21..7E. (Subtract 0x80.) 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 98 * as bytes 20..7F. (Subtract 0x80.) 99 * Do not encode C1 control codes with native bytes 80..9F 100 * as bytes 00..1F (C0 control codes). 101 */ 102 enum { 103 GR94_START=0xa1, 104 GR94_END=0xfe, 105 GR96_START=0xa0, 106 GR96_END=0xff 107 }; 108 109 /* 110 * ISO 2022 control codes must not be converted from Unicode 111 * because they would mess up the byte stream. 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 113 * corresponding to SO, SI, and ESC. 114 */ 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 116 117 /* for ISO-2022-JP and -CN implementations */ 118 typedef enum { 119 /* shared values */ 120 INVALID_STATE=-1, 121 ASCII = 0, 122 123 SS2_STATE=0x10, 124 SS3_STATE, 125 126 /* JP */ 127 ISO8859_1 = 1 , 128 ISO8859_7 = 2 , 129 JISX201 = 3, 130 JISX208 = 4, 131 JISX212 = 5, 132 GB2312 =6, 133 KSC5601 =7, 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 135 136 /* CN */ 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 138 GB2312_1=1, 139 ISO_IR_165=2, 140 CNS_11643=3, 141 142 /* 143 * these are used in StateEnum and ISO2022State variables, 144 * but CNS_11643 must be used to index into myConverterArray[] 145 */ 146 CNS_11643_0=0x20, 147 CNS_11643_1, 148 CNS_11643_2, 149 CNS_11643_3, 150 CNS_11643_4, 151 CNS_11643_5, 152 CNS_11643_6, 153 CNS_11643_7 154 } StateEnum; 155 156 /* is the StateEnum charset value for a DBCS charset? */ 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 158 159 #define CSM(cs) ((uint16_t)1<<(cs)) 160 161 /* 162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 164 * 165 * Note: The converter uses some leniency: 166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 167 * all versions, not just JIS7 and JIS8. 168 * - ICU does not distinguish between different versions of JIS X 0208. 169 */ 170 #if UCONFIG_NO_NON_HTML5_CONVERSION 171 enum { MAX_JA_VERSION=0 }; 172 #else 173 enum { MAX_JA_VERSION=4 }; 174 #endif 175 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 177 #if !UCONFIG_NO_NON_HTML5_CONVERSION 178 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 179 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 180 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 181 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 182 #endif 183 }; 184 185 typedef enum { 186 ASCII1=0, 187 LATIN1, 188 SBCS, 189 DBCS, 190 MBCS, 191 HWKANA 192 }Cnv2022Type; 193 194 typedef struct ISO2022State { 195 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 196 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 197 int8_t prevG; /* g before single shift (SS2 or SS3) */ 198 } ISO2022State; 199 200 #define UCNV_OPTIONS_VERSION_MASK 0xf 201 #define UCNV_2022_MAX_CONVERTERS 10 202 203 typedef struct{ 204 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 205 UConverter *currentConverter; 206 Cnv2022Type currentType; 207 ISO2022State toU2022State, fromU2022State; 208 uint32_t key; 209 uint32_t version; 210 #ifdef U_ENABLE_GENERIC_ISO_2022 211 UBool isFirstBuffer; 212 #endif 213 UBool isEmptySegment; 214 char name[30]; 215 char locale[3]; 216 }UConverterDataISO2022; 217 218 /* Protos */ 219 /* ISO-2022 ----------------------------------------------------------------- */ 220 221 /*Forward declaration */ 222 U_CFUNC void 223 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 224 UErrorCode * err); 225 U_CFUNC void 226 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 227 UErrorCode * err); 228 229 #define ESC_2022 0x1B /*ESC*/ 230 231 typedef enum 232 { 233 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 234 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 235 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 236 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 237 } UCNV_TableStates_2022; 238 239 /* 240 * The way these state transition arrays work is: 241 * ex : ESC$B is the sequence for JISX208 242 * a) First Iteration: char is ESC 243 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 244 * int x = normalize_esq_chars_2022[27] which is equal to 1 245 * ii) Search for this value in escSeqStateTable_Key_2022[] 246 * value of x is stored at escSeqStateTable_Key_2022[0] 247 * iii) Save this index as offset 248 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 249 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 250 * b) Switch on this state and continue to next char 251 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 252 * which is normalize_esq_chars_2022[36] == 4 253 * ii) x is currently 1(from above) 254 * x<<=5 -- x is now 32 255 * x+=normalize_esq_chars_2022[36] 256 * now x is 36 257 * iii) Search for this value in escSeqStateTable_Key_2022[] 258 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 259 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 260 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 261 * c) Switch on this state and continue to next char 262 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 263 * ii) x is currently 36 (from above) 264 * x<<=5 -- x is now 1152 265 * x+=normalize_esq_chars_2022[66] 266 * now x is 1161 267 * iii) Search for this value in escSeqStateTable_Key_2022[] 268 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 269 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 270 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 271 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 272 */ 273 274 275 /*Below are the 3 arrays depicting a state transition table*/ 276 static const int8_t normalize_esq_chars_2022[256] = { 277 /* 0 1 2 3 4 5 6 7 8 9 */ 278 279 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 282 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 283 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 286 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 287 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 304 ,0 ,0 ,0 ,0 ,0 ,0 305 }; 306 307 #ifdef U_ENABLE_GENERIC_ISO_2022 308 /* 309 * When the generic ISO-2022 converter is completely removed, not just disabled 310 * per #ifdef, then the following state table and the associated tables that are 311 * dimensioned with MAX_STATES_2022 should be trimmed. 312 * 313 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 314 * the associated escape sequences starting with ESC ( B should be removed. 315 * This includes the ones with key values 1097 and all of the ones above 1000000. 316 * 317 * For the latter, the tables can simply be truncated. 318 * For the former, since the tables must be kept parallel, it is probably best 319 * to simply duplicate an adjacent table cell, parallel in all tables. 320 * 321 * It may make sense to restructure the tables, especially by using small search 322 * tables for the variants instead of indexing them parallel to the table here. 323 */ 324 #endif 325 326 #define MAX_STATES_2022 74 327 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 328 /* 0 1 2 3 4 5 6 7 8 9 */ 329 330 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 331 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 332 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 333 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 334 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 335 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 336 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 337 ,35947631 ,35947635 ,35947636 ,35947638 338 }; 339 340 #ifdef U_ENABLE_GENERIC_ISO_2022 341 342 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 343 /* 0 1 2 3 4 5 6 7 8 9 */ 344 345 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 346 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 347 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 348 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 349 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 350 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 351 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 352 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 353 }; 354 355 #endif 356 357 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 358 /* 0 1 2 3 4 5 6 7 8 9 */ 359 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 360 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 361 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 362 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 363 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 364 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 365 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 366 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 367 }; 368 369 370 /* Enable ISO-2022-{KR,CN,CN-Ext} for now. 371 * TODO(jshin): Disable it when we know what to do about 'replacement' 372 * encodings. See http://crbug.com/277037 and 373 * https://codereview.chromium.org/145973021/ 374 */ 375 #ifndef U_ENABLE_ISO_2022_KR_CN 376 #define U_ENABLE_ISO_2022_KR_CN 1 377 #endif 378 379 /* Type def for refactoring changeState_2022 code*/ 380 typedef enum{ 381 #ifdef U_ENABLE_GENERIC_ISO_2022 382 ISO_2022=0, 383 #endif 384 ISO_2022_JP=1, 385 #ifdef U_ENABLE_ISO_2022_KR_CN 386 ISO_2022_KR=2, 387 ISO_2022_CN=3 388 #endif 389 } Variant2022; 390 391 /*********** ISO 2022 Converter Protos ***********/ 392 static void 393 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 394 395 static void 396 _ISO2022Close(UConverter *converter); 397 398 static void 399 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 400 401 static const char* 402 _ISO2022getName(const UConverter* cnv); 403 404 static void 405 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 406 407 static UConverter * 408 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 409 410 #ifdef U_ENABLE_GENERIC_ISO_2022 411 static void 412 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 413 #endif 414 415 namespace { 416 417 /*const UConverterSharedData _ISO2022Data;*/ 418 extern const UConverterSharedData _ISO2022JPData; 419 extern const UConverterSharedData _ISO2022KRData; 420 extern const UConverterSharedData _ISO2022CNData; 421 422 } // namespace 423 424 /*************** Converter implementations ******************/ 425 426 /* The purpose of this function is to get around gcc compiler warnings. */ 427 static inline void 428 fromUWriteUInt8(UConverter *cnv, 429 const char *bytes, int32_t length, 430 uint8_t **target, const char *targetLimit, 431 int32_t **offsets, 432 int32_t sourceIndex, 433 UErrorCode *pErrorCode) 434 { 435 char *targetChars = (char *)*target; 436 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 437 offsets, sourceIndex, pErrorCode); 438 *target = (uint8_t*)targetChars; 439 440 } 441 442 static inline void 443 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 444 if(myConverterData->version == 1) { 445 UConverter *cnv = myConverterData->currentConverter; 446 447 cnv->toUnicodeStatus=0; /* offset */ 448 cnv->mode=0; /* state */ 449 cnv->toULength=0; /* byteIndex */ 450 } 451 } 452 453 static inline void 454 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 455 /* in ISO-2022-KR the designator sequence appears only once 456 * in a file so we append it only once 457 */ 458 if( converter->charErrorBufferLength==0){ 459 460 converter->charErrorBufferLength = 4; 461 converter->charErrorBuffer[0] = 0x1b; 462 converter->charErrorBuffer[1] = 0x24; 463 converter->charErrorBuffer[2] = 0x29; 464 converter->charErrorBuffer[3] = 0x43; 465 } 466 if(myConverterData->version == 1) { 467 UConverter *cnv = myConverterData->currentConverter; 468 469 cnv->fromUChar32=0; 470 cnv->fromUnicodeStatus=1; /* prevLength */ 471 } 472 } 473 474 static void 475 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 476 477 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 478 479 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 480 if(cnv->extraInfo != NULL) { 481 UConverterNamePieces stackPieces; 482 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 483 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 484 uint32_t version; 485 486 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 487 488 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 489 myConverterData->currentType = ASCII1; 490 cnv->fromUnicodeStatus =FALSE; 491 if(pArgs->locale){ 492 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 493 } 494 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 495 myConverterData->version = version; 496 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 497 (myLocale[2]=='_' || myLocale[2]=='\0')) 498 { 499 size_t len=0; 500 /* open the required converters and cache them */ 501 if(version>MAX_JA_VERSION) { 502 /* prevent indexing beyond jpCharsetMasks[] */ 503 myConverterData->version = version = 0; 504 } 505 #if !UCONFIG_NO_NON_HTML5_CONVERSION 506 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 507 myConverterData->myConverterArray[ISO8859_7] = 508 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 509 } 510 #endif 511 myConverterData->myConverterArray[JISX208] = 512 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 513 #if !UCONFIG_NO_NON_HTML5_CONVERSION 514 if(jpCharsetMasks[version]&CSM(JISX212)) { 515 myConverterData->myConverterArray[JISX212] = 516 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 517 } 518 if(jpCharsetMasks[version]&CSM(GB2312)) { 519 myConverterData->myConverterArray[GB2312] = 520 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 521 } 522 if(jpCharsetMasks[version]&CSM(KSC5601)) { 523 myConverterData->myConverterArray[KSC5601] = 524 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 525 } 526 #endif 527 528 /* set the function pointers to appropriate funtions */ 529 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 530 uprv_strcpy(myConverterData->locale,"ja"); 531 532 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 533 len = uprv_strlen(myConverterData->name); 534 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 535 myConverterData->name[len+1]='\0'; 536 } 537 #ifdef U_ENABLE_ISO_2022_KR_CN 538 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 539 (myLocale[2]=='_' || myLocale[2]=='\0')) 540 { 541 const char *cnvName; 542 if(version==1) { 543 cnvName="icu-internal-25546"; 544 } else { 545 cnvName="ibm-949"; 546 myConverterData->version=version=0; 547 } 548 if(pArgs->onlyTestIsLoadable) { 549 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 550 uprv_free(cnv->extraInfo); 551 cnv->extraInfo=NULL; 552 return; 553 } else { 554 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 555 if (U_FAILURE(*errorCode)) { 556 _ISO2022Close(cnv); 557 return; 558 } 559 560 if(version==1) { 561 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 562 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 563 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 564 }else{ 565 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 566 } 567 568 /* initialize the state variables */ 569 setInitialStateToUnicodeKR(cnv, myConverterData); 570 setInitialStateFromUnicodeKR(cnv, myConverterData); 571 572 /* set the function pointers to appropriate funtions */ 573 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 574 uprv_strcpy(myConverterData->locale,"ko"); 575 } 576 } 577 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 578 (myLocale[2]=='_' || myLocale[2]=='\0')) 579 { 580 581 /* open the required converters and cache them */ 582 myConverterData->myConverterArray[GB2312_1] = 583 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 584 if(version==1) { 585 myConverterData->myConverterArray[ISO_IR_165] = 586 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 587 } 588 myConverterData->myConverterArray[CNS_11643] = 589 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 590 591 592 /* set the function pointers to appropriate funtions */ 593 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 594 uprv_strcpy(myConverterData->locale,"cn"); 595 596 if (version==0){ 597 myConverterData->version = 0; 598 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 599 }else if (version==1){ 600 myConverterData->version = 1; 601 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 602 }else { 603 myConverterData->version = 2; 604 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 605 } 606 } 607 #endif // U_ENABLE_ISO_2022_KR_CN 608 else{ 609 #ifdef U_ENABLE_GENERIC_ISO_2022 610 myConverterData->isFirstBuffer = TRUE; 611 612 /* append the UTF-8 escape sequence */ 613 cnv->charErrorBufferLength = 3; 614 cnv->charErrorBuffer[0] = 0x1b; 615 cnv->charErrorBuffer[1] = 0x25; 616 cnv->charErrorBuffer[2] = 0x42; 617 618 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 619 /* initialize the state variables */ 620 uprv_strcpy(myConverterData->name,"ISO_2022"); 621 #else 622 *errorCode = U_UNSUPPORTED_ERROR; 623 return; 624 #endif 625 } 626 627 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 628 629 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 630 _ISO2022Close(cnv); 631 } 632 } else { 633 *errorCode = U_MEMORY_ALLOCATION_ERROR; 634 } 635 } 636 637 638 static void 639 _ISO2022Close(UConverter *converter) { 640 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 641 UConverterSharedData **array = myData->myConverterArray; 642 int32_t i; 643 644 if (converter->extraInfo != NULL) { 645 /*close the array of converter pointers and free the memory*/ 646 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 647 if(array[i]!=NULL) { 648 ucnv_unloadSharedDataIfReady(array[i]); 649 } 650 } 651 652 ucnv_close(myData->currentConverter); 653 654 if(!converter->isExtraLocal){ 655 uprv_free (converter->extraInfo); 656 converter->extraInfo = NULL; 657 } 658 } 659 } 660 661 static void 662 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 663 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 664 if(choice<=UCNV_RESET_TO_UNICODE) { 665 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 666 myConverterData->key = 0; 667 myConverterData->isEmptySegment = FALSE; 668 } 669 if(choice!=UCNV_RESET_TO_UNICODE) { 670 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 671 } 672 #ifdef U_ENABLE_GENERIC_ISO_2022 673 if(myConverterData->locale[0] == 0){ 674 if(choice<=UCNV_RESET_TO_UNICODE) { 675 myConverterData->isFirstBuffer = TRUE; 676 myConverterData->key = 0; 677 if (converter->mode == UCNV_SO){ 678 ucnv_close (myConverterData->currentConverter); 679 myConverterData->currentConverter=NULL; 680 } 681 converter->mode = UCNV_SI; 682 } 683 if(choice!=UCNV_RESET_TO_UNICODE) { 684 /* re-append UTF-8 escape sequence */ 685 converter->charErrorBufferLength = 3; 686 converter->charErrorBuffer[0] = 0x1b; 687 converter->charErrorBuffer[1] = 0x28; 688 converter->charErrorBuffer[2] = 0x42; 689 } 690 } 691 else 692 #endif 693 { 694 /* reset the state variables */ 695 if(myConverterData->locale[0] == 'k'){ 696 if(choice<=UCNV_RESET_TO_UNICODE) { 697 setInitialStateToUnicodeKR(converter, myConverterData); 698 } 699 if(choice!=UCNV_RESET_TO_UNICODE) { 700 setInitialStateFromUnicodeKR(converter, myConverterData); 701 } 702 } 703 } 704 } 705 706 static const char* 707 _ISO2022getName(const UConverter* cnv){ 708 if(cnv->extraInfo){ 709 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 710 return myData->name; 711 } 712 return NULL; 713 } 714 715 716 /*************** to unicode *******************/ 717 /**************************************************************************** 718 * Recognized escape sequences are 719 * <ESC>(B ASCII 720 * <ESC>.A ISO-8859-1 721 * <ESC>.F ISO-8859-7 722 * <ESC>(J JISX-201 723 * <ESC>(I JISX-201 724 * <ESC>$B JISX-208 725 * <ESC>$@ JISX-208 726 * <ESC>$(D JISX-212 727 * <ESC>$A GB2312 728 * <ESC>$(C KSC5601 729 */ 730 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 731 /* 0 1 2 3 4 5 6 7 8 9 */ 732 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 733 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 734 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 735 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 736 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 737 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 738 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 739 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 740 }; 741 742 /*************** to unicode *******************/ 743 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 744 /* 0 1 2 3 4 5 6 7 8 9 */ 745 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 746 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 747 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 748 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 749 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 750 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 751 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 752 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 753 }; 754 755 756 static UCNV_TableStates_2022 757 getKey_2022(char c,int32_t* key,int32_t* offset){ 758 int32_t togo; 759 int32_t low = 0; 760 int32_t hi = MAX_STATES_2022; 761 int32_t oldmid=0; 762 763 togo = normalize_esq_chars_2022[(uint8_t)c]; 764 if(togo == 0) { 765 /* not a valid character anywhere in an escape sequence */ 766 *key = 0; 767 *offset = 0; 768 return INVALID_2022; 769 } 770 togo = (*key << 5) + togo; 771 772 while (hi != low) /*binary search*/{ 773 774 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 775 776 if (mid == oldmid) 777 break; 778 779 if (escSeqStateTable_Key_2022[mid] > togo){ 780 hi = mid; 781 } 782 else if (escSeqStateTable_Key_2022[mid] < togo){ 783 low = mid; 784 } 785 else /*we found it*/{ 786 *key = togo; 787 *offset = mid; 788 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 789 } 790 oldmid = mid; 791 792 } 793 794 *key = 0; 795 *offset = 0; 796 return INVALID_2022; 797 } 798 799 /*runs through a state machine to determine the escape sequence - codepage correspondance 800 */ 801 static void 802 changeState_2022(UConverter* _this, 803 const char** source, 804 const char* sourceLimit, 805 Variant2022 var, 806 UErrorCode* err){ 807 UCNV_TableStates_2022 value; 808 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 809 uint32_t key = myData2022->key; 810 int32_t offset = 0; 811 int8_t initialToULength = _this->toULength; 812 char c; 813 814 value = VALID_NON_TERMINAL_2022; 815 while (*source < sourceLimit) { 816 c = *(*source)++; 817 _this->toUBytes[_this->toULength++]=(uint8_t)c; 818 value = getKey_2022(c,(int32_t *) &key, &offset); 819 820 switch (value){ 821 822 case VALID_NON_TERMINAL_2022 : 823 /* continue with the loop */ 824 break; 825 826 case VALID_TERMINAL_2022: 827 key = 0; 828 goto DONE; 829 830 case INVALID_2022: 831 goto DONE; 832 833 case VALID_MAYBE_TERMINAL_2022: 834 #ifdef U_ENABLE_GENERIC_ISO_2022 835 /* ESC ( B is ambiguous only for ISO_2022 itself */ 836 if(var == ISO_2022) { 837 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 838 _this->toULength = 0; 839 840 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 841 842 /* continue with the loop */ 843 value = VALID_NON_TERMINAL_2022; 844 break; 845 } else 846 #endif 847 { 848 /* not ISO_2022 itself, finish here */ 849 value = VALID_TERMINAL_2022; 850 key = 0; 851 goto DONE; 852 } 853 } 854 } 855 856 DONE: 857 myData2022->key = key; 858 859 if (value == VALID_NON_TERMINAL_2022) { 860 /* indicate that the escape sequence is incomplete: key!=0 */ 861 return; 862 } else if (value == INVALID_2022 ) { 863 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 864 } else /* value == VALID_TERMINAL_2022 */ { 865 switch(var){ 866 #ifdef U_ENABLE_GENERIC_ISO_2022 867 case ISO_2022: 868 { 869 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 870 if(chosenConverterName == NULL) { 871 /* SS2 or SS3 */ 872 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 873 _this->toUCallbackReason = UCNV_UNASSIGNED; 874 return; 875 } 876 877 _this->mode = UCNV_SI; 878 ucnv_close(myData2022->currentConverter); 879 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 880 if(U_SUCCESS(*err)) { 881 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 882 _this->mode = UCNV_SO; 883 } 884 break; 885 } 886 #endif 887 case ISO_2022_JP: 888 { 889 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 890 switch(tempState) { 891 case INVALID_STATE: 892 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 893 break; 894 case SS2_STATE: 895 if(myData2022->toU2022State.cs[2]!=0) { 896 if(myData2022->toU2022State.g<2) { 897 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 898 } 899 myData2022->toU2022State.g=2; 900 } else { 901 /* illegal to have SS2 before a matching designator */ 902 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 903 } 904 break; 905 /* case SS3_STATE: not used in ISO-2022-JP-x */ 906 case ISO8859_1: 907 case ISO8859_7: 908 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 909 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 910 } else { 911 /* G2 charset for SS2 */ 912 myData2022->toU2022State.cs[2]=(int8_t)tempState; 913 } 914 break; 915 default: 916 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 917 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 918 } else { 919 /* G0 charset */ 920 myData2022->toU2022State.cs[0]=(int8_t)tempState; 921 } 922 break; 923 } 924 } 925 break; 926 case ISO_2022_CN: 927 { 928 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 929 switch(tempState) { 930 case INVALID_STATE: 931 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 932 break; 933 case SS2_STATE: 934 if(myData2022->toU2022State.cs[2]!=0) { 935 if(myData2022->toU2022State.g<2) { 936 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 937 } 938 myData2022->toU2022State.g=2; 939 } else { 940 /* illegal to have SS2 before a matching designator */ 941 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 942 } 943 break; 944 case SS3_STATE: 945 if(myData2022->toU2022State.cs[3]!=0) { 946 if(myData2022->toU2022State.g<2) { 947 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 948 } 949 myData2022->toU2022State.g=3; 950 } else { 951 /* illegal to have SS3 before a matching designator */ 952 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 953 } 954 break; 955 case ISO_IR_165: 956 if(myData2022->version==0) { 957 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 958 break; 959 } 960 /*fall through*/ 961 case GB2312_1: 962 /*fall through*/ 963 case CNS_11643_1: 964 myData2022->toU2022State.cs[1]=(int8_t)tempState; 965 break; 966 case CNS_11643_2: 967 myData2022->toU2022State.cs[2]=(int8_t)tempState; 968 break; 969 default: 970 /* other CNS 11643 planes */ 971 if(myData2022->version==0) { 972 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 973 } else { 974 myData2022->toU2022State.cs[3]=(int8_t)tempState; 975 } 976 break; 977 } 978 } 979 break; 980 case ISO_2022_KR: 981 if(offset==0x30){ 982 /* nothing to be done, just accept this one escape sequence */ 983 } else { 984 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 985 } 986 break; 987 988 default: 989 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 990 break; 991 } 992 } 993 if(U_SUCCESS(*err)) { 994 _this->toULength = 0; 995 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 996 if(_this->toULength>1) { 997 /* 998 * Ticket 5691: consistent illegal sequences: 999 * - We include at least the first byte (ESC) in the illegal sequence. 1000 * - If any of the non-initial bytes could be the start of a character, 1001 * we stop the illegal sequence before the first one of those. 1002 * In escape sequences, all following bytes are "printable", that is, 1003 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1004 * they are valid single/lead bytes. 1005 * For simplicity, we always only report the initial ESC byte as the 1006 * illegal sequence and back out all other bytes we looked at. 1007 */ 1008 /* Back out some bytes. */ 1009 int8_t backOutDistance=_this->toULength-1; 1010 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1011 if(backOutDistance<=bytesFromThisBuffer) { 1012 /* same as initialToULength<=1 */ 1013 *source-=backOutDistance; 1014 } else { 1015 /* Back out bytes from the previous buffer: Need to replay them. */ 1016 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1017 /* same as -(initialToULength-1) */ 1018 /* preToULength is negative! */ 1019 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1020 *source-=bytesFromThisBuffer; 1021 } 1022 _this->toULength=1; 1023 } 1024 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1025 _this->toUCallbackReason = UCNV_UNASSIGNED; 1026 } 1027 } 1028 1029 /*Checks the characters of the buffer against valid 2022 escape sequences 1030 *if the match we return a pointer to the initial start of the sequence otherwise 1031 *we return sourceLimit 1032 */ 1033 /*for 2022 looks ahead in the stream 1034 *to determine the longest possible convertible 1035 *data stream 1036 */ 1037 static inline const char* 1038 getEndOfBuffer_2022(const char** source, 1039 const char* sourceLimit, 1040 UBool /*flush*/){ 1041 1042 const char* mySource = *source; 1043 1044 #ifdef U_ENABLE_GENERIC_ISO_2022 1045 if (*source >= sourceLimit) 1046 return sourceLimit; 1047 1048 do{ 1049 1050 if (*mySource == ESC_2022){ 1051 int8_t i; 1052 int32_t key = 0; 1053 int32_t offset; 1054 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1055 1056 /* Kludge: I could not 1057 * figure out the reason for validating an escape sequence 1058 * twice - once here and once in changeState_2022(). 1059 * is it possible to have an ESC character in a ISO2022 1060 * byte stream which is valid in a code page? Is it legal? 1061 */ 1062 for (i=0; 1063 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1064 i++) { 1065 value = getKey_2022(*(mySource+i), &key, &offset); 1066 } 1067 if (value > 0 || *mySource==ESC_2022) 1068 return mySource; 1069 1070 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1071 return sourceLimit; 1072 } 1073 }while (++mySource < sourceLimit); 1074 1075 return sourceLimit; 1076 #else 1077 while(mySource < sourceLimit && *mySource != ESC_2022) { 1078 ++mySource; 1079 } 1080 return mySource; 1081 #endif 1082 } 1083 1084 1085 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1086 * any future change in _MBCSFromUChar32() function should be reflected here. 1087 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1088 */ 1089 static inline int32_t 1090 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1091 UChar32 c, 1092 uint32_t* value, 1093 UBool useFallback, 1094 int outputType) 1095 { 1096 const int32_t *cx; 1097 const uint16_t *table; 1098 uint32_t stage2Entry; 1099 uint32_t myValue; 1100 int32_t length; 1101 const uint8_t *p; 1102 /* 1103 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1104 * Use internal version of ucnv_open() that verifies that the new structures are available, 1105 * else U_INTERNAL_PROGRAM_ERROR. 1106 */ 1107 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1108 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1109 table=sharedData->mbcs.fromUnicodeTable; 1110 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1111 /* get the bytes and the length for the output */ 1112 if(outputType==MBCS_OUTPUT_2){ 1113 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1114 if(myValue<=0xff) { 1115 length=1; 1116 } else { 1117 length=2; 1118 } 1119 } else /* outputType==MBCS_OUTPUT_3 */ { 1120 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1121 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1122 if(myValue<=0xff) { 1123 length=1; 1124 } else if(myValue<=0xffff) { 1125 length=2; 1126 } else { 1127 length=3; 1128 } 1129 } 1130 /* is this code point assigned, or do we use fallbacks? */ 1131 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1132 /* assigned */ 1133 *value=myValue; 1134 return length; 1135 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1136 /* 1137 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1138 * There is no way with this data structure for fallback output 1139 * to be a zero byte. 1140 */ 1141 *value=myValue; 1142 return -length; 1143 } 1144 } 1145 1146 cx=sharedData->mbcs.extIndexes; 1147 if(cx!=NULL) { 1148 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1149 } 1150 1151 /* unassigned */ 1152 return 0; 1153 } 1154 1155 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1156 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1157 * @param retval pointer to output byte 1158 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1159 */ 1160 static inline int32_t 1161 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1162 UChar32 c, 1163 uint32_t* retval, 1164 UBool useFallback) 1165 { 1166 const uint16_t *table; 1167 int32_t value; 1168 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1169 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1170 return 0; 1171 } 1172 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1173 table=sharedData->mbcs.fromUnicodeTable; 1174 /* get the byte for the output */ 1175 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1176 /* is this code point assigned, or do we use fallbacks? */ 1177 *retval=(uint32_t)(value&0xff); 1178 if(value>=0xf00) { 1179 return 1; /* roundtrip */ 1180 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1181 return -1; /* fallback taken */ 1182 } else { 1183 return 0; /* no mapping */ 1184 } 1185 } 1186 1187 /* 1188 * Check that the result is a 2-byte value with each byte in the range A1..FE 1189 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1190 * to move it to the ISO 2022 range 21..7E. 1191 * Return 0 if out of range. 1192 */ 1193 static inline uint32_t 1194 _2022FromGR94DBCS(uint32_t value) { 1195 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1196 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1197 ) { 1198 return value - 0x8080; /* shift down to 21..7e byte range */ 1199 } else { 1200 return 0; /* not valid for ISO 2022 */ 1201 } 1202 } 1203 1204 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1205 /* 1206 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1207 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1208 * unchanged. 1209 */ 1210 static inline uint32_t 1211 _2022ToGR94DBCS(uint32_t value) { 1212 uint32_t returnValue = value + 0x8080; 1213 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1214 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1215 return returnValue; 1216 } else { 1217 return value; 1218 } 1219 } 1220 #endif 1221 1222 #ifdef U_ENABLE_GENERIC_ISO_2022 1223 1224 /********************************************************************************** 1225 * ISO-2022 Converter 1226 * 1227 * 1228 */ 1229 1230 static void 1231 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1232 UErrorCode* err){ 1233 const char* mySourceLimit, *realSourceLimit; 1234 const char* sourceStart; 1235 const UChar* myTargetStart; 1236 UConverter* saveThis; 1237 UConverterDataISO2022* myData; 1238 int8_t length; 1239 1240 saveThis = args->converter; 1241 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1242 1243 realSourceLimit = args->sourceLimit; 1244 while (args->source < realSourceLimit) { 1245 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1246 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1247 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1248 1249 if(args->source < mySourceLimit) { 1250 if(myData->currentConverter==NULL) { 1251 myData->currentConverter = ucnv_open("ASCII",err); 1252 if(U_FAILURE(*err)){ 1253 return; 1254 } 1255 1256 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1257 saveThis->mode = UCNV_SO; 1258 } 1259 1260 /* convert to before the ESC or until the end of the buffer */ 1261 myData->isFirstBuffer=FALSE; 1262 sourceStart = args->source; 1263 myTargetStart = args->target; 1264 args->converter = myData->currentConverter; 1265 ucnv_toUnicode(args->converter, 1266 &args->target, 1267 args->targetLimit, 1268 &args->source, 1269 mySourceLimit, 1270 args->offsets, 1271 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1272 err); 1273 args->converter = saveThis; 1274 1275 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1276 /* move the overflow buffer */ 1277 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1278 myData->currentConverter->UCharErrorBufferLength = 0; 1279 if(length > 0) { 1280 uprv_memcpy(saveThis->UCharErrorBuffer, 1281 myData->currentConverter->UCharErrorBuffer, 1282 length*U_SIZEOF_UCHAR); 1283 } 1284 return; 1285 } 1286 1287 /* 1288 * At least one of: 1289 * -Error while converting 1290 * -Done with entire buffer 1291 * -Need to write offsets or update the current offset 1292 * (leave that up to the code in ucnv.c) 1293 * 1294 * or else we just stopped at an ESC byte and continue with changeState_2022() 1295 */ 1296 if (U_FAILURE(*err) || 1297 (args->source == realSourceLimit) || 1298 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1299 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1300 ) { 1301 /* copy partial or error input for truncated detection and error handling */ 1302 if(U_FAILURE(*err)) { 1303 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1304 if(length > 0) { 1305 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1306 } 1307 } else { 1308 length = saveThis->toULength = myData->currentConverter->toULength; 1309 if(length > 0) { 1310 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1311 if(args->source < mySourceLimit) { 1312 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1313 } 1314 } 1315 } 1316 return; 1317 } 1318 } 1319 } 1320 1321 sourceStart = args->source; 1322 changeState_2022(args->converter, 1323 &(args->source), 1324 realSourceLimit, 1325 ISO_2022, 1326 err); 1327 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1328 /* let the ucnv.c code update its current offset */ 1329 return; 1330 } 1331 } 1332 } 1333 1334 #endif 1335 1336 /* 1337 * To Unicode Callback helper function 1338 */ 1339 static void 1340 toUnicodeCallback(UConverter *cnv, 1341 const uint32_t sourceChar, const uint32_t targetUniChar, 1342 UErrorCode* err){ 1343 if(sourceChar>0xff){ 1344 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1345 cnv->toUBytes[1] = (uint8_t)sourceChar; 1346 cnv->toULength = 2; 1347 } 1348 else{ 1349 cnv->toUBytes[0] =(char) sourceChar; 1350 cnv->toULength = 1; 1351 } 1352 1353 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1354 *err = U_INVALID_CHAR_FOUND; 1355 } 1356 else{ 1357 *err = U_ILLEGAL_CHAR_FOUND; 1358 } 1359 } 1360 1361 /**************************************ISO-2022-JP*************************************************/ 1362 1363 /************************************** IMPORTANT ************************************************** 1364 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1365 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1366 * The converter iterates over each Unicode codepoint 1367 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1368 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1369 * would do as far as possible. 1370 * 1371 * If the implementation of these macros or structure of sharedData struct change in the future, make 1372 * sure that ISO-2022 is also changed. 1373 *************************************************************************************************** 1374 */ 1375 1376 /*************************************************************************************************** 1377 * Rules for ISO-2022-jp encoding 1378 * (i) Escape sequences must be fully contained within a line they should not 1379 * span new lines or CRs 1380 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1381 * JIS-Roman character escape sequence should follow before the line terminates 1382 * (iii) If the first character on the line is represented by two bytes then a two 1383 * byte character escape sequence should precede it 1384 * (iv) If no escape sequence is encountered then the characters are ASCII 1385 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1386 * and invoked with SS2 (ESC N). 1387 * (vi) If there is any G0 designation in text, there must be a switch to 1388 * ASCII or to JIS X 0201-Roman before a space character (but not 1389 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1390 * characters such as tab or CRLF. 1391 * (vi) Supported encodings: 1392 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1393 * 1394 * source : RFC-1554 1395 * 1396 * JISX201, JISX208,JISX212 : new .cnv data files created 1397 * KSC5601 : alias to ibm-949 mapping table 1398 * GB2312 : alias to ibm-1386 mapping table 1399 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1400 * ISO-8859-7 : alisas to ibm-9409 mapping table 1401 */ 1402 1403 /* preference order of JP charsets */ 1404 static const StateEnum jpCharsetPref[]={ 1405 ASCII, 1406 JISX201, 1407 ISO8859_1, 1408 ISO8859_7, 1409 JISX208, 1410 JISX212, 1411 GB2312, 1412 KSC5601, 1413 HWKANA_7BIT 1414 }; 1415 1416 /* 1417 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1418 * not in order of jpCharsetPref[]! 1419 */ 1420 static const char escSeqChars[][6] ={ 1421 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1422 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1423 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1424 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1425 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1426 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1427 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1428 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1429 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1430 1431 }; 1432 static const int8_t escSeqCharsLen[] ={ 1433 3, /* length of <ESC>(B ASCII */ 1434 3, /* length of <ESC>.A ISO-8859-1 */ 1435 3, /* length of <ESC>.F ISO-8859-7 */ 1436 3, /* length of <ESC>(J JISX-201 */ 1437 3, /* length of <ESC>$B JISX-208 */ 1438 4, /* length of <ESC>$(D JISX-212 */ 1439 3, /* length of <ESC>$A GB2312 */ 1440 4, /* length of <ESC>$(C KSC5601 */ 1441 3 /* length of <ESC>(I HWKANA_7BIT */ 1442 }; 1443 1444 /* 1445 * The iteration over various code pages works this way: 1446 * i) Get the currentState from myConverterData->currentState 1447 * ii) Check if the character is mapped to a valid character in the currentState 1448 * Yes -> a) set the initIterState to currentState 1449 * b) remain in this state until an invalid character is found 1450 * No -> a) go to the next code page and find the character 1451 * iii) Before changing the state increment the current state check if the current state 1452 * is equal to the intitIteration state 1453 * Yes -> A character that cannot be represented in any of the supported encodings 1454 * break and return a U_INVALID_CHARACTER error 1455 * No -> Continue and find the character in next code page 1456 * 1457 * 1458 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1459 */ 1460 1461 /* Map 00..7F to Unicode according to JIS X 0201. */ 1462 static inline uint32_t 1463 jisx201ToU(uint32_t value) { 1464 if(value < 0x5c) { 1465 return value; 1466 } else if(value == 0x5c) { 1467 return 0xa5; 1468 } else if(value == 0x7e) { 1469 return 0x203e; 1470 } else /* value <= 0x7f */ { 1471 return value; 1472 } 1473 } 1474 1475 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1476 static inline uint32_t 1477 jisx201FromU(uint32_t value) { 1478 if(value<=0x7f) { 1479 if(value!=0x5c && value!=0x7e) { 1480 return value; 1481 } 1482 } else if(value==0xa5) { 1483 return 0x5c; 1484 } else if(value==0x203e) { 1485 return 0x7e; 1486 } 1487 return 0xfffe; 1488 } 1489 1490 /* 1491 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1492 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1493 * Return 0 if the byte pair is out of range. 1494 */ 1495 static inline uint32_t 1496 _2022FromSJIS(uint32_t value) { 1497 uint8_t trail; 1498 1499 if(value > 0xEFFC) { 1500 return 0; /* beyond JIS X 0208 */ 1501 } 1502 1503 trail = (uint8_t)value; 1504 1505 value &= 0xff00; /* lead byte */ 1506 if(value <= 0x9f00) { 1507 value -= 0x7000; 1508 } else /* 0xe000 <= value <= 0xef00 */ { 1509 value -= 0xb000; 1510 } 1511 value <<= 1; 1512 1513 if(trail <= 0x9e) { 1514 value -= 0x100; 1515 if(trail <= 0x7e) { 1516 value |= trail - 0x1f; 1517 } else { 1518 value |= trail - 0x20; 1519 } 1520 } else /* trail <= 0xfc */ { 1521 value |= trail - 0x7e; 1522 } 1523 return value; 1524 } 1525 1526 /* 1527 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1528 * If either byte is outside 21..7E make sure that the result is not valid 1529 * for Shift-JIS so that the converter catches it. 1530 * Some invalid byte values already turn into equally invalid Shift-JIS 1531 * byte values and need not be tested explicitly. 1532 */ 1533 static inline void 1534 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1535 if(c1&1) { 1536 ++c1; 1537 if(c2 <= 0x5f) { 1538 c2 += 0x1f; 1539 } else if(c2 <= 0x7e) { 1540 c2 += 0x20; 1541 } else { 1542 c2 = 0; /* invalid */ 1543 } 1544 } else { 1545 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1546 c2 += 0x7e; 1547 } else { 1548 c2 = 0; /* invalid */ 1549 } 1550 } 1551 c1 >>= 1; 1552 if(c1 <= 0x2f) { 1553 c1 += 0x70; 1554 } else if(c1 <= 0x3f) { 1555 c1 += 0xb0; 1556 } else { 1557 c1 = 0; /* invalid */ 1558 } 1559 bytes[0] = (char)c1; 1560 bytes[1] = (char)c2; 1561 } 1562 1563 /* 1564 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1565 * Katakana. 1566 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1567 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1568 * These were the only fallbacks in ICU's jisx-208.ucm file. 1569 */ 1570 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1571 0x2123, /* U+FF61 */ 1572 0x2156, 1573 0x2157, 1574 0x2122, 1575 0x2126, 1576 0x2572, 1577 0x2521, 1578 0x2523, 1579 0x2525, 1580 0x2527, 1581 0x2529, 1582 0x2563, 1583 0x2565, 1584 0x2567, 1585 0x2543, 1586 0x213C, /* U+FF70 */ 1587 0x2522, 1588 0x2524, 1589 0x2526, 1590 0x2528, 1591 0x252A, 1592 0x252B, 1593 0x252D, 1594 0x252F, 1595 0x2531, 1596 0x2533, 1597 0x2535, 1598 0x2537, 1599 0x2539, 1600 0x253B, 1601 0x253D, 1602 0x253F, /* U+FF80 */ 1603 0x2541, 1604 0x2544, 1605 0x2546, 1606 0x2548, 1607 0x254A, 1608 0x254B, 1609 0x254C, 1610 0x254D, 1611 0x254E, 1612 0x254F, 1613 0x2552, 1614 0x2555, 1615 0x2558, 1616 0x255B, 1617 0x255E, 1618 0x255F, /* U+FF90 */ 1619 0x2560, 1620 0x2561, 1621 0x2562, 1622 0x2564, 1623 0x2566, 1624 0x2568, 1625 0x2569, 1626 0x256A, 1627 0x256B, 1628 0x256C, 1629 0x256D, 1630 0x256F, 1631 0x2573, 1632 0x212B, 1633 0x212C /* U+FF9F */ 1634 }; 1635 1636 static void 1637 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1638 UConverter *cnv = args->converter; 1639 UConverterDataISO2022 *converterData; 1640 ISO2022State *pFromU2022State; 1641 uint8_t *target = (uint8_t *) args->target; 1642 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1643 const UChar* source = args->source; 1644 const UChar* sourceLimit = args->sourceLimit; 1645 int32_t* offsets = args->offsets; 1646 UChar32 sourceChar; 1647 char buffer[8]; 1648 int32_t len, outLen; 1649 int8_t choices[10]; 1650 int32_t choiceCount; 1651 uint32_t targetValue = 0; 1652 UBool useFallback; 1653 1654 int32_t i; 1655 int8_t cs, g; 1656 1657 /* set up the state */ 1658 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1659 pFromU2022State = &converterData->fromU2022State; 1660 1661 choiceCount = 0; 1662 1663 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1664 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1665 goto getTrail; 1666 } 1667 1668 while(source < sourceLimit) { 1669 if(target < targetLimit) { 1670 1671 sourceChar = *(source++); 1672 /*check if the char is a First surrogate*/ 1673 if(U16_IS_SURROGATE(sourceChar)) { 1674 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1675 getTrail: 1676 /*look ahead to find the trail surrogate*/ 1677 if(source < sourceLimit) { 1678 /* test the following code unit */ 1679 UChar trail=(UChar) *source; 1680 if(U16_IS_TRAIL(trail)) { 1681 source++; 1682 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1683 cnv->fromUChar32=0x00; 1684 /* convert this supplementary code point */ 1685 /* exit this condition tree */ 1686 } else { 1687 /* this is an unmatched lead code unit (1st surrogate) */ 1688 /* callback(illegal) */ 1689 *err=U_ILLEGAL_CHAR_FOUND; 1690 cnv->fromUChar32=sourceChar; 1691 break; 1692 } 1693 } else { 1694 /* no more input */ 1695 cnv->fromUChar32=sourceChar; 1696 break; 1697 } 1698 } else { 1699 /* this is an unmatched trail code unit (2nd surrogate) */ 1700 /* callback(illegal) */ 1701 *err=U_ILLEGAL_CHAR_FOUND; 1702 cnv->fromUChar32=sourceChar; 1703 break; 1704 } 1705 } 1706 1707 /* do not convert SO/SI/ESC */ 1708 if(IS_2022_CONTROL(sourceChar)) { 1709 /* callback(illegal) */ 1710 *err=U_ILLEGAL_CHAR_FOUND; 1711 cnv->fromUChar32=sourceChar; 1712 break; 1713 } 1714 1715 /* do the conversion */ 1716 1717 if(choiceCount == 0) { 1718 uint16_t csm; 1719 1720 /* 1721 * The csm variable keeps track of which charsets are allowed 1722 * and not used yet while building the choices[]. 1723 */ 1724 csm = jpCharsetMasks[converterData->version]; 1725 choiceCount = 0; 1726 1727 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1728 if(converterData->version == 3 || converterData->version == 4) { 1729 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1730 } 1731 /* Do not try single-byte half-width Katakana for other versions. */ 1732 csm &= ~CSM(HWKANA_7BIT); 1733 1734 /* try the current G0 charset */ 1735 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1736 csm &= ~CSM(cs); 1737 1738 /* try the current G2 charset */ 1739 if((cs = pFromU2022State->cs[2]) != 0) { 1740 choices[choiceCount++] = cs; 1741 csm &= ~CSM(cs); 1742 } 1743 1744 /* try all the other possible charsets */ 1745 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1746 cs = (int8_t)jpCharsetPref[i]; 1747 if(CSM(cs) & csm) { 1748 choices[choiceCount++] = cs; 1749 csm &= ~CSM(cs); 1750 } 1751 } 1752 } 1753 1754 cs = g = 0; 1755 /* 1756 * len==0: no mapping found yet 1757 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1758 * len>0: found a roundtrip result, done 1759 */ 1760 len = 0; 1761 /* 1762 * We will turn off useFallback after finding a fallback, 1763 * but we still get fallbacks from PUA code points as usual. 1764 * Therefore, we will also need to check that we don't overwrite 1765 * an early fallback with a later one. 1766 */ 1767 useFallback = cnv->useFallback; 1768 1769 for(i = 0; i < choiceCount && len <= 0; ++i) { 1770 uint32_t value; 1771 int32_t len2; 1772 int8_t cs0 = choices[i]; 1773 switch(cs0) { 1774 case ASCII: 1775 if(sourceChar <= 0x7f) { 1776 targetValue = (uint32_t)sourceChar; 1777 len = 1; 1778 cs = cs0; 1779 g = 0; 1780 } 1781 break; 1782 case ISO8859_1: 1783 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1784 targetValue = (uint32_t)sourceChar - 0x80; 1785 len = 1; 1786 cs = cs0; 1787 g = 2; 1788 } 1789 break; 1790 case HWKANA_7BIT: 1791 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1792 if(converterData->version==3) { 1793 /* JIS7: use G1 (SO) */ 1794 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1795 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1796 len = 1; 1797 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1798 g = 1; 1799 } else if(converterData->version==4) { 1800 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1801 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1802 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1803 len = 1; 1804 1805 cs = pFromU2022State->cs[0]; 1806 if(IS_JP_DBCS(cs)) { 1807 /* switch from a DBCS charset to JISX201 */ 1808 cs = (int8_t)JISX201; 1809 } 1810 /* else stay in the current G0 charset */ 1811 g = 0; 1812 } 1813 /* else do not use HWKANA_7BIT with other versions */ 1814 } 1815 break; 1816 case JISX201: 1817 /* G0 SBCS */ 1818 value = jisx201FromU(sourceChar); 1819 if(value <= 0x7f) { 1820 targetValue = value; 1821 len = 1; 1822 cs = cs0; 1823 g = 0; 1824 useFallback = FALSE; 1825 } 1826 break; 1827 case JISX208: 1828 /* G0 DBCS from Shift-JIS table */ 1829 len2 = MBCS_FROM_UCHAR32_ISO2022( 1830 converterData->myConverterArray[cs0], 1831 sourceChar, &value, 1832 useFallback, MBCS_OUTPUT_2); 1833 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1834 value = _2022FromSJIS(value); 1835 if(value != 0) { 1836 targetValue = value; 1837 len = len2; 1838 cs = cs0; 1839 g = 0; 1840 useFallback = FALSE; 1841 } 1842 } else if(len == 0 && useFallback && 1843 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1844 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1845 len = -2; 1846 cs = cs0; 1847 g = 0; 1848 useFallback = FALSE; 1849 } 1850 break; 1851 case ISO8859_7: 1852 /* G0 SBCS forced to 7-bit output */ 1853 len2 = MBCS_SINGLE_FROM_UCHAR32( 1854 converterData->myConverterArray[cs0], 1855 sourceChar, &value, 1856 useFallback); 1857 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1858 targetValue = value - 0x80; 1859 len = len2; 1860 cs = cs0; 1861 g = 2; 1862 useFallback = FALSE; 1863 } 1864 break; 1865 default: 1866 /* G0 DBCS */ 1867 len2 = MBCS_FROM_UCHAR32_ISO2022( 1868 converterData->myConverterArray[cs0], 1869 sourceChar, &value, 1870 useFallback, MBCS_OUTPUT_2); 1871 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1872 if(cs0 == KSC5601) { 1873 /* 1874 * Check for valid bytes for the encoding scheme. 1875 * This is necessary because the sub-converter (windows-949) 1876 * has a broader encoding scheme than is valid for 2022. 1877 */ 1878 value = _2022FromGR94DBCS(value); 1879 if(value == 0) { 1880 break; 1881 } 1882 } 1883 targetValue = value; 1884 len = len2; 1885 cs = cs0; 1886 g = 0; 1887 useFallback = FALSE; 1888 } 1889 break; 1890 } 1891 } 1892 1893 if(len != 0) { 1894 if(len < 0) { 1895 len = -len; /* fallback */ 1896 } 1897 outLen = 0; /* count output bytes */ 1898 1899 /* write SI if necessary (only for JIS7) */ 1900 if(pFromU2022State->g == 1 && g == 0) { 1901 buffer[outLen++] = UCNV_SI; 1902 pFromU2022State->g = 0; 1903 } 1904 1905 /* write the designation sequence if necessary */ 1906 if(cs != pFromU2022State->cs[g]) { 1907 int32_t escLen = escSeqCharsLen[cs]; 1908 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1909 outLen += escLen; 1910 pFromU2022State->cs[g] = cs; 1911 1912 /* invalidate the choices[] */ 1913 choiceCount = 0; 1914 } 1915 1916 /* write the shift sequence if necessary */ 1917 if(g != pFromU2022State->g) { 1918 switch(g) { 1919 /* case 0 handled before writing escapes */ 1920 case 1: 1921 buffer[outLen++] = UCNV_SO; 1922 pFromU2022State->g = 1; 1923 break; 1924 default: /* case 2 */ 1925 buffer[outLen++] = 0x1b; 1926 buffer[outLen++] = 0x4e; 1927 break; 1928 /* no case 3: no SS3 in ISO-2022-JP-x */ 1929 } 1930 } 1931 1932 /* write the output bytes */ 1933 if(len == 1) { 1934 buffer[outLen++] = (char)targetValue; 1935 } else /* len == 2 */ { 1936 buffer[outLen++] = (char)(targetValue >> 8); 1937 buffer[outLen++] = (char)targetValue; 1938 } 1939 } else { 1940 /* 1941 * if we cannot find the character after checking all codepages 1942 * then this is an error 1943 */ 1944 *err = U_INVALID_CHAR_FOUND; 1945 cnv->fromUChar32=sourceChar; 1946 break; 1947 } 1948 1949 if(sourceChar == CR || sourceChar == LF) { 1950 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1951 pFromU2022State->cs[2] = 0; 1952 choiceCount = 0; 1953 } 1954 1955 /* output outLen>0 bytes in buffer[] */ 1956 if(outLen == 1) { 1957 *target++ = buffer[0]; 1958 if(offsets) { 1959 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1960 } 1961 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1962 *target++ = buffer[0]; 1963 *target++ = buffer[1]; 1964 if(offsets) { 1965 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1966 *offsets++ = sourceIndex; 1967 *offsets++ = sourceIndex; 1968 } 1969 } else { 1970 fromUWriteUInt8( 1971 cnv, 1972 buffer, outLen, 1973 &target, (const char *)targetLimit, 1974 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1975 err); 1976 if(U_FAILURE(*err)) { 1977 break; 1978 } 1979 } 1980 } /* end if(myTargetIndex<myTargetLength) */ 1981 else{ 1982 *err =U_BUFFER_OVERFLOW_ERROR; 1983 break; 1984 } 1985 1986 }/* end while(mySourceIndex<mySourceLength) */ 1987 1988 /* 1989 * the end of the input stream and detection of truncated input 1990 * are handled by the framework, but for ISO-2022-JP conversion 1991 * we need to be in ASCII mode at the very end 1992 * 1993 * conditions: 1994 * successful 1995 * in SO mode or not in ASCII mode 1996 * end of input and no truncated input 1997 */ 1998 if( U_SUCCESS(*err) && 1999 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2000 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2001 ) { 2002 int32_t sourceIndex; 2003 2004 outLen = 0; 2005 2006 if(pFromU2022State->g != 0) { 2007 buffer[outLen++] = UCNV_SI; 2008 pFromU2022State->g = 0; 2009 } 2010 2011 if(pFromU2022State->cs[0] != ASCII) { 2012 int32_t escLen = escSeqCharsLen[ASCII]; 2013 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2014 outLen += escLen; 2015 pFromU2022State->cs[0] = (int8_t)ASCII; 2016 } 2017 2018 /* get the source index of the last input character */ 2019 /* 2020 * TODO this would be simpler and more reliable if we used a pair 2021 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2022 * so that we could simply use the prevSourceIndex here; 2023 * this code gives an incorrect result for the rare case of an unmatched 2024 * trail surrogate that is alone in the last buffer of the text stream 2025 */ 2026 sourceIndex=(int32_t)(source-args->source); 2027 if(sourceIndex>0) { 2028 --sourceIndex; 2029 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2030 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2031 ) { 2032 --sourceIndex; 2033 } 2034 } else { 2035 sourceIndex=-1; 2036 } 2037 2038 fromUWriteUInt8( 2039 cnv, 2040 buffer, outLen, 2041 &target, (const char *)targetLimit, 2042 &offsets, sourceIndex, 2043 err); 2044 } 2045 2046 /*save the state and return */ 2047 args->source = source; 2048 args->target = (char*)target; 2049 } 2050 2051 /*************** to unicode *******************/ 2052 2053 static void 2054 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2055 UErrorCode* err){ 2056 char tempBuf[2]; 2057 const char *mySource = (char *) args->source; 2058 UChar *myTarget = args->target; 2059 const char *mySourceLimit = args->sourceLimit; 2060 uint32_t targetUniChar = 0x0000; 2061 uint32_t mySourceChar = 0x0000; 2062 uint32_t tmpSourceChar = 0x0000; 2063 UConverterDataISO2022* myData; 2064 ISO2022State *pToU2022State; 2065 StateEnum cs; 2066 2067 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2068 pToU2022State = &myData->toU2022State; 2069 2070 if(myData->key != 0) { 2071 /* continue with a partial escape sequence */ 2072 goto escape; 2073 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2074 /* continue with a partial double-byte character */ 2075 mySourceChar = args->converter->toUBytes[0]; 2076 args->converter->toULength = 0; 2077 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2078 targetUniChar = missingCharMarker; 2079 goto getTrailByte; 2080 } 2081 2082 while(mySource < mySourceLimit){ 2083 2084 targetUniChar =missingCharMarker; 2085 2086 if(myTarget < args->targetLimit){ 2087 2088 mySourceChar= (unsigned char) *mySource++; 2089 2090 switch(mySourceChar) { 2091 case UCNV_SI: 2092 if(myData->version==3) { 2093 pToU2022State->g=0; 2094 continue; 2095 } else { 2096 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2097 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2098 break; 2099 } 2100 2101 case UCNV_SO: 2102 if(myData->version==3) { 2103 /* JIS7: switch to G1 half-width Katakana */ 2104 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2105 pToU2022State->g=1; 2106 continue; 2107 } else { 2108 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2109 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2110 break; 2111 } 2112 2113 case ESC_2022: 2114 mySource--; 2115 escape: 2116 { 2117 const char * mySourceBefore = mySource; 2118 int8_t toULengthBefore = args->converter->toULength; 2119 2120 changeState_2022(args->converter,&(mySource), 2121 mySourceLimit, ISO_2022_JP,err); 2122 2123 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2124 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2125 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2126 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2127 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2128 } 2129 } 2130 2131 /* invalid or illegal escape sequence */ 2132 if(U_FAILURE(*err)){ 2133 args->target = myTarget; 2134 args->source = mySource; 2135 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2136 return; 2137 } 2138 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2139 if(myData->key==0) { 2140 myData->isEmptySegment = TRUE; 2141 } 2142 continue; 2143 2144 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2145 2146 case CR: 2147 /*falls through*/ 2148 case LF: 2149 /* automatically reset to single-byte mode */ 2150 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2151 pToU2022State->cs[0] = (int8_t)ASCII; 2152 } 2153 pToU2022State->cs[2] = 0; 2154 pToU2022State->g = 0; 2155 /* falls through */ 2156 default: 2157 /* convert one or two bytes */ 2158 myData->isEmptySegment = FALSE; 2159 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2160 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2161 !IS_JP_DBCS(cs) 2162 ) { 2163 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2164 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2165 2166 /* return from a single-shift state to the previous one */ 2167 if(pToU2022State->g >= 2) { 2168 pToU2022State->g=pToU2022State->prevG; 2169 } 2170 } else switch(cs) { 2171 case ASCII: 2172 if(mySourceChar <= 0x7f) { 2173 targetUniChar = mySourceChar; 2174 } 2175 break; 2176 case ISO8859_1: 2177 if(mySourceChar <= 0x7f) { 2178 targetUniChar = mySourceChar + 0x80; 2179 } 2180 /* return from a single-shift state to the previous one */ 2181 pToU2022State->g=pToU2022State->prevG; 2182 break; 2183 case ISO8859_7: 2184 if(mySourceChar <= 0x7f) { 2185 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2186 targetUniChar = 2187 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2188 myData->myConverterArray[cs], 2189 mySourceChar + 0x80); 2190 } 2191 /* return from a single-shift state to the previous one */ 2192 pToU2022State->g=pToU2022State->prevG; 2193 break; 2194 case JISX201: 2195 if(mySourceChar <= 0x7f) { 2196 targetUniChar = jisx201ToU(mySourceChar); 2197 } 2198 break; 2199 case HWKANA_7BIT: 2200 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2201 /* 7-bit halfwidth Katakana */ 2202 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2203 } 2204 break; 2205 default: 2206 /* G0 DBCS */ 2207 if(mySource < mySourceLimit) { 2208 int leadIsOk, trailIsOk; 2209 uint8_t trailByte; 2210 getTrailByte: 2211 trailByte = (uint8_t)*mySource; 2212 /* 2213 * Ticket 5691: consistent illegal sequences: 2214 * - We include at least the first byte in the illegal sequence. 2215 * - If any of the non-initial bytes could be the start of a character, 2216 * we stop the illegal sequence before the first one of those. 2217 * 2218 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2219 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2220 * Otherwise we convert or report the pair of bytes. 2221 */ 2222 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2223 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2224 if (leadIsOk && trailIsOk) { 2225 ++mySource; 2226 tmpSourceChar = (mySourceChar << 8) | trailByte; 2227 if(cs == JISX208) { 2228 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2229 mySourceChar = tmpSourceChar; 2230 } else { 2231 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2232 mySourceChar = tmpSourceChar; 2233 if (cs == KSC5601) { 2234 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2235 } 2236 tempBuf[0] = (char)(tmpSourceChar >> 8); 2237 tempBuf[1] = (char)(tmpSourceChar); 2238 } 2239 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2240 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2241 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2242 ++mySource; 2243 /* add another bit so that the code below writes 2 bytes in case of error */ 2244 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2245 } 2246 } else { 2247 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2248 args->converter->toULength = 1; 2249 goto endloop; 2250 } 2251 } /* End of inner switch */ 2252 break; 2253 } /* End of outer switch */ 2254 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2255 if(args->offsets){ 2256 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2257 } 2258 *(myTarget++)=(UChar)targetUniChar; 2259 } 2260 else if(targetUniChar > missingCharMarker){ 2261 /* disassemble the surrogate pair and write to output*/ 2262 targetUniChar-=0x0010000; 2263 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2264 if(args->offsets){ 2265 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2266 } 2267 ++myTarget; 2268 if(myTarget< args->targetLimit){ 2269 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2270 if(args->offsets){ 2271 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2272 } 2273 ++myTarget; 2274 }else{ 2275 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2276 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2277 } 2278 2279 } 2280 else{ 2281 /* Call the callback function*/ 2282 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2283 break; 2284 } 2285 } 2286 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2287 *err =U_BUFFER_OVERFLOW_ERROR; 2288 break; 2289 } 2290 } 2291 endloop: 2292 args->target = myTarget; 2293 args->source = mySource; 2294 } 2295 2296 2297 /*************************************************************** 2298 * Rules for ISO-2022-KR encoding 2299 * i) The KSC5601 designator sequence should appear only once in a file, 2300 * at the begining of a line before any KSC5601 characters. This usually 2301 * means that it appears by itself on the first line of the file 2302 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2303 * and SI to shift into single byte mode 2304 */ 2305 static void 2306 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2307 2308 UConverter* saveConv = args->converter; 2309 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2310 args->converter=myConverterData->currentConverter; 2311 2312 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2313 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2314 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2315 2316 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2317 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2318 uprv_memcpy( 2319 saveConv->charErrorBuffer, 2320 myConverterData->currentConverter->charErrorBuffer, 2321 myConverterData->currentConverter->charErrorBufferLength); 2322 } 2323 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2324 myConverterData->currentConverter->charErrorBufferLength = 0; 2325 } 2326 args->converter=saveConv; 2327 } 2328 2329 static void 2330 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2331 2332 const UChar *source = args->source; 2333 const UChar *sourceLimit = args->sourceLimit; 2334 unsigned char *target = (unsigned char *) args->target; 2335 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2336 int32_t* offsets = args->offsets; 2337 uint32_t targetByteUnit = 0x0000; 2338 UChar32 sourceChar = 0x0000; 2339 UBool isTargetByteDBCS; 2340 UBool oldIsTargetByteDBCS; 2341 UConverterDataISO2022 *converterData; 2342 UConverterSharedData* sharedData; 2343 UBool useFallback; 2344 int32_t length =0; 2345 2346 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2347 /* if the version is 1 then the user is requesting 2348 * conversion with ibm-25546 pass the arguments to 2349 * MBCS converter and return 2350 */ 2351 if(converterData->version==1){ 2352 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2353 return; 2354 } 2355 2356 /* initialize data */ 2357 sharedData = converterData->currentConverter->sharedData; 2358 useFallback = args->converter->useFallback; 2359 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2360 oldIsTargetByteDBCS = isTargetByteDBCS; 2361 2362 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2363 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2364 goto getTrail; 2365 } 2366 while(source < sourceLimit){ 2367 2368 targetByteUnit = missingCharMarker; 2369 2370 if(target < (unsigned char*) args->targetLimit){ 2371 sourceChar = *source++; 2372 2373 /* do not convert SO/SI/ESC */ 2374 if(IS_2022_CONTROL(sourceChar)) { 2375 /* callback(illegal) */ 2376 *err=U_ILLEGAL_CHAR_FOUND; 2377 args->converter->fromUChar32=sourceChar; 2378 break; 2379 } 2380 2381 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2382 if(length < 0) { 2383 length = -length; /* fallback */ 2384 } 2385 /* only DBCS or SBCS characters are expected*/ 2386 /* DB characters with high bit set to 1 are expected */ 2387 if( length > 2 || length==0 || 2388 (length == 1 && targetByteUnit > 0x7f) || 2389 (length == 2 && 2390 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2391 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2392 ) { 2393 targetByteUnit=missingCharMarker; 2394 } 2395 if (targetByteUnit != missingCharMarker){ 2396 2397 oldIsTargetByteDBCS = isTargetByteDBCS; 2398 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2399 /* append the shift sequence */ 2400 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2401 2402 if (isTargetByteDBCS) 2403 *target++ = UCNV_SO; 2404 else 2405 *target++ = UCNV_SI; 2406 if(offsets) 2407 *(offsets++) = (int32_t)(source - args->source-1); 2408 } 2409 /* write the targetUniChar to target */ 2410 if(targetByteUnit <= 0x00FF){ 2411 if( target < targetLimit){ 2412 *(target++) = (unsigned char) targetByteUnit; 2413 if(offsets){ 2414 *(offsets++) = (int32_t)(source - args->source-1); 2415 } 2416 2417 }else{ 2418 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2419 *err = U_BUFFER_OVERFLOW_ERROR; 2420 } 2421 }else{ 2422 if(target < targetLimit){ 2423 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2424 if(offsets){ 2425 *(offsets++) = (int32_t)(source - args->source-1); 2426 } 2427 if(target < targetLimit){ 2428 *(target++) =(unsigned char) (targetByteUnit -0x80); 2429 if(offsets){ 2430 *(offsets++) = (int32_t)(source - args->source-1); 2431 } 2432 }else{ 2433 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2434 *err = U_BUFFER_OVERFLOW_ERROR; 2435 } 2436 }else{ 2437 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2438 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2439 *err = U_BUFFER_OVERFLOW_ERROR; 2440 } 2441 } 2442 2443 } 2444 else{ 2445 /* oops.. the code point is unassingned 2446 * set the error and reason 2447 */ 2448 2449 /*check if the char is a First surrogate*/ 2450 if(U16_IS_SURROGATE(sourceChar)) { 2451 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2452 getTrail: 2453 /*look ahead to find the trail surrogate*/ 2454 if(source < sourceLimit) { 2455 /* test the following code unit */ 2456 UChar trail=(UChar) *source; 2457 if(U16_IS_TRAIL(trail)) { 2458 source++; 2459 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2460 *err = U_INVALID_CHAR_FOUND; 2461 /* convert this surrogate code point */ 2462 /* exit this condition tree */ 2463 } else { 2464 /* this is an unmatched lead code unit (1st surrogate) */ 2465 /* callback(illegal) */ 2466 *err=U_ILLEGAL_CHAR_FOUND; 2467 } 2468 } else { 2469 /* no more input */ 2470 *err = U_ZERO_ERROR; 2471 } 2472 } else { 2473 /* this is an unmatched trail code unit (2nd surrogate) */ 2474 /* callback(illegal) */ 2475 *err=U_ILLEGAL_CHAR_FOUND; 2476 } 2477 } else { 2478 /* callback(unassigned) for a BMP code point */ 2479 *err = U_INVALID_CHAR_FOUND; 2480 } 2481 2482 args->converter->fromUChar32=sourceChar; 2483 break; 2484 } 2485 } /* end if(myTargetIndex<myTargetLength) */ 2486 else{ 2487 *err =U_BUFFER_OVERFLOW_ERROR; 2488 break; 2489 } 2490 2491 }/* end while(mySourceIndex<mySourceLength) */ 2492 2493 /* 2494 * the end of the input stream and detection of truncated input 2495 * are handled by the framework, but for ISO-2022-KR conversion 2496 * we need to be in ASCII mode at the very end 2497 * 2498 * conditions: 2499 * successful 2500 * not in ASCII mode 2501 * end of input and no truncated input 2502 */ 2503 if( U_SUCCESS(*err) && 2504 isTargetByteDBCS && 2505 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2506 ) { 2507 int32_t sourceIndex; 2508 2509 /* we are switching to ASCII */ 2510 isTargetByteDBCS=FALSE; 2511 2512 /* get the source index of the last input character */ 2513 /* 2514 * TODO this would be simpler and more reliable if we used a pair 2515 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2516 * so that we could simply use the prevSourceIndex here; 2517 * this code gives an incorrect result for the rare case of an unmatched 2518 * trail surrogate that is alone in the last buffer of the text stream 2519 */ 2520 sourceIndex=(int32_t)(source-args->source); 2521 if(sourceIndex>0) { 2522 --sourceIndex; 2523 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2524 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2525 ) { 2526 --sourceIndex; 2527 } 2528 } else { 2529 sourceIndex=-1; 2530 } 2531 2532 fromUWriteUInt8( 2533 args->converter, 2534 SHIFT_IN_STR, 1, 2535 &target, (const char *)targetLimit, 2536 &offsets, sourceIndex, 2537 err); 2538 } 2539 2540 /*save the state and return */ 2541 args->source = source; 2542 args->target = (char*)target; 2543 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2544 } 2545 2546 /************************ To Unicode ***************************************/ 2547 2548 static void 2549 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2550 UErrorCode* err){ 2551 char const* sourceStart; 2552 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2553 2554 UConverterToUnicodeArgs subArgs; 2555 int32_t minArgsSize; 2556 2557 /* set up the subconverter arguments */ 2558 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2559 minArgsSize = args->size; 2560 } else { 2561 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2562 } 2563 2564 uprv_memcpy(&subArgs, args, minArgsSize); 2565 subArgs.size = (uint16_t)minArgsSize; 2566 subArgs.converter = myData->currentConverter; 2567 2568 /* remember the original start of the input for offsets */ 2569 sourceStart = args->source; 2570 2571 if(myData->key != 0) { 2572 /* continue with a partial escape sequence */ 2573 goto escape; 2574 } 2575 2576 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2577 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2578 subArgs.source = args->source; 2579 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2580 if(subArgs.source != subArgs.sourceLimit) { 2581 /* 2582 * get the current partial byte sequence 2583 * 2584 * it needs to be moved between the public and the subconverter 2585 * so that the conversion framework, which only sees the public 2586 * converter, can handle truncated and illegal input etc. 2587 */ 2588 if(args->converter->toULength > 0) { 2589 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2590 } 2591 subArgs.converter->toULength = args->converter->toULength; 2592 2593 /* 2594 * Convert up to the end of the input, or to before the next escape character. 2595 * Does not handle conversion extensions because the preToU[] state etc. 2596 * is not copied. 2597 */ 2598 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2599 2600 if(args->offsets != NULL && sourceStart != args->source) { 2601 /* update offsets to base them on the actual start of the input */ 2602 int32_t *offsets = args->offsets; 2603 UChar *target = args->target; 2604 int32_t delta = (int32_t)(args->source - sourceStart); 2605 while(target < subArgs.target) { 2606 if(*offsets >= 0) { 2607 *offsets += delta; 2608 } 2609 ++offsets; 2610 ++target; 2611 } 2612 } 2613 args->source = subArgs.source; 2614 args->target = subArgs.target; 2615 args->offsets = subArgs.offsets; 2616 2617 /* copy input/error/overflow buffers */ 2618 if(subArgs.converter->toULength > 0) { 2619 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2620 } 2621 args->converter->toULength = subArgs.converter->toULength; 2622 2623 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2624 if(subArgs.converter->UCharErrorBufferLength > 0) { 2625 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2626 subArgs.converter->UCharErrorBufferLength); 2627 } 2628 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2629 subArgs.converter->UCharErrorBufferLength = 0; 2630 } 2631 } 2632 2633 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2634 return; 2635 } 2636 2637 escape: 2638 changeState_2022(args->converter, 2639 &(args->source), 2640 args->sourceLimit, 2641 ISO_2022_KR, 2642 err); 2643 } 2644 } 2645 2646 static void 2647 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2648 UErrorCode* err){ 2649 char tempBuf[2]; 2650 const char *mySource = ( char *) args->source; 2651 UChar *myTarget = args->target; 2652 const char *mySourceLimit = args->sourceLimit; 2653 UChar32 targetUniChar = 0x0000; 2654 UChar mySourceChar = 0x0000; 2655 UConverterDataISO2022* myData; 2656 UConverterSharedData* sharedData ; 2657 UBool useFallback; 2658 2659 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2660 if(myData->version==1){ 2661 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2662 return; 2663 } 2664 2665 /* initialize state */ 2666 sharedData = myData->currentConverter->sharedData; 2667 useFallback = args->converter->useFallback; 2668 2669 if(myData->key != 0) { 2670 /* continue with a partial escape sequence */ 2671 goto escape; 2672 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2673 /* continue with a partial double-byte character */ 2674 mySourceChar = args->converter->toUBytes[0]; 2675 args->converter->toULength = 0; 2676 goto getTrailByte; 2677 } 2678 2679 while(mySource< mySourceLimit){ 2680 2681 if(myTarget < args->targetLimit){ 2682 2683 mySourceChar= (unsigned char) *mySource++; 2684 2685 if(mySourceChar==UCNV_SI){ 2686 myData->toU2022State.g = 0; 2687 if (myData->isEmptySegment) { 2688 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2689 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2690 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2691 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2692 args->converter->toULength = 1; 2693 args->target = myTarget; 2694 args->source = mySource; 2695 return; 2696 } 2697 /*consume the source */ 2698 continue; 2699 }else if(mySourceChar==UCNV_SO){ 2700 myData->toU2022State.g = 1; 2701 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2702 /*consume the source */ 2703 continue; 2704 }else if(mySourceChar==ESC_2022){ 2705 mySource--; 2706 escape: 2707 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2708 changeState_2022(args->converter,&(mySource), 2709 mySourceLimit, ISO_2022_KR, err); 2710 if(U_FAILURE(*err)){ 2711 args->target = myTarget; 2712 args->source = mySource; 2713 return; 2714 } 2715 continue; 2716 } 2717 2718 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2719 if(myData->toU2022State.g == 1) { 2720 if(mySource < mySourceLimit) { 2721 int leadIsOk, trailIsOk; 2722 uint8_t trailByte; 2723 getTrailByte: 2724 targetUniChar = missingCharMarker; 2725 trailByte = (uint8_t)*mySource; 2726 /* 2727 * Ticket 5691: consistent illegal sequences: 2728 * - We include at least the first byte in the illegal sequence. 2729 * - If any of the non-initial bytes could be the start of a character, 2730 * we stop the illegal sequence before the first one of those. 2731 * 2732 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2733 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2734 * Otherwise we convert or report the pair of bytes. 2735 */ 2736 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2737 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2738 if (leadIsOk && trailIsOk) { 2739 ++mySource; 2740 tempBuf[0] = (char)(mySourceChar + 0x80); 2741 tempBuf[1] = (char)(trailByte + 0x80); 2742 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2743 mySourceChar = (mySourceChar << 8) | trailByte; 2744 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2745 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2746 ++mySource; 2747 /* add another bit so that the code below writes 2 bytes in case of error */ 2748 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2749 } 2750 } else { 2751 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2752 args->converter->toULength = 1; 2753 break; 2754 } 2755 } 2756 else if(mySourceChar <= 0x7f) { 2757 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2758 } else { 2759 targetUniChar = 0xffff; 2760 } 2761 if(targetUniChar < 0xfffe){ 2762 if(args->offsets) { 2763 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2764 } 2765 *(myTarget++)=(UChar)targetUniChar; 2766 } 2767 else { 2768 /* Call the callback function*/ 2769 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2770 break; 2771 } 2772 } 2773 else{ 2774 *err =U_BUFFER_OVERFLOW_ERROR; 2775 break; 2776 } 2777 } 2778 args->target = myTarget; 2779 args->source = mySource; 2780 } 2781 2782 /*************************** END ISO2022-KR *********************************/ 2783 2784 /*************************** ISO-2022-CN ********************************* 2785 * 2786 * Rules for ISO-2022-CN Encoding: 2787 * i) The designator sequence must appear once on a line before any instance 2788 * of character set it designates. 2789 * ii) If two lines contain characters from the same character set, both lines 2790 * must include the designator sequence. 2791 * iii) Once the designator sequence is known, a shifting sequence has to be found 2792 * to invoke the shifting 2793 * iv) All lines start in ASCII and end in ASCII. 2794 * v) Four shifting sequences are employed for this purpose: 2795 * 2796 * Sequcence ASCII Eq Charsets 2797 * ---------- ------- --------- 2798 * SI <SI> US-ASCII 2799 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2800 * SS2 <ESC>N CNS-11643-1992 Plane 2 2801 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2802 * 2803 * vi) 2804 * SOdesignator : ESC "$" ")" finalchar_for_SO 2805 * SS2designator : ESC "$" "*" finalchar_for_SS2 2806 * SS3designator : ESC "$" "+" finalchar_for_SS3 2807 * 2808 * ESC $ ) A Indicates the bytes following SO are Chinese 2809 * characters as defined in GB 2312-80, until 2810 * another SOdesignation appears 2811 * 2812 * 2813 * ESC $ ) E Indicates the bytes following SO are as defined 2814 * in ISO-IR-165 (for details, see section 2.1), 2815 * until another SOdesignation appears 2816 * 2817 * ESC $ ) G Indicates the bytes following SO are as defined 2818 * in CNS 11643-plane-1, until another 2819 * SOdesignation appears 2820 * 2821 * ESC $ * H Indicates the two bytes immediately following 2822 * SS2 is a Chinese character as defined in CNS 2823 * 11643-plane-2, until another SS2designation 2824 * appears 2825 * (Meaning <ESC>N must preceed every 2 byte 2826 * sequence.) 2827 * 2828 * ESC $ + I Indicates the immediate two bytes following SS3 2829 * is a Chinese character as defined in CNS 2830 * 11643-plane-3, until another SS3designation 2831 * appears 2832 * (Meaning <ESC>O must preceed every 2 byte 2833 * sequence.) 2834 * 2835 * ESC $ + J Indicates the immediate two bytes following SS3 2836 * is a Chinese character as defined in CNS 2837 * 11643-plane-4, until another SS3designation 2838 * appears 2839 * (In English: <ESC>O must preceed every 2 byte 2840 * sequence.) 2841 * 2842 * ESC $ + K Indicates the immediate two bytes following SS3 2843 * is a Chinese character as defined in CNS 2844 * 11643-plane-5, until another SS3designation 2845 * appears 2846 * 2847 * ESC $ + L Indicates the immediate two bytes following SS3 2848 * is a Chinese character as defined in CNS 2849 * 11643-plane-6, until another SS3designation 2850 * appears 2851 * 2852 * ESC $ + M Indicates the immediate two bytes following SS3 2853 * is a Chinese character as defined in CNS 2854 * 11643-plane-7, until another SS3designation 2855 * appears 2856 * 2857 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2858 * has its own designation information before any Chinese characters 2859 * appear 2860 * 2861 */ 2862 2863 /* The following are defined this way to make the strings truly readonly */ 2864 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2865 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2866 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2867 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2868 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2869 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2870 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2871 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2872 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2873 2874 /********************** ISO2022-CN Data **************************/ 2875 static const char* const escSeqCharsCN[10] ={ 2876 SHIFT_IN_STR, /* 0 ASCII */ 2877 GB_2312_80_STR, /* 1 GB2312_1 */ 2878 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2879 CNS_11643_1992_Plane_1_STR, 2880 CNS_11643_1992_Plane_2_STR, 2881 CNS_11643_1992_Plane_3_STR, 2882 CNS_11643_1992_Plane_4_STR, 2883 CNS_11643_1992_Plane_5_STR, 2884 CNS_11643_1992_Plane_6_STR, 2885 CNS_11643_1992_Plane_7_STR 2886 }; 2887 2888 static void 2889 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2890 UConverter *cnv = args->converter; 2891 UConverterDataISO2022 *converterData; 2892 ISO2022State *pFromU2022State; 2893 uint8_t *target = (uint8_t *) args->target; 2894 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2895 const UChar* source = args->source; 2896 const UChar* sourceLimit = args->sourceLimit; 2897 int32_t* offsets = args->offsets; 2898 UChar32 sourceChar; 2899 char buffer[8]; 2900 int32_t len; 2901 int8_t choices[3]; 2902 int32_t choiceCount; 2903 uint32_t targetValue = 0; 2904 UBool useFallback; 2905 2906 /* set up the state */ 2907 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2908 pFromU2022State = &converterData->fromU2022State; 2909 2910 choiceCount = 0; 2911 2912 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2913 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2914 goto getTrail; 2915 } 2916 2917 while( source < sourceLimit){ 2918 if(target < targetLimit){ 2919 2920 sourceChar = *(source++); 2921 /*check if the char is a First surrogate*/ 2922 if(U16_IS_SURROGATE(sourceChar)) { 2923 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2924 getTrail: 2925 /*look ahead to find the trail surrogate*/ 2926 if(source < sourceLimit) { 2927 /* test the following code unit */ 2928 UChar trail=(UChar) *source; 2929 if(U16_IS_TRAIL(trail)) { 2930 source++; 2931 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2932 cnv->fromUChar32=0x00; 2933 /* convert this supplementary code point */ 2934 /* exit this condition tree */ 2935 } else { 2936 /* this is an unmatched lead code unit (1st surrogate) */ 2937 /* callback(illegal) */ 2938 *err=U_ILLEGAL_CHAR_FOUND; 2939 cnv->fromUChar32=sourceChar; 2940 break; 2941 } 2942 } else { 2943 /* no more input */ 2944 cnv->fromUChar32=sourceChar; 2945 break; 2946 } 2947 } else { 2948 /* this is an unmatched trail code unit (2nd surrogate) */ 2949 /* callback(illegal) */ 2950 *err=U_ILLEGAL_CHAR_FOUND; 2951 cnv->fromUChar32=sourceChar; 2952 break; 2953 } 2954 } 2955 2956 /* do the conversion */ 2957 if(sourceChar <= 0x007f ){ 2958 /* do not convert SO/SI/ESC */ 2959 if(IS_2022_CONTROL(sourceChar)) { 2960 /* callback(illegal) */ 2961 *err=U_ILLEGAL_CHAR_FOUND; 2962 cnv->fromUChar32=sourceChar; 2963 break; 2964 } 2965 2966 /* US-ASCII */ 2967 if(pFromU2022State->g == 0) { 2968 buffer[0] = (char)sourceChar; 2969 len = 1; 2970 } else { 2971 buffer[0] = UCNV_SI; 2972 buffer[1] = (char)sourceChar; 2973 len = 2; 2974 pFromU2022State->g = 0; 2975 choiceCount = 0; 2976 } 2977 if(sourceChar == CR || sourceChar == LF) { 2978 /* reset the state at the end of a line */ 2979 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2980 choiceCount = 0; 2981 } 2982 } 2983 else{ 2984 /* convert U+0080..U+10ffff */ 2985 int32_t i; 2986 int8_t cs, g; 2987 2988 if(choiceCount == 0) { 2989 /* try the current SO/G1 converter first */ 2990 choices[0] = pFromU2022State->cs[1]; 2991 2992 /* default to GB2312_1 if none is designated yet */ 2993 if(choices[0] == 0) { 2994 choices[0] = GB2312_1; 2995 } 2996 2997 if(converterData->version == 0) { 2998 /* ISO-2022-CN */ 2999 3000 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3001 if(choices[0] == GB2312_1) { 3002 choices[1] = (int8_t)CNS_11643_1; 3003 } else { 3004 choices[1] = (int8_t)GB2312_1; 3005 } 3006 3007 choiceCount = 2; 3008 } else if (converterData->version == 1) { 3009 /* ISO-2022-CN-EXT */ 3010 3011 /* try one of the other converters */ 3012 switch(choices[0]) { 3013 case GB2312_1: 3014 choices[1] = (int8_t)CNS_11643_1; 3015 choices[2] = (int8_t)ISO_IR_165; 3016 break; 3017 case ISO_IR_165: 3018 choices[1] = (int8_t)GB2312_1; 3019 choices[2] = (int8_t)CNS_11643_1; 3020 break; 3021 default: /* CNS_11643_x */ 3022 choices[1] = (int8_t)GB2312_1; 3023 choices[2] = (int8_t)ISO_IR_165; 3024 break; 3025 } 3026 3027 choiceCount = 3; 3028 } else { 3029 choices[0] = (int8_t)CNS_11643_1; 3030 choices[1] = (int8_t)GB2312_1; 3031 } 3032 } 3033 3034 cs = g = 0; 3035 /* 3036 * len==0: no mapping found yet 3037 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3038 * len>0: found a roundtrip result, done 3039 */ 3040 len = 0; 3041 /* 3042 * We will turn off useFallback after finding a fallback, 3043 * but we still get fallbacks from PUA code points as usual. 3044 * Therefore, we will also need to check that we don't overwrite 3045 * an early fallback with a later one. 3046 */ 3047 useFallback = cnv->useFallback; 3048 3049 for(i = 0; i < choiceCount && len <= 0; ++i) { 3050 int8_t cs0 = choices[i]; 3051 if(cs0 > 0) { 3052 uint32_t value; 3053 int32_t len2; 3054 if(cs0 >= CNS_11643_0) { 3055 len2 = MBCS_FROM_UCHAR32_ISO2022( 3056 converterData->myConverterArray[CNS_11643], 3057 sourceChar, 3058 &value, 3059 useFallback, 3060 MBCS_OUTPUT_3); 3061 if(len2 == 3 || (len2 == -3 && len == 0)) { 3062 targetValue = value; 3063 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3064 if(len2 >= 0) { 3065 len = 2; 3066 } else { 3067 len = -2; 3068 useFallback = FALSE; 3069 } 3070 if(cs == CNS_11643_1) { 3071 g = 1; 3072 } else if(cs == CNS_11643_2) { 3073 g = 2; 3074 } else /* plane 3..7 */ if(converterData->version == 1) { 3075 g = 3; 3076 } else { 3077 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3078 len = 0; 3079 } 3080 } 3081 } else { 3082 /* GB2312_1 or ISO-IR-165 */ 3083 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3084 len2 = MBCS_FROM_UCHAR32_ISO2022( 3085 converterData->myConverterArray[cs0], 3086 sourceChar, 3087 &value, 3088 useFallback, 3089 MBCS_OUTPUT_2); 3090 if(len2 == 2 || (len2 == -2 && len == 0)) { 3091 targetValue = value; 3092 len = len2; 3093 cs = cs0; 3094 g = 1; 3095 useFallback = FALSE; 3096 } 3097 } 3098 } 3099 } 3100 3101 if(len != 0) { 3102 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3103 3104 /* write the designation sequence if necessary */ 3105 if(cs != pFromU2022State->cs[g]) { 3106 if(cs < CNS_11643) { 3107 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3108 } else { 3109 U_ASSERT(cs >= CNS_11643_1); 3110 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3111 } 3112 len = 4; 3113 pFromU2022State->cs[g] = cs; 3114 if(g == 1) { 3115 /* changing the SO/G1 charset invalidates the choices[] */ 3116 choiceCount = 0; 3117 } 3118 } 3119 3120 /* write the shift sequence if necessary */ 3121 if(g != pFromU2022State->g) { 3122 switch(g) { 3123 case 1: 3124 buffer[len++] = UCNV_SO; 3125 3126 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3127 pFromU2022State->g = 1; 3128 break; 3129 case 2: 3130 buffer[len++] = 0x1b; 3131 buffer[len++] = 0x4e; 3132 break; 3133 default: /* case 3 */ 3134 buffer[len++] = 0x1b; 3135 buffer[len++] = 0x4f; 3136 break; 3137 } 3138 } 3139 3140 /* write the two output bytes */ 3141 buffer[len++] = (char)(targetValue >> 8); 3142 buffer[len++] = (char)targetValue; 3143 } else { 3144 /* if we cannot find the character after checking all codepages 3145 * then this is an error 3146 */ 3147 *err = U_INVALID_CHAR_FOUND; 3148 cnv->fromUChar32=sourceChar; 3149 break; 3150 } 3151 } 3152 3153 /* output len>0 bytes in buffer[] */ 3154 if(len == 1) { 3155 *target++ = buffer[0]; 3156 if(offsets) { 3157 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3158 } 3159 } else if(len == 2 && (target + 2) <= targetLimit) { 3160 *target++ = buffer[0]; 3161 *target++ = buffer[1]; 3162 if(offsets) { 3163 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3164 *offsets++ = sourceIndex; 3165 *offsets++ = sourceIndex; 3166 } 3167 } else { 3168 fromUWriteUInt8( 3169 cnv, 3170 buffer, len, 3171 &target, (const char *)targetLimit, 3172 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3173 err); 3174 if(U_FAILURE(*err)) { 3175 break; 3176 } 3177 } 3178 } /* end if(myTargetIndex<myTargetLength) */ 3179 else{ 3180 *err =U_BUFFER_OVERFLOW_ERROR; 3181 break; 3182 } 3183 3184 }/* end while(mySourceIndex<mySourceLength) */ 3185 3186 /* 3187 * the end of the input stream and detection of truncated input 3188 * are handled by the framework, but for ISO-2022-CN conversion 3189 * we need to be in ASCII mode at the very end 3190 * 3191 * conditions: 3192 * successful 3193 * not in ASCII mode 3194 * end of input and no truncated input 3195 */ 3196 if( U_SUCCESS(*err) && 3197 pFromU2022State->g!=0 && 3198 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3199 ) { 3200 int32_t sourceIndex; 3201 3202 /* we are switching to ASCII */ 3203 pFromU2022State->g=0; 3204 3205 /* get the source index of the last input character */ 3206 /* 3207 * TODO this would be simpler and more reliable if we used a pair 3208 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3209 * so that we could simply use the prevSourceIndex here; 3210 * this code gives an incorrect result for the rare case of an unmatched 3211 * trail surrogate that is alone in the last buffer of the text stream 3212 */ 3213 sourceIndex=(int32_t)(source-args->source); 3214 if(sourceIndex>0) { 3215 --sourceIndex; 3216 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3217 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3218 ) { 3219 --sourceIndex; 3220 } 3221 } else { 3222 sourceIndex=-1; 3223 } 3224 3225 fromUWriteUInt8( 3226 cnv, 3227 SHIFT_IN_STR, 1, 3228 &target, (const char *)targetLimit, 3229 &offsets, sourceIndex, 3230 err); 3231 } 3232 3233 /*save the state and return */ 3234 args->source = source; 3235 args->target = (char*)target; 3236 } 3237 3238 3239 static void 3240 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3241 UErrorCode* err){ 3242 char tempBuf[3]; 3243 const char *mySource = (char *) args->source; 3244 UChar *myTarget = args->target; 3245 const char *mySourceLimit = args->sourceLimit; 3246 uint32_t targetUniChar = 0x0000; 3247 uint32_t mySourceChar = 0x0000; 3248 UConverterDataISO2022* myData; 3249 ISO2022State *pToU2022State; 3250 3251 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3252 pToU2022State = &myData->toU2022State; 3253 3254 if(myData->key != 0) { 3255 /* continue with a partial escape sequence */ 3256 goto escape; 3257 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3258 /* continue with a partial double-byte character */ 3259 mySourceChar = args->converter->toUBytes[0]; 3260 args->converter->toULength = 0; 3261 targetUniChar = missingCharMarker; 3262 goto getTrailByte; 3263 } 3264 3265 while(mySource < mySourceLimit){ 3266 3267 targetUniChar =missingCharMarker; 3268 3269 if(myTarget < args->targetLimit){ 3270 3271 mySourceChar= (unsigned char) *mySource++; 3272 3273 switch(mySourceChar){ 3274 case UCNV_SI: 3275 pToU2022State->g=0; 3276 if (myData->isEmptySegment) { 3277 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3278 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3279 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3280 args->converter->toUBytes[0] = mySourceChar; 3281 args->converter->toULength = 1; 3282 args->target = myTarget; 3283 args->source = mySource; 3284 return; 3285 } 3286 continue; 3287 3288 case UCNV_SO: 3289 if(pToU2022State->cs[1] != 0) { 3290 pToU2022State->g=1; 3291 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3292 continue; 3293 } else { 3294 /* illegal to have SO before a matching designator */ 3295 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3296 break; 3297 } 3298 3299 case ESC_2022: 3300 mySource--; 3301 escape: 3302 { 3303 const char * mySourceBefore = mySource; 3304 int8_t toULengthBefore = args->converter->toULength; 3305 3306 changeState_2022(args->converter,&(mySource), 3307 mySourceLimit, ISO_2022_CN,err); 3308 3309 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3310 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3311 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3312 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3313 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3314 } 3315 } 3316 3317 /* invalid or illegal escape sequence */ 3318 if(U_FAILURE(*err)){ 3319 args->target = myTarget; 3320 args->source = mySource; 3321 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3322 return; 3323 } 3324 continue; 3325 3326 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3327 3328 case CR: 3329 /*falls through*/ 3330 case LF: 3331 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3332 /* falls through */ 3333 default: 3334 /* convert one or two bytes */ 3335 myData->isEmptySegment = FALSE; 3336 if(pToU2022State->g != 0) { 3337 if(mySource < mySourceLimit) { 3338 UConverterSharedData *cnv; 3339 StateEnum tempState; 3340 int32_t tempBufLen; 3341 int leadIsOk, trailIsOk; 3342 uint8_t trailByte; 3343 getTrailByte: 3344 trailByte = (uint8_t)*mySource; 3345 /* 3346 * Ticket 5691: consistent illegal sequences: 3347 * - We include at least the first byte in the illegal sequence. 3348 * - If any of the non-initial bytes could be the start of a character, 3349 * we stop the illegal sequence before the first one of those. 3350 * 3351 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3352 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3353 * Otherwise we convert or report the pair of bytes. 3354 */ 3355 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3356 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3357 if (leadIsOk && trailIsOk) { 3358 ++mySource; 3359 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3360 if(tempState >= CNS_11643_0) { 3361 cnv = myData->myConverterArray[CNS_11643]; 3362 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3363 tempBuf[1] = (char) (mySourceChar); 3364 tempBuf[2] = (char) trailByte; 3365 tempBufLen = 3; 3366 3367 }else{ 3368 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3369 cnv = myData->myConverterArray[tempState]; 3370 tempBuf[0] = (char) (mySourceChar); 3371 tempBuf[1] = (char) trailByte; 3372 tempBufLen = 2; 3373 } 3374 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3375 mySourceChar = (mySourceChar << 8) | trailByte; 3376 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3377 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3378 ++mySource; 3379 /* add another bit so that the code below writes 2 bytes in case of error */ 3380 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3381 } 3382 if(pToU2022State->g>=2) { 3383 /* return from a single-shift state to the previous one */ 3384 pToU2022State->g=pToU2022State->prevG; 3385 } 3386 } else { 3387 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3388 args->converter->toULength = 1; 3389 goto endloop; 3390 } 3391 } 3392 else{ 3393 if(mySourceChar <= 0x7f) { 3394 targetUniChar = (UChar) mySourceChar; 3395 } 3396 } 3397 break; 3398 } 3399 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3400 if(args->offsets){ 3401 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3402 } 3403 *(myTarget++)=(UChar)targetUniChar; 3404 } 3405 else if(targetUniChar > missingCharMarker){ 3406 /* disassemble the surrogate pair and write to output*/ 3407 targetUniChar-=0x0010000; 3408 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3409 if(args->offsets){ 3410 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3411 } 3412 ++myTarget; 3413 if(myTarget< args->targetLimit){ 3414 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3415 if(args->offsets){ 3416 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3417 } 3418 ++myTarget; 3419 }else{ 3420 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3421 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3422 } 3423 3424 } 3425 else{ 3426 /* Call the callback function*/ 3427 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3428 break; 3429 } 3430 } 3431 else{ 3432 *err =U_BUFFER_OVERFLOW_ERROR; 3433 break; 3434 } 3435 } 3436 endloop: 3437 args->target = myTarget; 3438 args->source = mySource; 3439 } 3440 3441 static void 3442 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3443 UConverter *cnv = args->converter; 3444 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3445 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3446 char *p, *subchar; 3447 char buffer[8]; 3448 int32_t length; 3449 3450 subchar=(char *)cnv->subChars; 3451 length=cnv->subCharLen; /* assume length==1 for most variants */ 3452 3453 p = buffer; 3454 switch(myConverterData->locale[0]){ 3455 case 'j': 3456 { 3457 int8_t cs; 3458 3459 if(pFromU2022State->g == 1) { 3460 /* JIS7: switch from G1 to G0 */ 3461 pFromU2022State->g = 0; 3462 *p++ = UCNV_SI; 3463 } 3464 3465 cs = pFromU2022State->cs[0]; 3466 if(cs != ASCII && cs != JISX201) { 3467 /* not in ASCII or JIS X 0201: switch to ASCII */ 3468 pFromU2022State->cs[0] = (int8_t)ASCII; 3469 *p++ = '\x1b'; 3470 *p++ = '\x28'; 3471 *p++ = '\x42'; 3472 } 3473 3474 *p++ = subchar[0]; 3475 break; 3476 } 3477 case 'c': 3478 if(pFromU2022State->g != 0) { 3479 /* not in ASCII mode: switch to ASCII */ 3480 pFromU2022State->g = 0; 3481 *p++ = UCNV_SI; 3482 } 3483 *p++ = subchar[0]; 3484 break; 3485 case 'k': 3486 if(myConverterData->version == 0) { 3487 if(length == 1) { 3488 if((UBool)args->converter->fromUnicodeStatus) { 3489 /* in DBCS mode: switch to SBCS */ 3490 args->converter->fromUnicodeStatus = 0; 3491 *p++ = UCNV_SI; 3492 } 3493 *p++ = subchar[0]; 3494 } else /* length == 2*/ { 3495 if(!(UBool)args->converter->fromUnicodeStatus) { 3496 /* in SBCS mode: switch to DBCS */ 3497 args->converter->fromUnicodeStatus = 1; 3498 *p++ = UCNV_SO; 3499 } 3500 *p++ = subchar[0]; 3501 *p++ = subchar[1]; 3502 } 3503 break; 3504 } else { 3505 /* save the subconverter's substitution string */ 3506 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3507 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3508 3509 /* set our substitution string into the subconverter */ 3510 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3511 myConverterData->currentConverter->subCharLen = (int8_t)length; 3512 3513 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3514 args->converter = myConverterData->currentConverter; 3515 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3516 ucnv_cbFromUWriteSub(args, 0, err); 3517 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3518 args->converter = cnv; 3519 3520 /* restore the subconverter's substitution string */ 3521 myConverterData->currentConverter->subChars = currentSubChars; 3522 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3523 3524 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3525 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3526 uprv_memcpy( 3527 cnv->charErrorBuffer, 3528 myConverterData->currentConverter->charErrorBuffer, 3529 myConverterData->currentConverter->charErrorBufferLength); 3530 } 3531 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3532 myConverterData->currentConverter->charErrorBufferLength = 0; 3533 } 3534 return; 3535 } 3536 default: 3537 /* not expected */ 3538 break; 3539 } 3540 ucnv_cbFromUWriteBytes(args, 3541 buffer, (int32_t)(p - buffer), 3542 offsetIndex, err); 3543 } 3544 3545 /* 3546 * Structure for cloning an ISO 2022 converter into a single memory block. 3547 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3548 * and then ucnv_safeClone() of the sub-converter may additionally align 3549 * currentConverter inside the cloneStruct, for which we need the deadSpace 3550 * after currentConverter. 3551 * This is because UAlignedMemory may be larger than the actually 3552 * necessary alignment size for the platform. 3553 * The other cloneStruct fields will not be moved around, 3554 * and are aligned properly with cloneStruct's alignment. 3555 */ 3556 struct cloneStruct 3557 { 3558 UConverter cnv; 3559 UConverter currentConverter; 3560 UAlignedMemory deadSpace; 3561 UConverterDataISO2022 mydata; 3562 }; 3563 3564 3565 static UConverter * 3566 _ISO_2022_SafeClone( 3567 const UConverter *cnv, 3568 void *stackBuffer, 3569 int32_t *pBufferSize, 3570 UErrorCode *status) 3571 { 3572 struct cloneStruct * localClone; 3573 UConverterDataISO2022 *cnvData; 3574 int32_t i, size; 3575 3576 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3577 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3578 return NULL; 3579 } 3580 3581 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3582 localClone = (struct cloneStruct *)stackBuffer; 3583 3584 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3585 3586 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3587 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3588 localClone->cnv.isExtraLocal = TRUE; 3589 3590 /* share the subconverters */ 3591 3592 if(cnvData->currentConverter != NULL) { 3593 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3594 localClone->mydata.currentConverter = 3595 ucnv_safeClone(cnvData->currentConverter, 3596 &localClone->currentConverter, 3597 &size, status); 3598 if(U_FAILURE(*status)) { 3599 return NULL; 3600 } 3601 } 3602 3603 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3604 if(cnvData->myConverterArray[i] != NULL) { 3605 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3606 } 3607 } 3608 3609 return &localClone->cnv; 3610 } 3611 3612 static void 3613 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3614 const USetAdder *sa, 3615 UConverterUnicodeSet which, 3616 UErrorCode *pErrorCode) 3617 { 3618 int32_t i; 3619 UConverterDataISO2022* cnvData; 3620 3621 if (U_FAILURE(*pErrorCode)) { 3622 return; 3623 } 3624 #ifdef U_ENABLE_GENERIC_ISO_2022 3625 if (cnv->sharedData == &_ISO2022Data) { 3626 /* We use UTF-8 in this case */ 3627 sa->addRange(sa->set, 0, 0xd7FF); 3628 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3629 return; 3630 } 3631 #endif 3632 3633 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3634 3635 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3636 switch(cnvData->locale[0]){ 3637 case 'j': 3638 /* include JIS X 0201 which is hardcoded */ 3639 sa->add(sa->set, 0xa5); 3640 sa->add(sa->set, 0x203e); 3641 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3642 /* include Latin-1 for some variants of JP */ 3643 sa->addRange(sa->set, 0, 0xff); 3644 } else { 3645 /* include ASCII for JP */ 3646 sa->addRange(sa->set, 0, 0x7f); 3647 } 3648 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3649 /* 3650 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3651 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3652 * use half-width Katakana. 3653 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3654 * half-width Katakana via the ESC ( I sequence. 3655 * However, we only emit (fromUnicode) half-width Katakana according to the 3656 * definition of each variant. 3657 * 3658 * When including fallbacks, 3659 * we need to include half-width Katakana Unicode code points for all JP variants because 3660 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3661 */ 3662 /* include half-width Katakana for JP */ 3663 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3664 } 3665 break; 3666 case 'c': 3667 case 'z': 3668 /* include ASCII for CN */ 3669 sa->addRange(sa->set, 0, 0x7f); 3670 break; 3671 case 'k': 3672 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3673 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3674 cnvData->currentConverter, sa, which, pErrorCode); 3675 /* the loop over myConverterArray[] will simply not find another converter */ 3676 break; 3677 default: 3678 break; 3679 } 3680 3681 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3682 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3683 cnvData->version==0 && i==CNS_11643 3684 ) { 3685 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3686 ucnv_MBCSGetUnicodeSetForBytes( 3687 cnvData->myConverterArray[i], 3688 sa, UCNV_ROUNDTRIP_SET, 3689 0, 0x81, 0x82, 3690 pErrorCode); 3691 } 3692 #endif 3693 3694 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3695 UConverterSetFilter filter; 3696 if(cnvData->myConverterArray[i]!=NULL) { 3697 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3698 cnvData->version==0 && i==CNS_11643 3699 ) { 3700 /* 3701 * Version-specific for CN: 3702 * CN version 0 does not map CNS planes 3..7 although 3703 * they are all available in the CNS conversion table; 3704 * CN version 1 (-EXT) does map them all. 3705 * The two versions create different Unicode sets. 3706 */ 3707 filter=UCNV_SET_FILTER_2022_CN; 3708 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3709 /* 3710 * Only add code points that map to Shift-JIS codes 3711 * corresponding to JIS X 0208. 3712 */ 3713 filter=UCNV_SET_FILTER_SJIS; 3714 } else if(i==KSC5601) { 3715 /* 3716 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3717 * are broader than GR94. 3718 */ 3719 filter=UCNV_SET_FILTER_GR94DBCS; 3720 } else { 3721 filter=UCNV_SET_FILTER_NONE; 3722 } 3723 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3724 } 3725 } 3726 3727 /* 3728 * ISO 2022 converters must not convert SO/SI/ESC despite what 3729 * sub-converters do by themselves. 3730 * Remove these characters from the set. 3731 */ 3732 sa->remove(sa->set, 0x0e); 3733 sa->remove(sa->set, 0x0f); 3734 sa->remove(sa->set, 0x1b); 3735 3736 /* ISO 2022 converters do not convert C1 controls either */ 3737 sa->removeRange(sa->set, 0x80, 0x9f); 3738 } 3739 3740 static const UConverterImpl _ISO2022Impl={ 3741 UCNV_ISO_2022, 3742 3743 NULL, 3744 NULL, 3745 3746 _ISO2022Open, 3747 _ISO2022Close, 3748 _ISO2022Reset, 3749 3750 #ifdef U_ENABLE_GENERIC_ISO_2022 3751 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3752 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3753 ucnv_fromUnicode_UTF8, 3754 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3755 #else 3756 NULL, 3757 NULL, 3758 NULL, 3759 NULL, 3760 #endif 3761 NULL, 3762 3763 NULL, 3764 _ISO2022getName, 3765 _ISO_2022_WriteSub, 3766 _ISO_2022_SafeClone, 3767 _ISO_2022_GetUnicodeSet, 3768 3769 NULL, 3770 NULL 3771 }; 3772 static const UConverterStaticData _ISO2022StaticData={ 3773 sizeof(UConverterStaticData), 3774 "ISO_2022", 3775 2022, 3776 UCNV_IBM, 3777 UCNV_ISO_2022, 3778 1, 3779 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3780 { 0x1a, 0, 0, 0 }, 3781 1, 3782 FALSE, 3783 FALSE, 3784 0, 3785 0, 3786 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3787 }; 3788 const UConverterSharedData _ISO2022Data={ 3789 sizeof(UConverterSharedData), 3790 ~((uint32_t) 0), 3791 NULL, 3792 NULL, 3793 &_ISO2022StaticData, 3794 FALSE, 3795 &_ISO2022Impl, 3796 0, UCNV_MBCS_TABLE_INITIALIZER 3797 }; 3798 3799 /*************JP****************/ 3800 static const UConverterImpl _ISO2022JPImpl={ 3801 UCNV_ISO_2022, 3802 3803 NULL, 3804 NULL, 3805 3806 _ISO2022Open, 3807 _ISO2022Close, 3808 _ISO2022Reset, 3809 3810 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3811 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3812 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3813 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3814 NULL, 3815 3816 NULL, 3817 _ISO2022getName, 3818 _ISO_2022_WriteSub, 3819 _ISO_2022_SafeClone, 3820 _ISO_2022_GetUnicodeSet, 3821 3822 NULL, 3823 NULL 3824 }; 3825 static const UConverterStaticData _ISO2022JPStaticData={ 3826 sizeof(UConverterStaticData), 3827 "ISO_2022_JP", 3828 0, 3829 UCNV_IBM, 3830 UCNV_ISO_2022, 3831 1, 3832 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3833 { 0x1a, 0, 0, 0 }, 3834 1, 3835 FALSE, 3836 FALSE, 3837 0, 3838 0, 3839 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3840 }; 3841 3842 namespace { 3843 3844 const UConverterSharedData _ISO2022JPData={ 3845 sizeof(UConverterSharedData), 3846 ~((uint32_t) 0), 3847 NULL, 3848 NULL, 3849 &_ISO2022JPStaticData, 3850 FALSE, 3851 &_ISO2022JPImpl, 3852 0, UCNV_MBCS_TABLE_INITIALIZER 3853 }; 3854 3855 } // namespace 3856 3857 /************* KR ***************/ 3858 static const UConverterImpl _ISO2022KRImpl={ 3859 UCNV_ISO_2022, 3860 3861 NULL, 3862 NULL, 3863 3864 _ISO2022Open, 3865 _ISO2022Close, 3866 _ISO2022Reset, 3867 3868 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3869 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3870 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3871 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3872 NULL, 3873 3874 NULL, 3875 _ISO2022getName, 3876 _ISO_2022_WriteSub, 3877 _ISO_2022_SafeClone, 3878 _ISO_2022_GetUnicodeSet, 3879 3880 NULL, 3881 NULL 3882 }; 3883 static const UConverterStaticData _ISO2022KRStaticData={ 3884 sizeof(UConverterStaticData), 3885 "ISO_2022_KR", 3886 0, 3887 UCNV_IBM, 3888 UCNV_ISO_2022, 3889 1, 3890 3, /* max 3 bytes per UChar: SO+DBCS */ 3891 { 0x1a, 0, 0, 0 }, 3892 1, 3893 FALSE, 3894 FALSE, 3895 0, 3896 0, 3897 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3898 }; 3899 3900 namespace { 3901 3902 const UConverterSharedData _ISO2022KRData={ 3903 sizeof(UConverterSharedData), 3904 ~((uint32_t) 0), 3905 NULL, 3906 NULL, 3907 &_ISO2022KRStaticData, 3908 FALSE, 3909 &_ISO2022KRImpl, 3910 0, UCNV_MBCS_TABLE_INITIALIZER 3911 }; 3912 3913 } // namespace 3914 3915 /*************** CN ***************/ 3916 static const UConverterImpl _ISO2022CNImpl={ 3917 3918 UCNV_ISO_2022, 3919 3920 NULL, 3921 NULL, 3922 3923 _ISO2022Open, 3924 _ISO2022Close, 3925 _ISO2022Reset, 3926 3927 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3928 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3929 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3930 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3931 NULL, 3932 3933 NULL, 3934 _ISO2022getName, 3935 _ISO_2022_WriteSub, 3936 _ISO_2022_SafeClone, 3937 _ISO_2022_GetUnicodeSet, 3938 3939 NULL, 3940 NULL 3941 }; 3942 static const UConverterStaticData _ISO2022CNStaticData={ 3943 sizeof(UConverterStaticData), 3944 "ISO_2022_CN", 3945 0, 3946 UCNV_IBM, 3947 UCNV_ISO_2022, 3948 1, 3949 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3950 { 0x1a, 0, 0, 0 }, 3951 1, 3952 FALSE, 3953 FALSE, 3954 0, 3955 0, 3956 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3957 }; 3958 3959 namespace { 3960 3961 const UConverterSharedData _ISO2022CNData={ 3962 sizeof(UConverterSharedData), 3963 ~((uint32_t) 0), 3964 NULL, 3965 NULL, 3966 &_ISO2022CNStaticData, 3967 FALSE, 3968 &_ISO2022CNImpl, 3969 0, UCNV_MBCS_TABLE_INITIALIZER 3970 }; 3971 3972 } // namespace 3973 3974 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3975