1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "unicode/utf16.h" 38 #include "ucnv_imp.h" 39 #include "ucnv_bld.h" 40 #include "ucnv_cnv.h" 41 #include "ucnvmbcs.h" 42 #include "cstring.h" 43 #include "cmemory.h" 44 #include "uassert.h" 45 46 #ifdef U_ENABLE_GENERIC_ISO_2022 47 /* 48 * I am disabling the generic ISO-2022 converter after proposing to do so on 49 * the icu mailing list two days ago. 50 * 51 * Reasons: 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 53 * its designation sequences, single shifts with return to the previous state, 54 * switch-with-no-return to UTF-16BE or similar, etc. 55 * This is unlike the language-specific variants like ISO-2022-JP which 56 * require a much smaller repertoire of ISO-2022 features. 57 * These variants continue to be supported. 58 * 2. I believe that no one is really using the generic ISO-2022 converter 59 * but rather always one of the language-specific variants. 60 * Note that ICU's generic ISO-2022 converter has always output one escape 61 * sequence followed by UTF-8 for the whole stream. 62 * 3. Switching between subcharsets is extremely slow, because each time 63 * the previous converter is closed and a new one opened, 64 * without any kind of caching, least-recently-used list, etc. 65 * 4. The code is currently buggy, and given the above it does not seem 66 * reasonable to spend the time on maintenance. 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 68 * This means, for example, that when ISO-8859-7 is designated, the following 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 70 * The ICU ISO-2022 converter does not handle this - and has no information 71 * about which subconverter would have to be shifted vs. which is designed 72 * for 7-bit ISO-2022. 73 * 74 * Markus Scherer 2003-dec-03 75 */ 76 #endif 77 78 #if !UCONFIG_ONLY_HTML_CONVERSION 79 static const char SHIFT_IN_STR[] = "\x0F"; 80 // static const char SHIFT_OUT_STR[] = "\x0E"; 81 #endif 82 83 #define CR 0x0D 84 #define LF 0x0A 85 #define H_TAB 0x09 86 #define V_TAB 0x0B 87 #define SPACE 0x20 88 89 enum { 90 HWKANA_START=0xff61, 91 HWKANA_END=0xff9f 92 }; 93 94 /* 95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 96 * as bytes 21..7E. (Subtract 0x80.) 97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 98 * as bytes 20..7F. (Subtract 0x80.) 99 * Do not encode C1 control codes with native bytes 80..9F 100 * as bytes 00..1F (C0 control codes). 101 */ 102 enum { 103 GR94_START=0xa1, 104 GR94_END=0xfe, 105 GR96_START=0xa0, 106 GR96_END=0xff 107 }; 108 109 /* 110 * ISO 2022 control codes must not be converted from Unicode 111 * because they would mess up the byte stream. 112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 113 * corresponding to SO, SI, and ESC. 114 */ 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 116 117 /* for ISO-2022-JP and -CN implementations */ 118 typedef enum { 119 /* shared values */ 120 INVALID_STATE=-1, 121 ASCII = 0, 122 123 SS2_STATE=0x10, 124 SS3_STATE, 125 126 /* JP */ 127 ISO8859_1 = 1 , 128 ISO8859_7 = 2 , 129 JISX201 = 3, 130 JISX208 = 4, 131 JISX212 = 5, 132 GB2312 =6, 133 KSC5601 =7, 134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 135 136 /* CN */ 137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 138 GB2312_1=1, 139 ISO_IR_165=2, 140 CNS_11643=3, 141 142 /* 143 * these are used in StateEnum and ISO2022State variables, 144 * but CNS_11643 must be used to index into myConverterArray[] 145 */ 146 CNS_11643_0=0x20, 147 CNS_11643_1, 148 CNS_11643_2, 149 CNS_11643_3, 150 CNS_11643_4, 151 CNS_11643_5, 152 CNS_11643_6, 153 CNS_11643_7 154 } StateEnum; 155 156 /* is the StateEnum charset value for a DBCS charset? */ 157 #if UCONFIG_ONLY_HTML_CONVERSION 158 #define IS_JP_DBCS(cs) (JISX208==(cs)) 159 #else 160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 161 #endif 162 163 #define CSM(cs) ((uint16_t)1<<(cs)) 164 165 /* 166 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 167 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 168 * 169 * Note: The converter uses some leniency: 170 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 171 * all versions, not just JIS7 and JIS8. 172 * - ICU does not distinguish between different versions of JIS X 0208. 173 */ 174 #if UCONFIG_ONLY_HTML_CONVERSION 175 enum { MAX_JA_VERSION=0 }; 176 #else 177 enum { MAX_JA_VERSION=4 }; 178 #endif 179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 180 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 181 #if !UCONFIG_ONLY_HTML_CONVERSION 182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 183 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 186 #endif 187 }; 188 189 typedef enum { 190 ASCII1=0, 191 LATIN1, 192 SBCS, 193 DBCS, 194 MBCS, 195 HWKANA 196 }Cnv2022Type; 197 198 typedef struct ISO2022State { 199 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 200 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 201 int8_t prevG; /* g before single shift (SS2 or SS3) */ 202 } ISO2022State; 203 204 #define UCNV_OPTIONS_VERSION_MASK 0xf 205 #define UCNV_2022_MAX_CONVERTERS 10 206 207 typedef struct{ 208 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 209 UConverter *currentConverter; 210 Cnv2022Type currentType; 211 ISO2022State toU2022State, fromU2022State; 212 uint32_t key; 213 uint32_t version; 214 #ifdef U_ENABLE_GENERIC_ISO_2022 215 UBool isFirstBuffer; 216 #endif 217 UBool isEmptySegment; 218 char name[30]; 219 char locale[3]; 220 }UConverterDataISO2022; 221 222 /* Protos */ 223 /* ISO-2022 ----------------------------------------------------------------- */ 224 225 /*Forward declaration */ 226 U_CFUNC void 227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 228 UErrorCode * err); 229 U_CFUNC void 230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 231 UErrorCode * err); 232 233 #define ESC_2022 0x1B /*ESC*/ 234 235 typedef enum 236 { 237 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 238 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 239 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 240 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 241 } UCNV_TableStates_2022; 242 243 /* 244 * The way these state transition arrays work is: 245 * ex : ESC$B is the sequence for JISX208 246 * a) First Iteration: char is ESC 247 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 248 * int x = normalize_esq_chars_2022[27] which is equal to 1 249 * ii) Search for this value in escSeqStateTable_Key_2022[] 250 * value of x is stored at escSeqStateTable_Key_2022[0] 251 * iii) Save this index as offset 252 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 253 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 254 * b) Switch on this state and continue to next char 255 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 256 * which is normalize_esq_chars_2022[36] == 4 257 * ii) x is currently 1(from above) 258 * x<<=5 -- x is now 32 259 * x+=normalize_esq_chars_2022[36] 260 * now x is 36 261 * iii) Search for this value in escSeqStateTable_Key_2022[] 262 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 264 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 265 * c) Switch on this state and continue to next char 266 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 267 * ii) x is currently 36 (from above) 268 * x<<=5 -- x is now 1152 269 * x+=normalize_esq_chars_2022[66] 270 * now x is 1161 271 * iii) Search for this value in escSeqStateTable_Key_2022[] 272 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 273 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 274 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 275 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 276 */ 277 278 279 /*Below are the 3 arrays depicting a state transition table*/ 280 static const int8_t normalize_esq_chars_2022[256] = { 281 /* 0 1 2 3 4 5 6 7 8 9 */ 282 283 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 287 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 290 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 291 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 308 ,0 ,0 ,0 ,0 ,0 ,0 309 }; 310 311 #ifdef U_ENABLE_GENERIC_ISO_2022 312 /* 313 * When the generic ISO-2022 converter is completely removed, not just disabled 314 * per #ifdef, then the following state table and the associated tables that are 315 * dimensioned with MAX_STATES_2022 should be trimmed. 316 * 317 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 318 * the associated escape sequences starting with ESC ( B should be removed. 319 * This includes the ones with key values 1097 and all of the ones above 1000000. 320 * 321 * For the latter, the tables can simply be truncated. 322 * For the former, since the tables must be kept parallel, it is probably best 323 * to simply duplicate an adjacent table cell, parallel in all tables. 324 * 325 * It may make sense to restructure the tables, especially by using small search 326 * tables for the variants instead of indexing them parallel to the table here. 327 */ 328 #endif 329 330 #define MAX_STATES_2022 74 331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 332 /* 0 1 2 3 4 5 6 7 8 9 */ 333 334 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 335 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 336 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 337 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 338 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 339 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 340 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 341 ,35947631 ,35947635 ,35947636 ,35947638 342 }; 343 344 #ifdef U_ENABLE_GENERIC_ISO_2022 345 346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 347 /* 0 1 2 3 4 5 6 7 8 9 */ 348 349 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 350 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 351 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 352 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 353 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 354 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 355 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 356 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 357 }; 358 359 #endif 360 361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 362 /* 0 1 2 3 4 5 6 7 8 9 */ 363 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 364 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 365 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 366 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 367 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 371 }; 372 373 /* Type def for refactoring changeState_2022 code*/ 374 typedef enum{ 375 #ifdef U_ENABLE_GENERIC_ISO_2022 376 ISO_2022=0, 377 #endif 378 ISO_2022_JP=1, 379 #if !UCONFIG_ONLY_HTML_CONVERSION 380 ISO_2022_KR=2, 381 ISO_2022_CN=3 382 #endif 383 } Variant2022; 384 385 /*********** ISO 2022 Converter Protos ***********/ 386 static void 387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 388 389 static void 390 _ISO2022Close(UConverter *converter); 391 392 static void 393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 394 395 static const char* 396 _ISO2022getName(const UConverter* cnv); 397 398 static void 399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 400 401 static UConverter * 402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 403 404 #ifdef U_ENABLE_GENERIC_ISO_2022 405 static void 406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 407 #endif 408 409 namespace { 410 411 /*const UConverterSharedData _ISO2022Data;*/ 412 extern const UConverterSharedData _ISO2022JPData; 413 414 #if !UCONFIG_ONLY_HTML_CONVERSION 415 extern const UConverterSharedData _ISO2022KRData; 416 extern const UConverterSharedData _ISO2022CNData; 417 #endif 418 419 } // namespace 420 421 /*************** Converter implementations ******************/ 422 423 /* The purpose of this function is to get around gcc compiler warnings. */ 424 static inline void 425 fromUWriteUInt8(UConverter *cnv, 426 const char *bytes, int32_t length, 427 uint8_t **target, const char *targetLimit, 428 int32_t **offsets, 429 int32_t sourceIndex, 430 UErrorCode *pErrorCode) 431 { 432 char *targetChars = (char *)*target; 433 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 434 offsets, sourceIndex, pErrorCode); 435 *target = (uint8_t*)targetChars; 436 437 } 438 439 static inline void 440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 441 if(myConverterData->version == 1) { 442 UConverter *cnv = myConverterData->currentConverter; 443 444 cnv->toUnicodeStatus=0; /* offset */ 445 cnv->mode=0; /* state */ 446 cnv->toULength=0; /* byteIndex */ 447 } 448 } 449 450 static inline void 451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 452 /* in ISO-2022-KR the designator sequence appears only once 453 * in a file so we append it only once 454 */ 455 if( converter->charErrorBufferLength==0){ 456 457 converter->charErrorBufferLength = 4; 458 converter->charErrorBuffer[0] = 0x1b; 459 converter->charErrorBuffer[1] = 0x24; 460 converter->charErrorBuffer[2] = 0x29; 461 converter->charErrorBuffer[3] = 0x43; 462 } 463 if(myConverterData->version == 1) { 464 UConverter *cnv = myConverterData->currentConverter; 465 466 cnv->fromUChar32=0; 467 cnv->fromUnicodeStatus=1; /* prevLength */ 468 } 469 } 470 471 static void 472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 473 474 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 475 476 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 477 if(cnv->extraInfo != NULL) { 478 UConverterNamePieces stackPieces; 479 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 480 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 481 uint32_t version; 482 483 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 484 485 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 486 myConverterData->currentType = ASCII1; 487 cnv->fromUnicodeStatus =FALSE; 488 if(pArgs->locale){ 489 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 490 } 491 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 492 myConverterData->version = version; 493 /* Begin Google-specific change. */ 494 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */ 495 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */ 496 if((myLocale[0]=='j' && 497 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' || 498 myLocale[1]=='s') && 499 (myLocale[2]=='_' || myLocale[2]=='\0'))) 500 { 501 /* open the required converters and cache them */ 502 if(version>MAX_JA_VERSION) { 503 // ICU 55 fails to open a converter for an unsupported version. 504 // Previously, it fell back to version 0, but that would yield 505 // unexpected behavior. 506 *errorCode = U_MISSING_RESOURCE_ERROR; 507 return; 508 } 509 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 510 myConverterData->myConverterArray[ISO8859_7] = 511 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 512 } 513 if (myLocale[1]=='k') { /* Use KDDI's version. */ 514 myConverterData->myConverterArray[JISX208] = 515 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 516 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */ 517 myConverterData->myConverterArray[JISX208] = 518 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode); 519 } else { 520 /* 521 * Change for http://b/issue?id=937017 : 522 * Restore JIS X 0208 ISO-2022-JP mappings from before 523 * sharing the table with the Shift-JIS converter 524 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797). 525 * TODO(mscherer): Create and use a new, unified Google Shift-JIS 526 * table for both Shift-JIS and ISO-2022-JP. 527 */ 528 myConverterData->myConverterArray[JISX208] = 529 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode); 530 } 531 /* End Google-specific change. */ 532 if(jpCharsetMasks[version]&CSM(JISX212)) { 533 myConverterData->myConverterArray[JISX212] = 534 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 535 } 536 if(jpCharsetMasks[version]&CSM(GB2312)) { 537 myConverterData->myConverterArray[GB2312] = 538 /* BEGIN android-changed */ 539 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 540 /* END android-changed */ 541 } 542 if(jpCharsetMasks[version]&CSM(KSC5601)) { 543 myConverterData->myConverterArray[KSC5601] = 544 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 545 } 546 547 /* set the function pointers to appropriate funtions */ 548 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 549 uprv_strcpy(myConverterData->locale,"ja"); 550 551 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 552 size_t len = uprv_strlen(myConverterData->name); 553 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 554 myConverterData->name[len+1]='\0'; 555 } 556 #if !UCONFIG_ONLY_HTML_CONVERSION 557 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 558 (myLocale[2]=='_' || myLocale[2]=='\0')) 559 { 560 if(version>1) { 561 // ICU 55 fails to open a converter for an unsupported version. 562 // Previously, it fell back to version 0, but that would yield 563 // unexpected behavior. 564 *errorCode = U_MISSING_RESOURCE_ERROR; 565 return; 566 } 567 const char *cnvName; 568 if(version==1) { 569 cnvName="icu-internal-25546"; 570 } else { 571 /* BEGIN android-changed */ 572 cnvName="ksc_5601"; 573 /* END android-changed */ 574 myConverterData->version=version=0; 575 } 576 if(pArgs->onlyTestIsLoadable) { 577 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 578 uprv_free(cnv->extraInfo); 579 cnv->extraInfo=NULL; 580 return; 581 } else { 582 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 583 if (U_FAILURE(*errorCode)) { 584 _ISO2022Close(cnv); 585 return; 586 } 587 588 if(version==1) { 589 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 590 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 591 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 592 }else{ 593 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 594 } 595 596 /* initialize the state variables */ 597 setInitialStateToUnicodeKR(cnv, myConverterData); 598 setInitialStateFromUnicodeKR(cnv, myConverterData); 599 600 /* set the function pointers to appropriate funtions */ 601 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 602 uprv_strcpy(myConverterData->locale,"ko"); 603 } 604 } 605 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 606 (myLocale[2]=='_' || myLocale[2]=='\0')) 607 { 608 if(version>2) { 609 // ICU 55 fails to open a converter for an unsupported version. 610 // Previously, it fell back to version 0, but that would yield 611 // unexpected behavior. 612 *errorCode = U_MISSING_RESOURCE_ERROR; 613 return; 614 } 615 616 /* open the required converters and cache them */ 617 /* BEGIN android-changed */ 618 myConverterData->myConverterArray[GB2312_1] = 619 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 620 if(version==1) { 621 myConverterData->myConverterArray[ISO_IR_165] = 622 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 623 } 624 myConverterData->myConverterArray[CNS_11643] = 625 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 626 /* END android-changed */ 627 628 629 /* set the function pointers to appropriate funtions */ 630 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 631 uprv_strcpy(myConverterData->locale,"cn"); 632 633 if (version==0){ 634 myConverterData->version = 0; 635 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 636 }else if (version==1){ 637 myConverterData->version = 1; 638 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 639 }else { 640 myConverterData->version = 2; 641 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 642 } 643 } 644 #endif // !UCONFIG_ONLY_HTML_CONVERSION 645 else{ 646 #ifdef U_ENABLE_GENERIC_ISO_2022 647 myConverterData->isFirstBuffer = TRUE; 648 649 /* append the UTF-8 escape sequence */ 650 cnv->charErrorBufferLength = 3; 651 cnv->charErrorBuffer[0] = 0x1b; 652 cnv->charErrorBuffer[1] = 0x25; 653 cnv->charErrorBuffer[2] = 0x42; 654 655 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 656 /* initialize the state variables */ 657 uprv_strcpy(myConverterData->name,"ISO_2022"); 658 #else 659 *errorCode = U_MISSING_RESOURCE_ERROR; 660 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard 661 // data loading error code. 662 return; 663 #endif 664 } 665 666 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 667 668 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 669 _ISO2022Close(cnv); 670 } 671 } else { 672 *errorCode = U_MEMORY_ALLOCATION_ERROR; 673 } 674 } 675 676 677 static void 678 _ISO2022Close(UConverter *converter) { 679 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 680 UConverterSharedData **array = myData->myConverterArray; 681 int32_t i; 682 683 if (converter->extraInfo != NULL) { 684 /*close the array of converter pointers and free the memory*/ 685 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 686 if(array[i]!=NULL) { 687 ucnv_unloadSharedDataIfReady(array[i]); 688 } 689 } 690 691 ucnv_close(myData->currentConverter); 692 693 if(!converter->isExtraLocal){ 694 uprv_free (converter->extraInfo); 695 converter->extraInfo = NULL; 696 } 697 } 698 } 699 700 static void 701 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 702 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 703 if(choice<=UCNV_RESET_TO_UNICODE) { 704 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 705 myConverterData->key = 0; 706 myConverterData->isEmptySegment = FALSE; 707 } 708 if(choice!=UCNV_RESET_TO_UNICODE) { 709 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 710 } 711 #ifdef U_ENABLE_GENERIC_ISO_2022 712 if(myConverterData->locale[0] == 0){ 713 if(choice<=UCNV_RESET_TO_UNICODE) { 714 myConverterData->isFirstBuffer = TRUE; 715 myConverterData->key = 0; 716 if (converter->mode == UCNV_SO){ 717 ucnv_close (myConverterData->currentConverter); 718 myConverterData->currentConverter=NULL; 719 } 720 converter->mode = UCNV_SI; 721 } 722 if(choice!=UCNV_RESET_TO_UNICODE) { 723 /* re-append UTF-8 escape sequence */ 724 converter->charErrorBufferLength = 3; 725 converter->charErrorBuffer[0] = 0x1b; 726 converter->charErrorBuffer[1] = 0x28; 727 converter->charErrorBuffer[2] = 0x42; 728 } 729 } 730 else 731 #endif 732 { 733 /* reset the state variables */ 734 if(myConverterData->locale[0] == 'k'){ 735 if(choice<=UCNV_RESET_TO_UNICODE) { 736 setInitialStateToUnicodeKR(converter, myConverterData); 737 } 738 if(choice!=UCNV_RESET_TO_UNICODE) { 739 setInitialStateFromUnicodeKR(converter, myConverterData); 740 } 741 } 742 } 743 } 744 745 static const char* 746 _ISO2022getName(const UConverter* cnv){ 747 if(cnv->extraInfo){ 748 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 749 return myData->name; 750 } 751 return NULL; 752 } 753 754 755 /*************** to unicode *******************/ 756 /**************************************************************************** 757 * Recognized escape sequences are 758 * <ESC>(B ASCII 759 * <ESC>.A ISO-8859-1 760 * <ESC>.F ISO-8859-7 761 * <ESC>(J JISX-201 762 * <ESC>(I JISX-201 763 * <ESC>$B JISX-208 764 * <ESC>$@ JISX-208 765 * <ESC>$(D JISX-212 766 * <ESC>$A GB2312 767 * <ESC>$(C KSC5601 768 */ 769 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 770 /* 0 1 2 3 4 5 6 7 8 9 */ 771 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 772 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 773 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 774 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 776 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 777 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 778 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 779 }; 780 781 #if !UCONFIG_ONLY_HTML_CONVERSION 782 /*************** to unicode *******************/ 783 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 784 /* 0 1 2 3 4 5 6 7 8 9 */ 785 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 786 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 787 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 788 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 789 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 790 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 791 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 792 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 793 }; 794 #endif 795 796 797 static UCNV_TableStates_2022 798 getKey_2022(char c,int32_t* key,int32_t* offset){ 799 int32_t togo; 800 int32_t low = 0; 801 int32_t hi = MAX_STATES_2022; 802 int32_t oldmid=0; 803 804 togo = normalize_esq_chars_2022[(uint8_t)c]; 805 if(togo == 0) { 806 /* not a valid character anywhere in an escape sequence */ 807 *key = 0; 808 *offset = 0; 809 return INVALID_2022; 810 } 811 togo = (*key << 5) + togo; 812 813 while (hi != low) /*binary search*/{ 814 815 int32_t mid = (hi+low) >> 1; /*Finds median*/ 816 817 if (mid == oldmid) 818 break; 819 820 if (escSeqStateTable_Key_2022[mid] > togo){ 821 hi = mid; 822 } 823 else if (escSeqStateTable_Key_2022[mid] < togo){ 824 low = mid; 825 } 826 else /*we found it*/{ 827 *key = togo; 828 *offset = mid; 829 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 830 } 831 oldmid = mid; 832 833 } 834 835 *key = 0; 836 *offset = 0; 837 return INVALID_2022; 838 } 839 840 /*runs through a state machine to determine the escape sequence - codepage correspondance 841 */ 842 static void 843 changeState_2022(UConverter* _this, 844 const char** source, 845 const char* sourceLimit, 846 Variant2022 var, 847 UErrorCode* err){ 848 UCNV_TableStates_2022 value; 849 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 850 uint32_t key = myData2022->key; 851 int32_t offset = 0; 852 int8_t initialToULength = _this->toULength; 853 char c; 854 855 value = VALID_NON_TERMINAL_2022; 856 while (*source < sourceLimit) { 857 c = *(*source)++; 858 _this->toUBytes[_this->toULength++]=(uint8_t)c; 859 value = getKey_2022(c,(int32_t *) &key, &offset); 860 861 switch (value){ 862 863 case VALID_NON_TERMINAL_2022 : 864 /* continue with the loop */ 865 break; 866 867 case VALID_TERMINAL_2022: 868 key = 0; 869 goto DONE; 870 871 case INVALID_2022: 872 goto DONE; 873 874 case VALID_MAYBE_TERMINAL_2022: 875 #ifdef U_ENABLE_GENERIC_ISO_2022 876 /* ESC ( B is ambiguous only for ISO_2022 itself */ 877 if(var == ISO_2022) { 878 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 879 _this->toULength = 0; 880 881 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 882 883 /* continue with the loop */ 884 value = VALID_NON_TERMINAL_2022; 885 break; 886 } else 887 #endif 888 { 889 /* not ISO_2022 itself, finish here */ 890 value = VALID_TERMINAL_2022; 891 key = 0; 892 goto DONE; 893 } 894 } 895 } 896 897 DONE: 898 myData2022->key = key; 899 900 if (value == VALID_NON_TERMINAL_2022) { 901 /* indicate that the escape sequence is incomplete: key!=0 */ 902 return; 903 } else if (value == INVALID_2022 ) { 904 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 905 } else /* value == VALID_TERMINAL_2022 */ { 906 switch(var){ 907 #ifdef U_ENABLE_GENERIC_ISO_2022 908 case ISO_2022: 909 { 910 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 911 if(chosenConverterName == NULL) { 912 /* SS2 or SS3 */ 913 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 914 _this->toUCallbackReason = UCNV_UNASSIGNED; 915 return; 916 } 917 918 _this->mode = UCNV_SI; 919 ucnv_close(myData2022->currentConverter); 920 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 921 if(U_SUCCESS(*err)) { 922 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 923 _this->mode = UCNV_SO; 924 } 925 break; 926 } 927 #endif 928 case ISO_2022_JP: 929 { 930 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 931 switch(tempState) { 932 case INVALID_STATE: 933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 934 break; 935 case SS2_STATE: 936 if(myData2022->toU2022State.cs[2]!=0) { 937 if(myData2022->toU2022State.g<2) { 938 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 939 } 940 myData2022->toU2022State.g=2; 941 } else { 942 /* illegal to have SS2 before a matching designator */ 943 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 944 } 945 break; 946 /* case SS3_STATE: not used in ISO-2022-JP-x */ 947 case ISO8859_1: 948 case ISO8859_7: 949 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 950 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 951 } else { 952 /* G2 charset for SS2 */ 953 myData2022->toU2022State.cs[2]=(int8_t)tempState; 954 } 955 break; 956 default: 957 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 958 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 959 } else { 960 /* G0 charset */ 961 myData2022->toU2022State.cs[0]=(int8_t)tempState; 962 } 963 break; 964 } 965 } 966 break; 967 #if !UCONFIG_ONLY_HTML_CONVERSION 968 case ISO_2022_CN: 969 { 970 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 971 switch(tempState) { 972 case INVALID_STATE: 973 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 974 break; 975 case SS2_STATE: 976 if(myData2022->toU2022State.cs[2]!=0) { 977 if(myData2022->toU2022State.g<2) { 978 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 979 } 980 myData2022->toU2022State.g=2; 981 } else { 982 /* illegal to have SS2 before a matching designator */ 983 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 984 } 985 break; 986 case SS3_STATE: 987 if(myData2022->toU2022State.cs[3]!=0) { 988 if(myData2022->toU2022State.g<2) { 989 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 990 } 991 myData2022->toU2022State.g=3; 992 } else { 993 /* illegal to have SS3 before a matching designator */ 994 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 995 } 996 break; 997 case ISO_IR_165: 998 if(myData2022->version==0) { 999 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1000 break; 1001 } 1002 /*fall through*/ 1003 case GB2312_1: 1004 /*fall through*/ 1005 case CNS_11643_1: 1006 myData2022->toU2022State.cs[1]=(int8_t)tempState; 1007 break; 1008 case CNS_11643_2: 1009 myData2022->toU2022State.cs[2]=(int8_t)tempState; 1010 break; 1011 default: 1012 /* other CNS 11643 planes */ 1013 if(myData2022->version==0) { 1014 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1015 } else { 1016 myData2022->toU2022State.cs[3]=(int8_t)tempState; 1017 } 1018 break; 1019 } 1020 } 1021 break; 1022 case ISO_2022_KR: 1023 if(offset==0x30){ 1024 /* nothing to be done, just accept this one escape sequence */ 1025 } else { 1026 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1027 } 1028 break; 1029 #endif // !UCONFIG_ONLY_HTML_CONVERSION 1030 1031 default: 1032 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1033 break; 1034 } 1035 } 1036 if(U_SUCCESS(*err)) { 1037 _this->toULength = 0; 1038 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1039 if(_this->toULength>1) { 1040 /* 1041 * Ticket 5691: consistent illegal sequences: 1042 * - We include at least the first byte (ESC) in the illegal sequence. 1043 * - If any of the non-initial bytes could be the start of a character, 1044 * we stop the illegal sequence before the first one of those. 1045 * In escape sequences, all following bytes are "printable", that is, 1046 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1047 * they are valid single/lead bytes. 1048 * For simplicity, we always only report the initial ESC byte as the 1049 * illegal sequence and back out all other bytes we looked at. 1050 */ 1051 /* Back out some bytes. */ 1052 int8_t backOutDistance=_this->toULength-1; 1053 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1054 if(backOutDistance<=bytesFromThisBuffer) { 1055 /* same as initialToULength<=1 */ 1056 *source-=backOutDistance; 1057 } else { 1058 /* Back out bytes from the previous buffer: Need to replay them. */ 1059 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1060 /* same as -(initialToULength-1) */ 1061 /* preToULength is negative! */ 1062 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1063 *source-=bytesFromThisBuffer; 1064 } 1065 _this->toULength=1; 1066 } 1067 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1068 _this->toUCallbackReason = UCNV_UNASSIGNED; 1069 } 1070 } 1071 1072 #if !UCONFIG_ONLY_HTML_CONVERSION 1073 /*Checks the characters of the buffer against valid 2022 escape sequences 1074 *if the match we return a pointer to the initial start of the sequence otherwise 1075 *we return sourceLimit 1076 */ 1077 /*for 2022 looks ahead in the stream 1078 *to determine the longest possible convertible 1079 *data stream 1080 */ 1081 static inline const char* 1082 getEndOfBuffer_2022(const char** source, 1083 const char* sourceLimit, 1084 UBool /*flush*/){ 1085 1086 const char* mySource = *source; 1087 1088 #ifdef U_ENABLE_GENERIC_ISO_2022 1089 if (*source >= sourceLimit) 1090 return sourceLimit; 1091 1092 do{ 1093 1094 if (*mySource == ESC_2022){ 1095 int8_t i; 1096 int32_t key = 0; 1097 int32_t offset; 1098 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1099 1100 /* Kludge: I could not 1101 * figure out the reason for validating an escape sequence 1102 * twice - once here and once in changeState_2022(). 1103 * is it possible to have an ESC character in a ISO2022 1104 * byte stream which is valid in a code page? Is it legal? 1105 */ 1106 for (i=0; 1107 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1108 i++) { 1109 value = getKey_2022(*(mySource+i), &key, &offset); 1110 } 1111 if (value > 0 || *mySource==ESC_2022) 1112 return mySource; 1113 1114 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1115 return sourceLimit; 1116 } 1117 }while (++mySource < sourceLimit); 1118 1119 return sourceLimit; 1120 #else 1121 while(mySource < sourceLimit && *mySource != ESC_2022) { 1122 ++mySource; 1123 } 1124 return mySource; 1125 #endif 1126 } 1127 #endif 1128 1129 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1130 * any future change in _MBCSFromUChar32() function should be reflected here. 1131 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1132 */ 1133 static inline int32_t 1134 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1135 UChar32 c, 1136 uint32_t* value, 1137 UBool useFallback, 1138 int outputType) 1139 { 1140 const int32_t *cx; 1141 const uint16_t *table; 1142 uint32_t stage2Entry; 1143 uint32_t myValue; 1144 int32_t length; 1145 const uint8_t *p; 1146 /* 1147 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1148 * Use internal version of ucnv_open() that verifies that the new structures are available, 1149 * else U_INTERNAL_PROGRAM_ERROR. 1150 */ 1151 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1152 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1153 table=sharedData->mbcs.fromUnicodeTable; 1154 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1155 /* get the bytes and the length for the output */ 1156 if(outputType==MBCS_OUTPUT_2){ 1157 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1158 if(myValue<=0xff) { 1159 length=1; 1160 } else { 1161 length=2; 1162 } 1163 } else /* outputType==MBCS_OUTPUT_3 */ { 1164 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1165 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1166 if(myValue<=0xff) { 1167 length=1; 1168 } else if(myValue<=0xffff) { 1169 length=2; 1170 } else { 1171 length=3; 1172 } 1173 } 1174 /* is this code point assigned, or do we use fallbacks? */ 1175 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1176 /* assigned */ 1177 *value=myValue; 1178 return length; 1179 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1180 /* 1181 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1182 * There is no way with this data structure for fallback output 1183 * to be a zero byte. 1184 */ 1185 *value=myValue; 1186 return -length; 1187 } 1188 } 1189 1190 cx=sharedData->mbcs.extIndexes; 1191 if(cx!=NULL) { 1192 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1193 } 1194 1195 /* unassigned */ 1196 return 0; 1197 } 1198 1199 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1200 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1201 * @param retval pointer to output byte 1202 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1203 */ 1204 static inline int32_t 1205 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1206 UChar32 c, 1207 uint32_t* retval, 1208 UBool useFallback) 1209 { 1210 const uint16_t *table; 1211 int32_t value; 1212 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1213 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1214 return 0; 1215 } 1216 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1217 table=sharedData->mbcs.fromUnicodeTable; 1218 /* get the byte for the output */ 1219 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1220 /* is this code point assigned, or do we use fallbacks? */ 1221 *retval=(uint32_t)(value&0xff); 1222 if(value>=0xf00) { 1223 return 1; /* roundtrip */ 1224 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1225 return -1; /* fallback taken */ 1226 } else { 1227 return 0; /* no mapping */ 1228 } 1229 } 1230 1231 /* 1232 * Check that the result is a 2-byte value with each byte in the range A1..FE 1233 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1234 * to move it to the ISO 2022 range 21..7E. 1235 * Return 0 if out of range. 1236 */ 1237 static inline uint32_t 1238 _2022FromGR94DBCS(uint32_t value) { 1239 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1240 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1241 ) { 1242 return value - 0x8080; /* shift down to 21..7e byte range */ 1243 } else { 1244 return 0; /* not valid for ISO 2022 */ 1245 } 1246 } 1247 1248 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1249 /* 1250 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1251 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1252 * unchanged. 1253 */ 1254 static inline uint32_t 1255 _2022ToGR94DBCS(uint32_t value) { 1256 uint32_t returnValue = value + 0x8080; 1257 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1258 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1259 return returnValue; 1260 } else { 1261 return value; 1262 } 1263 } 1264 #endif 1265 1266 #ifdef U_ENABLE_GENERIC_ISO_2022 1267 1268 /********************************************************************************** 1269 * ISO-2022 Converter 1270 * 1271 * 1272 */ 1273 1274 static void 1275 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1276 UErrorCode* err){ 1277 const char* mySourceLimit, *realSourceLimit; 1278 const char* sourceStart; 1279 const UChar* myTargetStart; 1280 UConverter* saveThis; 1281 UConverterDataISO2022* myData; 1282 int8_t length; 1283 1284 saveThis = args->converter; 1285 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1286 1287 realSourceLimit = args->sourceLimit; 1288 while (args->source < realSourceLimit) { 1289 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1290 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1291 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1292 1293 if(args->source < mySourceLimit) { 1294 if(myData->currentConverter==NULL) { 1295 myData->currentConverter = ucnv_open("ASCII",err); 1296 if(U_FAILURE(*err)){ 1297 return; 1298 } 1299 1300 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1301 saveThis->mode = UCNV_SO; 1302 } 1303 1304 /* convert to before the ESC or until the end of the buffer */ 1305 myData->isFirstBuffer=FALSE; 1306 sourceStart = args->source; 1307 myTargetStart = args->target; 1308 args->converter = myData->currentConverter; 1309 ucnv_toUnicode(args->converter, 1310 &args->target, 1311 args->targetLimit, 1312 &args->source, 1313 mySourceLimit, 1314 args->offsets, 1315 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1316 err); 1317 args->converter = saveThis; 1318 1319 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1320 /* move the overflow buffer */ 1321 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1322 myData->currentConverter->UCharErrorBufferLength = 0; 1323 if(length > 0) { 1324 uprv_memcpy(saveThis->UCharErrorBuffer, 1325 myData->currentConverter->UCharErrorBuffer, 1326 length*U_SIZEOF_UCHAR); 1327 } 1328 return; 1329 } 1330 1331 /* 1332 * At least one of: 1333 * -Error while converting 1334 * -Done with entire buffer 1335 * -Need to write offsets or update the current offset 1336 * (leave that up to the code in ucnv.c) 1337 * 1338 * or else we just stopped at an ESC byte and continue with changeState_2022() 1339 */ 1340 if (U_FAILURE(*err) || 1341 (args->source == realSourceLimit) || 1342 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1343 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1344 ) { 1345 /* copy partial or error input for truncated detection and error handling */ 1346 if(U_FAILURE(*err)) { 1347 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1348 if(length > 0) { 1349 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1350 } 1351 } else { 1352 length = saveThis->toULength = myData->currentConverter->toULength; 1353 if(length > 0) { 1354 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1355 if(args->source < mySourceLimit) { 1356 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1357 } 1358 } 1359 } 1360 return; 1361 } 1362 } 1363 } 1364 1365 sourceStart = args->source; 1366 changeState_2022(args->converter, 1367 &(args->source), 1368 realSourceLimit, 1369 ISO_2022, 1370 err); 1371 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1372 /* let the ucnv.c code update its current offset */ 1373 return; 1374 } 1375 } 1376 } 1377 1378 #endif 1379 1380 /* 1381 * To Unicode Callback helper function 1382 */ 1383 static void 1384 toUnicodeCallback(UConverter *cnv, 1385 const uint32_t sourceChar, const uint32_t targetUniChar, 1386 UErrorCode* err){ 1387 if(sourceChar>0xff){ 1388 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1389 cnv->toUBytes[1] = (uint8_t)sourceChar; 1390 cnv->toULength = 2; 1391 } 1392 else{ 1393 cnv->toUBytes[0] =(char) sourceChar; 1394 cnv->toULength = 1; 1395 } 1396 1397 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1398 *err = U_INVALID_CHAR_FOUND; 1399 } 1400 else{ 1401 *err = U_ILLEGAL_CHAR_FOUND; 1402 } 1403 } 1404 1405 /**************************************ISO-2022-JP*************************************************/ 1406 1407 /************************************** IMPORTANT ************************************************** 1408 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1409 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1410 * The converter iterates over each Unicode codepoint 1411 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1412 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1413 * would do as far as possible. 1414 * 1415 * If the implementation of these macros or structure of sharedData struct change in the future, make 1416 * sure that ISO-2022 is also changed. 1417 *************************************************************************************************** 1418 */ 1419 1420 /*************************************************************************************************** 1421 * Rules for ISO-2022-jp encoding 1422 * (i) Escape sequences must be fully contained within a line they should not 1423 * span new lines or CRs 1424 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1425 * JIS-Roman character escape sequence should follow before the line terminates 1426 * (iii) If the first character on the line is represented by two bytes then a two 1427 * byte character escape sequence should precede it 1428 * (iv) If no escape sequence is encountered then the characters are ASCII 1429 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1430 * and invoked with SS2 (ESC N). 1431 * (vi) If there is any G0 designation in text, there must be a switch to 1432 * ASCII or to JIS X 0201-Roman before a space character (but not 1433 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1434 * characters such as tab or CRLF. 1435 * (vi) Supported encodings: 1436 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1437 * 1438 * source : RFC-1554 1439 * 1440 * JISX201, JISX208,JISX212 : new .cnv data files created 1441 * KSC5601 : alias to ibm-949 mapping table 1442 * GB2312 : alias to ibm-1386 mapping table 1443 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1444 * ISO-8859-7 : alisas to ibm-9409 mapping table 1445 */ 1446 1447 /* preference order of JP charsets */ 1448 static const StateEnum jpCharsetPref[]={ 1449 ASCII, 1450 JISX201, 1451 ISO8859_1, 1452 ISO8859_7, 1453 JISX208, 1454 JISX212, 1455 GB2312, 1456 KSC5601, 1457 HWKANA_7BIT 1458 }; 1459 1460 /* 1461 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1462 * not in order of jpCharsetPref[]! 1463 */ 1464 static const char escSeqChars[][6] ={ 1465 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1466 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1467 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1468 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1469 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1470 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1471 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1472 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1473 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1474 1475 }; 1476 static const int8_t escSeqCharsLen[] ={ 1477 3, /* length of <ESC>(B ASCII */ 1478 3, /* length of <ESC>.A ISO-8859-1 */ 1479 3, /* length of <ESC>.F ISO-8859-7 */ 1480 3, /* length of <ESC>(J JISX-201 */ 1481 3, /* length of <ESC>$B JISX-208 */ 1482 4, /* length of <ESC>$(D JISX-212 */ 1483 3, /* length of <ESC>$A GB2312 */ 1484 4, /* length of <ESC>$(C KSC5601 */ 1485 3 /* length of <ESC>(I HWKANA_7BIT */ 1486 }; 1487 1488 /* 1489 * The iteration over various code pages works this way: 1490 * i) Get the currentState from myConverterData->currentState 1491 * ii) Check if the character is mapped to a valid character in the currentState 1492 * Yes -> a) set the initIterState to currentState 1493 * b) remain in this state until an invalid character is found 1494 * No -> a) go to the next code page and find the character 1495 * iii) Before changing the state increment the current state check if the current state 1496 * is equal to the intitIteration state 1497 * Yes -> A character that cannot be represented in any of the supported encodings 1498 * break and return a U_INVALID_CHARACTER error 1499 * No -> Continue and find the character in next code page 1500 * 1501 * 1502 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1503 */ 1504 1505 /* Map 00..7F to Unicode according to JIS X 0201. */ 1506 static inline uint32_t 1507 jisx201ToU(uint32_t value) { 1508 if(value < 0x5c) { 1509 return value; 1510 } else if(value == 0x5c) { 1511 return 0xa5; 1512 } else if(value == 0x7e) { 1513 return 0x203e; 1514 } else /* value <= 0x7f */ { 1515 return value; 1516 } 1517 } 1518 1519 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1520 static inline uint32_t 1521 jisx201FromU(uint32_t value) { 1522 if(value<=0x7f) { 1523 if(value!=0x5c && value!=0x7e) { 1524 return value; 1525 } 1526 } else if(value==0xa5) { 1527 return 0x5c; 1528 } else if(value==0x203e) { 1529 return 0x7e; 1530 } 1531 return 0xfffe; 1532 } 1533 1534 /* 1535 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1536 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1537 * Return 0 if the byte pair is out of range. 1538 */ 1539 static inline uint32_t 1540 _2022FromSJIS(uint32_t value) { 1541 uint8_t trail; 1542 1543 if(value > 0xEFFC) { 1544 return 0; /* beyond JIS X 0208 */ 1545 } 1546 1547 trail = (uint8_t)value; 1548 1549 value &= 0xff00; /* lead byte */ 1550 if(value <= 0x9f00) { 1551 value -= 0x7000; 1552 } else /* 0xe000 <= value <= 0xef00 */ { 1553 value -= 0xb000; 1554 } 1555 value <<= 1; 1556 1557 if(trail <= 0x9e) { 1558 value -= 0x100; 1559 if(trail <= 0x7e) { 1560 value |= trail - 0x1f; 1561 } else { 1562 value |= trail - 0x20; 1563 } 1564 } else /* trail <= 0xfc */ { 1565 value |= trail - 0x7e; 1566 } 1567 return value; 1568 } 1569 1570 /* 1571 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1572 * If either byte is outside 21..7E make sure that the result is not valid 1573 * for Shift-JIS so that the converter catches it. 1574 * Some invalid byte values already turn into equally invalid Shift-JIS 1575 * byte values and need not be tested explicitly. 1576 */ 1577 static inline void 1578 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1579 if(c1&1) { 1580 ++c1; 1581 if(c2 <= 0x5f) { 1582 c2 += 0x1f; 1583 } else if(c2 <= 0x7e) { 1584 c2 += 0x20; 1585 } else { 1586 c2 = 0; /* invalid */ 1587 } 1588 } else { 1589 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1590 c2 += 0x7e; 1591 } else { 1592 c2 = 0; /* invalid */ 1593 } 1594 } 1595 c1 >>= 1; 1596 if(c1 <= 0x2f) { 1597 c1 += 0x70; 1598 } else if(c1 <= 0x3f) { 1599 c1 += 0xb0; 1600 } else { 1601 c1 = 0; /* invalid */ 1602 } 1603 bytes[0] = (char)c1; 1604 bytes[1] = (char)c2; 1605 } 1606 1607 /* 1608 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1609 * Katakana. 1610 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1611 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1612 * These were the only fallbacks in ICU's jisx-208.ucm file. 1613 */ 1614 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1615 0x2123, /* U+FF61 */ 1616 0x2156, 1617 0x2157, 1618 0x2122, 1619 0x2126, 1620 0x2572, 1621 0x2521, 1622 0x2523, 1623 0x2525, 1624 0x2527, 1625 0x2529, 1626 0x2563, 1627 0x2565, 1628 0x2567, 1629 0x2543, 1630 0x213C, /* U+FF70 */ 1631 0x2522, 1632 0x2524, 1633 0x2526, 1634 0x2528, 1635 0x252A, 1636 0x252B, 1637 0x252D, 1638 0x252F, 1639 0x2531, 1640 0x2533, 1641 0x2535, 1642 0x2537, 1643 0x2539, 1644 0x253B, 1645 0x253D, 1646 0x253F, /* U+FF80 */ 1647 0x2541, 1648 0x2544, 1649 0x2546, 1650 0x2548, 1651 0x254A, 1652 0x254B, 1653 0x254C, 1654 0x254D, 1655 0x254E, 1656 0x254F, 1657 0x2552, 1658 0x2555, 1659 0x2558, 1660 0x255B, 1661 0x255E, 1662 0x255F, /* U+FF90 */ 1663 0x2560, 1664 0x2561, 1665 0x2562, 1666 0x2564, 1667 0x2566, 1668 0x2568, 1669 0x2569, 1670 0x256A, 1671 0x256B, 1672 0x256C, 1673 0x256D, 1674 0x256F, 1675 0x2573, 1676 0x212B, 1677 0x212C /* U+FF9F */ 1678 }; 1679 1680 static void 1681 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1682 UConverter *cnv = args->converter; 1683 UConverterDataISO2022 *converterData; 1684 ISO2022State *pFromU2022State; 1685 uint8_t *target = (uint8_t *) args->target; 1686 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1687 const UChar* source = args->source; 1688 const UChar* sourceLimit = args->sourceLimit; 1689 int32_t* offsets = args->offsets; 1690 UChar32 sourceChar; 1691 char buffer[8]; 1692 int32_t len, outLen; 1693 int8_t choices[10]; 1694 int32_t choiceCount; 1695 uint32_t targetValue = 0; 1696 UBool useFallback; 1697 1698 int32_t i; 1699 int8_t cs, g; 1700 1701 /* set up the state */ 1702 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1703 pFromU2022State = &converterData->fromU2022State; 1704 1705 choiceCount = 0; 1706 1707 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1708 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1709 goto getTrail; 1710 } 1711 1712 while(source < sourceLimit) { 1713 if(target < targetLimit) { 1714 1715 sourceChar = *(source++); 1716 /*check if the char is a First surrogate*/ 1717 if(U16_IS_SURROGATE(sourceChar)) { 1718 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1719 getTrail: 1720 /*look ahead to find the trail surrogate*/ 1721 if(source < sourceLimit) { 1722 /* test the following code unit */ 1723 UChar trail=(UChar) *source; 1724 if(U16_IS_TRAIL(trail)) { 1725 source++; 1726 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1727 cnv->fromUChar32=0x00; 1728 /* convert this supplementary code point */ 1729 /* exit this condition tree */ 1730 } else { 1731 /* this is an unmatched lead code unit (1st surrogate) */ 1732 /* callback(illegal) */ 1733 *err=U_ILLEGAL_CHAR_FOUND; 1734 cnv->fromUChar32=sourceChar; 1735 break; 1736 } 1737 } else { 1738 /* no more input */ 1739 cnv->fromUChar32=sourceChar; 1740 break; 1741 } 1742 } else { 1743 /* this is an unmatched trail code unit (2nd surrogate) */ 1744 /* callback(illegal) */ 1745 *err=U_ILLEGAL_CHAR_FOUND; 1746 cnv->fromUChar32=sourceChar; 1747 break; 1748 } 1749 } 1750 1751 /* do not convert SO/SI/ESC */ 1752 if(IS_2022_CONTROL(sourceChar)) { 1753 /* callback(illegal) */ 1754 *err=U_ILLEGAL_CHAR_FOUND; 1755 cnv->fromUChar32=sourceChar; 1756 break; 1757 } 1758 1759 /* do the conversion */ 1760 1761 if(choiceCount == 0) { 1762 uint16_t csm; 1763 1764 /* 1765 * The csm variable keeps track of which charsets are allowed 1766 * and not used yet while building the choices[]. 1767 */ 1768 csm = jpCharsetMasks[converterData->version]; 1769 choiceCount = 0; 1770 1771 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1772 if(converterData->version == 3 || converterData->version == 4) { 1773 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1774 } 1775 /* Do not try single-byte half-width Katakana for other versions. */ 1776 csm &= ~CSM(HWKANA_7BIT); 1777 1778 /* try the current G0 charset */ 1779 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1780 csm &= ~CSM(cs); 1781 1782 /* try the current G2 charset */ 1783 if((cs = pFromU2022State->cs[2]) != 0) { 1784 choices[choiceCount++] = cs; 1785 csm &= ~CSM(cs); 1786 } 1787 1788 /* try all the other possible charsets */ 1789 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { 1790 cs = (int8_t)jpCharsetPref[i]; 1791 if(CSM(cs) & csm) { 1792 choices[choiceCount++] = cs; 1793 csm &= ~CSM(cs); 1794 } 1795 } 1796 } 1797 1798 cs = g = 0; 1799 /* 1800 * len==0: no mapping found yet 1801 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1802 * len>0: found a roundtrip result, done 1803 */ 1804 len = 0; 1805 /* 1806 * We will turn off useFallback after finding a fallback, 1807 * but we still get fallbacks from PUA code points as usual. 1808 * Therefore, we will also need to check that we don't overwrite 1809 * an early fallback with a later one. 1810 */ 1811 useFallback = cnv->useFallback; 1812 1813 for(i = 0; i < choiceCount && len <= 0; ++i) { 1814 uint32_t value; 1815 int32_t len2; 1816 int8_t cs0 = choices[i]; 1817 switch(cs0) { 1818 case ASCII: 1819 if(sourceChar <= 0x7f) { 1820 targetValue = (uint32_t)sourceChar; 1821 len = 1; 1822 cs = cs0; 1823 g = 0; 1824 } 1825 break; 1826 case ISO8859_1: 1827 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1828 targetValue = (uint32_t)sourceChar - 0x80; 1829 len = 1; 1830 cs = cs0; 1831 g = 2; 1832 } 1833 break; 1834 case HWKANA_7BIT: 1835 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1836 if(converterData->version==3) { 1837 /* JIS7: use G1 (SO) */ 1838 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1839 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1840 len = 1; 1841 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1842 g = 1; 1843 } else if(converterData->version==4) { 1844 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1845 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1846 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1847 len = 1; 1848 1849 cs = pFromU2022State->cs[0]; 1850 if(IS_JP_DBCS(cs)) { 1851 /* switch from a DBCS charset to JISX201 */ 1852 cs = (int8_t)JISX201; 1853 } 1854 /* else stay in the current G0 charset */ 1855 g = 0; 1856 } 1857 /* else do not use HWKANA_7BIT with other versions */ 1858 } 1859 break; 1860 case JISX201: 1861 /* G0 SBCS */ 1862 value = jisx201FromU(sourceChar); 1863 if(value <= 0x7f) { 1864 targetValue = value; 1865 len = 1; 1866 cs = cs0; 1867 g = 0; 1868 useFallback = FALSE; 1869 } 1870 break; 1871 case JISX208: 1872 /* G0 DBCS from Shift-JIS table */ 1873 len2 = MBCS_FROM_UCHAR32_ISO2022( 1874 converterData->myConverterArray[cs0], 1875 sourceChar, &value, 1876 useFallback, MBCS_OUTPUT_2); 1877 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1878 value = _2022FromSJIS(value); 1879 if(value != 0) { 1880 targetValue = value; 1881 len = len2; 1882 cs = cs0; 1883 g = 0; 1884 useFallback = FALSE; 1885 } 1886 } else if(len == 0 && useFallback && 1887 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1888 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1889 len = -2; 1890 cs = cs0; 1891 g = 0; 1892 useFallback = FALSE; 1893 } 1894 break; 1895 case ISO8859_7: 1896 /* G0 SBCS forced to 7-bit output */ 1897 len2 = MBCS_SINGLE_FROM_UCHAR32( 1898 converterData->myConverterArray[cs0], 1899 sourceChar, &value, 1900 useFallback); 1901 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1902 targetValue = value - 0x80; 1903 len = len2; 1904 cs = cs0; 1905 g = 2; 1906 useFallback = FALSE; 1907 } 1908 break; 1909 default: 1910 /* G0 DBCS */ 1911 len2 = MBCS_FROM_UCHAR32_ISO2022( 1912 converterData->myConverterArray[cs0], 1913 sourceChar, &value, 1914 useFallback, MBCS_OUTPUT_2); 1915 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1916 if(cs0 == KSC5601) { 1917 /* 1918 * Check for valid bytes for the encoding scheme. 1919 * This is necessary because the sub-converter (windows-949) 1920 * has a broader encoding scheme than is valid for 2022. 1921 */ 1922 value = _2022FromGR94DBCS(value); 1923 if(value == 0) { 1924 break; 1925 } 1926 } 1927 targetValue = value; 1928 len = len2; 1929 cs = cs0; 1930 g = 0; 1931 useFallback = FALSE; 1932 } 1933 break; 1934 } 1935 } 1936 1937 if(len != 0) { 1938 if(len < 0) { 1939 len = -len; /* fallback */ 1940 } 1941 outLen = 0; /* count output bytes */ 1942 1943 /* write SI if necessary (only for JIS7) */ 1944 if(pFromU2022State->g == 1 && g == 0) { 1945 buffer[outLen++] = UCNV_SI; 1946 pFromU2022State->g = 0; 1947 } 1948 1949 /* write the designation sequence if necessary */ 1950 if(cs != pFromU2022State->cs[g]) { 1951 int32_t escLen = escSeqCharsLen[cs]; 1952 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1953 outLen += escLen; 1954 pFromU2022State->cs[g] = cs; 1955 1956 /* invalidate the choices[] */ 1957 choiceCount = 0; 1958 } 1959 1960 /* write the shift sequence if necessary */ 1961 if(g != pFromU2022State->g) { 1962 switch(g) { 1963 /* case 0 handled before writing escapes */ 1964 case 1: 1965 buffer[outLen++] = UCNV_SO; 1966 pFromU2022State->g = 1; 1967 break; 1968 default: /* case 2 */ 1969 buffer[outLen++] = 0x1b; 1970 buffer[outLen++] = 0x4e; 1971 break; 1972 /* no case 3: no SS3 in ISO-2022-JP-x */ 1973 } 1974 } 1975 1976 /* write the output bytes */ 1977 if(len == 1) { 1978 buffer[outLen++] = (char)targetValue; 1979 } else /* len == 2 */ { 1980 buffer[outLen++] = (char)(targetValue >> 8); 1981 buffer[outLen++] = (char)targetValue; 1982 } 1983 } else { 1984 /* 1985 * if we cannot find the character after checking all codepages 1986 * then this is an error 1987 */ 1988 *err = U_INVALID_CHAR_FOUND; 1989 cnv->fromUChar32=sourceChar; 1990 break; 1991 } 1992 1993 if(sourceChar == CR || sourceChar == LF) { 1994 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1995 pFromU2022State->cs[2] = 0; 1996 choiceCount = 0; 1997 } 1998 1999 /* output outLen>0 bytes in buffer[] */ 2000 if(outLen == 1) { 2001 *target++ = buffer[0]; 2002 if(offsets) { 2003 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 2004 } 2005 } else if(outLen == 2 && (target + 2) <= targetLimit) { 2006 *target++ = buffer[0]; 2007 *target++ = buffer[1]; 2008 if(offsets) { 2009 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 2010 *offsets++ = sourceIndex; 2011 *offsets++ = sourceIndex; 2012 } 2013 } else { 2014 fromUWriteUInt8( 2015 cnv, 2016 buffer, outLen, 2017 &target, (const char *)targetLimit, 2018 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 2019 err); 2020 if(U_FAILURE(*err)) { 2021 break; 2022 } 2023 } 2024 } /* end if(myTargetIndex<myTargetLength) */ 2025 else{ 2026 *err =U_BUFFER_OVERFLOW_ERROR; 2027 break; 2028 } 2029 2030 }/* end while(mySourceIndex<mySourceLength) */ 2031 2032 /* 2033 * the end of the input stream and detection of truncated input 2034 * are handled by the framework, but for ISO-2022-JP conversion 2035 * we need to be in ASCII mode at the very end 2036 * 2037 * conditions: 2038 * successful 2039 * in SO mode or not in ASCII mode 2040 * end of input and no truncated input 2041 */ 2042 if( U_SUCCESS(*err) && 2043 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2044 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2045 ) { 2046 int32_t sourceIndex; 2047 2048 outLen = 0; 2049 2050 if(pFromU2022State->g != 0) { 2051 buffer[outLen++] = UCNV_SI; 2052 pFromU2022State->g = 0; 2053 } 2054 2055 if(pFromU2022State->cs[0] != ASCII) { 2056 int32_t escLen = escSeqCharsLen[ASCII]; 2057 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2058 outLen += escLen; 2059 pFromU2022State->cs[0] = (int8_t)ASCII; 2060 } 2061 2062 /* get the source index of the last input character */ 2063 /* 2064 * TODO this would be simpler and more reliable if we used a pair 2065 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2066 * so that we could simply use the prevSourceIndex here; 2067 * this code gives an incorrect result for the rare case of an unmatched 2068 * trail surrogate that is alone in the last buffer of the text stream 2069 */ 2070 sourceIndex=(int32_t)(source-args->source); 2071 if(sourceIndex>0) { 2072 --sourceIndex; 2073 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2074 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2075 ) { 2076 --sourceIndex; 2077 } 2078 } else { 2079 sourceIndex=-1; 2080 } 2081 2082 fromUWriteUInt8( 2083 cnv, 2084 buffer, outLen, 2085 &target, (const char *)targetLimit, 2086 &offsets, sourceIndex, 2087 err); 2088 } 2089 2090 /*save the state and return */ 2091 args->source = source; 2092 args->target = (char*)target; 2093 } 2094 2095 /*************** to unicode *******************/ 2096 2097 static void 2098 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2099 UErrorCode* err){ 2100 char tempBuf[2]; 2101 const char *mySource = (char *) args->source; 2102 UChar *myTarget = args->target; 2103 const char *mySourceLimit = args->sourceLimit; 2104 uint32_t targetUniChar = 0x0000; 2105 uint32_t mySourceChar = 0x0000; 2106 uint32_t tmpSourceChar = 0x0000; 2107 UConverterDataISO2022* myData; 2108 ISO2022State *pToU2022State; 2109 StateEnum cs; 2110 2111 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2112 pToU2022State = &myData->toU2022State; 2113 2114 if(myData->key != 0) { 2115 /* continue with a partial escape sequence */ 2116 goto escape; 2117 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2118 /* continue with a partial double-byte character */ 2119 mySourceChar = args->converter->toUBytes[0]; 2120 args->converter->toULength = 0; 2121 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2122 targetUniChar = missingCharMarker; 2123 goto getTrailByte; 2124 } 2125 2126 while(mySource < mySourceLimit){ 2127 2128 targetUniChar =missingCharMarker; 2129 2130 if(myTarget < args->targetLimit){ 2131 2132 mySourceChar= (unsigned char) *mySource++; 2133 2134 switch(mySourceChar) { 2135 case UCNV_SI: 2136 if(myData->version==3) { 2137 pToU2022State->g=0; 2138 continue; 2139 } else { 2140 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2141 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2142 break; 2143 } 2144 2145 case UCNV_SO: 2146 if(myData->version==3) { 2147 /* JIS7: switch to G1 half-width Katakana */ 2148 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2149 pToU2022State->g=1; 2150 continue; 2151 } else { 2152 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2153 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2154 break; 2155 } 2156 2157 case ESC_2022: 2158 mySource--; 2159 escape: 2160 { 2161 const char * mySourceBefore = mySource; 2162 int8_t toULengthBefore = args->converter->toULength; 2163 2164 changeState_2022(args->converter,&(mySource), 2165 mySourceLimit, ISO_2022_JP,err); 2166 2167 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2168 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2169 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2170 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2171 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2172 } 2173 } 2174 2175 /* invalid or illegal escape sequence */ 2176 if(U_FAILURE(*err)){ 2177 args->target = myTarget; 2178 args->source = mySource; 2179 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2180 return; 2181 } 2182 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2183 if(myData->key==0) { 2184 myData->isEmptySegment = TRUE; 2185 } 2186 continue; 2187 2188 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2189 2190 case CR: 2191 /*falls through*/ 2192 case LF: 2193 /* automatically reset to single-byte mode */ 2194 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2195 pToU2022State->cs[0] = (int8_t)ASCII; 2196 } 2197 pToU2022State->cs[2] = 0; 2198 pToU2022State->g = 0; 2199 /* falls through */ 2200 default: 2201 /* convert one or two bytes */ 2202 myData->isEmptySegment = FALSE; 2203 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2204 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2205 !IS_JP_DBCS(cs) 2206 ) { 2207 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2208 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2209 2210 /* return from a single-shift state to the previous one */ 2211 if(pToU2022State->g >= 2) { 2212 pToU2022State->g=pToU2022State->prevG; 2213 } 2214 } else switch(cs) { 2215 case ASCII: 2216 if(mySourceChar <= 0x7f) { 2217 targetUniChar = mySourceChar; 2218 } 2219 break; 2220 case ISO8859_1: 2221 if(mySourceChar <= 0x7f) { 2222 targetUniChar = mySourceChar + 0x80; 2223 } 2224 /* return from a single-shift state to the previous one */ 2225 pToU2022State->g=pToU2022State->prevG; 2226 break; 2227 case ISO8859_7: 2228 if(mySourceChar <= 0x7f) { 2229 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2230 targetUniChar = 2231 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2232 myData->myConverterArray[cs], 2233 mySourceChar + 0x80); 2234 } 2235 /* return from a single-shift state to the previous one */ 2236 pToU2022State->g=pToU2022State->prevG; 2237 break; 2238 case JISX201: 2239 if(mySourceChar <= 0x7f) { 2240 targetUniChar = jisx201ToU(mySourceChar); 2241 } 2242 break; 2243 case HWKANA_7BIT: 2244 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2245 /* 7-bit halfwidth Katakana */ 2246 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2247 } 2248 break; 2249 default: 2250 /* G0 DBCS */ 2251 if(mySource < mySourceLimit) { 2252 int leadIsOk, trailIsOk; 2253 uint8_t trailByte; 2254 getTrailByte: 2255 trailByte = (uint8_t)*mySource; 2256 /* 2257 * Ticket 5691: consistent illegal sequences: 2258 * - We include at least the first byte in the illegal sequence. 2259 * - If any of the non-initial bytes could be the start of a character, 2260 * we stop the illegal sequence before the first one of those. 2261 * 2262 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2263 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2264 * Otherwise we convert or report the pair of bytes. 2265 */ 2266 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2267 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2268 if (leadIsOk && trailIsOk) { 2269 ++mySource; 2270 tmpSourceChar = (mySourceChar << 8) | trailByte; 2271 if(cs == JISX208) { 2272 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2273 mySourceChar = tmpSourceChar; 2274 } else { 2275 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2276 mySourceChar = tmpSourceChar; 2277 if (cs == KSC5601) { 2278 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2279 } 2280 tempBuf[0] = (char)(tmpSourceChar >> 8); 2281 tempBuf[1] = (char)(tmpSourceChar); 2282 } 2283 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2284 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2285 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2286 ++mySource; 2287 /* add another bit so that the code below writes 2 bytes in case of error */ 2288 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2289 } 2290 } else { 2291 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2292 args->converter->toULength = 1; 2293 goto endloop; 2294 } 2295 } /* End of inner switch */ 2296 break; 2297 } /* End of outer switch */ 2298 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2299 if(args->offsets){ 2300 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2301 } 2302 *(myTarget++)=(UChar)targetUniChar; 2303 } 2304 else if(targetUniChar > missingCharMarker){ 2305 /* disassemble the surrogate pair and write to output*/ 2306 targetUniChar-=0x0010000; 2307 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2308 if(args->offsets){ 2309 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2310 } 2311 ++myTarget; 2312 if(myTarget< args->targetLimit){ 2313 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2314 if(args->offsets){ 2315 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2316 } 2317 ++myTarget; 2318 }else{ 2319 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2320 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2321 } 2322 2323 } 2324 else{ 2325 /* Call the callback function*/ 2326 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2327 break; 2328 } 2329 } 2330 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2331 *err =U_BUFFER_OVERFLOW_ERROR; 2332 break; 2333 } 2334 } 2335 endloop: 2336 args->target = myTarget; 2337 args->source = mySource; 2338 } 2339 2340 2341 #if !UCONFIG_ONLY_HTML_CONVERSION 2342 /*************************************************************** 2343 * Rules for ISO-2022-KR encoding 2344 * i) The KSC5601 designator sequence should appear only once in a file, 2345 * at the begining of a line before any KSC5601 characters. This usually 2346 * means that it appears by itself on the first line of the file 2347 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2348 * and SI to shift into single byte mode 2349 */ 2350 static void 2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2352 2353 UConverter* saveConv = args->converter; 2354 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2355 args->converter=myConverterData->currentConverter; 2356 2357 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2358 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2359 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2360 2361 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2362 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2363 uprv_memcpy( 2364 saveConv->charErrorBuffer, 2365 myConverterData->currentConverter->charErrorBuffer, 2366 myConverterData->currentConverter->charErrorBufferLength); 2367 } 2368 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2369 myConverterData->currentConverter->charErrorBufferLength = 0; 2370 } 2371 args->converter=saveConv; 2372 } 2373 2374 static void 2375 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2376 2377 const UChar *source = args->source; 2378 const UChar *sourceLimit = args->sourceLimit; 2379 unsigned char *target = (unsigned char *) args->target; 2380 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2381 int32_t* offsets = args->offsets; 2382 uint32_t targetByteUnit = 0x0000; 2383 UChar32 sourceChar = 0x0000; 2384 UBool isTargetByteDBCS; 2385 UBool oldIsTargetByteDBCS; 2386 UConverterDataISO2022 *converterData; 2387 UConverterSharedData* sharedData; 2388 UBool useFallback; 2389 int32_t length =0; 2390 2391 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2392 /* if the version is 1 then the user is requesting 2393 * conversion with ibm-25546 pass the arguments to 2394 * MBCS converter and return 2395 */ 2396 if(converterData->version==1){ 2397 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2398 return; 2399 } 2400 2401 /* initialize data */ 2402 sharedData = converterData->currentConverter->sharedData; 2403 useFallback = args->converter->useFallback; 2404 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2405 oldIsTargetByteDBCS = isTargetByteDBCS; 2406 2407 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2408 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2409 goto getTrail; 2410 } 2411 while(source < sourceLimit){ 2412 2413 targetByteUnit = missingCharMarker; 2414 2415 if(target < (unsigned char*) args->targetLimit){ 2416 sourceChar = *source++; 2417 2418 /* do not convert SO/SI/ESC */ 2419 if(IS_2022_CONTROL(sourceChar)) { 2420 /* callback(illegal) */ 2421 *err=U_ILLEGAL_CHAR_FOUND; 2422 args->converter->fromUChar32=sourceChar; 2423 break; 2424 } 2425 2426 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2427 if(length < 0) { 2428 length = -length; /* fallback */ 2429 } 2430 /* only DBCS or SBCS characters are expected*/ 2431 /* DB characters with high bit set to 1 are expected */ 2432 if( length > 2 || length==0 || 2433 (length == 1 && targetByteUnit > 0x7f) || 2434 (length == 2 && 2435 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2436 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2437 ) { 2438 targetByteUnit=missingCharMarker; 2439 } 2440 if (targetByteUnit != missingCharMarker){ 2441 2442 oldIsTargetByteDBCS = isTargetByteDBCS; 2443 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2444 /* append the shift sequence */ 2445 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2446 2447 if (isTargetByteDBCS) 2448 *target++ = UCNV_SO; 2449 else 2450 *target++ = UCNV_SI; 2451 if(offsets) 2452 *(offsets++) = (int32_t)(source - args->source-1); 2453 } 2454 /* write the targetUniChar to target */ 2455 if(targetByteUnit <= 0x00FF){ 2456 if( target < targetLimit){ 2457 *(target++) = (unsigned char) targetByteUnit; 2458 if(offsets){ 2459 *(offsets++) = (int32_t)(source - args->source-1); 2460 } 2461 2462 }else{ 2463 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2464 *err = U_BUFFER_OVERFLOW_ERROR; 2465 } 2466 }else{ 2467 if(target < targetLimit){ 2468 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2469 if(offsets){ 2470 *(offsets++) = (int32_t)(source - args->source-1); 2471 } 2472 if(target < targetLimit){ 2473 *(target++) =(unsigned char) (targetByteUnit -0x80); 2474 if(offsets){ 2475 *(offsets++) = (int32_t)(source - args->source-1); 2476 } 2477 }else{ 2478 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2479 *err = U_BUFFER_OVERFLOW_ERROR; 2480 } 2481 }else{ 2482 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2483 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2484 *err = U_BUFFER_OVERFLOW_ERROR; 2485 } 2486 } 2487 2488 } 2489 else{ 2490 /* oops.. the code point is unassingned 2491 * set the error and reason 2492 */ 2493 2494 /*check if the char is a First surrogate*/ 2495 if(U16_IS_SURROGATE(sourceChar)) { 2496 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2497 getTrail: 2498 /*look ahead to find the trail surrogate*/ 2499 if(source < sourceLimit) { 2500 /* test the following code unit */ 2501 UChar trail=(UChar) *source; 2502 if(U16_IS_TRAIL(trail)) { 2503 source++; 2504 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2505 *err = U_INVALID_CHAR_FOUND; 2506 /* convert this surrogate code point */ 2507 /* exit this condition tree */ 2508 } else { 2509 /* this is an unmatched lead code unit (1st surrogate) */ 2510 /* callback(illegal) */ 2511 *err=U_ILLEGAL_CHAR_FOUND; 2512 } 2513 } else { 2514 /* no more input */ 2515 *err = U_ZERO_ERROR; 2516 } 2517 } else { 2518 /* this is an unmatched trail code unit (2nd surrogate) */ 2519 /* callback(illegal) */ 2520 *err=U_ILLEGAL_CHAR_FOUND; 2521 } 2522 } else { 2523 /* callback(unassigned) for a BMP code point */ 2524 *err = U_INVALID_CHAR_FOUND; 2525 } 2526 2527 args->converter->fromUChar32=sourceChar; 2528 break; 2529 } 2530 } /* end if(myTargetIndex<myTargetLength) */ 2531 else{ 2532 *err =U_BUFFER_OVERFLOW_ERROR; 2533 break; 2534 } 2535 2536 }/* end while(mySourceIndex<mySourceLength) */ 2537 2538 /* 2539 * the end of the input stream and detection of truncated input 2540 * are handled by the framework, but for ISO-2022-KR conversion 2541 * we need to be in ASCII mode at the very end 2542 * 2543 * conditions: 2544 * successful 2545 * not in ASCII mode 2546 * end of input and no truncated input 2547 */ 2548 if( U_SUCCESS(*err) && 2549 isTargetByteDBCS && 2550 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2551 ) { 2552 int32_t sourceIndex; 2553 2554 /* we are switching to ASCII */ 2555 isTargetByteDBCS=FALSE; 2556 2557 /* get the source index of the last input character */ 2558 /* 2559 * TODO this would be simpler and more reliable if we used a pair 2560 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2561 * so that we could simply use the prevSourceIndex here; 2562 * this code gives an incorrect result for the rare case of an unmatched 2563 * trail surrogate that is alone in the last buffer of the text stream 2564 */ 2565 sourceIndex=(int32_t)(source-args->source); 2566 if(sourceIndex>0) { 2567 --sourceIndex; 2568 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2569 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2570 ) { 2571 --sourceIndex; 2572 } 2573 } else { 2574 sourceIndex=-1; 2575 } 2576 2577 fromUWriteUInt8( 2578 args->converter, 2579 SHIFT_IN_STR, 1, 2580 &target, (const char *)targetLimit, 2581 &offsets, sourceIndex, 2582 err); 2583 } 2584 2585 /*save the state and return */ 2586 args->source = source; 2587 args->target = (char*)target; 2588 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2589 } 2590 2591 /************************ To Unicode ***************************************/ 2592 2593 static void 2594 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2595 UErrorCode* err){ 2596 char const* sourceStart; 2597 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2598 2599 UConverterToUnicodeArgs subArgs; 2600 int32_t minArgsSize; 2601 2602 /* set up the subconverter arguments */ 2603 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2604 minArgsSize = args->size; 2605 } else { 2606 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2607 } 2608 2609 uprv_memcpy(&subArgs, args, minArgsSize); 2610 subArgs.size = (uint16_t)minArgsSize; 2611 subArgs.converter = myData->currentConverter; 2612 2613 /* remember the original start of the input for offsets */ 2614 sourceStart = args->source; 2615 2616 if(myData->key != 0) { 2617 /* continue with a partial escape sequence */ 2618 goto escape; 2619 } 2620 2621 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2622 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2623 subArgs.source = args->source; 2624 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2625 if(subArgs.source != subArgs.sourceLimit) { 2626 /* 2627 * get the current partial byte sequence 2628 * 2629 * it needs to be moved between the public and the subconverter 2630 * so that the conversion framework, which only sees the public 2631 * converter, can handle truncated and illegal input etc. 2632 */ 2633 if(args->converter->toULength > 0) { 2634 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2635 } 2636 subArgs.converter->toULength = args->converter->toULength; 2637 2638 /* 2639 * Convert up to the end of the input, or to before the next escape character. 2640 * Does not handle conversion extensions because the preToU[] state etc. 2641 * is not copied. 2642 */ 2643 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2644 2645 if(args->offsets != NULL && sourceStart != args->source) { 2646 /* update offsets to base them on the actual start of the input */ 2647 int32_t *offsets = args->offsets; 2648 UChar *target = args->target; 2649 int32_t delta = (int32_t)(args->source - sourceStart); 2650 while(target < subArgs.target) { 2651 if(*offsets >= 0) { 2652 *offsets += delta; 2653 } 2654 ++offsets; 2655 ++target; 2656 } 2657 } 2658 args->source = subArgs.source; 2659 args->target = subArgs.target; 2660 args->offsets = subArgs.offsets; 2661 2662 /* copy input/error/overflow buffers */ 2663 if(subArgs.converter->toULength > 0) { 2664 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2665 } 2666 args->converter->toULength = subArgs.converter->toULength; 2667 2668 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2669 if(subArgs.converter->UCharErrorBufferLength > 0) { 2670 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2671 subArgs.converter->UCharErrorBufferLength); 2672 } 2673 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2674 subArgs.converter->UCharErrorBufferLength = 0; 2675 } 2676 } 2677 2678 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2679 return; 2680 } 2681 2682 escape: 2683 changeState_2022(args->converter, 2684 &(args->source), 2685 args->sourceLimit, 2686 ISO_2022_KR, 2687 err); 2688 } 2689 } 2690 2691 static void 2692 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2693 UErrorCode* err){ 2694 char tempBuf[2]; 2695 const char *mySource = ( char *) args->source; 2696 UChar *myTarget = args->target; 2697 const char *mySourceLimit = args->sourceLimit; 2698 UChar32 targetUniChar = 0x0000; 2699 UChar mySourceChar = 0x0000; 2700 UConverterDataISO2022* myData; 2701 UConverterSharedData* sharedData ; 2702 UBool useFallback; 2703 2704 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2705 if(myData->version==1){ 2706 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2707 return; 2708 } 2709 2710 /* initialize state */ 2711 sharedData = myData->currentConverter->sharedData; 2712 useFallback = args->converter->useFallback; 2713 2714 if(myData->key != 0) { 2715 /* continue with a partial escape sequence */ 2716 goto escape; 2717 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2718 /* continue with a partial double-byte character */ 2719 mySourceChar = args->converter->toUBytes[0]; 2720 args->converter->toULength = 0; 2721 goto getTrailByte; 2722 } 2723 2724 while(mySource< mySourceLimit){ 2725 2726 if(myTarget < args->targetLimit){ 2727 2728 mySourceChar= (unsigned char) *mySource++; 2729 2730 if(mySourceChar==UCNV_SI){ 2731 myData->toU2022State.g = 0; 2732 if (myData->isEmptySegment) { 2733 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2734 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2735 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2736 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2737 args->converter->toULength = 1; 2738 args->target = myTarget; 2739 args->source = mySource; 2740 return; 2741 } 2742 /*consume the source */ 2743 continue; 2744 }else if(mySourceChar==UCNV_SO){ 2745 myData->toU2022State.g = 1; 2746 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2747 /*consume the source */ 2748 continue; 2749 }else if(mySourceChar==ESC_2022){ 2750 mySource--; 2751 escape: 2752 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2753 changeState_2022(args->converter,&(mySource), 2754 mySourceLimit, ISO_2022_KR, err); 2755 if(U_FAILURE(*err)){ 2756 args->target = myTarget; 2757 args->source = mySource; 2758 return; 2759 } 2760 continue; 2761 } 2762 2763 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2764 if(myData->toU2022State.g == 1) { 2765 if(mySource < mySourceLimit) { 2766 int leadIsOk, trailIsOk; 2767 uint8_t trailByte; 2768 getTrailByte: 2769 targetUniChar = missingCharMarker; 2770 trailByte = (uint8_t)*mySource; 2771 /* 2772 * Ticket 5691: consistent illegal sequences: 2773 * - We include at least the first byte in the illegal sequence. 2774 * - If any of the non-initial bytes could be the start of a character, 2775 * we stop the illegal sequence before the first one of those. 2776 * 2777 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2778 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2779 * Otherwise we convert or report the pair of bytes. 2780 */ 2781 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2782 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2783 if (leadIsOk && trailIsOk) { 2784 ++mySource; 2785 tempBuf[0] = (char)(mySourceChar + 0x80); 2786 tempBuf[1] = (char)(trailByte + 0x80); 2787 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2788 mySourceChar = (mySourceChar << 8) | trailByte; 2789 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2790 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2791 ++mySource; 2792 /* add another bit so that the code below writes 2 bytes in case of error */ 2793 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2794 } 2795 } else { 2796 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2797 args->converter->toULength = 1; 2798 break; 2799 } 2800 } 2801 else if(mySourceChar <= 0x7f) { 2802 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2803 } else { 2804 targetUniChar = 0xffff; 2805 } 2806 if(targetUniChar < 0xfffe){ 2807 if(args->offsets) { 2808 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2809 } 2810 *(myTarget++)=(UChar)targetUniChar; 2811 } 2812 else { 2813 /* Call the callback function*/ 2814 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2815 break; 2816 } 2817 } 2818 else{ 2819 *err =U_BUFFER_OVERFLOW_ERROR; 2820 break; 2821 } 2822 } 2823 args->target = myTarget; 2824 args->source = mySource; 2825 } 2826 2827 /*************************** END ISO2022-KR *********************************/ 2828 2829 /*************************** ISO-2022-CN ********************************* 2830 * 2831 * Rules for ISO-2022-CN Encoding: 2832 * i) The designator sequence must appear once on a line before any instance 2833 * of character set it designates. 2834 * ii) If two lines contain characters from the same character set, both lines 2835 * must include the designator sequence. 2836 * iii) Once the designator sequence is known, a shifting sequence has to be found 2837 * to invoke the shifting 2838 * iv) All lines start in ASCII and end in ASCII. 2839 * v) Four shifting sequences are employed for this purpose: 2840 * 2841 * Sequcence ASCII Eq Charsets 2842 * ---------- ------- --------- 2843 * SI <SI> US-ASCII 2844 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2845 * SS2 <ESC>N CNS-11643-1992 Plane 2 2846 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2847 * 2848 * vi) 2849 * SOdesignator : ESC "$" ")" finalchar_for_SO 2850 * SS2designator : ESC "$" "*" finalchar_for_SS2 2851 * SS3designator : ESC "$" "+" finalchar_for_SS3 2852 * 2853 * ESC $ ) A Indicates the bytes following SO are Chinese 2854 * characters as defined in GB 2312-80, until 2855 * another SOdesignation appears 2856 * 2857 * 2858 * ESC $ ) E Indicates the bytes following SO are as defined 2859 * in ISO-IR-165 (for details, see section 2.1), 2860 * until another SOdesignation appears 2861 * 2862 * ESC $ ) G Indicates the bytes following SO are as defined 2863 * in CNS 11643-plane-1, until another 2864 * SOdesignation appears 2865 * 2866 * ESC $ * H Indicates the two bytes immediately following 2867 * SS2 is a Chinese character as defined in CNS 2868 * 11643-plane-2, until another SS2designation 2869 * appears 2870 * (Meaning <ESC>N must preceed every 2 byte 2871 * sequence.) 2872 * 2873 * ESC $ + I Indicates the immediate two bytes following SS3 2874 * is a Chinese character as defined in CNS 2875 * 11643-plane-3, until another SS3designation 2876 * appears 2877 * (Meaning <ESC>O must preceed every 2 byte 2878 * sequence.) 2879 * 2880 * ESC $ + J Indicates the immediate two bytes following SS3 2881 * is a Chinese character as defined in CNS 2882 * 11643-plane-4, until another SS3designation 2883 * appears 2884 * (In English: <ESC>O must preceed every 2 byte 2885 * sequence.) 2886 * 2887 * ESC $ + K Indicates the immediate two bytes following SS3 2888 * is a Chinese character as defined in CNS 2889 * 11643-plane-5, until another SS3designation 2890 * appears 2891 * 2892 * ESC $ + L Indicates the immediate two bytes following SS3 2893 * is a Chinese character as defined in CNS 2894 * 11643-plane-6, until another SS3designation 2895 * appears 2896 * 2897 * ESC $ + M Indicates the immediate two bytes following SS3 2898 * is a Chinese character as defined in CNS 2899 * 11643-plane-7, until another SS3designation 2900 * appears 2901 * 2902 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2903 * has its own designation information before any Chinese characters 2904 * appear 2905 * 2906 */ 2907 2908 /* The following are defined this way to make the strings truly readonly */ 2909 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2910 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2911 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2912 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2913 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2914 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2915 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2916 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2917 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2918 2919 /********************** ISO2022-CN Data **************************/ 2920 static const char* const escSeqCharsCN[10] ={ 2921 SHIFT_IN_STR, /* 0 ASCII */ 2922 GB_2312_80_STR, /* 1 GB2312_1 */ 2923 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2924 CNS_11643_1992_Plane_1_STR, 2925 CNS_11643_1992_Plane_2_STR, 2926 CNS_11643_1992_Plane_3_STR, 2927 CNS_11643_1992_Plane_4_STR, 2928 CNS_11643_1992_Plane_5_STR, 2929 CNS_11643_1992_Plane_6_STR, 2930 CNS_11643_1992_Plane_7_STR 2931 }; 2932 2933 static void 2934 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2935 UConverter *cnv = args->converter; 2936 UConverterDataISO2022 *converterData; 2937 ISO2022State *pFromU2022State; 2938 uint8_t *target = (uint8_t *) args->target; 2939 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2940 const UChar* source = args->source; 2941 const UChar* sourceLimit = args->sourceLimit; 2942 int32_t* offsets = args->offsets; 2943 UChar32 sourceChar; 2944 char buffer[8]; 2945 int32_t len; 2946 int8_t choices[3]; 2947 int32_t choiceCount; 2948 uint32_t targetValue = 0; 2949 UBool useFallback; 2950 2951 /* set up the state */ 2952 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2953 pFromU2022State = &converterData->fromU2022State; 2954 2955 choiceCount = 0; 2956 2957 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2958 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2959 goto getTrail; 2960 } 2961 2962 while( source < sourceLimit){ 2963 if(target < targetLimit){ 2964 2965 sourceChar = *(source++); 2966 /*check if the char is a First surrogate*/ 2967 if(U16_IS_SURROGATE(sourceChar)) { 2968 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2969 getTrail: 2970 /*look ahead to find the trail surrogate*/ 2971 if(source < sourceLimit) { 2972 /* test the following code unit */ 2973 UChar trail=(UChar) *source; 2974 if(U16_IS_TRAIL(trail)) { 2975 source++; 2976 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2977 cnv->fromUChar32=0x00; 2978 /* convert this supplementary code point */ 2979 /* exit this condition tree */ 2980 } else { 2981 /* this is an unmatched lead code unit (1st surrogate) */ 2982 /* callback(illegal) */ 2983 *err=U_ILLEGAL_CHAR_FOUND; 2984 cnv->fromUChar32=sourceChar; 2985 break; 2986 } 2987 } else { 2988 /* no more input */ 2989 cnv->fromUChar32=sourceChar; 2990 break; 2991 } 2992 } else { 2993 /* this is an unmatched trail code unit (2nd surrogate) */ 2994 /* callback(illegal) */ 2995 *err=U_ILLEGAL_CHAR_FOUND; 2996 cnv->fromUChar32=sourceChar; 2997 break; 2998 } 2999 } 3000 3001 /* do the conversion */ 3002 if(sourceChar <= 0x007f ){ 3003 /* do not convert SO/SI/ESC */ 3004 if(IS_2022_CONTROL(sourceChar)) { 3005 /* callback(illegal) */ 3006 *err=U_ILLEGAL_CHAR_FOUND; 3007 cnv->fromUChar32=sourceChar; 3008 break; 3009 } 3010 3011 /* US-ASCII */ 3012 if(pFromU2022State->g == 0) { 3013 buffer[0] = (char)sourceChar; 3014 len = 1; 3015 } else { 3016 buffer[0] = UCNV_SI; 3017 buffer[1] = (char)sourceChar; 3018 len = 2; 3019 pFromU2022State->g = 0; 3020 choiceCount = 0; 3021 } 3022 if(sourceChar == CR || sourceChar == LF) { 3023 /* reset the state at the end of a line */ 3024 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 3025 choiceCount = 0; 3026 } 3027 } 3028 else{ 3029 /* convert U+0080..U+10ffff */ 3030 int32_t i; 3031 int8_t cs, g; 3032 3033 if(choiceCount == 0) { 3034 /* try the current SO/G1 converter first */ 3035 choices[0] = pFromU2022State->cs[1]; 3036 3037 /* default to GB2312_1 if none is designated yet */ 3038 if(choices[0] == 0) { 3039 choices[0] = GB2312_1; 3040 } 3041 3042 if(converterData->version == 0) { 3043 /* ISO-2022-CN */ 3044 3045 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3046 if(choices[0] == GB2312_1) { 3047 choices[1] = (int8_t)CNS_11643_1; 3048 } else { 3049 choices[1] = (int8_t)GB2312_1; 3050 } 3051 3052 choiceCount = 2; 3053 } else if (converterData->version == 1) { 3054 /* ISO-2022-CN-EXT */ 3055 3056 /* try one of the other converters */ 3057 switch(choices[0]) { 3058 case GB2312_1: 3059 choices[1] = (int8_t)CNS_11643_1; 3060 choices[2] = (int8_t)ISO_IR_165; 3061 break; 3062 case ISO_IR_165: 3063 choices[1] = (int8_t)GB2312_1; 3064 choices[2] = (int8_t)CNS_11643_1; 3065 break; 3066 default: /* CNS_11643_x */ 3067 choices[1] = (int8_t)GB2312_1; 3068 choices[2] = (int8_t)ISO_IR_165; 3069 break; 3070 } 3071 3072 choiceCount = 3; 3073 } else { 3074 choices[0] = (int8_t)CNS_11643_1; 3075 choices[1] = (int8_t)GB2312_1; 3076 } 3077 } 3078 3079 cs = g = 0; 3080 /* 3081 * len==0: no mapping found yet 3082 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3083 * len>0: found a roundtrip result, done 3084 */ 3085 len = 0; 3086 /* 3087 * We will turn off useFallback after finding a fallback, 3088 * but we still get fallbacks from PUA code points as usual. 3089 * Therefore, we will also need to check that we don't overwrite 3090 * an early fallback with a later one. 3091 */ 3092 useFallback = cnv->useFallback; 3093 3094 for(i = 0; i < choiceCount && len <= 0; ++i) { 3095 int8_t cs0 = choices[i]; 3096 if(cs0 > 0) { 3097 uint32_t value; 3098 int32_t len2; 3099 if(cs0 >= CNS_11643_0) { 3100 len2 = MBCS_FROM_UCHAR32_ISO2022( 3101 converterData->myConverterArray[CNS_11643], 3102 sourceChar, 3103 &value, 3104 useFallback, 3105 MBCS_OUTPUT_3); 3106 if(len2 == 3 || (len2 == -3 && len == 0)) { 3107 targetValue = value; 3108 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3109 if(len2 >= 0) { 3110 len = 2; 3111 } else { 3112 len = -2; 3113 useFallback = FALSE; 3114 } 3115 if(cs == CNS_11643_1) { 3116 g = 1; 3117 } else if(cs == CNS_11643_2) { 3118 g = 2; 3119 } else /* plane 3..7 */ if(converterData->version == 1) { 3120 g = 3; 3121 } else { 3122 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3123 len = 0; 3124 } 3125 } 3126 } else { 3127 /* GB2312_1 or ISO-IR-165 */ 3128 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3129 len2 = MBCS_FROM_UCHAR32_ISO2022( 3130 converterData->myConverterArray[cs0], 3131 sourceChar, 3132 &value, 3133 useFallback, 3134 MBCS_OUTPUT_2); 3135 if(len2 == 2 || (len2 == -2 && len == 0)) { 3136 targetValue = value; 3137 len = len2; 3138 cs = cs0; 3139 g = 1; 3140 useFallback = FALSE; 3141 } 3142 } 3143 } 3144 } 3145 3146 if(len != 0) { 3147 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3148 3149 /* write the designation sequence if necessary */ 3150 if(cs != pFromU2022State->cs[g]) { 3151 if(cs < CNS_11643) { 3152 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3153 } else { 3154 U_ASSERT(cs >= CNS_11643_1); 3155 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3156 } 3157 len = 4; 3158 pFromU2022State->cs[g] = cs; 3159 if(g == 1) { 3160 /* changing the SO/G1 charset invalidates the choices[] */ 3161 choiceCount = 0; 3162 } 3163 } 3164 3165 /* write the shift sequence if necessary */ 3166 if(g != pFromU2022State->g) { 3167 switch(g) { 3168 case 1: 3169 buffer[len++] = UCNV_SO; 3170 3171 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3172 pFromU2022State->g = 1; 3173 break; 3174 case 2: 3175 buffer[len++] = 0x1b; 3176 buffer[len++] = 0x4e; 3177 break; 3178 default: /* case 3 */ 3179 buffer[len++] = 0x1b; 3180 buffer[len++] = 0x4f; 3181 break; 3182 } 3183 } 3184 3185 /* write the two output bytes */ 3186 buffer[len++] = (char)(targetValue >> 8); 3187 buffer[len++] = (char)targetValue; 3188 } else { 3189 /* if we cannot find the character after checking all codepages 3190 * then this is an error 3191 */ 3192 *err = U_INVALID_CHAR_FOUND; 3193 cnv->fromUChar32=sourceChar; 3194 break; 3195 } 3196 } 3197 3198 /* output len>0 bytes in buffer[] */ 3199 if(len == 1) { 3200 *target++ = buffer[0]; 3201 if(offsets) { 3202 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3203 } 3204 } else if(len == 2 && (target + 2) <= targetLimit) { 3205 *target++ = buffer[0]; 3206 *target++ = buffer[1]; 3207 if(offsets) { 3208 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3209 *offsets++ = sourceIndex; 3210 *offsets++ = sourceIndex; 3211 } 3212 } else { 3213 fromUWriteUInt8( 3214 cnv, 3215 buffer, len, 3216 &target, (const char *)targetLimit, 3217 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3218 err); 3219 if(U_FAILURE(*err)) { 3220 break; 3221 } 3222 } 3223 } /* end if(myTargetIndex<myTargetLength) */ 3224 else{ 3225 *err =U_BUFFER_OVERFLOW_ERROR; 3226 break; 3227 } 3228 3229 }/* end while(mySourceIndex<mySourceLength) */ 3230 3231 /* 3232 * the end of the input stream and detection of truncated input 3233 * are handled by the framework, but for ISO-2022-CN conversion 3234 * we need to be in ASCII mode at the very end 3235 * 3236 * conditions: 3237 * successful 3238 * not in ASCII mode 3239 * end of input and no truncated input 3240 */ 3241 if( U_SUCCESS(*err) && 3242 pFromU2022State->g!=0 && 3243 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3244 ) { 3245 int32_t sourceIndex; 3246 3247 /* we are switching to ASCII */ 3248 pFromU2022State->g=0; 3249 3250 /* get the source index of the last input character */ 3251 /* 3252 * TODO this would be simpler and more reliable if we used a pair 3253 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3254 * so that we could simply use the prevSourceIndex here; 3255 * this code gives an incorrect result for the rare case of an unmatched 3256 * trail surrogate that is alone in the last buffer of the text stream 3257 */ 3258 sourceIndex=(int32_t)(source-args->source); 3259 if(sourceIndex>0) { 3260 --sourceIndex; 3261 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3262 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3263 ) { 3264 --sourceIndex; 3265 } 3266 } else { 3267 sourceIndex=-1; 3268 } 3269 3270 fromUWriteUInt8( 3271 cnv, 3272 SHIFT_IN_STR, 1, 3273 &target, (const char *)targetLimit, 3274 &offsets, sourceIndex, 3275 err); 3276 } 3277 3278 /*save the state and return */ 3279 args->source = source; 3280 args->target = (char*)target; 3281 } 3282 3283 3284 static void 3285 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3286 UErrorCode* err){ 3287 char tempBuf[3]; 3288 const char *mySource = (char *) args->source; 3289 UChar *myTarget = args->target; 3290 const char *mySourceLimit = args->sourceLimit; 3291 uint32_t targetUniChar = 0x0000; 3292 uint32_t mySourceChar = 0x0000; 3293 UConverterDataISO2022* myData; 3294 ISO2022State *pToU2022State; 3295 3296 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3297 pToU2022State = &myData->toU2022State; 3298 3299 if(myData->key != 0) { 3300 /* continue with a partial escape sequence */ 3301 goto escape; 3302 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3303 /* continue with a partial double-byte character */ 3304 mySourceChar = args->converter->toUBytes[0]; 3305 args->converter->toULength = 0; 3306 targetUniChar = missingCharMarker; 3307 goto getTrailByte; 3308 } 3309 3310 while(mySource < mySourceLimit){ 3311 3312 targetUniChar =missingCharMarker; 3313 3314 if(myTarget < args->targetLimit){ 3315 3316 mySourceChar= (unsigned char) *mySource++; 3317 3318 switch(mySourceChar){ 3319 case UCNV_SI: 3320 pToU2022State->g=0; 3321 if (myData->isEmptySegment) { 3322 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3323 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3324 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3325 args->converter->toUBytes[0] = mySourceChar; 3326 args->converter->toULength = 1; 3327 args->target = myTarget; 3328 args->source = mySource; 3329 return; 3330 } 3331 continue; 3332 3333 case UCNV_SO: 3334 if(pToU2022State->cs[1] != 0) { 3335 pToU2022State->g=1; 3336 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3337 continue; 3338 } else { 3339 /* illegal to have SO before a matching designator */ 3340 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3341 break; 3342 } 3343 3344 case ESC_2022: 3345 mySource--; 3346 escape: 3347 { 3348 const char * mySourceBefore = mySource; 3349 int8_t toULengthBefore = args->converter->toULength; 3350 3351 changeState_2022(args->converter,&(mySource), 3352 mySourceLimit, ISO_2022_CN,err); 3353 3354 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3355 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3356 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3357 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3358 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3359 } 3360 } 3361 3362 /* invalid or illegal escape sequence */ 3363 if(U_FAILURE(*err)){ 3364 args->target = myTarget; 3365 args->source = mySource; 3366 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3367 return; 3368 } 3369 continue; 3370 3371 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3372 3373 case CR: 3374 /*falls through*/ 3375 case LF: 3376 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3377 /* falls through */ 3378 default: 3379 /* convert one or two bytes */ 3380 myData->isEmptySegment = FALSE; 3381 if(pToU2022State->g != 0) { 3382 if(mySource < mySourceLimit) { 3383 UConverterSharedData *cnv; 3384 StateEnum tempState; 3385 int32_t tempBufLen; 3386 int leadIsOk, trailIsOk; 3387 uint8_t trailByte; 3388 getTrailByte: 3389 trailByte = (uint8_t)*mySource; 3390 /* 3391 * Ticket 5691: consistent illegal sequences: 3392 * - We include at least the first byte in the illegal sequence. 3393 * - If any of the non-initial bytes could be the start of a character, 3394 * we stop the illegal sequence before the first one of those. 3395 * 3396 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3397 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3398 * Otherwise we convert or report the pair of bytes. 3399 */ 3400 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3401 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3402 if (leadIsOk && trailIsOk) { 3403 ++mySource; 3404 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3405 if(tempState >= CNS_11643_0) { 3406 cnv = myData->myConverterArray[CNS_11643]; 3407 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3408 tempBuf[1] = (char) (mySourceChar); 3409 tempBuf[2] = (char) trailByte; 3410 tempBufLen = 3; 3411 3412 }else{ 3413 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3414 cnv = myData->myConverterArray[tempState]; 3415 tempBuf[0] = (char) (mySourceChar); 3416 tempBuf[1] = (char) trailByte; 3417 tempBufLen = 2; 3418 } 3419 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3420 mySourceChar = (mySourceChar << 8) | trailByte; 3421 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3422 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3423 ++mySource; 3424 /* add another bit so that the code below writes 2 bytes in case of error */ 3425 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3426 } 3427 if(pToU2022State->g>=2) { 3428 /* return from a single-shift state to the previous one */ 3429 pToU2022State->g=pToU2022State->prevG; 3430 } 3431 } else { 3432 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3433 args->converter->toULength = 1; 3434 goto endloop; 3435 } 3436 } 3437 else{ 3438 if(mySourceChar <= 0x7f) { 3439 targetUniChar = (UChar) mySourceChar; 3440 } 3441 } 3442 break; 3443 } 3444 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3445 if(args->offsets){ 3446 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3447 } 3448 *(myTarget++)=(UChar)targetUniChar; 3449 } 3450 else if(targetUniChar > missingCharMarker){ 3451 /* disassemble the surrogate pair and write to output*/ 3452 targetUniChar-=0x0010000; 3453 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3454 if(args->offsets){ 3455 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3456 } 3457 ++myTarget; 3458 if(myTarget< args->targetLimit){ 3459 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3460 if(args->offsets){ 3461 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3462 } 3463 ++myTarget; 3464 }else{ 3465 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3466 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3467 } 3468 3469 } 3470 else{ 3471 /* Call the callback function*/ 3472 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3473 break; 3474 } 3475 } 3476 else{ 3477 *err =U_BUFFER_OVERFLOW_ERROR; 3478 break; 3479 } 3480 } 3481 endloop: 3482 args->target = myTarget; 3483 args->source = mySource; 3484 } 3485 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 3486 3487 static void 3488 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3489 UConverter *cnv = args->converter; 3490 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3491 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3492 char *p, *subchar; 3493 char buffer[8]; 3494 int32_t length; 3495 3496 subchar=(char *)cnv->subChars; 3497 length=cnv->subCharLen; /* assume length==1 for most variants */ 3498 3499 p = buffer; 3500 switch(myConverterData->locale[0]){ 3501 case 'j': 3502 { 3503 int8_t cs; 3504 3505 if(pFromU2022State->g == 1) { 3506 /* JIS7: switch from G1 to G0 */ 3507 pFromU2022State->g = 0; 3508 *p++ = UCNV_SI; 3509 } 3510 3511 cs = pFromU2022State->cs[0]; 3512 if(cs != ASCII && cs != JISX201) { 3513 /* not in ASCII or JIS X 0201: switch to ASCII */ 3514 pFromU2022State->cs[0] = (int8_t)ASCII; 3515 *p++ = '\x1b'; 3516 *p++ = '\x28'; 3517 *p++ = '\x42'; 3518 } 3519 3520 *p++ = subchar[0]; 3521 break; 3522 } 3523 case 'c': 3524 if(pFromU2022State->g != 0) { 3525 /* not in ASCII mode: switch to ASCII */ 3526 pFromU2022State->g = 0; 3527 *p++ = UCNV_SI; 3528 } 3529 *p++ = subchar[0]; 3530 break; 3531 case 'k': 3532 if(myConverterData->version == 0) { 3533 if(length == 1) { 3534 if((UBool)args->converter->fromUnicodeStatus) { 3535 /* in DBCS mode: switch to SBCS */ 3536 args->converter->fromUnicodeStatus = 0; 3537 *p++ = UCNV_SI; 3538 } 3539 *p++ = subchar[0]; 3540 } else /* length == 2*/ { 3541 if(!(UBool)args->converter->fromUnicodeStatus) { 3542 /* in SBCS mode: switch to DBCS */ 3543 args->converter->fromUnicodeStatus = 1; 3544 *p++ = UCNV_SO; 3545 } 3546 *p++ = subchar[0]; 3547 *p++ = subchar[1]; 3548 } 3549 break; 3550 } else { 3551 /* save the subconverter's substitution string */ 3552 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3553 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3554 3555 /* set our substitution string into the subconverter */ 3556 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3557 myConverterData->currentConverter->subCharLen = (int8_t)length; 3558 3559 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3560 args->converter = myConverterData->currentConverter; 3561 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3562 ucnv_cbFromUWriteSub(args, 0, err); 3563 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3564 args->converter = cnv; 3565 3566 /* restore the subconverter's substitution string */ 3567 myConverterData->currentConverter->subChars = currentSubChars; 3568 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3569 3570 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3571 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3572 uprv_memcpy( 3573 cnv->charErrorBuffer, 3574 myConverterData->currentConverter->charErrorBuffer, 3575 myConverterData->currentConverter->charErrorBufferLength); 3576 } 3577 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3578 myConverterData->currentConverter->charErrorBufferLength = 0; 3579 } 3580 return; 3581 } 3582 default: 3583 /* not expected */ 3584 break; 3585 } 3586 ucnv_cbFromUWriteBytes(args, 3587 buffer, (int32_t)(p - buffer), 3588 offsetIndex, err); 3589 } 3590 3591 /* 3592 * Structure for cloning an ISO 2022 converter into a single memory block. 3593 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3594 * and then ucnv_safeClone() of the sub-converter may additionally align 3595 * currentConverter inside the cloneStruct, for which we need the deadSpace 3596 * after currentConverter. 3597 * This is because UAlignedMemory may be larger than the actually 3598 * necessary alignment size for the platform. 3599 * The other cloneStruct fields will not be moved around, 3600 * and are aligned properly with cloneStruct's alignment. 3601 */ 3602 struct cloneStruct 3603 { 3604 UConverter cnv; 3605 UConverter currentConverter; 3606 UAlignedMemory deadSpace; 3607 UConverterDataISO2022 mydata; 3608 }; 3609 3610 3611 static UConverter * 3612 _ISO_2022_SafeClone( 3613 const UConverter *cnv, 3614 void *stackBuffer, 3615 int32_t *pBufferSize, 3616 UErrorCode *status) 3617 { 3618 struct cloneStruct * localClone; 3619 UConverterDataISO2022 *cnvData; 3620 int32_t i, size; 3621 3622 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3623 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3624 return NULL; 3625 } 3626 3627 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3628 localClone = (struct cloneStruct *)stackBuffer; 3629 3630 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3631 3632 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3633 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3634 localClone->cnv.isExtraLocal = TRUE; 3635 3636 /* share the subconverters */ 3637 3638 if(cnvData->currentConverter != NULL) { 3639 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3640 localClone->mydata.currentConverter = 3641 ucnv_safeClone(cnvData->currentConverter, 3642 &localClone->currentConverter, 3643 &size, status); 3644 if(U_FAILURE(*status)) { 3645 return NULL; 3646 } 3647 } 3648 3649 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3650 if(cnvData->myConverterArray[i] != NULL) { 3651 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3652 } 3653 } 3654 3655 return &localClone->cnv; 3656 } 3657 3658 static void 3659 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3660 const USetAdder *sa, 3661 UConverterUnicodeSet which, 3662 UErrorCode *pErrorCode) 3663 { 3664 int32_t i; 3665 UConverterDataISO2022* cnvData; 3666 3667 if (U_FAILURE(*pErrorCode)) { 3668 return; 3669 } 3670 #ifdef U_ENABLE_GENERIC_ISO_2022 3671 if (cnv->sharedData == &_ISO2022Data) { 3672 /* We use UTF-8 in this case */ 3673 sa->addRange(sa->set, 0, 0xd7FF); 3674 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3675 return; 3676 } 3677 #endif 3678 3679 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3680 3681 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3682 switch(cnvData->locale[0]){ 3683 case 'j': 3684 /* include JIS X 0201 which is hardcoded */ 3685 sa->add(sa->set, 0xa5); 3686 sa->add(sa->set, 0x203e); 3687 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3688 /* include Latin-1 for some variants of JP */ 3689 sa->addRange(sa->set, 0, 0xff); 3690 } else { 3691 /* include ASCII for JP */ 3692 sa->addRange(sa->set, 0, 0x7f); 3693 } 3694 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3695 /* 3696 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3697 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3698 * use half-width Katakana. 3699 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3700 * half-width Katakana via the ESC ( I sequence. 3701 * However, we only emit (fromUnicode) half-width Katakana according to the 3702 * definition of each variant. 3703 * 3704 * When including fallbacks, 3705 * we need to include half-width Katakana Unicode code points for all JP variants because 3706 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3707 */ 3708 /* include half-width Katakana for JP */ 3709 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3710 } 3711 break; 3712 #if !UCONFIG_ONLY_HTML_CONVERSION 3713 case 'c': 3714 case 'z': 3715 /* include ASCII for CN */ 3716 sa->addRange(sa->set, 0, 0x7f); 3717 break; 3718 case 'k': 3719 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3720 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3721 cnvData->currentConverter, sa, which, pErrorCode); 3722 /* the loop over myConverterArray[] will simply not find another converter */ 3723 break; 3724 #endif 3725 default: 3726 break; 3727 } 3728 3729 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3730 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3731 cnvData->version==0 && i==CNS_11643 3732 ) { 3733 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3734 ucnv_MBCSGetUnicodeSetForBytes( 3735 cnvData->myConverterArray[i], 3736 sa, UCNV_ROUNDTRIP_SET, 3737 0, 0x81, 0x82, 3738 pErrorCode); 3739 } 3740 #endif 3741 3742 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3743 UConverterSetFilter filter; 3744 if(cnvData->myConverterArray[i]!=NULL) { 3745 if(cnvData->locale[0]=='j' && i==JISX208) { 3746 /* 3747 * Only add code points that map to Shift-JIS codes 3748 * corresponding to JIS X 0208. 3749 */ 3750 filter=UCNV_SET_FILTER_SJIS; 3751 #if !UCONFIG_ONLY_HTML_CONVERSION 3752 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3753 cnvData->version==0 && i==CNS_11643) { 3754 /* 3755 * Version-specific for CN: 3756 * CN version 0 does not map CNS planes 3..7 although 3757 * they are all available in the CNS conversion table; 3758 * CN version 1 (-EXT) does map them all. 3759 * The two versions create different Unicode sets. 3760 */ 3761 filter=UCNV_SET_FILTER_2022_CN; 3762 } else if(i==KSC5601) { 3763 /* 3764 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3765 * are broader than GR94. 3766 */ 3767 filter=UCNV_SET_FILTER_GR94DBCS; 3768 #endif 3769 } else { 3770 filter=UCNV_SET_FILTER_NONE; 3771 } 3772 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3773 } 3774 } 3775 3776 /* 3777 * ISO 2022 converters must not convert SO/SI/ESC despite what 3778 * sub-converters do by themselves. 3779 * Remove these characters from the set. 3780 */ 3781 sa->remove(sa->set, 0x0e); 3782 sa->remove(sa->set, 0x0f); 3783 sa->remove(sa->set, 0x1b); 3784 3785 /* ISO 2022 converters do not convert C1 controls either */ 3786 sa->removeRange(sa->set, 0x80, 0x9f); 3787 } 3788 3789 static const UConverterImpl _ISO2022Impl={ 3790 UCNV_ISO_2022, 3791 3792 NULL, 3793 NULL, 3794 3795 _ISO2022Open, 3796 _ISO2022Close, 3797 _ISO2022Reset, 3798 3799 #ifdef U_ENABLE_GENERIC_ISO_2022 3800 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3801 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3802 ucnv_fromUnicode_UTF8, 3803 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3804 #else 3805 NULL, 3806 NULL, 3807 NULL, 3808 NULL, 3809 #endif 3810 NULL, 3811 3812 NULL, 3813 _ISO2022getName, 3814 _ISO_2022_WriteSub, 3815 _ISO_2022_SafeClone, 3816 _ISO_2022_GetUnicodeSet, 3817 3818 NULL, 3819 NULL 3820 }; 3821 static const UConverterStaticData _ISO2022StaticData={ 3822 sizeof(UConverterStaticData), 3823 "ISO_2022", 3824 2022, 3825 UCNV_IBM, 3826 UCNV_ISO_2022, 3827 1, 3828 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3829 { 0x1a, 0, 0, 0 }, 3830 1, 3831 FALSE, 3832 FALSE, 3833 0, 3834 0, 3835 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3836 }; 3837 const UConverterSharedData _ISO2022Data={ 3838 sizeof(UConverterSharedData), 3839 ~((uint32_t) 0), 3840 NULL, 3841 NULL, 3842 &_ISO2022StaticData, 3843 FALSE, 3844 &_ISO2022Impl, 3845 0, UCNV_MBCS_TABLE_INITIALIZER 3846 }; 3847 3848 /*************JP****************/ 3849 static const UConverterImpl _ISO2022JPImpl={ 3850 UCNV_ISO_2022, 3851 3852 NULL, 3853 NULL, 3854 3855 _ISO2022Open, 3856 _ISO2022Close, 3857 _ISO2022Reset, 3858 3859 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3860 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3861 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3862 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3863 NULL, 3864 3865 NULL, 3866 _ISO2022getName, 3867 _ISO_2022_WriteSub, 3868 _ISO_2022_SafeClone, 3869 _ISO_2022_GetUnicodeSet, 3870 3871 NULL, 3872 NULL 3873 }; 3874 static const UConverterStaticData _ISO2022JPStaticData={ 3875 sizeof(UConverterStaticData), 3876 "ISO_2022_JP", 3877 0, 3878 UCNV_IBM, 3879 UCNV_ISO_2022, 3880 1, 3881 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3882 { 0x1a, 0, 0, 0 }, 3883 1, 3884 FALSE, 3885 FALSE, 3886 0, 3887 0, 3888 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3889 }; 3890 3891 namespace { 3892 3893 const UConverterSharedData _ISO2022JPData={ 3894 sizeof(UConverterSharedData), 3895 ~((uint32_t) 0), 3896 NULL, 3897 NULL, 3898 &_ISO2022JPStaticData, 3899 FALSE, 3900 &_ISO2022JPImpl, 3901 0, UCNV_MBCS_TABLE_INITIALIZER 3902 }; 3903 3904 } // namespace 3905 3906 #if !UCONFIG_ONLY_HTML_CONVERSION 3907 /************* KR ***************/ 3908 static const UConverterImpl _ISO2022KRImpl={ 3909 UCNV_ISO_2022, 3910 3911 NULL, 3912 NULL, 3913 3914 _ISO2022Open, 3915 _ISO2022Close, 3916 _ISO2022Reset, 3917 3918 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3919 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3920 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3921 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3922 NULL, 3923 3924 NULL, 3925 _ISO2022getName, 3926 _ISO_2022_WriteSub, 3927 _ISO_2022_SafeClone, 3928 _ISO_2022_GetUnicodeSet, 3929 3930 NULL, 3931 NULL 3932 }; 3933 static const UConverterStaticData _ISO2022KRStaticData={ 3934 sizeof(UConverterStaticData), 3935 "ISO_2022_KR", 3936 0, 3937 UCNV_IBM, 3938 UCNV_ISO_2022, 3939 1, 3940 3, /* max 3 bytes per UChar: SO+DBCS */ 3941 { 0x1a, 0, 0, 0 }, 3942 1, 3943 FALSE, 3944 FALSE, 3945 0, 3946 0, 3947 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3948 }; 3949 3950 namespace { 3951 3952 const UConverterSharedData _ISO2022KRData={ 3953 sizeof(UConverterSharedData), 3954 ~((uint32_t) 0), 3955 NULL, 3956 NULL, 3957 &_ISO2022KRStaticData, 3958 FALSE, 3959 &_ISO2022KRImpl, 3960 0, UCNV_MBCS_TABLE_INITIALIZER 3961 }; 3962 3963 } // namespace 3964 3965 /*************** CN ***************/ 3966 static const UConverterImpl _ISO2022CNImpl={ 3967 3968 UCNV_ISO_2022, 3969 3970 NULL, 3971 NULL, 3972 3973 _ISO2022Open, 3974 _ISO2022Close, 3975 _ISO2022Reset, 3976 3977 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3978 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3979 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3980 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3981 NULL, 3982 3983 NULL, 3984 _ISO2022getName, 3985 _ISO_2022_WriteSub, 3986 _ISO_2022_SafeClone, 3987 _ISO_2022_GetUnicodeSet, 3988 3989 NULL, 3990 NULL 3991 }; 3992 static const UConverterStaticData _ISO2022CNStaticData={ 3993 sizeof(UConverterStaticData), 3994 "ISO_2022_CN", 3995 0, 3996 UCNV_IBM, 3997 UCNV_ISO_2022, 3998 1, 3999 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 4000 { 0x1a, 0, 0, 0 }, 4001 1, 4002 FALSE, 4003 FALSE, 4004 0, 4005 0, 4006 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 4007 }; 4008 4009 namespace { 4010 4011 const UConverterSharedData _ISO2022CNData={ 4012 sizeof(UConverterSharedData), 4013 ~((uint32_t) 0), 4014 NULL, 4015 NULL, 4016 &_ISO2022CNStaticData, 4017 FALSE, 4018 &_ISO2022CNImpl, 4019 0, UCNV_MBCS_TABLE_INITIALIZER 4020 }; 4021 4022 } // namespace 4023 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 4024 4025 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 4026