1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2000-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv2022.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000feb03 14 * created by: Markus W. Scherer 15 * 16 * Change history: 17 * 18 * 06/29/2000 helena Major rewrite of the callback APIs. 19 * 08/08/2000 Ram Included support for ISO-2022-JP-2 20 * Changed implementation of toUnicode 21 * function 22 * 08/21/2000 Ram Added support for ISO-2022-KR 23 * 08/29/2000 Ram Seperated implementation of EBCDIC to 24 * ucnvebdc.c 25 * 09/20/2000 Ram Added support for ISO-2022-CN 26 * Added implementations for getNextUChar() 27 * for specific 2022 country variants. 28 * 10/31/2000 Ram Implemented offsets logic functions 29 */ 30 31 #include "unicode/utypes.h" 32 33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 34 35 #include "unicode/ucnv.h" 36 #include "unicode/uset.h" 37 #include "unicode/ucnv_err.h" 38 #include "unicode/ucnv_cb.h" 39 #include "unicode/utf16.h" 40 #include "ucnv_imp.h" 41 #include "ucnv_bld.h" 42 #include "ucnv_cnv.h" 43 #include "ucnvmbcs.h" 44 #include "cstring.h" 45 #include "cmemory.h" 46 #include "uassert.h" 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 #if !UCONFIG_ONLY_HTML_CONVERSION 81 static const char SHIFT_IN_STR[] = "\x0F"; 82 // static const char SHIFT_OUT_STR[] = "\x0E"; 83 #endif 84 85 #define CR 0x0D 86 #define LF 0x0A 87 #define H_TAB 0x09 88 #define V_TAB 0x0B 89 #define SPACE 0x20 90 91 enum { 92 HWKANA_START=0xff61, 93 HWKANA_END=0xff9f 94 }; 95 96 /* 97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 98 * as bytes 21..7E. (Subtract 0x80.) 99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 100 * as bytes 20..7F. (Subtract 0x80.) 101 * Do not encode C1 control codes with native bytes 80..9F 102 * as bytes 00..1F (C0 control codes). 103 */ 104 enum { 105 GR94_START=0xa1, 106 GR94_END=0xfe, 107 GR96_START=0xa0, 108 GR96_END=0xff 109 }; 110 111 /* 112 * ISO 2022 control codes must not be converted from Unicode 113 * because they would mess up the byte stream. 114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 115 * corresponding to SO, SI, and ESC. 116 */ 117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 118 119 /* for ISO-2022-JP and -CN implementations */ 120 typedef enum { 121 /* shared values */ 122 INVALID_STATE=-1, 123 ASCII = 0, 124 125 SS2_STATE=0x10, 126 SS3_STATE, 127 128 /* JP */ 129 ISO8859_1 = 1 , 130 ISO8859_7 = 2 , 131 JISX201 = 3, 132 JISX208 = 4, 133 JISX212 = 5, 134 GB2312 =6, 135 KSC5601 =7, 136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 137 138 /* CN */ 139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 140 GB2312_1=1, 141 ISO_IR_165=2, 142 CNS_11643=3, 143 144 /* 145 * these are used in StateEnum and ISO2022State variables, 146 * but CNS_11643 must be used to index into myConverterArray[] 147 */ 148 CNS_11643_0=0x20, 149 CNS_11643_1, 150 CNS_11643_2, 151 CNS_11643_3, 152 CNS_11643_4, 153 CNS_11643_5, 154 CNS_11643_6, 155 CNS_11643_7 156 } StateEnum; 157 158 /* is the StateEnum charset value for a DBCS charset? */ 159 #if UCONFIG_ONLY_HTML_CONVERSION 160 #define IS_JP_DBCS(cs) (JISX208==(cs)) 161 #else 162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 163 #endif 164 165 #define CSM(cs) ((uint16_t)1<<(cs)) 166 167 /* 168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 170 * 171 * Note: The converter uses some leniency: 172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 173 * all versions, not just JIS7 and JIS8. 174 * - ICU does not distinguish between different versions of JIS X 0208. 175 */ 176 #if UCONFIG_ONLY_HTML_CONVERSION 177 enum { MAX_JA_VERSION=0 }; 178 #else 179 enum { MAX_JA_VERSION=4 }; 180 #endif 181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 183 #if !UCONFIG_ONLY_HTML_CONVERSION 184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 188 #endif 189 }; 190 191 typedef enum { 192 ASCII1=0, 193 LATIN1, 194 SBCS, 195 DBCS, 196 MBCS, 197 HWKANA 198 }Cnv2022Type; 199 200 typedef struct ISO2022State { 201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 203 int8_t prevG; /* g before single shift (SS2 or SS3) */ 204 } ISO2022State; 205 206 #define UCNV_OPTIONS_VERSION_MASK 0xf 207 #define UCNV_2022_MAX_CONVERTERS 10 208 209 typedef struct{ 210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 211 UConverter *currentConverter; 212 Cnv2022Type currentType; 213 ISO2022State toU2022State, fromU2022State; 214 uint32_t key; 215 uint32_t version; 216 #ifdef U_ENABLE_GENERIC_ISO_2022 217 UBool isFirstBuffer; 218 #endif 219 UBool isEmptySegment; 220 char name[30]; 221 char locale[3]; 222 }UConverterDataISO2022; 223 224 /* Protos */ 225 /* ISO-2022 ----------------------------------------------------------------- */ 226 227 /*Forward declaration */ 228 U_CFUNC void U_CALLCONV 229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 230 UErrorCode * err); 231 U_CFUNC void U_CALLCONV 232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 233 UErrorCode * err); 234 235 #define ESC_2022 0x1B /*ESC*/ 236 237 typedef enum 238 { 239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 243 } UCNV_TableStates_2022; 244 245 /* 246 * The way these state transition arrays work is: 247 * ex : ESC$B is the sequence for JISX208 248 * a) First Iteration: char is ESC 249 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 250 * int x = normalize_esq_chars_2022[27] which is equal to 1 251 * ii) Search for this value in escSeqStateTable_Key_2022[] 252 * value of x is stored at escSeqStateTable_Key_2022[0] 253 * iii) Save this index as offset 254 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 255 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 256 * b) Switch on this state and continue to next char 257 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 258 * which is normalize_esq_chars_2022[36] == 4 259 * ii) x is currently 1(from above) 260 * x<<=5 -- x is now 32 261 * x+=normalize_esq_chars_2022[36] 262 * now x is 36 263 * iii) Search for this value in escSeqStateTable_Key_2022[] 264 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 265 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 266 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 267 * c) Switch on this state and continue to next char 268 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 269 * ii) x is currently 36 (from above) 270 * x<<=5 -- x is now 1152 271 * x+=normalize_esq_chars_2022[66] 272 * now x is 1161 273 * iii) Search for this value in escSeqStateTable_Key_2022[] 274 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 275 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 276 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 277 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 278 */ 279 280 281 /*Below are the 3 arrays depicting a state transition table*/ 282 static const int8_t normalize_esq_chars_2022[256] = { 283 /* 0 1 2 3 4 5 6 7 8 9 */ 284 285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 310 ,0 ,0 ,0 ,0 ,0 ,0 311 }; 312 313 #ifdef U_ENABLE_GENERIC_ISO_2022 314 /* 315 * When the generic ISO-2022 converter is completely removed, not just disabled 316 * per #ifdef, then the following state table and the associated tables that are 317 * dimensioned with MAX_STATES_2022 should be trimmed. 318 * 319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 320 * the associated escape sequences starting with ESC ( B should be removed. 321 * This includes the ones with key values 1097 and all of the ones above 1000000. 322 * 323 * For the latter, the tables can simply be truncated. 324 * For the former, since the tables must be kept parallel, it is probably best 325 * to simply duplicate an adjacent table cell, parallel in all tables. 326 * 327 * It may make sense to restructure the tables, especially by using small search 328 * tables for the variants instead of indexing them parallel to the table here. 329 */ 330 #endif 331 332 #define MAX_STATES_2022 74 333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 334 /* 0 1 2 3 4 5 6 7 8 9 */ 335 336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 343 ,35947631 ,35947635 ,35947636 ,35947638 344 }; 345 346 #ifdef U_ENABLE_GENERIC_ISO_2022 347 348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 349 /* 0 1 2 3 4 5 6 7 8 9 */ 350 351 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 353 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 359 }; 360 361 #endif 362 363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 364 /* 0 1 2 3 4 5 6 7 8 9 */ 365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 373 }; 374 375 /* Type def for refactoring changeState_2022 code*/ 376 typedef enum{ 377 #ifdef U_ENABLE_GENERIC_ISO_2022 378 ISO_2022=0, 379 #endif 380 ISO_2022_JP=1, 381 #if !UCONFIG_ONLY_HTML_CONVERSION 382 ISO_2022_KR=2, 383 ISO_2022_CN=3 384 #endif 385 } Variant2022; 386 387 /*********** ISO 2022 Converter Protos ***********/ 388 static void U_CALLCONV 389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 390 391 static void U_CALLCONV 392 _ISO2022Close(UConverter *converter); 393 394 static void U_CALLCONV 395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 396 397 U_CDECL_BEGIN 398 static const char * U_CALLCONV 399 _ISO2022getName(const UConverter* cnv); 400 U_CDECL_END 401 402 static void U_CALLCONV 403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 404 405 U_CDECL_BEGIN 406 static UConverter * U_CALLCONV 407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 408 409 U_CDECL_END 410 411 #ifdef U_ENABLE_GENERIC_ISO_2022 412 static void U_CALLCONV 413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 414 #endif 415 416 namespace { 417 418 /*const UConverterSharedData _ISO2022Data;*/ 419 extern const UConverterSharedData _ISO2022JPData; 420 421 #if !UCONFIG_ONLY_HTML_CONVERSION 422 extern const UConverterSharedData _ISO2022KRData; 423 extern const UConverterSharedData _ISO2022CNData; 424 #endif 425 426 } // namespace 427 428 /*************** Converter implementations ******************/ 429 430 /* The purpose of this function is to get around gcc compiler warnings. */ 431 static inline void 432 fromUWriteUInt8(UConverter *cnv, 433 const char *bytes, int32_t length, 434 uint8_t **target, const char *targetLimit, 435 int32_t **offsets, 436 int32_t sourceIndex, 437 UErrorCode *pErrorCode) 438 { 439 char *targetChars = (char *)*target; 440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 441 offsets, sourceIndex, pErrorCode); 442 *target = (uint8_t*)targetChars; 443 444 } 445 446 static inline void 447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 448 if(myConverterData->version == 1) { 449 UConverter *cnv = myConverterData->currentConverter; 450 451 cnv->toUnicodeStatus=0; /* offset */ 452 cnv->mode=0; /* state */ 453 cnv->toULength=0; /* byteIndex */ 454 } 455 } 456 457 static inline void 458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 459 /* in ISO-2022-KR the designator sequence appears only once 460 * in a file so we append it only once 461 */ 462 if( converter->charErrorBufferLength==0){ 463 464 converter->charErrorBufferLength = 4; 465 converter->charErrorBuffer[0] = 0x1b; 466 converter->charErrorBuffer[1] = 0x24; 467 converter->charErrorBuffer[2] = 0x29; 468 converter->charErrorBuffer[3] = 0x43; 469 } 470 if(myConverterData->version == 1) { 471 UConverter *cnv = myConverterData->currentConverter; 472 473 cnv->fromUChar32=0; 474 cnv->fromUnicodeStatus=1; /* prevLength */ 475 } 476 } 477 478 static void U_CALLCONV 479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 480 481 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 482 483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 484 if(cnv->extraInfo != NULL) { 485 UConverterNamePieces stackPieces; 486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 487 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 488 uint32_t version; 489 490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 491 492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 493 myConverterData->currentType = ASCII1; 494 cnv->fromUnicodeStatus =FALSE; 495 if(pArgs->locale){ 496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 497 } 498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 499 myConverterData->version = version; 500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 501 (myLocale[2]=='_' || myLocale[2]=='\0')) 502 { 503 /* open the required converters and cache them */ 504 if(version>MAX_JA_VERSION) { 505 // ICU 55 fails to open a converter for an unsupported version. 506 // Previously, it fell back to version 0, but that would yield 507 // unexpected behavior. 508 *errorCode = U_MISSING_RESOURCE_ERROR; 509 return; 510 } 511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 512 myConverterData->myConverterArray[ISO8859_7] = 513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 514 } 515 myConverterData->myConverterArray[JISX208] = 516 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 517 if(jpCharsetMasks[version]&CSM(JISX212)) { 518 myConverterData->myConverterArray[JISX212] = 519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 520 } 521 if(jpCharsetMasks[version]&CSM(GB2312)) { 522 myConverterData->myConverterArray[GB2312] = 523 /* BEGIN android-changed */ 524 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 525 /* END android-changed */ 526 } 527 if(jpCharsetMasks[version]&CSM(KSC5601)) { 528 myConverterData->myConverterArray[KSC5601] = 529 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 530 } 531 532 /* set the function pointers to appropriate funtions */ 533 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 534 uprv_strcpy(myConverterData->locale,"ja"); 535 536 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 537 size_t len = uprv_strlen(myConverterData->name); 538 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 539 myConverterData->name[len+1]='\0'; 540 } 541 #if !UCONFIG_ONLY_HTML_CONVERSION 542 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 543 (myLocale[2]=='_' || myLocale[2]=='\0')) 544 { 545 if(version>1) { 546 // ICU 55 fails to open a converter for an unsupported version. 547 // Previously, it fell back to version 0, but that would yield 548 // unexpected behavior. 549 *errorCode = U_MISSING_RESOURCE_ERROR; 550 return; 551 } 552 const char *cnvName; 553 if(version==1) { 554 cnvName="icu-internal-25546"; 555 } else { 556 /* BEGIN android-changed */ 557 cnvName="ksc_5601"; 558 /* END android-changed */ 559 myConverterData->version=version=0; 560 } 561 if(pArgs->onlyTestIsLoadable) { 562 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 563 uprv_free(cnv->extraInfo); 564 cnv->extraInfo=NULL; 565 return; 566 } else { 567 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 568 if (U_FAILURE(*errorCode)) { 569 _ISO2022Close(cnv); 570 return; 571 } 572 573 if(version==1) { 574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 575 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 576 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 577 }else{ 578 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 579 } 580 581 /* initialize the state variables */ 582 setInitialStateToUnicodeKR(cnv, myConverterData); 583 setInitialStateFromUnicodeKR(cnv, myConverterData); 584 585 /* set the function pointers to appropriate funtions */ 586 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 587 uprv_strcpy(myConverterData->locale,"ko"); 588 } 589 } 590 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 591 (myLocale[2]=='_' || myLocale[2]=='\0')) 592 { 593 if(version>2) { 594 // ICU 55 fails to open a converter for an unsupported version. 595 // Previously, it fell back to version 0, but that would yield 596 // unexpected behavior. 597 *errorCode = U_MISSING_RESOURCE_ERROR; 598 return; 599 } 600 601 /* open the required converters and cache them */ 602 /* BEGIN android-changed */ 603 myConverterData->myConverterArray[GB2312_1] = 604 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 605 if(version==1) { 606 myConverterData->myConverterArray[ISO_IR_165] = 607 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 608 } 609 myConverterData->myConverterArray[CNS_11643] = 610 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 611 /* END android-changed */ 612 613 614 /* set the function pointers to appropriate funtions */ 615 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 616 uprv_strcpy(myConverterData->locale,"cn"); 617 618 if (version==0){ 619 myConverterData->version = 0; 620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 621 }else if (version==1){ 622 myConverterData->version = 1; 623 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 624 }else { 625 myConverterData->version = 2; 626 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 627 } 628 } 629 #endif // !UCONFIG_ONLY_HTML_CONVERSION 630 else{ 631 #ifdef U_ENABLE_GENERIC_ISO_2022 632 myConverterData->isFirstBuffer = TRUE; 633 634 /* append the UTF-8 escape sequence */ 635 cnv->charErrorBufferLength = 3; 636 cnv->charErrorBuffer[0] = 0x1b; 637 cnv->charErrorBuffer[1] = 0x25; 638 cnv->charErrorBuffer[2] = 0x42; 639 640 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 641 /* initialize the state variables */ 642 uprv_strcpy(myConverterData->name,"ISO_2022"); 643 #else 644 *errorCode = U_MISSING_RESOURCE_ERROR; 645 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard 646 // data loading error code. 647 return; 648 #endif 649 } 650 651 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 652 653 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 654 _ISO2022Close(cnv); 655 } 656 } else { 657 *errorCode = U_MEMORY_ALLOCATION_ERROR; 658 } 659 } 660 661 662 static void U_CALLCONV 663 _ISO2022Close(UConverter *converter) { 664 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 665 UConverterSharedData **array = myData->myConverterArray; 666 int32_t i; 667 668 if (converter->extraInfo != NULL) { 669 /*close the array of converter pointers and free the memory*/ 670 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 671 if(array[i]!=NULL) { 672 ucnv_unloadSharedDataIfReady(array[i]); 673 } 674 } 675 676 ucnv_close(myData->currentConverter); 677 678 if(!converter->isExtraLocal){ 679 uprv_free (converter->extraInfo); 680 converter->extraInfo = NULL; 681 } 682 } 683 } 684 685 static void U_CALLCONV 686 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 687 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 688 if(choice<=UCNV_RESET_TO_UNICODE) { 689 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 690 myConverterData->key = 0; 691 myConverterData->isEmptySegment = FALSE; 692 } 693 if(choice!=UCNV_RESET_TO_UNICODE) { 694 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 695 } 696 #ifdef U_ENABLE_GENERIC_ISO_2022 697 if(myConverterData->locale[0] == 0){ 698 if(choice<=UCNV_RESET_TO_UNICODE) { 699 myConverterData->isFirstBuffer = TRUE; 700 myConverterData->key = 0; 701 if (converter->mode == UCNV_SO){ 702 ucnv_close (myConverterData->currentConverter); 703 myConverterData->currentConverter=NULL; 704 } 705 converter->mode = UCNV_SI; 706 } 707 if(choice!=UCNV_RESET_TO_UNICODE) { 708 /* re-append UTF-8 escape sequence */ 709 converter->charErrorBufferLength = 3; 710 converter->charErrorBuffer[0] = 0x1b; 711 converter->charErrorBuffer[1] = 0x28; 712 converter->charErrorBuffer[2] = 0x42; 713 } 714 } 715 else 716 #endif 717 { 718 /* reset the state variables */ 719 if(myConverterData->locale[0] == 'k'){ 720 if(choice<=UCNV_RESET_TO_UNICODE) { 721 setInitialStateToUnicodeKR(converter, myConverterData); 722 } 723 if(choice!=UCNV_RESET_TO_UNICODE) { 724 setInitialStateFromUnicodeKR(converter, myConverterData); 725 } 726 } 727 } 728 } 729 730 U_CDECL_BEGIN 731 732 static const char * U_CALLCONV 733 _ISO2022getName(const UConverter* cnv){ 734 if(cnv->extraInfo){ 735 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 736 return myData->name; 737 } 738 return NULL; 739 } 740 741 U_CDECL_END 742 743 744 /*************** to unicode *******************/ 745 /**************************************************************************** 746 * Recognized escape sequences are 747 * <ESC>(B ASCII 748 * <ESC>.A ISO-8859-1 749 * <ESC>.F ISO-8859-7 750 * <ESC>(J JISX-201 751 * <ESC>(I JISX-201 752 * <ESC>$B JISX-208 753 * <ESC>$@ JISX-208 754 * <ESC>$(D JISX-212 755 * <ESC>$A GB2312 756 * <ESC>$(C KSC5601 757 */ 758 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 759 /* 0 1 2 3 4 5 6 7 8 9 */ 760 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 761 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 762 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 763 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 764 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 765 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 766 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 767 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 768 }; 769 770 #if !UCONFIG_ONLY_HTML_CONVERSION 771 /*************** to unicode *******************/ 772 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 773 /* 0 1 2 3 4 5 6 7 8 9 */ 774 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 776 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 777 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 778 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 779 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 780 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 781 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 782 }; 783 #endif 784 785 786 static UCNV_TableStates_2022 787 getKey_2022(char c,int32_t* key,int32_t* offset){ 788 int32_t togo; 789 int32_t low = 0; 790 int32_t hi = MAX_STATES_2022; 791 int32_t oldmid=0; 792 793 togo = normalize_esq_chars_2022[(uint8_t)c]; 794 if(togo == 0) { 795 /* not a valid character anywhere in an escape sequence */ 796 *key = 0; 797 *offset = 0; 798 return INVALID_2022; 799 } 800 togo = (*key << 5) + togo; 801 802 while (hi != low) /*binary search*/{ 803 804 int32_t mid = (hi+low) >> 1; /*Finds median*/ 805 806 if (mid == oldmid) 807 break; 808 809 if (escSeqStateTable_Key_2022[mid] > togo){ 810 hi = mid; 811 } 812 else if (escSeqStateTable_Key_2022[mid] < togo){ 813 low = mid; 814 } 815 else /*we found it*/{ 816 *key = togo; 817 *offset = mid; 818 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 819 } 820 oldmid = mid; 821 822 } 823 824 *key = 0; 825 *offset = 0; 826 return INVALID_2022; 827 } 828 829 /*runs through a state machine to determine the escape sequence - codepage correspondance 830 */ 831 static void 832 changeState_2022(UConverter* _this, 833 const char** source, 834 const char* sourceLimit, 835 Variant2022 var, 836 UErrorCode* err){ 837 UCNV_TableStates_2022 value; 838 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 839 uint32_t key = myData2022->key; 840 int32_t offset = 0; 841 int8_t initialToULength = _this->toULength; 842 char c; 843 844 value = VALID_NON_TERMINAL_2022; 845 while (*source < sourceLimit) { 846 c = *(*source)++; 847 _this->toUBytes[_this->toULength++]=(uint8_t)c; 848 value = getKey_2022(c,(int32_t *) &key, &offset); 849 850 switch (value){ 851 852 case VALID_NON_TERMINAL_2022 : 853 /* continue with the loop */ 854 break; 855 856 case VALID_TERMINAL_2022: 857 key = 0; 858 goto DONE; 859 860 case INVALID_2022: 861 goto DONE; 862 863 case VALID_MAYBE_TERMINAL_2022: 864 #ifdef U_ENABLE_GENERIC_ISO_2022 865 /* ESC ( B is ambiguous only for ISO_2022 itself */ 866 if(var == ISO_2022) { 867 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 868 _this->toULength = 0; 869 870 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 871 872 /* continue with the loop */ 873 value = VALID_NON_TERMINAL_2022; 874 break; 875 } else 876 #endif 877 { 878 /* not ISO_2022 itself, finish here */ 879 value = VALID_TERMINAL_2022; 880 key = 0; 881 goto DONE; 882 } 883 } 884 } 885 886 DONE: 887 myData2022->key = key; 888 889 if (value == VALID_NON_TERMINAL_2022) { 890 /* indicate that the escape sequence is incomplete: key!=0 */ 891 return; 892 } else if (value == INVALID_2022 ) { 893 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 894 } else /* value == VALID_TERMINAL_2022 */ { 895 switch(var){ 896 #ifdef U_ENABLE_GENERIC_ISO_2022 897 case ISO_2022: 898 { 899 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 900 if(chosenConverterName == NULL) { 901 /* SS2 or SS3 */ 902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 903 _this->toUCallbackReason = UCNV_UNASSIGNED; 904 return; 905 } 906 907 _this->mode = UCNV_SI; 908 ucnv_close(myData2022->currentConverter); 909 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 910 if(U_SUCCESS(*err)) { 911 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 912 _this->mode = UCNV_SO; 913 } 914 break; 915 } 916 #endif 917 case ISO_2022_JP: 918 { 919 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 920 switch(tempState) { 921 case INVALID_STATE: 922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 923 break; 924 case SS2_STATE: 925 if(myData2022->toU2022State.cs[2]!=0) { 926 if(myData2022->toU2022State.g<2) { 927 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 928 } 929 myData2022->toU2022State.g=2; 930 } else { 931 /* illegal to have SS2 before a matching designator */ 932 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 933 } 934 break; 935 /* case SS3_STATE: not used in ISO-2022-JP-x */ 936 case ISO8859_1: 937 case ISO8859_7: 938 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 939 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 940 } else { 941 /* G2 charset for SS2 */ 942 myData2022->toU2022State.cs[2]=(int8_t)tempState; 943 } 944 break; 945 default: 946 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 947 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 948 } else { 949 /* G0 charset */ 950 myData2022->toU2022State.cs[0]=(int8_t)tempState; 951 } 952 break; 953 } 954 } 955 break; 956 #if !UCONFIG_ONLY_HTML_CONVERSION 957 case ISO_2022_CN: 958 { 959 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 960 switch(tempState) { 961 case INVALID_STATE: 962 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 963 break; 964 case SS2_STATE: 965 if(myData2022->toU2022State.cs[2]!=0) { 966 if(myData2022->toU2022State.g<2) { 967 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 968 } 969 myData2022->toU2022State.g=2; 970 } else { 971 /* illegal to have SS2 before a matching designator */ 972 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 973 } 974 break; 975 case SS3_STATE: 976 if(myData2022->toU2022State.cs[3]!=0) { 977 if(myData2022->toU2022State.g<2) { 978 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 979 } 980 myData2022->toU2022State.g=3; 981 } else { 982 /* illegal to have SS3 before a matching designator */ 983 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 984 } 985 break; 986 case ISO_IR_165: 987 if(myData2022->version==0) { 988 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 989 break; 990 } 991 U_FALLTHROUGH; 992 case GB2312_1: 993 U_FALLTHROUGH; 994 case CNS_11643_1: 995 myData2022->toU2022State.cs[1]=(int8_t)tempState; 996 break; 997 case CNS_11643_2: 998 myData2022->toU2022State.cs[2]=(int8_t)tempState; 999 break; 1000 default: 1001 /* other CNS 11643 planes */ 1002 if(myData2022->version==0) { 1003 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1004 } else { 1005 myData2022->toU2022State.cs[3]=(int8_t)tempState; 1006 } 1007 break; 1008 } 1009 } 1010 break; 1011 case ISO_2022_KR: 1012 if(offset==0x30){ 1013 /* nothing to be done, just accept this one escape sequence */ 1014 } else { 1015 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1016 } 1017 break; 1018 #endif // !UCONFIG_ONLY_HTML_CONVERSION 1019 1020 default: 1021 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1022 break; 1023 } 1024 } 1025 if(U_SUCCESS(*err)) { 1026 _this->toULength = 0; 1027 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1028 if(_this->toULength>1) { 1029 /* 1030 * Ticket 5691: consistent illegal sequences: 1031 * - We include at least the first byte (ESC) in the illegal sequence. 1032 * - If any of the non-initial bytes could be the start of a character, 1033 * we stop the illegal sequence before the first one of those. 1034 * In escape sequences, all following bytes are "printable", that is, 1035 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1036 * they are valid single/lead bytes. 1037 * For simplicity, we always only report the initial ESC byte as the 1038 * illegal sequence and back out all other bytes we looked at. 1039 */ 1040 /* Back out some bytes. */ 1041 int8_t backOutDistance=_this->toULength-1; 1042 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1043 if(backOutDistance<=bytesFromThisBuffer) { 1044 /* same as initialToULength<=1 */ 1045 *source-=backOutDistance; 1046 } else { 1047 /* Back out bytes from the previous buffer: Need to replay them. */ 1048 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 1049 /* same as -(initialToULength-1) */ 1050 /* preToULength is negative! */ 1051 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1052 *source-=bytesFromThisBuffer; 1053 } 1054 _this->toULength=1; 1055 } 1056 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1057 _this->toUCallbackReason = UCNV_UNASSIGNED; 1058 } 1059 } 1060 1061 #if !UCONFIG_ONLY_HTML_CONVERSION 1062 /*Checks the characters of the buffer against valid 2022 escape sequences 1063 *if the match we return a pointer to the initial start of the sequence otherwise 1064 *we return sourceLimit 1065 */ 1066 /*for 2022 looks ahead in the stream 1067 *to determine the longest possible convertible 1068 *data stream 1069 */ 1070 static inline const char* 1071 getEndOfBuffer_2022(const char** source, 1072 const char* sourceLimit, 1073 UBool /*flush*/){ 1074 1075 const char* mySource = *source; 1076 1077 #ifdef U_ENABLE_GENERIC_ISO_2022 1078 if (*source >= sourceLimit) 1079 return sourceLimit; 1080 1081 do{ 1082 1083 if (*mySource == ESC_2022){ 1084 int8_t i; 1085 int32_t key = 0; 1086 int32_t offset; 1087 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1088 1089 /* Kludge: I could not 1090 * figure out the reason for validating an escape sequence 1091 * twice - once here and once in changeState_2022(). 1092 * is it possible to have an ESC character in a ISO2022 1093 * byte stream which is valid in a code page? Is it legal? 1094 */ 1095 for (i=0; 1096 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1097 i++) { 1098 value = getKey_2022(*(mySource+i), &key, &offset); 1099 } 1100 if (value > 0 || *mySource==ESC_2022) 1101 return mySource; 1102 1103 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1104 return sourceLimit; 1105 } 1106 }while (++mySource < sourceLimit); 1107 1108 return sourceLimit; 1109 #else 1110 while(mySource < sourceLimit && *mySource != ESC_2022) { 1111 ++mySource; 1112 } 1113 return mySource; 1114 #endif 1115 } 1116 #endif 1117 1118 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1119 * any future change in _MBCSFromUChar32() function should be reflected here. 1120 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1121 */ 1122 static inline int32_t 1123 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1124 UChar32 c, 1125 uint32_t* value, 1126 UBool useFallback, 1127 int outputType) 1128 { 1129 const int32_t *cx; 1130 const uint16_t *table; 1131 uint32_t stage2Entry; 1132 uint32_t myValue; 1133 int32_t length; 1134 const uint8_t *p; 1135 /* 1136 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1137 * Use internal version of ucnv_open() that verifies that the new structures are available, 1138 * else U_INTERNAL_PROGRAM_ERROR. 1139 */ 1140 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1141 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1142 table=sharedData->mbcs.fromUnicodeTable; 1143 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1144 /* get the bytes and the length for the output */ 1145 if(outputType==MBCS_OUTPUT_2){ 1146 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1147 if(myValue<=0xff) { 1148 length=1; 1149 } else { 1150 length=2; 1151 } 1152 } else /* outputType==MBCS_OUTPUT_3 */ { 1153 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1154 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1155 if(myValue<=0xff) { 1156 length=1; 1157 } else if(myValue<=0xffff) { 1158 length=2; 1159 } else { 1160 length=3; 1161 } 1162 } 1163 /* is this code point assigned, or do we use fallbacks? */ 1164 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1165 /* assigned */ 1166 *value=myValue; 1167 return length; 1168 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1169 /* 1170 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1171 * There is no way with this data structure for fallback output 1172 * to be a zero byte. 1173 */ 1174 *value=myValue; 1175 return -length; 1176 } 1177 } 1178 1179 cx=sharedData->mbcs.extIndexes; 1180 if(cx!=NULL) { 1181 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1182 } 1183 1184 /* unassigned */ 1185 return 0; 1186 } 1187 1188 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1189 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1190 * @param retval pointer to output byte 1191 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1192 */ 1193 static inline int32_t 1194 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1195 UChar32 c, 1196 uint32_t* retval, 1197 UBool useFallback) 1198 { 1199 const uint16_t *table; 1200 int32_t value; 1201 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1202 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1203 return 0; 1204 } 1205 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1206 table=sharedData->mbcs.fromUnicodeTable; 1207 /* get the byte for the output */ 1208 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1209 /* is this code point assigned, or do we use fallbacks? */ 1210 *retval=(uint32_t)(value&0xff); 1211 if(value>=0xf00) { 1212 return 1; /* roundtrip */ 1213 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1214 return -1; /* fallback taken */ 1215 } else { 1216 return 0; /* no mapping */ 1217 } 1218 } 1219 1220 /* 1221 * Check that the result is a 2-byte value with each byte in the range A1..FE 1222 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1223 * to move it to the ISO 2022 range 21..7E. 1224 * Return 0 if out of range. 1225 */ 1226 static inline uint32_t 1227 _2022FromGR94DBCS(uint32_t value) { 1228 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1229 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1230 ) { 1231 return value - 0x8080; /* shift down to 21..7e byte range */ 1232 } else { 1233 return 0; /* not valid for ISO 2022 */ 1234 } 1235 } 1236 1237 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1238 /* 1239 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1240 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1241 * unchanged. 1242 */ 1243 static inline uint32_t 1244 _2022ToGR94DBCS(uint32_t value) { 1245 uint32_t returnValue = value + 0x8080; 1246 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1247 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1248 return returnValue; 1249 } else { 1250 return value; 1251 } 1252 } 1253 #endif 1254 1255 #ifdef U_ENABLE_GENERIC_ISO_2022 1256 1257 /********************************************************************************** 1258 * ISO-2022 Converter 1259 * 1260 * 1261 */ 1262 1263 static void U_CALLCONV 1264 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1265 UErrorCode* err){ 1266 const char* mySourceLimit, *realSourceLimit; 1267 const char* sourceStart; 1268 const UChar* myTargetStart; 1269 UConverter* saveThis; 1270 UConverterDataISO2022* myData; 1271 int8_t length; 1272 1273 saveThis = args->converter; 1274 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1275 1276 realSourceLimit = args->sourceLimit; 1277 while (args->source < realSourceLimit) { 1278 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1279 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1280 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1281 1282 if(args->source < mySourceLimit) { 1283 if(myData->currentConverter==NULL) { 1284 myData->currentConverter = ucnv_open("ASCII",err); 1285 if(U_FAILURE(*err)){ 1286 return; 1287 } 1288 1289 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1290 saveThis->mode = UCNV_SO; 1291 } 1292 1293 /* convert to before the ESC or until the end of the buffer */ 1294 myData->isFirstBuffer=FALSE; 1295 sourceStart = args->source; 1296 myTargetStart = args->target; 1297 args->converter = myData->currentConverter; 1298 ucnv_toUnicode(args->converter, 1299 &args->target, 1300 args->targetLimit, 1301 &args->source, 1302 mySourceLimit, 1303 args->offsets, 1304 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1305 err); 1306 args->converter = saveThis; 1307 1308 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1309 /* move the overflow buffer */ 1310 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1311 myData->currentConverter->UCharErrorBufferLength = 0; 1312 if(length > 0) { 1313 uprv_memcpy(saveThis->UCharErrorBuffer, 1314 myData->currentConverter->UCharErrorBuffer, 1315 length*U_SIZEOF_UCHAR); 1316 } 1317 return; 1318 } 1319 1320 /* 1321 * At least one of: 1322 * -Error while converting 1323 * -Done with entire buffer 1324 * -Need to write offsets or update the current offset 1325 * (leave that up to the code in ucnv.c) 1326 * 1327 * or else we just stopped at an ESC byte and continue with changeState_2022() 1328 */ 1329 if (U_FAILURE(*err) || 1330 (args->source == realSourceLimit) || 1331 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1332 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1333 ) { 1334 /* copy partial or error input for truncated detection and error handling */ 1335 if(U_FAILURE(*err)) { 1336 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1337 if(length > 0) { 1338 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1339 } 1340 } else { 1341 length = saveThis->toULength = myData->currentConverter->toULength; 1342 if(length > 0) { 1343 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1344 if(args->source < mySourceLimit) { 1345 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1346 } 1347 } 1348 } 1349 return; 1350 } 1351 } 1352 } 1353 1354 sourceStart = args->source; 1355 changeState_2022(args->converter, 1356 &(args->source), 1357 realSourceLimit, 1358 ISO_2022, 1359 err); 1360 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1361 /* let the ucnv.c code update its current offset */ 1362 return; 1363 } 1364 } 1365 } 1366 1367 #endif 1368 1369 /* 1370 * To Unicode Callback helper function 1371 */ 1372 static void 1373 toUnicodeCallback(UConverter *cnv, 1374 const uint32_t sourceChar, const uint32_t targetUniChar, 1375 UErrorCode* err){ 1376 if(sourceChar>0xff){ 1377 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1378 cnv->toUBytes[1] = (uint8_t)sourceChar; 1379 cnv->toULength = 2; 1380 } 1381 else{ 1382 cnv->toUBytes[0] =(char) sourceChar; 1383 cnv->toULength = 1; 1384 } 1385 1386 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1387 *err = U_INVALID_CHAR_FOUND; 1388 } 1389 else{ 1390 *err = U_ILLEGAL_CHAR_FOUND; 1391 } 1392 } 1393 1394 /**************************************ISO-2022-JP*************************************************/ 1395 1396 /************************************** IMPORTANT ************************************************** 1397 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1398 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1399 * The converter iterates over each Unicode codepoint 1400 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1401 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1402 * would do as far as possible. 1403 * 1404 * If the implementation of these macros or structure of sharedData struct change in the future, make 1405 * sure that ISO-2022 is also changed. 1406 *************************************************************************************************** 1407 */ 1408 1409 /*************************************************************************************************** 1410 * Rules for ISO-2022-jp encoding 1411 * (i) Escape sequences must be fully contained within a line they should not 1412 * span new lines or CRs 1413 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1414 * JIS-Roman character escape sequence should follow before the line terminates 1415 * (iii) If the first character on the line is represented by two bytes then a two 1416 * byte character escape sequence should precede it 1417 * (iv) If no escape sequence is encountered then the characters are ASCII 1418 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1419 * and invoked with SS2 (ESC N). 1420 * (vi) If there is any G0 designation in text, there must be a switch to 1421 * ASCII or to JIS X 0201-Roman before a space character (but not 1422 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1423 * characters such as tab or CRLF. 1424 * (vi) Supported encodings: 1425 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1426 * 1427 * source : RFC-1554 1428 * 1429 * JISX201, JISX208,JISX212 : new .cnv data files created 1430 * KSC5601 : alias to ibm-949 mapping table 1431 * GB2312 : alias to ibm-1386 mapping table 1432 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1433 * ISO-8859-7 : alisas to ibm-9409 mapping table 1434 */ 1435 1436 /* preference order of JP charsets */ 1437 static const StateEnum jpCharsetPref[]={ 1438 ASCII, 1439 JISX201, 1440 ISO8859_1, 1441 JISX208, 1442 ISO8859_7, 1443 JISX212, 1444 GB2312, 1445 KSC5601, 1446 HWKANA_7BIT 1447 }; 1448 1449 /* 1450 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1451 * not in order of jpCharsetPref[]! 1452 */ 1453 static const char escSeqChars[][6] ={ 1454 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1455 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1456 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1457 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1458 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1459 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1460 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1461 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1462 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1463 1464 }; 1465 static const int8_t escSeqCharsLen[] ={ 1466 3, /* length of <ESC>(B ASCII */ 1467 3, /* length of <ESC>.A ISO-8859-1 */ 1468 3, /* length of <ESC>.F ISO-8859-7 */ 1469 3, /* length of <ESC>(J JISX-201 */ 1470 3, /* length of <ESC>$B JISX-208 */ 1471 4, /* length of <ESC>$(D JISX-212 */ 1472 3, /* length of <ESC>$A GB2312 */ 1473 4, /* length of <ESC>$(C KSC5601 */ 1474 3 /* length of <ESC>(I HWKANA_7BIT */ 1475 }; 1476 1477 /* 1478 * The iteration over various code pages works this way: 1479 * i) Get the currentState from myConverterData->currentState 1480 * ii) Check if the character is mapped to a valid character in the currentState 1481 * Yes -> a) set the initIterState to currentState 1482 * b) remain in this state until an invalid character is found 1483 * No -> a) go to the next code page and find the character 1484 * iii) Before changing the state increment the current state check if the current state 1485 * is equal to the intitIteration state 1486 * Yes -> A character that cannot be represented in any of the supported encodings 1487 * break and return a U_INVALID_CHARACTER error 1488 * No -> Continue and find the character in next code page 1489 * 1490 * 1491 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1492 */ 1493 1494 /* Map 00..7F to Unicode according to JIS X 0201. */ 1495 static inline uint32_t 1496 jisx201ToU(uint32_t value) { 1497 if(value < 0x5c) { 1498 return value; 1499 } else if(value == 0x5c) { 1500 return 0xa5; 1501 } else if(value == 0x7e) { 1502 return 0x203e; 1503 } else /* value <= 0x7f */ { 1504 return value; 1505 } 1506 } 1507 1508 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1509 static inline uint32_t 1510 jisx201FromU(uint32_t value) { 1511 if(value<=0x7f) { 1512 if(value!=0x5c && value!=0x7e) { 1513 return value; 1514 } 1515 } else if(value==0xa5) { 1516 return 0x5c; 1517 } else if(value==0x203e) { 1518 return 0x7e; 1519 } 1520 return 0xfffe; 1521 } 1522 1523 /* 1524 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1525 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1526 * Return 0 if the byte pair is out of range. 1527 */ 1528 static inline uint32_t 1529 _2022FromSJIS(uint32_t value) { 1530 uint8_t trail; 1531 1532 if(value > 0xEFFC) { 1533 return 0; /* beyond JIS X 0208 */ 1534 } 1535 1536 trail = (uint8_t)value; 1537 1538 value &= 0xff00; /* lead byte */ 1539 if(value <= 0x9f00) { 1540 value -= 0x7000; 1541 } else /* 0xe000 <= value <= 0xef00 */ { 1542 value -= 0xb000; 1543 } 1544 value <<= 1; 1545 1546 if(trail <= 0x9e) { 1547 value -= 0x100; 1548 if(trail <= 0x7e) { 1549 value |= trail - 0x1f; 1550 } else { 1551 value |= trail - 0x20; 1552 } 1553 } else /* trail <= 0xfc */ { 1554 value |= trail - 0x7e; 1555 } 1556 return value; 1557 } 1558 1559 /* 1560 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1561 * If either byte is outside 21..7E make sure that the result is not valid 1562 * for Shift-JIS so that the converter catches it. 1563 * Some invalid byte values already turn into equally invalid Shift-JIS 1564 * byte values and need not be tested explicitly. 1565 */ 1566 static inline void 1567 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1568 if(c1&1) { 1569 ++c1; 1570 if(c2 <= 0x5f) { 1571 c2 += 0x1f; 1572 } else if(c2 <= 0x7e) { 1573 c2 += 0x20; 1574 } else { 1575 c2 = 0; /* invalid */ 1576 } 1577 } else { 1578 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1579 c2 += 0x7e; 1580 } else { 1581 c2 = 0; /* invalid */ 1582 } 1583 } 1584 c1 >>= 1; 1585 if(c1 <= 0x2f) { 1586 c1 += 0x70; 1587 } else if(c1 <= 0x3f) { 1588 c1 += 0xb0; 1589 } else { 1590 c1 = 0; /* invalid */ 1591 } 1592 bytes[0] = (char)c1; 1593 bytes[1] = (char)c2; 1594 } 1595 1596 /* 1597 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1598 * Katakana. 1599 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1600 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1601 * These were the only fallbacks in ICU's jisx-208.ucm file. 1602 */ 1603 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1604 0x2123, /* U+FF61 */ 1605 0x2156, 1606 0x2157, 1607 0x2122, 1608 0x2126, 1609 0x2572, 1610 0x2521, 1611 0x2523, 1612 0x2525, 1613 0x2527, 1614 0x2529, 1615 0x2563, 1616 0x2565, 1617 0x2567, 1618 0x2543, 1619 0x213C, /* U+FF70 */ 1620 0x2522, 1621 0x2524, 1622 0x2526, 1623 0x2528, 1624 0x252A, 1625 0x252B, 1626 0x252D, 1627 0x252F, 1628 0x2531, 1629 0x2533, 1630 0x2535, 1631 0x2537, 1632 0x2539, 1633 0x253B, 1634 0x253D, 1635 0x253F, /* U+FF80 */ 1636 0x2541, 1637 0x2544, 1638 0x2546, 1639 0x2548, 1640 0x254A, 1641 0x254B, 1642 0x254C, 1643 0x254D, 1644 0x254E, 1645 0x254F, 1646 0x2552, 1647 0x2555, 1648 0x2558, 1649 0x255B, 1650 0x255E, 1651 0x255F, /* U+FF90 */ 1652 0x2560, 1653 0x2561, 1654 0x2562, 1655 0x2564, 1656 0x2566, 1657 0x2568, 1658 0x2569, 1659 0x256A, 1660 0x256B, 1661 0x256C, 1662 0x256D, 1663 0x256F, 1664 0x2573, 1665 0x212B, 1666 0x212C /* U+FF9F */ 1667 }; 1668 1669 static void U_CALLCONV 1670 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1671 UConverter *cnv = args->converter; 1672 UConverterDataISO2022 *converterData; 1673 ISO2022State *pFromU2022State; 1674 uint8_t *target = (uint8_t *) args->target; 1675 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1676 const UChar* source = args->source; 1677 const UChar* sourceLimit = args->sourceLimit; 1678 int32_t* offsets = args->offsets; 1679 UChar32 sourceChar; 1680 char buffer[8]; 1681 int32_t len, outLen; 1682 int8_t choices[10]; 1683 int32_t choiceCount; 1684 uint32_t targetValue = 0; 1685 UBool useFallback; 1686 1687 int32_t i; 1688 int8_t cs, g; 1689 1690 /* set up the state */ 1691 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1692 pFromU2022State = &converterData->fromU2022State; 1693 1694 choiceCount = 0; 1695 1696 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1697 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1698 goto getTrail; 1699 } 1700 1701 while(source < sourceLimit) { 1702 if(target < targetLimit) { 1703 1704 sourceChar = *(source++); 1705 /*check if the char is a First surrogate*/ 1706 if(U16_IS_SURROGATE(sourceChar)) { 1707 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1708 getTrail: 1709 /*look ahead to find the trail surrogate*/ 1710 if(source < sourceLimit) { 1711 /* test the following code unit */ 1712 UChar trail=(UChar) *source; 1713 if(U16_IS_TRAIL(trail)) { 1714 source++; 1715 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1716 cnv->fromUChar32=0x00; 1717 /* convert this supplementary code point */ 1718 /* exit this condition tree */ 1719 } else { 1720 /* this is an unmatched lead code unit (1st surrogate) */ 1721 /* callback(illegal) */ 1722 *err=U_ILLEGAL_CHAR_FOUND; 1723 cnv->fromUChar32=sourceChar; 1724 break; 1725 } 1726 } else { 1727 /* no more input */ 1728 cnv->fromUChar32=sourceChar; 1729 break; 1730 } 1731 } else { 1732 /* this is an unmatched trail code unit (2nd surrogate) */ 1733 /* callback(illegal) */ 1734 *err=U_ILLEGAL_CHAR_FOUND; 1735 cnv->fromUChar32=sourceChar; 1736 break; 1737 } 1738 } 1739 1740 /* do not convert SO/SI/ESC */ 1741 if(IS_2022_CONTROL(sourceChar)) { 1742 /* callback(illegal) */ 1743 *err=U_ILLEGAL_CHAR_FOUND; 1744 cnv->fromUChar32=sourceChar; 1745 break; 1746 } 1747 1748 /* do the conversion */ 1749 1750 if(choiceCount == 0) { 1751 uint16_t csm; 1752 1753 /* 1754 * The csm variable keeps track of which charsets are allowed 1755 * and not used yet while building the choices[]. 1756 */ 1757 csm = jpCharsetMasks[converterData->version]; 1758 choiceCount = 0; 1759 1760 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1761 if(converterData->version == 3 || converterData->version == 4) { 1762 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1763 } 1764 /* Do not try single-byte half-width Katakana for other versions. */ 1765 csm &= ~CSM(HWKANA_7BIT); 1766 1767 /* try the current G0 charset */ 1768 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1769 csm &= ~CSM(cs); 1770 1771 /* try the current G2 charset */ 1772 if((cs = pFromU2022State->cs[2]) != 0) { 1773 choices[choiceCount++] = cs; 1774 csm &= ~CSM(cs); 1775 } 1776 1777 /* try all the other possible charsets */ 1778 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { 1779 cs = (int8_t)jpCharsetPref[i]; 1780 if(CSM(cs) & csm) { 1781 choices[choiceCount++] = cs; 1782 csm &= ~CSM(cs); 1783 } 1784 } 1785 } 1786 1787 cs = g = 0; 1788 /* 1789 * len==0: no mapping found yet 1790 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1791 * len>0: found a roundtrip result, done 1792 */ 1793 len = 0; 1794 /* 1795 * We will turn off useFallback after finding a fallback, 1796 * but we still get fallbacks from PUA code points as usual. 1797 * Therefore, we will also need to check that we don't overwrite 1798 * an early fallback with a later one. 1799 */ 1800 useFallback = cnv->useFallback; 1801 1802 for(i = 0; i < choiceCount && len <= 0; ++i) { 1803 uint32_t value; 1804 int32_t len2; 1805 int8_t cs0 = choices[i]; 1806 switch(cs0) { 1807 case ASCII: 1808 if(sourceChar <= 0x7f) { 1809 targetValue = (uint32_t)sourceChar; 1810 len = 1; 1811 cs = cs0; 1812 g = 0; 1813 } 1814 break; 1815 case ISO8859_1: 1816 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1817 targetValue = (uint32_t)sourceChar - 0x80; 1818 len = 1; 1819 cs = cs0; 1820 g = 2; 1821 } 1822 break; 1823 case HWKANA_7BIT: 1824 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1825 if(converterData->version==3) { 1826 /* JIS7: use G1 (SO) */ 1827 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1828 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1829 len = 1; 1830 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1831 g = 1; 1832 } else if(converterData->version==4) { 1833 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1834 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1835 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1836 len = 1; 1837 1838 cs = pFromU2022State->cs[0]; 1839 if(IS_JP_DBCS(cs)) { 1840 /* switch from a DBCS charset to JISX201 */ 1841 cs = (int8_t)JISX201; 1842 } 1843 /* else stay in the current G0 charset */ 1844 g = 0; 1845 } 1846 /* else do not use HWKANA_7BIT with other versions */ 1847 } 1848 break; 1849 case JISX201: 1850 /* G0 SBCS */ 1851 value = jisx201FromU(sourceChar); 1852 if(value <= 0x7f) { 1853 targetValue = value; 1854 len = 1; 1855 cs = cs0; 1856 g = 0; 1857 useFallback = FALSE; 1858 } 1859 break; 1860 case JISX208: 1861 /* G0 DBCS from Shift-JIS table */ 1862 len2 = MBCS_FROM_UCHAR32_ISO2022( 1863 converterData->myConverterArray[cs0], 1864 sourceChar, &value, 1865 useFallback, MBCS_OUTPUT_2); 1866 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1867 value = _2022FromSJIS(value); 1868 if(value != 0) { 1869 targetValue = value; 1870 len = len2; 1871 cs = cs0; 1872 g = 0; 1873 useFallback = FALSE; 1874 } 1875 } else if(len == 0 && useFallback && 1876 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1877 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1878 len = -2; 1879 cs = cs0; 1880 g = 0; 1881 useFallback = FALSE; 1882 } 1883 break; 1884 case ISO8859_7: 1885 /* G0 SBCS forced to 7-bit output */ 1886 len2 = MBCS_SINGLE_FROM_UCHAR32( 1887 converterData->myConverterArray[cs0], 1888 sourceChar, &value, 1889 useFallback); 1890 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1891 targetValue = value - 0x80; 1892 len = len2; 1893 cs = cs0; 1894 g = 2; 1895 useFallback = FALSE; 1896 } 1897 break; 1898 default: 1899 /* G0 DBCS */ 1900 len2 = MBCS_FROM_UCHAR32_ISO2022( 1901 converterData->myConverterArray[cs0], 1902 sourceChar, &value, 1903 useFallback, MBCS_OUTPUT_2); 1904 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1905 if(cs0 == KSC5601) { 1906 /* 1907 * Check for valid bytes for the encoding scheme. 1908 * This is necessary because the sub-converter (windows-949) 1909 * has a broader encoding scheme than is valid for 2022. 1910 */ 1911 value = _2022FromGR94DBCS(value); 1912 if(value == 0) { 1913 break; 1914 } 1915 } 1916 targetValue = value; 1917 len = len2; 1918 cs = cs0; 1919 g = 0; 1920 useFallback = FALSE; 1921 } 1922 break; 1923 } 1924 } 1925 1926 if(len != 0) { 1927 if(len < 0) { 1928 len = -len; /* fallback */ 1929 } 1930 outLen = 0; /* count output bytes */ 1931 1932 /* write SI if necessary (only for JIS7) */ 1933 if(pFromU2022State->g == 1 && g == 0) { 1934 buffer[outLen++] = UCNV_SI; 1935 pFromU2022State->g = 0; 1936 } 1937 1938 /* write the designation sequence if necessary */ 1939 if(cs != pFromU2022State->cs[g]) { 1940 int32_t escLen = escSeqCharsLen[cs]; 1941 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1942 outLen += escLen; 1943 pFromU2022State->cs[g] = cs; 1944 1945 /* invalidate the choices[] */ 1946 choiceCount = 0; 1947 } 1948 1949 /* write the shift sequence if necessary */ 1950 if(g != pFromU2022State->g) { 1951 switch(g) { 1952 /* case 0 handled before writing escapes */ 1953 case 1: 1954 buffer[outLen++] = UCNV_SO; 1955 pFromU2022State->g = 1; 1956 break; 1957 default: /* case 2 */ 1958 buffer[outLen++] = 0x1b; 1959 buffer[outLen++] = 0x4e; 1960 break; 1961 /* no case 3: no SS3 in ISO-2022-JP-x */ 1962 } 1963 } 1964 1965 /* write the output bytes */ 1966 if(len == 1) { 1967 buffer[outLen++] = (char)targetValue; 1968 } else /* len == 2 */ { 1969 buffer[outLen++] = (char)(targetValue >> 8); 1970 buffer[outLen++] = (char)targetValue; 1971 } 1972 } else { 1973 /* 1974 * if we cannot find the character after checking all codepages 1975 * then this is an error 1976 */ 1977 *err = U_INVALID_CHAR_FOUND; 1978 cnv->fromUChar32=sourceChar; 1979 break; 1980 } 1981 1982 if(sourceChar == CR || sourceChar == LF) { 1983 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1984 pFromU2022State->cs[2] = 0; 1985 choiceCount = 0; 1986 } 1987 1988 /* output outLen>0 bytes in buffer[] */ 1989 if(outLen == 1) { 1990 *target++ = buffer[0]; 1991 if(offsets) { 1992 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1993 } 1994 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1995 *target++ = buffer[0]; 1996 *target++ = buffer[1]; 1997 if(offsets) { 1998 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1999 *offsets++ = sourceIndex; 2000 *offsets++ = sourceIndex; 2001 } 2002 } else { 2003 fromUWriteUInt8( 2004 cnv, 2005 buffer, outLen, 2006 &target, (const char *)targetLimit, 2007 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 2008 err); 2009 if(U_FAILURE(*err)) { 2010 break; 2011 } 2012 } 2013 } /* end if(myTargetIndex<myTargetLength) */ 2014 else{ 2015 *err =U_BUFFER_OVERFLOW_ERROR; 2016 break; 2017 } 2018 2019 }/* end while(mySourceIndex<mySourceLength) */ 2020 2021 /* 2022 * the end of the input stream and detection of truncated input 2023 * are handled by the framework, but for ISO-2022-JP conversion 2024 * we need to be in ASCII mode at the very end 2025 * 2026 * conditions: 2027 * successful 2028 * in SO mode or not in ASCII mode 2029 * end of input and no truncated input 2030 */ 2031 if( U_SUCCESS(*err) && 2032 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2033 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2034 ) { 2035 int32_t sourceIndex; 2036 2037 outLen = 0; 2038 2039 if(pFromU2022State->g != 0) { 2040 buffer[outLen++] = UCNV_SI; 2041 pFromU2022State->g = 0; 2042 } 2043 2044 if(pFromU2022State->cs[0] != ASCII) { 2045 int32_t escLen = escSeqCharsLen[ASCII]; 2046 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2047 outLen += escLen; 2048 pFromU2022State->cs[0] = (int8_t)ASCII; 2049 } 2050 2051 /* get the source index of the last input character */ 2052 /* 2053 * TODO this would be simpler and more reliable if we used a pair 2054 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2055 * so that we could simply use the prevSourceIndex here; 2056 * this code gives an incorrect result for the rare case of an unmatched 2057 * trail surrogate that is alone in the last buffer of the text stream 2058 */ 2059 sourceIndex=(int32_t)(source-args->source); 2060 if(sourceIndex>0) { 2061 --sourceIndex; 2062 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2063 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2064 ) { 2065 --sourceIndex; 2066 } 2067 } else { 2068 sourceIndex=-1; 2069 } 2070 2071 fromUWriteUInt8( 2072 cnv, 2073 buffer, outLen, 2074 &target, (const char *)targetLimit, 2075 &offsets, sourceIndex, 2076 err); 2077 } 2078 2079 /*save the state and return */ 2080 args->source = source; 2081 args->target = (char*)target; 2082 } 2083 2084 /*************** to unicode *******************/ 2085 2086 static void U_CALLCONV 2087 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2088 UErrorCode* err){ 2089 char tempBuf[2]; 2090 const char *mySource = (char *) args->source; 2091 UChar *myTarget = args->target; 2092 const char *mySourceLimit = args->sourceLimit; 2093 uint32_t targetUniChar = 0x0000; 2094 uint32_t mySourceChar = 0x0000; 2095 uint32_t tmpSourceChar = 0x0000; 2096 UConverterDataISO2022* myData; 2097 ISO2022State *pToU2022State; 2098 StateEnum cs; 2099 2100 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2101 pToU2022State = &myData->toU2022State; 2102 2103 if(myData->key != 0) { 2104 /* continue with a partial escape sequence */ 2105 goto escape; 2106 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2107 /* continue with a partial double-byte character */ 2108 mySourceChar = args->converter->toUBytes[0]; 2109 args->converter->toULength = 0; 2110 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2111 targetUniChar = missingCharMarker; 2112 goto getTrailByte; 2113 } 2114 2115 while(mySource < mySourceLimit){ 2116 2117 targetUniChar =missingCharMarker; 2118 2119 if(myTarget < args->targetLimit){ 2120 2121 mySourceChar= (unsigned char) *mySource++; 2122 2123 switch(mySourceChar) { 2124 case UCNV_SI: 2125 if(myData->version==3) { 2126 pToU2022State->g=0; 2127 continue; 2128 } else { 2129 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2130 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2131 break; 2132 } 2133 2134 case UCNV_SO: 2135 if(myData->version==3) { 2136 /* JIS7: switch to G1 half-width Katakana */ 2137 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2138 pToU2022State->g=1; 2139 continue; 2140 } else { 2141 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2142 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2143 break; 2144 } 2145 2146 case ESC_2022: 2147 mySource--; 2148 escape: 2149 { 2150 const char * mySourceBefore = mySource; 2151 int8_t toULengthBefore = args->converter->toULength; 2152 2153 changeState_2022(args->converter,&(mySource), 2154 mySourceLimit, ISO_2022_JP,err); 2155 2156 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2157 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2158 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2159 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2160 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 2161 } 2162 } 2163 2164 /* invalid or illegal escape sequence */ 2165 if(U_FAILURE(*err)){ 2166 args->target = myTarget; 2167 args->source = mySource; 2168 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2169 return; 2170 } 2171 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2172 if(myData->key==0) { 2173 myData->isEmptySegment = TRUE; 2174 } 2175 continue; 2176 2177 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2178 2179 case CR: 2180 case LF: 2181 /* automatically reset to single-byte mode */ 2182 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2183 pToU2022State->cs[0] = (int8_t)ASCII; 2184 } 2185 pToU2022State->cs[2] = 0; 2186 pToU2022State->g = 0; 2187 U_FALLTHROUGH; 2188 default: 2189 /* convert one or two bytes */ 2190 myData->isEmptySegment = FALSE; 2191 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2192 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2193 !IS_JP_DBCS(cs) 2194 ) { 2195 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2196 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2197 2198 /* return from a single-shift state to the previous one */ 2199 if(pToU2022State->g >= 2) { 2200 pToU2022State->g=pToU2022State->prevG; 2201 } 2202 } else switch(cs) { 2203 case ASCII: 2204 if(mySourceChar <= 0x7f) { 2205 targetUniChar = mySourceChar; 2206 } 2207 break; 2208 case ISO8859_1: 2209 if(mySourceChar <= 0x7f) { 2210 targetUniChar = mySourceChar + 0x80; 2211 } 2212 /* return from a single-shift state to the previous one */ 2213 pToU2022State->g=pToU2022State->prevG; 2214 break; 2215 case ISO8859_7: 2216 if(mySourceChar <= 0x7f) { 2217 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2218 targetUniChar = 2219 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2220 myData->myConverterArray[cs], 2221 mySourceChar + 0x80); 2222 } 2223 /* return from a single-shift state to the previous one */ 2224 pToU2022State->g=pToU2022State->prevG; 2225 break; 2226 case JISX201: 2227 if(mySourceChar <= 0x7f) { 2228 targetUniChar = jisx201ToU(mySourceChar); 2229 } 2230 break; 2231 case HWKANA_7BIT: 2232 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2233 /* 7-bit halfwidth Katakana */ 2234 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2235 } 2236 break; 2237 default: 2238 /* G0 DBCS */ 2239 if(mySource < mySourceLimit) { 2240 int leadIsOk, trailIsOk; 2241 uint8_t trailByte; 2242 getTrailByte: 2243 trailByte = (uint8_t)*mySource; 2244 /* 2245 * Ticket 5691: consistent illegal sequences: 2246 * - We include at least the first byte in the illegal sequence. 2247 * - If any of the non-initial bytes could be the start of a character, 2248 * we stop the illegal sequence before the first one of those. 2249 * 2250 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2251 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2252 * Otherwise we convert or report the pair of bytes. 2253 */ 2254 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2255 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2256 if (leadIsOk && trailIsOk) { 2257 ++mySource; 2258 tmpSourceChar = (mySourceChar << 8) | trailByte; 2259 if(cs == JISX208) { 2260 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2261 mySourceChar = tmpSourceChar; 2262 } else { 2263 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2264 mySourceChar = tmpSourceChar; 2265 if (cs == KSC5601) { 2266 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2267 } 2268 tempBuf[0] = (char)(tmpSourceChar >> 8); 2269 tempBuf[1] = (char)(tmpSourceChar); 2270 } 2271 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2272 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2273 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2274 ++mySource; 2275 /* add another bit so that the code below writes 2 bytes in case of error */ 2276 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2277 } 2278 } else { 2279 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2280 args->converter->toULength = 1; 2281 goto endloop; 2282 } 2283 } /* End of inner switch */ 2284 break; 2285 } /* End of outer switch */ 2286 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2287 if(args->offsets){ 2288 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2289 } 2290 *(myTarget++)=(UChar)targetUniChar; 2291 } 2292 else if(targetUniChar > missingCharMarker){ 2293 /* disassemble the surrogate pair and write to output*/ 2294 targetUniChar-=0x0010000; 2295 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2296 if(args->offsets){ 2297 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2298 } 2299 ++myTarget; 2300 if(myTarget< args->targetLimit){ 2301 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2302 if(args->offsets){ 2303 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2304 } 2305 ++myTarget; 2306 }else{ 2307 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2308 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2309 } 2310 2311 } 2312 else{ 2313 /* Call the callback function*/ 2314 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2315 break; 2316 } 2317 } 2318 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2319 *err =U_BUFFER_OVERFLOW_ERROR; 2320 break; 2321 } 2322 } 2323 endloop: 2324 args->target = myTarget; 2325 args->source = mySource; 2326 } 2327 2328 2329 #if !UCONFIG_ONLY_HTML_CONVERSION 2330 /*************************************************************** 2331 * Rules for ISO-2022-KR encoding 2332 * i) The KSC5601 designator sequence should appear only once in a file, 2333 * at the begining of a line before any KSC5601 characters. This usually 2334 * means that it appears by itself on the first line of the file 2335 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2336 * and SI to shift into single byte mode 2337 */ 2338 static void U_CALLCONV 2339 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2340 2341 UConverter* saveConv = args->converter; 2342 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2343 args->converter=myConverterData->currentConverter; 2344 2345 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2346 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2347 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2348 2349 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2350 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2351 uprv_memcpy( 2352 saveConv->charErrorBuffer, 2353 myConverterData->currentConverter->charErrorBuffer, 2354 myConverterData->currentConverter->charErrorBufferLength); 2355 } 2356 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2357 myConverterData->currentConverter->charErrorBufferLength = 0; 2358 } 2359 args->converter=saveConv; 2360 } 2361 2362 static void U_CALLCONV 2363 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2364 2365 const UChar *source = args->source; 2366 const UChar *sourceLimit = args->sourceLimit; 2367 unsigned char *target = (unsigned char *) args->target; 2368 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2369 int32_t* offsets = args->offsets; 2370 uint32_t targetByteUnit = 0x0000; 2371 UChar32 sourceChar = 0x0000; 2372 UBool isTargetByteDBCS; 2373 UBool oldIsTargetByteDBCS; 2374 UConverterDataISO2022 *converterData; 2375 UConverterSharedData* sharedData; 2376 UBool useFallback; 2377 int32_t length =0; 2378 2379 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2380 /* if the version is 1 then the user is requesting 2381 * conversion with ibm-25546 pass the arguments to 2382 * MBCS converter and return 2383 */ 2384 if(converterData->version==1){ 2385 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2386 return; 2387 } 2388 2389 /* initialize data */ 2390 sharedData = converterData->currentConverter->sharedData; 2391 useFallback = args->converter->useFallback; 2392 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2393 oldIsTargetByteDBCS = isTargetByteDBCS; 2394 2395 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2396 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2397 goto getTrail; 2398 } 2399 while(source < sourceLimit){ 2400 2401 targetByteUnit = missingCharMarker; 2402 2403 if(target < (unsigned char*) args->targetLimit){ 2404 sourceChar = *source++; 2405 2406 /* do not convert SO/SI/ESC */ 2407 if(IS_2022_CONTROL(sourceChar)) { 2408 /* callback(illegal) */ 2409 *err=U_ILLEGAL_CHAR_FOUND; 2410 args->converter->fromUChar32=sourceChar; 2411 break; 2412 } 2413 2414 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2415 if(length < 0) { 2416 length = -length; /* fallback */ 2417 } 2418 /* only DBCS or SBCS characters are expected*/ 2419 /* DB characters with high bit set to 1 are expected */ 2420 if( length > 2 || length==0 || 2421 (length == 1 && targetByteUnit > 0x7f) || 2422 (length == 2 && 2423 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2424 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2425 ) { 2426 targetByteUnit=missingCharMarker; 2427 } 2428 if (targetByteUnit != missingCharMarker){ 2429 2430 oldIsTargetByteDBCS = isTargetByteDBCS; 2431 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2432 /* append the shift sequence */ 2433 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2434 2435 if (isTargetByteDBCS) 2436 *target++ = UCNV_SO; 2437 else 2438 *target++ = UCNV_SI; 2439 if(offsets) 2440 *(offsets++) = (int32_t)(source - args->source-1); 2441 } 2442 /* write the targetUniChar to target */ 2443 if(targetByteUnit <= 0x00FF){ 2444 if( target < targetLimit){ 2445 *(target++) = (unsigned char) targetByteUnit; 2446 if(offsets){ 2447 *(offsets++) = (int32_t)(source - args->source-1); 2448 } 2449 2450 }else{ 2451 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2452 *err = U_BUFFER_OVERFLOW_ERROR; 2453 } 2454 }else{ 2455 if(target < targetLimit){ 2456 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2457 if(offsets){ 2458 *(offsets++) = (int32_t)(source - args->source-1); 2459 } 2460 if(target < targetLimit){ 2461 *(target++) =(unsigned char) (targetByteUnit -0x80); 2462 if(offsets){ 2463 *(offsets++) = (int32_t)(source - args->source-1); 2464 } 2465 }else{ 2466 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2467 *err = U_BUFFER_OVERFLOW_ERROR; 2468 } 2469 }else{ 2470 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2471 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2472 *err = U_BUFFER_OVERFLOW_ERROR; 2473 } 2474 } 2475 2476 } 2477 else{ 2478 /* oops.. the code point is unassingned 2479 * set the error and reason 2480 */ 2481 2482 /*check if the char is a First surrogate*/ 2483 if(U16_IS_SURROGATE(sourceChar)) { 2484 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2485 getTrail: 2486 /*look ahead to find the trail surrogate*/ 2487 if(source < sourceLimit) { 2488 /* test the following code unit */ 2489 UChar trail=(UChar) *source; 2490 if(U16_IS_TRAIL(trail)) { 2491 source++; 2492 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2493 *err = U_INVALID_CHAR_FOUND; 2494 /* convert this surrogate code point */ 2495 /* exit this condition tree */ 2496 } else { 2497 /* this is an unmatched lead code unit (1st surrogate) */ 2498 /* callback(illegal) */ 2499 *err=U_ILLEGAL_CHAR_FOUND; 2500 } 2501 } else { 2502 /* no more input */ 2503 *err = U_ZERO_ERROR; 2504 } 2505 } else { 2506 /* this is an unmatched trail code unit (2nd surrogate) */ 2507 /* callback(illegal) */ 2508 *err=U_ILLEGAL_CHAR_FOUND; 2509 } 2510 } else { 2511 /* callback(unassigned) for a BMP code point */ 2512 *err = U_INVALID_CHAR_FOUND; 2513 } 2514 2515 args->converter->fromUChar32=sourceChar; 2516 break; 2517 } 2518 } /* end if(myTargetIndex<myTargetLength) */ 2519 else{ 2520 *err =U_BUFFER_OVERFLOW_ERROR; 2521 break; 2522 } 2523 2524 }/* end while(mySourceIndex<mySourceLength) */ 2525 2526 /* 2527 * the end of the input stream and detection of truncated input 2528 * are handled by the framework, but for ISO-2022-KR conversion 2529 * we need to be in ASCII mode at the very end 2530 * 2531 * conditions: 2532 * successful 2533 * not in ASCII mode 2534 * end of input and no truncated input 2535 */ 2536 if( U_SUCCESS(*err) && 2537 isTargetByteDBCS && 2538 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2539 ) { 2540 int32_t sourceIndex; 2541 2542 /* we are switching to ASCII */ 2543 isTargetByteDBCS=FALSE; 2544 2545 /* get the source index of the last input character */ 2546 /* 2547 * TODO this would be simpler and more reliable if we used a pair 2548 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2549 * so that we could simply use the prevSourceIndex here; 2550 * this code gives an incorrect result for the rare case of an unmatched 2551 * trail surrogate that is alone in the last buffer of the text stream 2552 */ 2553 sourceIndex=(int32_t)(source-args->source); 2554 if(sourceIndex>0) { 2555 --sourceIndex; 2556 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2557 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2558 ) { 2559 --sourceIndex; 2560 } 2561 } else { 2562 sourceIndex=-1; 2563 } 2564 2565 fromUWriteUInt8( 2566 args->converter, 2567 SHIFT_IN_STR, 1, 2568 &target, (const char *)targetLimit, 2569 &offsets, sourceIndex, 2570 err); 2571 } 2572 2573 /*save the state and return */ 2574 args->source = source; 2575 args->target = (char*)target; 2576 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2577 } 2578 2579 /************************ To Unicode ***************************************/ 2580 2581 static void U_CALLCONV 2582 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2583 UErrorCode* err){ 2584 char const* sourceStart; 2585 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2586 2587 UConverterToUnicodeArgs subArgs; 2588 int32_t minArgsSize; 2589 2590 /* set up the subconverter arguments */ 2591 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2592 minArgsSize = args->size; 2593 } else { 2594 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2595 } 2596 2597 uprv_memcpy(&subArgs, args, minArgsSize); 2598 subArgs.size = (uint16_t)minArgsSize; 2599 subArgs.converter = myData->currentConverter; 2600 2601 /* remember the original start of the input for offsets */ 2602 sourceStart = args->source; 2603 2604 if(myData->key != 0) { 2605 /* continue with a partial escape sequence */ 2606 goto escape; 2607 } 2608 2609 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2610 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2611 subArgs.source = args->source; 2612 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2613 if(subArgs.source != subArgs.sourceLimit) { 2614 /* 2615 * get the current partial byte sequence 2616 * 2617 * it needs to be moved between the public and the subconverter 2618 * so that the conversion framework, which only sees the public 2619 * converter, can handle truncated and illegal input etc. 2620 */ 2621 if(args->converter->toULength > 0) { 2622 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2623 } 2624 subArgs.converter->toULength = args->converter->toULength; 2625 2626 /* 2627 * Convert up to the end of the input, or to before the next escape character. 2628 * Does not handle conversion extensions because the preToU[] state etc. 2629 * is not copied. 2630 */ 2631 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2632 2633 if(args->offsets != NULL && sourceStart != args->source) { 2634 /* update offsets to base them on the actual start of the input */ 2635 int32_t *offsets = args->offsets; 2636 UChar *target = args->target; 2637 int32_t delta = (int32_t)(args->source - sourceStart); 2638 while(target < subArgs.target) { 2639 if(*offsets >= 0) { 2640 *offsets += delta; 2641 } 2642 ++offsets; 2643 ++target; 2644 } 2645 } 2646 args->source = subArgs.source; 2647 args->target = subArgs.target; 2648 args->offsets = subArgs.offsets; 2649 2650 /* copy input/error/overflow buffers */ 2651 if(subArgs.converter->toULength > 0) { 2652 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2653 } 2654 args->converter->toULength = subArgs.converter->toULength; 2655 2656 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2657 if(subArgs.converter->UCharErrorBufferLength > 0) { 2658 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2659 subArgs.converter->UCharErrorBufferLength); 2660 } 2661 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2662 subArgs.converter->UCharErrorBufferLength = 0; 2663 } 2664 } 2665 2666 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2667 return; 2668 } 2669 2670 escape: 2671 changeState_2022(args->converter, 2672 &(args->source), 2673 args->sourceLimit, 2674 ISO_2022_KR, 2675 err); 2676 } 2677 } 2678 2679 static void U_CALLCONV 2680 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2681 UErrorCode* err){ 2682 char tempBuf[2]; 2683 const char *mySource = ( char *) args->source; 2684 UChar *myTarget = args->target; 2685 const char *mySourceLimit = args->sourceLimit; 2686 UChar32 targetUniChar = 0x0000; 2687 UChar mySourceChar = 0x0000; 2688 UConverterDataISO2022* myData; 2689 UConverterSharedData* sharedData ; 2690 UBool useFallback; 2691 2692 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2693 if(myData->version==1){ 2694 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2695 return; 2696 } 2697 2698 /* initialize state */ 2699 sharedData = myData->currentConverter->sharedData; 2700 useFallback = args->converter->useFallback; 2701 2702 if(myData->key != 0) { 2703 /* continue with a partial escape sequence */ 2704 goto escape; 2705 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2706 /* continue with a partial double-byte character */ 2707 mySourceChar = args->converter->toUBytes[0]; 2708 args->converter->toULength = 0; 2709 goto getTrailByte; 2710 } 2711 2712 while(mySource< mySourceLimit){ 2713 2714 if(myTarget < args->targetLimit){ 2715 2716 mySourceChar= (unsigned char) *mySource++; 2717 2718 if(mySourceChar==UCNV_SI){ 2719 myData->toU2022State.g = 0; 2720 if (myData->isEmptySegment) { 2721 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2722 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2723 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2724 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2725 args->converter->toULength = 1; 2726 args->target = myTarget; 2727 args->source = mySource; 2728 return; 2729 } 2730 /*consume the source */ 2731 continue; 2732 }else if(mySourceChar==UCNV_SO){ 2733 myData->toU2022State.g = 1; 2734 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2735 /*consume the source */ 2736 continue; 2737 }else if(mySourceChar==ESC_2022){ 2738 mySource--; 2739 escape: 2740 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2741 changeState_2022(args->converter,&(mySource), 2742 mySourceLimit, ISO_2022_KR, err); 2743 if(U_FAILURE(*err)){ 2744 args->target = myTarget; 2745 args->source = mySource; 2746 return; 2747 } 2748 continue; 2749 } 2750 2751 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2752 if(myData->toU2022State.g == 1) { 2753 if(mySource < mySourceLimit) { 2754 int leadIsOk, trailIsOk; 2755 uint8_t trailByte; 2756 getTrailByte: 2757 targetUniChar = missingCharMarker; 2758 trailByte = (uint8_t)*mySource; 2759 /* 2760 * Ticket 5691: consistent illegal sequences: 2761 * - We include at least the first byte in the illegal sequence. 2762 * - If any of the non-initial bytes could be the start of a character, 2763 * we stop the illegal sequence before the first one of those. 2764 * 2765 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2766 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2767 * Otherwise we convert or report the pair of bytes. 2768 */ 2769 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2770 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2771 if (leadIsOk && trailIsOk) { 2772 ++mySource; 2773 tempBuf[0] = (char)(mySourceChar + 0x80); 2774 tempBuf[1] = (char)(trailByte + 0x80); 2775 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2776 mySourceChar = (mySourceChar << 8) | trailByte; 2777 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2778 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2779 ++mySource; 2780 /* add another bit so that the code below writes 2 bytes in case of error */ 2781 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2782 } 2783 } else { 2784 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2785 args->converter->toULength = 1; 2786 break; 2787 } 2788 } 2789 else if(mySourceChar <= 0x7f) { 2790 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2791 } else { 2792 targetUniChar = 0xffff; 2793 } 2794 if(targetUniChar < 0xfffe){ 2795 if(args->offsets) { 2796 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2797 } 2798 *(myTarget++)=(UChar)targetUniChar; 2799 } 2800 else { 2801 /* Call the callback function*/ 2802 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2803 break; 2804 } 2805 } 2806 else{ 2807 *err =U_BUFFER_OVERFLOW_ERROR; 2808 break; 2809 } 2810 } 2811 args->target = myTarget; 2812 args->source = mySource; 2813 } 2814 2815 /*************************** END ISO2022-KR *********************************/ 2816 2817 /*************************** ISO-2022-CN ********************************* 2818 * 2819 * Rules for ISO-2022-CN Encoding: 2820 * i) The designator sequence must appear once on a line before any instance 2821 * of character set it designates. 2822 * ii) If two lines contain characters from the same character set, both lines 2823 * must include the designator sequence. 2824 * iii) Once the designator sequence is known, a shifting sequence has to be found 2825 * to invoke the shifting 2826 * iv) All lines start in ASCII and end in ASCII. 2827 * v) Four shifting sequences are employed for this purpose: 2828 * 2829 * Sequcence ASCII Eq Charsets 2830 * ---------- ------- --------- 2831 * SI <SI> US-ASCII 2832 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2833 * SS2 <ESC>N CNS-11643-1992 Plane 2 2834 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2835 * 2836 * vi) 2837 * SOdesignator : ESC "$" ")" finalchar_for_SO 2838 * SS2designator : ESC "$" "*" finalchar_for_SS2 2839 * SS3designator : ESC "$" "+" finalchar_for_SS3 2840 * 2841 * ESC $ ) A Indicates the bytes following SO are Chinese 2842 * characters as defined in GB 2312-80, until 2843 * another SOdesignation appears 2844 * 2845 * 2846 * ESC $ ) E Indicates the bytes following SO are as defined 2847 * in ISO-IR-165 (for details, see section 2.1), 2848 * until another SOdesignation appears 2849 * 2850 * ESC $ ) G Indicates the bytes following SO are as defined 2851 * in CNS 11643-plane-1, until another 2852 * SOdesignation appears 2853 * 2854 * ESC $ * H Indicates the two bytes immediately following 2855 * SS2 is a Chinese character as defined in CNS 2856 * 11643-plane-2, until another SS2designation 2857 * appears 2858 * (Meaning <ESC>N must preceed every 2 byte 2859 * sequence.) 2860 * 2861 * ESC $ + I Indicates the immediate two bytes following SS3 2862 * is a Chinese character as defined in CNS 2863 * 11643-plane-3, until another SS3designation 2864 * appears 2865 * (Meaning <ESC>O must preceed every 2 byte 2866 * sequence.) 2867 * 2868 * ESC $ + J Indicates the immediate two bytes following SS3 2869 * is a Chinese character as defined in CNS 2870 * 11643-plane-4, until another SS3designation 2871 * appears 2872 * (In English: <ESC>O must preceed every 2 byte 2873 * sequence.) 2874 * 2875 * ESC $ + K Indicates the immediate two bytes following SS3 2876 * is a Chinese character as defined in CNS 2877 * 11643-plane-5, until another SS3designation 2878 * appears 2879 * 2880 * ESC $ + L Indicates the immediate two bytes following SS3 2881 * is a Chinese character as defined in CNS 2882 * 11643-plane-6, until another SS3designation 2883 * appears 2884 * 2885 * ESC $ + M Indicates the immediate two bytes following SS3 2886 * is a Chinese character as defined in CNS 2887 * 11643-plane-7, until another SS3designation 2888 * appears 2889 * 2890 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2891 * has its own designation information before any Chinese characters 2892 * appear 2893 * 2894 */ 2895 2896 /* The following are defined this way to make the strings truly readonly */ 2897 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2898 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2899 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2900 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2901 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2902 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2903 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2904 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2905 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2906 2907 /********************** ISO2022-CN Data **************************/ 2908 static const char* const escSeqCharsCN[10] ={ 2909 SHIFT_IN_STR, /* 0 ASCII */ 2910 GB_2312_80_STR, /* 1 GB2312_1 */ 2911 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2912 CNS_11643_1992_Plane_1_STR, 2913 CNS_11643_1992_Plane_2_STR, 2914 CNS_11643_1992_Plane_3_STR, 2915 CNS_11643_1992_Plane_4_STR, 2916 CNS_11643_1992_Plane_5_STR, 2917 CNS_11643_1992_Plane_6_STR, 2918 CNS_11643_1992_Plane_7_STR 2919 }; 2920 2921 static void U_CALLCONV 2922 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2923 UConverter *cnv = args->converter; 2924 UConverterDataISO2022 *converterData; 2925 ISO2022State *pFromU2022State; 2926 uint8_t *target = (uint8_t *) args->target; 2927 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2928 const UChar* source = args->source; 2929 const UChar* sourceLimit = args->sourceLimit; 2930 int32_t* offsets = args->offsets; 2931 UChar32 sourceChar; 2932 char buffer[8]; 2933 int32_t len; 2934 int8_t choices[3]; 2935 int32_t choiceCount; 2936 uint32_t targetValue = 0; 2937 UBool useFallback; 2938 2939 /* set up the state */ 2940 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2941 pFromU2022State = &converterData->fromU2022State; 2942 2943 choiceCount = 0; 2944 2945 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2946 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2947 goto getTrail; 2948 } 2949 2950 while( source < sourceLimit){ 2951 if(target < targetLimit){ 2952 2953 sourceChar = *(source++); 2954 /*check if the char is a First surrogate*/ 2955 if(U16_IS_SURROGATE(sourceChar)) { 2956 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2957 getTrail: 2958 /*look ahead to find the trail surrogate*/ 2959 if(source < sourceLimit) { 2960 /* test the following code unit */ 2961 UChar trail=(UChar) *source; 2962 if(U16_IS_TRAIL(trail)) { 2963 source++; 2964 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2965 cnv->fromUChar32=0x00; 2966 /* convert this supplementary code point */ 2967 /* exit this condition tree */ 2968 } else { 2969 /* this is an unmatched lead code unit (1st surrogate) */ 2970 /* callback(illegal) */ 2971 *err=U_ILLEGAL_CHAR_FOUND; 2972 cnv->fromUChar32=sourceChar; 2973 break; 2974 } 2975 } else { 2976 /* no more input */ 2977 cnv->fromUChar32=sourceChar; 2978 break; 2979 } 2980 } else { 2981 /* this is an unmatched trail code unit (2nd surrogate) */ 2982 /* callback(illegal) */ 2983 *err=U_ILLEGAL_CHAR_FOUND; 2984 cnv->fromUChar32=sourceChar; 2985 break; 2986 } 2987 } 2988 2989 /* do the conversion */ 2990 if(sourceChar <= 0x007f ){ 2991 /* do not convert SO/SI/ESC */ 2992 if(IS_2022_CONTROL(sourceChar)) { 2993 /* callback(illegal) */ 2994 *err=U_ILLEGAL_CHAR_FOUND; 2995 cnv->fromUChar32=sourceChar; 2996 break; 2997 } 2998 2999 /* US-ASCII */ 3000 if(pFromU2022State->g == 0) { 3001 buffer[0] = (char)sourceChar; 3002 len = 1; 3003 } else { 3004 buffer[0] = UCNV_SI; 3005 buffer[1] = (char)sourceChar; 3006 len = 2; 3007 pFromU2022State->g = 0; 3008 choiceCount = 0; 3009 } 3010 if(sourceChar == CR || sourceChar == LF) { 3011 /* reset the state at the end of a line */ 3012 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 3013 choiceCount = 0; 3014 } 3015 } 3016 else{ 3017 /* convert U+0080..U+10ffff */ 3018 int32_t i; 3019 int8_t cs, g; 3020 3021 if(choiceCount == 0) { 3022 /* try the current SO/G1 converter first */ 3023 choices[0] = pFromU2022State->cs[1]; 3024 3025 /* default to GB2312_1 if none is designated yet */ 3026 if(choices[0] == 0) { 3027 choices[0] = GB2312_1; 3028 } 3029 3030 if(converterData->version == 0) { 3031 /* ISO-2022-CN */ 3032 3033 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3034 if(choices[0] == GB2312_1) { 3035 choices[1] = (int8_t)CNS_11643_1; 3036 } else { 3037 choices[1] = (int8_t)GB2312_1; 3038 } 3039 3040 choiceCount = 2; 3041 } else if (converterData->version == 1) { 3042 /* ISO-2022-CN-EXT */ 3043 3044 /* try one of the other converters */ 3045 switch(choices[0]) { 3046 case GB2312_1: 3047 choices[1] = (int8_t)CNS_11643_1; 3048 choices[2] = (int8_t)ISO_IR_165; 3049 break; 3050 case ISO_IR_165: 3051 choices[1] = (int8_t)GB2312_1; 3052 choices[2] = (int8_t)CNS_11643_1; 3053 break; 3054 default: /* CNS_11643_x */ 3055 choices[1] = (int8_t)GB2312_1; 3056 choices[2] = (int8_t)ISO_IR_165; 3057 break; 3058 } 3059 3060 choiceCount = 3; 3061 } else { 3062 choices[0] = (int8_t)CNS_11643_1; 3063 choices[1] = (int8_t)GB2312_1; 3064 } 3065 } 3066 3067 cs = g = 0; 3068 /* 3069 * len==0: no mapping found yet 3070 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3071 * len>0: found a roundtrip result, done 3072 */ 3073 len = 0; 3074 /* 3075 * We will turn off useFallback after finding a fallback, 3076 * but we still get fallbacks from PUA code points as usual. 3077 * Therefore, we will also need to check that we don't overwrite 3078 * an early fallback with a later one. 3079 */ 3080 useFallback = cnv->useFallback; 3081 3082 for(i = 0; i < choiceCount && len <= 0; ++i) { 3083 int8_t cs0 = choices[i]; 3084 if(cs0 > 0) { 3085 uint32_t value; 3086 int32_t len2; 3087 if(cs0 >= CNS_11643_0) { 3088 len2 = MBCS_FROM_UCHAR32_ISO2022( 3089 converterData->myConverterArray[CNS_11643], 3090 sourceChar, 3091 &value, 3092 useFallback, 3093 MBCS_OUTPUT_3); 3094 if(len2 == 3 || (len2 == -3 && len == 0)) { 3095 targetValue = value; 3096 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3097 if(len2 >= 0) { 3098 len = 2; 3099 } else { 3100 len = -2; 3101 useFallback = FALSE; 3102 } 3103 if(cs == CNS_11643_1) { 3104 g = 1; 3105 } else if(cs == CNS_11643_2) { 3106 g = 2; 3107 } else /* plane 3..7 */ if(converterData->version == 1) { 3108 g = 3; 3109 } else { 3110 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3111 len = 0; 3112 } 3113 } 3114 } else { 3115 /* GB2312_1 or ISO-IR-165 */ 3116 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3117 len2 = MBCS_FROM_UCHAR32_ISO2022( 3118 converterData->myConverterArray[cs0], 3119 sourceChar, 3120 &value, 3121 useFallback, 3122 MBCS_OUTPUT_2); 3123 if(len2 == 2 || (len2 == -2 && len == 0)) { 3124 targetValue = value; 3125 len = len2; 3126 cs = cs0; 3127 g = 1; 3128 useFallback = FALSE; 3129 } 3130 } 3131 } 3132 } 3133 3134 if(len != 0) { 3135 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3136 3137 /* write the designation sequence if necessary */ 3138 if(cs != pFromU2022State->cs[g]) { 3139 if(cs < CNS_11643) { 3140 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3141 } else { 3142 U_ASSERT(cs >= CNS_11643_1); 3143 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3144 } 3145 len = 4; 3146 pFromU2022State->cs[g] = cs; 3147 if(g == 1) { 3148 /* changing the SO/G1 charset invalidates the choices[] */ 3149 choiceCount = 0; 3150 } 3151 } 3152 3153 /* write the shift sequence if necessary */ 3154 if(g != pFromU2022State->g) { 3155 switch(g) { 3156 case 1: 3157 buffer[len++] = UCNV_SO; 3158 3159 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3160 pFromU2022State->g = 1; 3161 break; 3162 case 2: 3163 buffer[len++] = 0x1b; 3164 buffer[len++] = 0x4e; 3165 break; 3166 default: /* case 3 */ 3167 buffer[len++] = 0x1b; 3168 buffer[len++] = 0x4f; 3169 break; 3170 } 3171 } 3172 3173 /* write the two output bytes */ 3174 buffer[len++] = (char)(targetValue >> 8); 3175 buffer[len++] = (char)targetValue; 3176 } else { 3177 /* if we cannot find the character after checking all codepages 3178 * then this is an error 3179 */ 3180 *err = U_INVALID_CHAR_FOUND; 3181 cnv->fromUChar32=sourceChar; 3182 break; 3183 } 3184 } 3185 3186 /* output len>0 bytes in buffer[] */ 3187 if(len == 1) { 3188 *target++ = buffer[0]; 3189 if(offsets) { 3190 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3191 } 3192 } else if(len == 2 && (target + 2) <= targetLimit) { 3193 *target++ = buffer[0]; 3194 *target++ = buffer[1]; 3195 if(offsets) { 3196 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3197 *offsets++ = sourceIndex; 3198 *offsets++ = sourceIndex; 3199 } 3200 } else { 3201 fromUWriteUInt8( 3202 cnv, 3203 buffer, len, 3204 &target, (const char *)targetLimit, 3205 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3206 err); 3207 if(U_FAILURE(*err)) { 3208 break; 3209 } 3210 } 3211 } /* end if(myTargetIndex<myTargetLength) */ 3212 else{ 3213 *err =U_BUFFER_OVERFLOW_ERROR; 3214 break; 3215 } 3216 3217 }/* end while(mySourceIndex<mySourceLength) */ 3218 3219 /* 3220 * the end of the input stream and detection of truncated input 3221 * are handled by the framework, but for ISO-2022-CN conversion 3222 * we need to be in ASCII mode at the very end 3223 * 3224 * conditions: 3225 * successful 3226 * not in ASCII mode 3227 * end of input and no truncated input 3228 */ 3229 if( U_SUCCESS(*err) && 3230 pFromU2022State->g!=0 && 3231 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3232 ) { 3233 int32_t sourceIndex; 3234 3235 /* we are switching to ASCII */ 3236 pFromU2022State->g=0; 3237 3238 /* get the source index of the last input character */ 3239 /* 3240 * TODO this would be simpler and more reliable if we used a pair 3241 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3242 * so that we could simply use the prevSourceIndex here; 3243 * this code gives an incorrect result for the rare case of an unmatched 3244 * trail surrogate that is alone in the last buffer of the text stream 3245 */ 3246 sourceIndex=(int32_t)(source-args->source); 3247 if(sourceIndex>0) { 3248 --sourceIndex; 3249 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3250 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3251 ) { 3252 --sourceIndex; 3253 } 3254 } else { 3255 sourceIndex=-1; 3256 } 3257 3258 fromUWriteUInt8( 3259 cnv, 3260 SHIFT_IN_STR, 1, 3261 &target, (const char *)targetLimit, 3262 &offsets, sourceIndex, 3263 err); 3264 } 3265 3266 /*save the state and return */ 3267 args->source = source; 3268 args->target = (char*)target; 3269 } 3270 3271 3272 static void U_CALLCONV 3273 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3274 UErrorCode* err){ 3275 char tempBuf[3]; 3276 const char *mySource = (char *) args->source; 3277 UChar *myTarget = args->target; 3278 const char *mySourceLimit = args->sourceLimit; 3279 uint32_t targetUniChar = 0x0000; 3280 uint32_t mySourceChar = 0x0000; 3281 UConverterDataISO2022* myData; 3282 ISO2022State *pToU2022State; 3283 3284 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3285 pToU2022State = &myData->toU2022State; 3286 3287 if(myData->key != 0) { 3288 /* continue with a partial escape sequence */ 3289 goto escape; 3290 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3291 /* continue with a partial double-byte character */ 3292 mySourceChar = args->converter->toUBytes[0]; 3293 args->converter->toULength = 0; 3294 targetUniChar = missingCharMarker; 3295 goto getTrailByte; 3296 } 3297 3298 while(mySource < mySourceLimit){ 3299 3300 targetUniChar =missingCharMarker; 3301 3302 if(myTarget < args->targetLimit){ 3303 3304 mySourceChar= (unsigned char) *mySource++; 3305 3306 switch(mySourceChar){ 3307 case UCNV_SI: 3308 pToU2022State->g=0; 3309 if (myData->isEmptySegment) { 3310 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3311 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3312 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3313 args->converter->toUBytes[0] = mySourceChar; 3314 args->converter->toULength = 1; 3315 args->target = myTarget; 3316 args->source = mySource; 3317 return; 3318 } 3319 continue; 3320 3321 case UCNV_SO: 3322 if(pToU2022State->cs[1] != 0) { 3323 pToU2022State->g=1; 3324 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3325 continue; 3326 } else { 3327 /* illegal to have SO before a matching designator */ 3328 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3329 break; 3330 } 3331 3332 case ESC_2022: 3333 mySource--; 3334 escape: 3335 { 3336 const char * mySourceBefore = mySource; 3337 int8_t toULengthBefore = args->converter->toULength; 3338 3339 changeState_2022(args->converter,&(mySource), 3340 mySourceLimit, ISO_2022_CN,err); 3341 3342 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3343 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3344 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3345 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3346 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); 3347 } 3348 } 3349 3350 /* invalid or illegal escape sequence */ 3351 if(U_FAILURE(*err)){ 3352 args->target = myTarget; 3353 args->source = mySource; 3354 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3355 return; 3356 } 3357 continue; 3358 3359 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3360 3361 case CR: 3362 case LF: 3363 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3364 U_FALLTHROUGH; 3365 default: 3366 /* convert one or two bytes */ 3367 myData->isEmptySegment = FALSE; 3368 if(pToU2022State->g != 0) { 3369 if(mySource < mySourceLimit) { 3370 UConverterSharedData *cnv; 3371 StateEnum tempState; 3372 int32_t tempBufLen; 3373 int leadIsOk, trailIsOk; 3374 uint8_t trailByte; 3375 getTrailByte: 3376 trailByte = (uint8_t)*mySource; 3377 /* 3378 * Ticket 5691: consistent illegal sequences: 3379 * - We include at least the first byte in the illegal sequence. 3380 * - If any of the non-initial bytes could be the start of a character, 3381 * we stop the illegal sequence before the first one of those. 3382 * 3383 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3384 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3385 * Otherwise we convert or report the pair of bytes. 3386 */ 3387 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3388 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3389 if (leadIsOk && trailIsOk) { 3390 ++mySource; 3391 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3392 if(tempState >= CNS_11643_0) { 3393 cnv = myData->myConverterArray[CNS_11643]; 3394 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3395 tempBuf[1] = (char) (mySourceChar); 3396 tempBuf[2] = (char) trailByte; 3397 tempBufLen = 3; 3398 3399 }else{ 3400 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3401 cnv = myData->myConverterArray[tempState]; 3402 tempBuf[0] = (char) (mySourceChar); 3403 tempBuf[1] = (char) trailByte; 3404 tempBufLen = 2; 3405 } 3406 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3407 mySourceChar = (mySourceChar << 8) | trailByte; 3408 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3409 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3410 ++mySource; 3411 /* add another bit so that the code below writes 2 bytes in case of error */ 3412 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3413 } 3414 if(pToU2022State->g>=2) { 3415 /* return from a single-shift state to the previous one */ 3416 pToU2022State->g=pToU2022State->prevG; 3417 } 3418 } else { 3419 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3420 args->converter->toULength = 1; 3421 goto endloop; 3422 } 3423 } 3424 else{ 3425 if(mySourceChar <= 0x7f) { 3426 targetUniChar = (UChar) mySourceChar; 3427 } 3428 } 3429 break; 3430 } 3431 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3432 if(args->offsets){ 3433 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3434 } 3435 *(myTarget++)=(UChar)targetUniChar; 3436 } 3437 else if(targetUniChar > missingCharMarker){ 3438 /* disassemble the surrogate pair and write to output*/ 3439 targetUniChar-=0x0010000; 3440 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3441 if(args->offsets){ 3442 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3443 } 3444 ++myTarget; 3445 if(myTarget< args->targetLimit){ 3446 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3447 if(args->offsets){ 3448 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3449 } 3450 ++myTarget; 3451 }else{ 3452 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3453 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3454 } 3455 3456 } 3457 else{ 3458 /* Call the callback function*/ 3459 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3460 break; 3461 } 3462 } 3463 else{ 3464 *err =U_BUFFER_OVERFLOW_ERROR; 3465 break; 3466 } 3467 } 3468 endloop: 3469 args->target = myTarget; 3470 args->source = mySource; 3471 } 3472 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 3473 3474 static void U_CALLCONV 3475 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3476 UConverter *cnv = args->converter; 3477 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3478 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3479 char *p, *subchar; 3480 char buffer[8]; 3481 int32_t length; 3482 3483 subchar=(char *)cnv->subChars; 3484 length=cnv->subCharLen; /* assume length==1 for most variants */ 3485 3486 p = buffer; 3487 switch(myConverterData->locale[0]){ 3488 case 'j': 3489 { 3490 int8_t cs; 3491 3492 if(pFromU2022State->g == 1) { 3493 /* JIS7: switch from G1 to G0 */ 3494 pFromU2022State->g = 0; 3495 *p++ = UCNV_SI; 3496 } 3497 3498 cs = pFromU2022State->cs[0]; 3499 if(cs != ASCII && cs != JISX201) { 3500 /* not in ASCII or JIS X 0201: switch to ASCII */ 3501 pFromU2022State->cs[0] = (int8_t)ASCII; 3502 *p++ = '\x1b'; 3503 *p++ = '\x28'; 3504 *p++ = '\x42'; 3505 } 3506 3507 *p++ = subchar[0]; 3508 break; 3509 } 3510 case 'c': 3511 if(pFromU2022State->g != 0) { 3512 /* not in ASCII mode: switch to ASCII */ 3513 pFromU2022State->g = 0; 3514 *p++ = UCNV_SI; 3515 } 3516 *p++ = subchar[0]; 3517 break; 3518 case 'k': 3519 if(myConverterData->version == 0) { 3520 if(length == 1) { 3521 if((UBool)args->converter->fromUnicodeStatus) { 3522 /* in DBCS mode: switch to SBCS */ 3523 args->converter->fromUnicodeStatus = 0; 3524 *p++ = UCNV_SI; 3525 } 3526 *p++ = subchar[0]; 3527 } else /* length == 2*/ { 3528 if(!(UBool)args->converter->fromUnicodeStatus) { 3529 /* in SBCS mode: switch to DBCS */ 3530 args->converter->fromUnicodeStatus = 1; 3531 *p++ = UCNV_SO; 3532 } 3533 *p++ = subchar[0]; 3534 *p++ = subchar[1]; 3535 } 3536 break; 3537 } else { 3538 /* save the subconverter's substitution string */ 3539 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3540 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3541 3542 /* set our substitution string into the subconverter */ 3543 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3544 myConverterData->currentConverter->subCharLen = (int8_t)length; 3545 3546 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3547 args->converter = myConverterData->currentConverter; 3548 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3549 ucnv_cbFromUWriteSub(args, 0, err); 3550 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3551 args->converter = cnv; 3552 3553 /* restore the subconverter's substitution string */ 3554 myConverterData->currentConverter->subChars = currentSubChars; 3555 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3556 3557 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3558 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3559 uprv_memcpy( 3560 cnv->charErrorBuffer, 3561 myConverterData->currentConverter->charErrorBuffer, 3562 myConverterData->currentConverter->charErrorBufferLength); 3563 } 3564 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3565 myConverterData->currentConverter->charErrorBufferLength = 0; 3566 } 3567 return; 3568 } 3569 default: 3570 /* not expected */ 3571 break; 3572 } 3573 ucnv_cbFromUWriteBytes(args, 3574 buffer, (int32_t)(p - buffer), 3575 offsetIndex, err); 3576 } 3577 3578 /* 3579 * Structure for cloning an ISO 2022 converter into a single memory block. 3580 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3581 * and then ucnv_safeClone() of the sub-converter may additionally align 3582 * currentConverter inside the cloneStruct, for which we need the deadSpace 3583 * after currentConverter. 3584 * This is because UAlignedMemory may be larger than the actually 3585 * necessary alignment size for the platform. 3586 * The other cloneStruct fields will not be moved around, 3587 * and are aligned properly with cloneStruct's alignment. 3588 */ 3589 struct cloneStruct 3590 { 3591 UConverter cnv; 3592 UConverter currentConverter; 3593 UAlignedMemory deadSpace; 3594 UConverterDataISO2022 mydata; 3595 }; 3596 3597 3598 U_CDECL_BEGIN 3599 3600 static UConverter * U_CALLCONV 3601 _ISO_2022_SafeClone( 3602 const UConverter *cnv, 3603 void *stackBuffer, 3604 int32_t *pBufferSize, 3605 UErrorCode *status) 3606 { 3607 struct cloneStruct * localClone; 3608 UConverterDataISO2022 *cnvData; 3609 int32_t i, size; 3610 3611 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3612 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3613 return NULL; 3614 } 3615 3616 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3617 localClone = (struct cloneStruct *)stackBuffer; 3618 3619 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3620 3621 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3622 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3623 localClone->cnv.isExtraLocal = TRUE; 3624 3625 /* share the subconverters */ 3626 3627 if(cnvData->currentConverter != NULL) { 3628 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3629 localClone->mydata.currentConverter = 3630 ucnv_safeClone(cnvData->currentConverter, 3631 &localClone->currentConverter, 3632 &size, status); 3633 if(U_FAILURE(*status)) { 3634 return NULL; 3635 } 3636 } 3637 3638 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3639 if(cnvData->myConverterArray[i] != NULL) { 3640 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3641 } 3642 } 3643 3644 return &localClone->cnv; 3645 } 3646 3647 U_CDECL_END 3648 3649 static void U_CALLCONV 3650 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3651 const USetAdder *sa, 3652 UConverterUnicodeSet which, 3653 UErrorCode *pErrorCode) 3654 { 3655 int32_t i; 3656 UConverterDataISO2022* cnvData; 3657 3658 if (U_FAILURE(*pErrorCode)) { 3659 return; 3660 } 3661 #ifdef U_ENABLE_GENERIC_ISO_2022 3662 if (cnv->sharedData == &_ISO2022Data) { 3663 /* We use UTF-8 in this case */ 3664 sa->addRange(sa->set, 0, 0xd7FF); 3665 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3666 return; 3667 } 3668 #endif 3669 3670 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3671 3672 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3673 switch(cnvData->locale[0]){ 3674 case 'j': 3675 /* include JIS X 0201 which is hardcoded */ 3676 sa->add(sa->set, 0xa5); 3677 sa->add(sa->set, 0x203e); 3678 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3679 /* include Latin-1 for some variants of JP */ 3680 sa->addRange(sa->set, 0, 0xff); 3681 } else { 3682 /* include ASCII for JP */ 3683 sa->addRange(sa->set, 0, 0x7f); 3684 } 3685 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3686 /* 3687 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3688 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3689 * use half-width Katakana. 3690 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3691 * half-width Katakana via the ESC ( I sequence. 3692 * However, we only emit (fromUnicode) half-width Katakana according to the 3693 * definition of each variant. 3694 * 3695 * When including fallbacks, 3696 * we need to include half-width Katakana Unicode code points for all JP variants because 3697 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3698 */ 3699 /* include half-width Katakana for JP */ 3700 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3701 } 3702 break; 3703 #if !UCONFIG_ONLY_HTML_CONVERSION 3704 case 'c': 3705 case 'z': 3706 /* include ASCII for CN */ 3707 sa->addRange(sa->set, 0, 0x7f); 3708 break; 3709 case 'k': 3710 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3711 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3712 cnvData->currentConverter, sa, which, pErrorCode); 3713 /* the loop over myConverterArray[] will simply not find another converter */ 3714 break; 3715 #endif 3716 default: 3717 break; 3718 } 3719 3720 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3721 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3722 cnvData->version==0 && i==CNS_11643 3723 ) { 3724 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3725 ucnv_MBCSGetUnicodeSetForBytes( 3726 cnvData->myConverterArray[i], 3727 sa, UCNV_ROUNDTRIP_SET, 3728 0, 0x81, 0x82, 3729 pErrorCode); 3730 } 3731 #endif 3732 3733 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3734 UConverterSetFilter filter; 3735 if(cnvData->myConverterArray[i]!=NULL) { 3736 if(cnvData->locale[0]=='j' && i==JISX208) { 3737 /* 3738 * Only add code points that map to Shift-JIS codes 3739 * corresponding to JIS X 0208. 3740 */ 3741 filter=UCNV_SET_FILTER_SJIS; 3742 #if !UCONFIG_ONLY_HTML_CONVERSION 3743 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3744 cnvData->version==0 && i==CNS_11643) { 3745 /* 3746 * Version-specific for CN: 3747 * CN version 0 does not map CNS planes 3..7 although 3748 * they are all available in the CNS conversion table; 3749 * CN version 1 (-EXT) does map them all. 3750 * The two versions create different Unicode sets. 3751 */ 3752 filter=UCNV_SET_FILTER_2022_CN; 3753 } else if(i==KSC5601) { 3754 /* 3755 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3756 * are broader than GR94. 3757 */ 3758 filter=UCNV_SET_FILTER_GR94DBCS; 3759 #endif 3760 } else { 3761 filter=UCNV_SET_FILTER_NONE; 3762 } 3763 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3764 } 3765 } 3766 3767 /* 3768 * ISO 2022 converters must not convert SO/SI/ESC despite what 3769 * sub-converters do by themselves. 3770 * Remove these characters from the set. 3771 */ 3772 sa->remove(sa->set, 0x0e); 3773 sa->remove(sa->set, 0x0f); 3774 sa->remove(sa->set, 0x1b); 3775 3776 /* ISO 2022 converters do not convert C1 controls either */ 3777 sa->removeRange(sa->set, 0x80, 0x9f); 3778 } 3779 3780 static const UConverterImpl _ISO2022Impl={ 3781 UCNV_ISO_2022, 3782 3783 NULL, 3784 NULL, 3785 3786 _ISO2022Open, 3787 _ISO2022Close, 3788 _ISO2022Reset, 3789 3790 #ifdef U_ENABLE_GENERIC_ISO_2022 3791 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3792 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3793 ucnv_fromUnicode_UTF8, 3794 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3795 #else 3796 NULL, 3797 NULL, 3798 NULL, 3799 NULL, 3800 #endif 3801 NULL, 3802 3803 NULL, 3804 _ISO2022getName, 3805 _ISO_2022_WriteSub, 3806 _ISO_2022_SafeClone, 3807 _ISO_2022_GetUnicodeSet, 3808 3809 NULL, 3810 NULL 3811 }; 3812 static const UConverterStaticData _ISO2022StaticData={ 3813 sizeof(UConverterStaticData), 3814 "ISO_2022", 3815 2022, 3816 UCNV_IBM, 3817 UCNV_ISO_2022, 3818 1, 3819 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3820 { 0x1a, 0, 0, 0 }, 3821 1, 3822 FALSE, 3823 FALSE, 3824 0, 3825 0, 3826 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3827 }; 3828 const UConverterSharedData _ISO2022Data= 3829 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl); 3830 3831 /*************JP****************/ 3832 static const UConverterImpl _ISO2022JPImpl={ 3833 UCNV_ISO_2022, 3834 3835 NULL, 3836 NULL, 3837 3838 _ISO2022Open, 3839 _ISO2022Close, 3840 _ISO2022Reset, 3841 3842 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3843 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3844 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3845 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3846 NULL, 3847 3848 NULL, 3849 _ISO2022getName, 3850 _ISO_2022_WriteSub, 3851 _ISO_2022_SafeClone, 3852 _ISO_2022_GetUnicodeSet, 3853 3854 NULL, 3855 NULL 3856 }; 3857 static const UConverterStaticData _ISO2022JPStaticData={ 3858 sizeof(UConverterStaticData), 3859 "ISO_2022_JP", 3860 0, 3861 UCNV_IBM, 3862 UCNV_ISO_2022, 3863 1, 3864 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3865 { 0x1a, 0, 0, 0 }, 3866 1, 3867 FALSE, 3868 FALSE, 3869 0, 3870 0, 3871 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3872 }; 3873 3874 namespace { 3875 3876 const UConverterSharedData _ISO2022JPData= 3877 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl); 3878 3879 } // namespace 3880 3881 #if !UCONFIG_ONLY_HTML_CONVERSION 3882 /************* KR ***************/ 3883 static const UConverterImpl _ISO2022KRImpl={ 3884 UCNV_ISO_2022, 3885 3886 NULL, 3887 NULL, 3888 3889 _ISO2022Open, 3890 _ISO2022Close, 3891 _ISO2022Reset, 3892 3893 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3894 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3895 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3896 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3897 NULL, 3898 3899 NULL, 3900 _ISO2022getName, 3901 _ISO_2022_WriteSub, 3902 _ISO_2022_SafeClone, 3903 _ISO_2022_GetUnicodeSet, 3904 3905 NULL, 3906 NULL 3907 }; 3908 static const UConverterStaticData _ISO2022KRStaticData={ 3909 sizeof(UConverterStaticData), 3910 "ISO_2022_KR", 3911 0, 3912 UCNV_IBM, 3913 UCNV_ISO_2022, 3914 1, 3915 8, /* max 8 bytes per UChar */ 3916 { 0x1a, 0, 0, 0 }, 3917 1, 3918 FALSE, 3919 FALSE, 3920 0, 3921 0, 3922 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3923 }; 3924 3925 namespace { 3926 3927 const UConverterSharedData _ISO2022KRData= 3928 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl); 3929 3930 } // namespace 3931 3932 /*************** CN ***************/ 3933 static const UConverterImpl _ISO2022CNImpl={ 3934 3935 UCNV_ISO_2022, 3936 3937 NULL, 3938 NULL, 3939 3940 _ISO2022Open, 3941 _ISO2022Close, 3942 _ISO2022Reset, 3943 3944 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3945 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3946 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3947 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3948 NULL, 3949 3950 NULL, 3951 _ISO2022getName, 3952 _ISO_2022_WriteSub, 3953 _ISO_2022_SafeClone, 3954 _ISO_2022_GetUnicodeSet, 3955 3956 NULL, 3957 NULL 3958 }; 3959 static const UConverterStaticData _ISO2022CNStaticData={ 3960 sizeof(UConverterStaticData), 3961 "ISO_2022_CN", 3962 0, 3963 UCNV_IBM, 3964 UCNV_ISO_2022, 3965 1, 3966 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3967 { 0x1a, 0, 0, 0 }, 3968 1, 3969 FALSE, 3970 FALSE, 3971 0, 3972 0, 3973 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3974 }; 3975 3976 namespace { 3977 3978 const UConverterSharedData _ISO2022CNData= 3979 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl); 3980 3981 } // namespace 3982 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 3983 3984 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3985