1 /* 2 ********************************************************************** 3 * Copyright (C) 2000-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv2022.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2000feb03 12 * created by: Markus W. Scherer 13 * 14 * Change history: 15 * 16 * 06/29/2000 helena Major rewrite of the callback APIs. 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 18 * Changed implementation of toUnicode 19 * function 20 * 08/21/2000 Ram Added support for ISO-2022-KR 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to 22 * ucnvebdc.c 23 * 09/20/2000 Ram Added support for ISO-2022-CN 24 * Added implementations for getNextUChar() 25 * for specific 2022 country variants. 26 * 10/31/2000 Ram Implemented offsets logic functions 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 32 33 #include "unicode/ucnv.h" 34 #include "unicode/uset.h" 35 #include "unicode/ucnv_err.h" 36 #include "unicode/ucnv_cb.h" 37 #include "ucnv_imp.h" 38 #include "ucnv_bld.h" 39 #include "ucnv_cnv.h" 40 #include "ucnvmbcs.h" 41 #include "cstring.h" 42 #include "cmemory.h" 43 44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 45 46 #ifdef U_ENABLE_GENERIC_ISO_2022 47 /* 48 * I am disabling the generic ISO-2022 converter after proposing to do so on 49 * the icu mailing list two days ago. 50 * 51 * Reasons: 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 53 * its designation sequences, single shifts with return to the previous state, 54 * switch-with-no-return to UTF-16BE or similar, etc. 55 * This is unlike the language-specific variants like ISO-2022-JP which 56 * require a much smaller repertoire of ISO-2022 features. 57 * These variants continue to be supported. 58 * 2. I believe that no one is really using the generic ISO-2022 converter 59 * but rather always one of the language-specific variants. 60 * Note that ICU's generic ISO-2022 converter has always output one escape 61 * sequence followed by UTF-8 for the whole stream. 62 * 3. Switching between subcharsets is extremely slow, because each time 63 * the previous converter is closed and a new one opened, 64 * without any kind of caching, least-recently-used list, etc. 65 * 4. The code is currently buggy, and given the above it does not seem 66 * reasonable to spend the time on maintenance. 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 68 * This means, for example, that when ISO-8859-7 is designated, the following 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 70 * The ICU ISO-2022 converter does not handle this - and has no information 71 * about which subconverter would have to be shifted vs. which is designed 72 * for 7-bit ISO-2022. 73 * 74 * Markus Scherer 2003-dec-03 75 */ 76 #endif 77 78 static const char SHIFT_IN_STR[] = "\x0F"; 79 static const char SHIFT_OUT_STR[] = "\x0E"; 80 81 #define CR 0x0D 82 #define LF 0x0A 83 #define H_TAB 0x09 84 #define V_TAB 0x0B 85 #define SPACE 0x20 86 87 enum { 88 HWKANA_START=0xff61, 89 HWKANA_END=0xff9f 90 }; 91 92 /* 93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 94 * as bytes 21..7E. (Subtract 0x80.) 95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 96 * as bytes 20..7F. (Subtract 0x80.) 97 * Do not encode C1 control codes with native bytes 80..9F 98 * as bytes 00..1F (C0 control codes). 99 */ 100 enum { 101 GR94_START=0xa1, 102 GR94_END=0xfe, 103 GR96_START=0xa0, 104 GR96_END=0xff 105 }; 106 107 /* 108 * ISO 2022 control codes must not be converted from Unicode 109 * because they would mess up the byte stream. 110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 111 * corresponding to SO, SI, and ESC. 112 */ 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 114 115 /* for ISO-2022-JP and -CN implementations */ 116 typedef enum { 117 /* shared values */ 118 INVALID_STATE=-1, 119 ASCII = 0, 120 121 SS2_STATE=0x10, 122 SS3_STATE, 123 124 /* JP */ 125 ISO8859_1 = 1 , 126 ISO8859_7 = 2 , 127 JISX201 = 3, 128 JISX208 = 4, 129 JISX212 = 5, 130 GB2312 =6, 131 KSC5601 =7, 132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 133 134 /* CN */ 135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 136 GB2312_1=1, 137 ISO_IR_165=2, 138 CNS_11643=3, 139 140 /* 141 * these are used in StateEnum and ISO2022State variables, 142 * but CNS_11643 must be used to index into myConverterArray[] 143 */ 144 CNS_11643_0=0x20, 145 CNS_11643_1, 146 CNS_11643_2, 147 CNS_11643_3, 148 CNS_11643_4, 149 CNS_11643_5, 150 CNS_11643_6, 151 CNS_11643_7 152 } StateEnum; 153 154 /* is the StateEnum charset value for a DBCS charset? */ 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 156 157 #define CSM(cs) ((uint16_t)1<<(cs)) 158 159 /* 160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 162 * 163 * Note: The converter uses some leniency: 164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 165 * all versions, not just JIS7 and JIS8. 166 * - ICU does not distinguish between different versions of JIS X 0208. 167 */ 168 enum { MAX_JA_VERSION=4 }; 169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 175 }; 176 177 typedef enum { 178 ASCII1=0, 179 LATIN1, 180 SBCS, 181 DBCS, 182 MBCS, 183 HWKANA 184 }Cnv2022Type; 185 186 typedef struct ISO2022State { 187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 189 int8_t prevG; /* g before single shift (SS2 or SS3) */ 190 } ISO2022State; 191 192 #define UCNV_OPTIONS_VERSION_MASK 0xf 193 #define UCNV_2022_MAX_CONVERTERS 10 194 195 typedef struct{ 196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 197 UConverter *currentConverter; 198 Cnv2022Type currentType; 199 ISO2022State toU2022State, fromU2022State; 200 uint32_t key; 201 uint32_t version; 202 #ifdef U_ENABLE_GENERIC_ISO_2022 203 UBool isFirstBuffer; 204 #endif 205 UBool isEmptySegment; 206 char name[30]; 207 char locale[3]; 208 }UConverterDataISO2022; 209 210 /* Protos */ 211 /* ISO-2022 ----------------------------------------------------------------- */ 212 213 /*Forward declaration */ 214 U_CFUNC void 215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 216 UErrorCode * err); 217 U_CFUNC void 218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 219 UErrorCode * err); 220 221 #define ESC_2022 0x1B /*ESC*/ 222 223 typedef enum 224 { 225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 229 } UCNV_TableStates_2022; 230 231 /* 232 * The way these state transition arrays work is: 233 * ex : ESC$B is the sequence for JISX208 234 * a) First Iteration: char is ESC 235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 236 * int x = normalize_esq_chars_2022[27] which is equal to 1 237 * ii) Search for this value in escSeqStateTable_Key_2022[] 238 * value of x is stored at escSeqStateTable_Key_2022[0] 239 * iii) Save this index as offset 240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 242 * b) Switch on this state and continue to next char 243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 244 * which is normalize_esq_chars_2022[36] == 4 245 * ii) x is currently 1(from above) 246 * x<<=5 -- x is now 32 247 * x+=normalize_esq_chars_2022[36] 248 * now x is 36 249 * iii) Search for this value in escSeqStateTable_Key_2022[] 250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 253 * c) Switch on this state and continue to next char 254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 255 * ii) x is currently 36 (from above) 256 * x<<=5 -- x is now 1152 257 * x+=normalize_esq_chars_2022[66] 258 * now x is 1161 259 * iii) Search for this value in escSeqStateTable_Key_2022[] 260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 264 */ 265 266 267 /*Below are the 3 arrays depicting a state transition table*/ 268 static const int8_t normalize_esq_chars_2022[256] = { 269 /* 0 1 2 3 4 5 6 7 8 9 */ 270 271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 297 }; 298 299 #ifdef U_ENABLE_GENERIC_ISO_2022 300 /* 301 * When the generic ISO-2022 converter is completely removed, not just disabled 302 * per #ifdef, then the following state table and the associated tables that are 303 * dimensioned with MAX_STATES_2022 should be trimmed. 304 * 305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 306 * the associated escape sequences starting with ESC ( B should be removed. 307 * This includes the ones with key values 1097 and all of the ones above 1000000. 308 * 309 * For the latter, the tables can simply be truncated. 310 * For the former, since the tables must be kept parallel, it is probably best 311 * to simply duplicate an adjacent table cell, parallel in all tables. 312 * 313 * It may make sense to restructure the tables, especially by using small search 314 * tables for the variants instead of indexing them parallel to the table here. 315 */ 316 #endif 317 318 #define MAX_STATES_2022 74 319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 320 /* 0 1 2 3 4 5 6 7 8 9 */ 321 322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 329 ,35947631 ,35947635 ,35947636 ,35947638 330 }; 331 332 #ifdef U_ENABLE_GENERIC_ISO_2022 333 334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 335 /* 0 1 2 3 4 5 6 7 8 9 */ 336 337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" 338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" 340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 345 }; 346 347 #endif 348 349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 350 /* 0 1 2 3 4 5 6 7 8 9 */ 351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 359 }; 360 361 362 /* Type def for refactoring changeState_2022 code*/ 363 typedef enum{ 364 #ifdef U_ENABLE_GENERIC_ISO_2022 365 ISO_2022=0, 366 #endif 367 ISO_2022_JP=1, 368 ISO_2022_KR=2, 369 ISO_2022_CN=3 370 } Variant2022; 371 372 /*********** ISO 2022 Converter Protos ***********/ 373 static void 374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 375 376 static void 377 _ISO2022Close(UConverter *converter); 378 379 static void 380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 381 382 static const char* 383 _ISO2022getName(const UConverter* cnv); 384 385 static void 386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 387 388 static UConverter * 389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 390 391 #ifdef U_ENABLE_GENERIC_ISO_2022 392 static void 393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 394 #endif 395 396 /*const UConverterSharedData _ISO2022Data;*/ 397 static const UConverterSharedData _ISO2022JPData; 398 static const UConverterSharedData _ISO2022KRData; 399 static const UConverterSharedData _ISO2022CNData; 400 401 /*************** Converter implementations ******************/ 402 403 /* The purpose of this function is to get around gcc compiler warnings. */ 404 static U_INLINE void 405 fromUWriteUInt8(UConverter *cnv, 406 const char *bytes, int32_t length, 407 uint8_t **target, const char *targetLimit, 408 int32_t **offsets, 409 int32_t sourceIndex, 410 UErrorCode *pErrorCode) 411 { 412 char *targetChars = (char *)*target; 413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 414 offsets, sourceIndex, pErrorCode); 415 *target = (uint8_t*)targetChars; 416 417 } 418 419 static U_INLINE void 420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){ 421 if(myConverterData->version == 1) { 422 UConverter *cnv = myConverterData->currentConverter; 423 424 cnv->toUnicodeStatus=0; /* offset */ 425 cnv->mode=0; /* state */ 426 cnv->toULength=0; /* byteIndex */ 427 } 428 } 429 430 static U_INLINE void 431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 432 /* in ISO-2022-KR the designator sequence appears only once 433 * in a file so we append it only once 434 */ 435 if( converter->charErrorBufferLength==0){ 436 437 converter->charErrorBufferLength = 4; 438 converter->charErrorBuffer[0] = 0x1b; 439 converter->charErrorBuffer[1] = 0x24; 440 converter->charErrorBuffer[2] = 0x29; 441 converter->charErrorBuffer[3] = 0x43; 442 } 443 if(myConverterData->version == 1) { 444 UConverter *cnv = myConverterData->currentConverter; 445 446 cnv->fromUChar32=0; 447 cnv->fromUnicodeStatus=1; /* prevLength */ 448 } 449 } 450 451 static void 452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 453 454 char myLocale[6]={' ',' ',' ',' ',' ',' '}; 455 456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 457 if(cnv->extraInfo != NULL) { 458 UConverterNamePieces stackPieces; 459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; 460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 461 uint32_t version; 462 463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 464 465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 466 myConverterData->currentType = ASCII1; 467 cnv->fromUnicodeStatus =FALSE; 468 if(pArgs->locale){ 469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); 470 } 471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 472 myConverterData->version = version; 473 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 474 (myLocale[2]=='_' || myLocale[2]=='\0')) 475 { 476 size_t len=0; 477 /* open the required converters and cache them */ 478 if(version>MAX_JA_VERSION) { 479 /* prevent indexing beyond jpCharsetMasks[] */ 480 myConverterData->version = version = 0; 481 } 482 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 483 myConverterData->myConverterArray[ISO8859_7] = 484 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 485 } 486 myConverterData->myConverterArray[JISX208] = 487 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 488 if(jpCharsetMasks[version]&CSM(JISX212)) { 489 myConverterData->myConverterArray[JISX212] = 490 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 491 } 492 if(jpCharsetMasks[version]&CSM(GB2312)) { 493 myConverterData->myConverterArray[GB2312] = 494 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 495 } 496 if(jpCharsetMasks[version]&CSM(KSC5601)) { 497 myConverterData->myConverterArray[KSC5601] = 498 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 499 } 500 501 /* set the function pointers to appropriate funtions */ 502 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); 503 uprv_strcpy(myConverterData->locale,"ja"); 504 505 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 506 len = uprv_strlen(myConverterData->name); 507 myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); 508 myConverterData->name[len+1]='\0'; 509 } 510 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 511 (myLocale[2]=='_' || myLocale[2]=='\0')) 512 { 513 const char *cnvName; 514 if(version==1) { 515 cnvName="icu-internal-25546"; 516 } else { 517 cnvName="ksc_5601"; 518 myConverterData->version=version=0; 519 } 520 if(pArgs->onlyTestIsLoadable) { 521 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 522 uprv_free(cnv->extraInfo); 523 cnv->extraInfo=NULL; 524 return; 525 } else { 526 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 527 if (U_FAILURE(*errorCode)) { 528 _ISO2022Close(cnv); 529 return; 530 } 531 532 if(version==1) { 533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 534 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 535 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 536 }else{ 537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 538 } 539 540 /* initialize the state variables */ 541 setInitialStateToUnicodeKR(cnv, myConverterData); 542 setInitialStateFromUnicodeKR(cnv, myConverterData); 543 544 /* set the function pointers to appropriate funtions */ 545 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; 546 uprv_strcpy(myConverterData->locale,"ko"); 547 } 548 } 549 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 550 (myLocale[2]=='_' || myLocale[2]=='\0')) 551 { 552 553 /* open the required converters and cache them */ 554 myConverterData->myConverterArray[GB2312_1] = 555 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); 556 if(version==1) { 557 myConverterData->myConverterArray[ISO_IR_165] = 558 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode); 559 } 560 myConverterData->myConverterArray[CNS_11643] = 561 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode); 562 563 564 /* set the function pointers to appropriate funtions */ 565 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; 566 uprv_strcpy(myConverterData->locale,"cn"); 567 568 if (version==1){ 569 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 570 }else{ 571 myConverterData->version = 0; 572 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 573 } 574 } 575 else{ 576 #ifdef U_ENABLE_GENERIC_ISO_2022 577 myConverterData->isFirstBuffer = TRUE; 578 579 /* append the UTF-8 escape sequence */ 580 cnv->charErrorBufferLength = 3; 581 cnv->charErrorBuffer[0] = 0x1b; 582 cnv->charErrorBuffer[1] = 0x25; 583 cnv->charErrorBuffer[2] = 0x42; 584 585 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 586 /* initialize the state variables */ 587 uprv_strcpy(myConverterData->name,"ISO_2022"); 588 #else 589 *errorCode = U_UNSUPPORTED_ERROR; 590 return; 591 #endif 592 } 593 594 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 595 596 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 597 _ISO2022Close(cnv); 598 } 599 } else { 600 *errorCode = U_MEMORY_ALLOCATION_ERROR; 601 } 602 } 603 604 605 static void 606 _ISO2022Close(UConverter *converter) { 607 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); 608 UConverterSharedData **array = myData->myConverterArray; 609 int32_t i; 610 611 if (converter->extraInfo != NULL) { 612 /*close the array of converter pointers and free the memory*/ 613 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 614 if(array[i]!=NULL) { 615 ucnv_unloadSharedDataIfReady(array[i]); 616 } 617 } 618 619 ucnv_close(myData->currentConverter); 620 621 if(!converter->isExtraLocal){ 622 uprv_free (converter->extraInfo); 623 converter->extraInfo = NULL; 624 } 625 } 626 } 627 628 static void 629 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 630 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); 631 if(choice<=UCNV_RESET_TO_UNICODE) { 632 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 633 myConverterData->key = 0; 634 myConverterData->isEmptySegment = FALSE; 635 } 636 if(choice!=UCNV_RESET_TO_UNICODE) { 637 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 638 } 639 #ifdef U_ENABLE_GENERIC_ISO_2022 640 if(myConverterData->locale[0] == 0){ 641 if(choice<=UCNV_RESET_TO_UNICODE) { 642 myConverterData->isFirstBuffer = TRUE; 643 myConverterData->key = 0; 644 if (converter->mode == UCNV_SO){ 645 ucnv_close (myConverterData->currentConverter); 646 myConverterData->currentConverter=NULL; 647 } 648 converter->mode = UCNV_SI; 649 } 650 if(choice!=UCNV_RESET_TO_UNICODE) { 651 /* re-append UTF-8 escape sequence */ 652 converter->charErrorBufferLength = 3; 653 converter->charErrorBuffer[0] = 0x1b; 654 converter->charErrorBuffer[1] = 0x28; 655 converter->charErrorBuffer[2] = 0x42; 656 } 657 } 658 else 659 #endif 660 { 661 /* reset the state variables */ 662 if(myConverterData->locale[0] == 'k'){ 663 if(choice<=UCNV_RESET_TO_UNICODE) { 664 setInitialStateToUnicodeKR(converter, myConverterData); 665 } 666 if(choice!=UCNV_RESET_TO_UNICODE) { 667 setInitialStateFromUnicodeKR(converter, myConverterData); 668 } 669 } 670 } 671 } 672 673 static const char* 674 _ISO2022getName(const UConverter* cnv){ 675 if(cnv->extraInfo){ 676 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 677 return myData->name; 678 } 679 return NULL; 680 } 681 682 683 /*************** to unicode *******************/ 684 /**************************************************************************** 685 * Recognized escape sequences are 686 * <ESC>(B ASCII 687 * <ESC>.A ISO-8859-1 688 * <ESC>.F ISO-8859-7 689 * <ESC>(J JISX-201 690 * <ESC>(I JISX-201 691 * <ESC>$B JISX-208 692 * <ESC>$@ JISX-208 693 * <ESC>$(D JISX-212 694 * <ESC>$A GB2312 695 * <ESC>$(C KSC5601 696 */ 697 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 698 /* 0 1 2 3 4 5 6 7 8 9 */ 699 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 700 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 701 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 702 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 703 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 704 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 705 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 706 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 707 }; 708 709 /*************** to unicode *******************/ 710 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 711 /* 0 1 2 3 4 5 6 7 8 9 */ 712 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 713 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 714 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 715 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 716 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 717 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 718 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 719 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 720 }; 721 722 723 static UCNV_TableStates_2022 724 getKey_2022(char c,int32_t* key,int32_t* offset){ 725 int32_t togo; 726 int32_t low = 0; 727 int32_t hi = MAX_STATES_2022; 728 int32_t oldmid=0; 729 730 togo = normalize_esq_chars_2022[(uint8_t)c]; 731 if(togo == 0) { 732 /* not a valid character anywhere in an escape sequence */ 733 *key = 0; 734 *offset = 0; 735 return INVALID_2022; 736 } 737 togo = (*key << 5) + togo; 738 739 while (hi != low) /*binary search*/{ 740 741 register int32_t mid = (hi+low) >> 1; /*Finds median*/ 742 743 if (mid == oldmid) 744 break; 745 746 if (escSeqStateTable_Key_2022[mid] > togo){ 747 hi = mid; 748 } 749 else if (escSeqStateTable_Key_2022[mid] < togo){ 750 low = mid; 751 } 752 else /*we found it*/{ 753 *key = togo; 754 *offset = mid; 755 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; 756 } 757 oldmid = mid; 758 759 } 760 761 *key = 0; 762 *offset = 0; 763 return INVALID_2022; 764 } 765 766 /*runs through a state machine to determine the escape sequence - codepage correspondance 767 */ 768 static void 769 changeState_2022(UConverter* _this, 770 const char** source, 771 const char* sourceLimit, 772 Variant2022 var, 773 UErrorCode* err){ 774 UCNV_TableStates_2022 value; 775 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); 776 uint32_t key = myData2022->key; 777 int32_t offset = 0; 778 int8_t initialToULength = _this->toULength; 779 char c; 780 781 value = VALID_NON_TERMINAL_2022; 782 while (*source < sourceLimit) { 783 c = *(*source)++; 784 _this->toUBytes[_this->toULength++]=(uint8_t)c; 785 value = getKey_2022(c,(int32_t *) &key, &offset); 786 787 switch (value){ 788 789 case VALID_NON_TERMINAL_2022 : 790 /* continue with the loop */ 791 break; 792 793 case VALID_TERMINAL_2022: 794 key = 0; 795 goto DONE; 796 797 case INVALID_2022: 798 goto DONE; 799 800 case VALID_MAYBE_TERMINAL_2022: 801 #ifdef U_ENABLE_GENERIC_ISO_2022 802 /* ESC ( B is ambiguous only for ISO_2022 itself */ 803 if(var == ISO_2022) { 804 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 805 _this->toULength = 0; 806 807 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 808 809 /* continue with the loop */ 810 value = VALID_NON_TERMINAL_2022; 811 break; 812 } else 813 #endif 814 { 815 /* not ISO_2022 itself, finish here */ 816 value = VALID_TERMINAL_2022; 817 key = 0; 818 goto DONE; 819 } 820 } 821 } 822 823 DONE: 824 myData2022->key = key; 825 826 if (value == VALID_NON_TERMINAL_2022) { 827 /* indicate that the escape sequence is incomplete: key!=0 */ 828 return; 829 } else if (value == INVALID_2022 ) { 830 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 831 } else /* value == VALID_TERMINAL_2022 */ { 832 switch(var){ 833 #ifdef U_ENABLE_GENERIC_ISO_2022 834 case ISO_2022: 835 { 836 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 837 if(chosenConverterName == NULL) { 838 /* SS2 or SS3 */ 839 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 840 _this->toUCallbackReason = UCNV_UNASSIGNED; 841 return; 842 } 843 844 _this->mode = UCNV_SI; 845 ucnv_close(myData2022->currentConverter); 846 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 847 if(U_SUCCESS(*err)) { 848 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 849 _this->mode = UCNV_SO; 850 } 851 break; 852 } 853 #endif 854 case ISO_2022_JP: 855 { 856 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; 857 switch(tempState) { 858 case INVALID_STATE: 859 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 860 break; 861 case SS2_STATE: 862 if(myData2022->toU2022State.cs[2]!=0) { 863 if(myData2022->toU2022State.g<2) { 864 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 865 } 866 myData2022->toU2022State.g=2; 867 } else { 868 /* illegal to have SS2 before a matching designator */ 869 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 870 } 871 break; 872 /* case SS3_STATE: not used in ISO-2022-JP-x */ 873 case ISO8859_1: 874 case ISO8859_7: 875 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 876 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 877 } else { 878 /* G2 charset for SS2 */ 879 myData2022->toU2022State.cs[2]=(int8_t)tempState; 880 } 881 break; 882 default: 883 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 884 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 885 } else { 886 /* G0 charset */ 887 myData2022->toU2022State.cs[0]=(int8_t)tempState; 888 } 889 break; 890 } 891 } 892 break; 893 case ISO_2022_CN: 894 { 895 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; 896 switch(tempState) { 897 case INVALID_STATE: 898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 899 break; 900 case SS2_STATE: 901 if(myData2022->toU2022State.cs[2]!=0) { 902 if(myData2022->toU2022State.g<2) { 903 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 904 } 905 myData2022->toU2022State.g=2; 906 } else { 907 /* illegal to have SS2 before a matching designator */ 908 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 909 } 910 break; 911 case SS3_STATE: 912 if(myData2022->toU2022State.cs[3]!=0) { 913 if(myData2022->toU2022State.g<2) { 914 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 915 } 916 myData2022->toU2022State.g=3; 917 } else { 918 /* illegal to have SS3 before a matching designator */ 919 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 920 } 921 break; 922 case ISO_IR_165: 923 if(myData2022->version==0) { 924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 925 break; 926 } 927 /*fall through*/ 928 case GB2312_1: 929 /*fall through*/ 930 case CNS_11643_1: 931 myData2022->toU2022State.cs[1]=(int8_t)tempState; 932 break; 933 case CNS_11643_2: 934 myData2022->toU2022State.cs[2]=(int8_t)tempState; 935 break; 936 default: 937 /* other CNS 11643 planes */ 938 if(myData2022->version==0) { 939 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 940 } else { 941 myData2022->toU2022State.cs[3]=(int8_t)tempState; 942 } 943 break; 944 } 945 } 946 break; 947 case ISO_2022_KR: 948 if(offset==0x30){ 949 /* nothing to be done, just accept this one escape sequence */ 950 } else { 951 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 952 } 953 break; 954 955 default: 956 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 957 break; 958 } 959 } 960 if(U_SUCCESS(*err)) { 961 _this->toULength = 0; 962 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 963 if(_this->toULength>1) { 964 /* 965 * Ticket 5691: consistent illegal sequences: 966 * - We include at least the first byte (ESC) in the illegal sequence. 967 * - If any of the non-initial bytes could be the start of a character, 968 * we stop the illegal sequence before the first one of those. 969 * In escape sequences, all following bytes are "printable", that is, 970 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 971 * they are valid single/lead bytes. 972 * For simplicity, we always only report the initial ESC byte as the 973 * illegal sequence and back out all other bytes we looked at. 974 */ 975 /* Back out some bytes. */ 976 int8_t backOutDistance=_this->toULength-1; 977 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 978 if(backOutDistance<=bytesFromThisBuffer) { 979 /* same as initialToULength<=1 */ 980 *source-=backOutDistance; 981 } else { 982 /* Back out bytes from the previous buffer: Need to replay them. */ 983 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 984 /* same as -(initialToULength-1) */ 985 /* preToULength is negative! */ 986 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 987 *source-=bytesFromThisBuffer; 988 } 989 _this->toULength=1; 990 } 991 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 992 _this->toUCallbackReason = UCNV_UNASSIGNED; 993 } 994 } 995 996 /*Checks the characters of the buffer against valid 2022 escape sequences 997 *if the match we return a pointer to the initial start of the sequence otherwise 998 *we return sourceLimit 999 */ 1000 /*for 2022 looks ahead in the stream 1001 *to determine the longest possible convertible 1002 *data stream 1003 */ 1004 static U_INLINE const char* 1005 getEndOfBuffer_2022(const char** source, 1006 const char* sourceLimit, 1007 UBool flush){ 1008 1009 const char* mySource = *source; 1010 1011 #ifdef U_ENABLE_GENERIC_ISO_2022 1012 if (*source >= sourceLimit) 1013 return sourceLimit; 1014 1015 do{ 1016 1017 if (*mySource == ESC_2022){ 1018 int8_t i; 1019 int32_t key = 0; 1020 int32_t offset; 1021 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1022 1023 /* Kludge: I could not 1024 * figure out the reason for validating an escape sequence 1025 * twice - once here and once in changeState_2022(). 1026 * is it possible to have an ESC character in a ISO2022 1027 * byte stream which is valid in a code page? Is it legal? 1028 */ 1029 for (i=0; 1030 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1031 i++) { 1032 value = getKey_2022(*(mySource+i), &key, &offset); 1033 } 1034 if (value > 0 || *mySource==ESC_2022) 1035 return mySource; 1036 1037 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1038 return sourceLimit; 1039 } 1040 }while (++mySource < sourceLimit); 1041 1042 return sourceLimit; 1043 #else 1044 while(mySource < sourceLimit && *mySource != ESC_2022) { 1045 ++mySource; 1046 } 1047 return mySource; 1048 #endif 1049 } 1050 1051 1052 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1053 * any future change in _MBCSFromUChar32() function should be reflected here. 1054 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1055 */ 1056 static U_INLINE int32_t 1057 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1058 UChar32 c, 1059 uint32_t* value, 1060 UBool useFallback, 1061 int outputType) 1062 { 1063 const int32_t *cx; 1064 const uint16_t *table; 1065 uint32_t stage2Entry; 1066 uint32_t myValue; 1067 int32_t length; 1068 const uint8_t *p; 1069 /* 1070 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1071 * Use internal version of ucnv_open() that verifies that the new structures are available, 1072 * else U_INTERNAL_PROGRAM_ERROR. 1073 */ 1074 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1075 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1076 table=sharedData->mbcs.fromUnicodeTable; 1077 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1078 /* get the bytes and the length for the output */ 1079 if(outputType==MBCS_OUTPUT_2){ 1080 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1081 if(myValue<=0xff) { 1082 length=1; 1083 } else { 1084 length=2; 1085 } 1086 } else /* outputType==MBCS_OUTPUT_3 */ { 1087 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1088 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 1089 if(myValue<=0xff) { 1090 length=1; 1091 } else if(myValue<=0xffff) { 1092 length=2; 1093 } else { 1094 length=3; 1095 } 1096 } 1097 /* is this code point assigned, or do we use fallbacks? */ 1098 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1099 /* assigned */ 1100 *value=myValue; 1101 return length; 1102 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1103 /* 1104 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1105 * There is no way with this data structure for fallback output 1106 * to be a zero byte. 1107 */ 1108 *value=myValue; 1109 return -length; 1110 } 1111 } 1112 1113 cx=sharedData->mbcs.extIndexes; 1114 if(cx!=NULL) { 1115 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1116 } 1117 1118 /* unassigned */ 1119 return 0; 1120 } 1121 1122 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1123 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1124 * @param retval pointer to output byte 1125 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1126 */ 1127 static U_INLINE int32_t 1128 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1129 UChar32 c, 1130 uint32_t* retval, 1131 UBool useFallback) 1132 { 1133 const uint16_t *table; 1134 int32_t value; 1135 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1136 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1137 return 0; 1138 } 1139 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1140 table=sharedData->mbcs.fromUnicodeTable; 1141 /* get the byte for the output */ 1142 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1143 /* is this code point assigned, or do we use fallbacks? */ 1144 *retval=(uint32_t)(value&0xff); 1145 if(value>=0xf00) { 1146 return 1; /* roundtrip */ 1147 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1148 return -1; /* fallback taken */ 1149 } else { 1150 return 0; /* no mapping */ 1151 } 1152 } 1153 1154 /* 1155 * Check that the result is a 2-byte value with each byte in the range A1..FE 1156 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1157 * to move it to the ISO 2022 range 21..7E. 1158 * Return 0 if out of range. 1159 */ 1160 static U_INLINE uint32_t 1161 _2022FromGR94DBCS(uint32_t value) { 1162 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1163 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1164 ) { 1165 return value - 0x8080; /* shift down to 21..7e byte range */ 1166 } else { 1167 return 0; /* not valid for ISO 2022 */ 1168 } 1169 } 1170 1171 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1172 /* 1173 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1174 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1175 * unchanged. 1176 */ 1177 static U_INLINE uint32_t 1178 _2022ToGR94DBCS(uint32_t value) { 1179 uint32_t returnValue = value + 0x8080; 1180 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1181 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1182 return returnValue; 1183 } else { 1184 return value; 1185 } 1186 } 1187 #endif 1188 1189 #ifdef U_ENABLE_GENERIC_ISO_2022 1190 1191 /********************************************************************************** 1192 * ISO-2022 Converter 1193 * 1194 * 1195 */ 1196 1197 static void 1198 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1199 UErrorCode* err){ 1200 const char* mySourceLimit, *realSourceLimit; 1201 const char* sourceStart; 1202 const UChar* myTargetStart; 1203 UConverter* saveThis; 1204 UConverterDataISO2022* myData; 1205 int8_t length; 1206 1207 saveThis = args->converter; 1208 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1209 1210 realSourceLimit = args->sourceLimit; 1211 while (args->source < realSourceLimit) { 1212 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1213 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1214 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1215 1216 if(args->source < mySourceLimit) { 1217 if(myData->currentConverter==NULL) { 1218 myData->currentConverter = ucnv_open("ASCII",err); 1219 if(U_FAILURE(*err)){ 1220 return; 1221 } 1222 1223 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1224 saveThis->mode = UCNV_SO; 1225 } 1226 1227 /* convert to before the ESC or until the end of the buffer */ 1228 myData->isFirstBuffer=FALSE; 1229 sourceStart = args->source; 1230 myTargetStart = args->target; 1231 args->converter = myData->currentConverter; 1232 ucnv_toUnicode(args->converter, 1233 &args->target, 1234 args->targetLimit, 1235 &args->source, 1236 mySourceLimit, 1237 args->offsets, 1238 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1239 err); 1240 args->converter = saveThis; 1241 1242 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1243 /* move the overflow buffer */ 1244 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1245 myData->currentConverter->UCharErrorBufferLength = 0; 1246 if(length > 0) { 1247 uprv_memcpy(saveThis->UCharErrorBuffer, 1248 myData->currentConverter->UCharErrorBuffer, 1249 length*U_SIZEOF_UCHAR); 1250 } 1251 return; 1252 } 1253 1254 /* 1255 * At least one of: 1256 * -Error while converting 1257 * -Done with entire buffer 1258 * -Need to write offsets or update the current offset 1259 * (leave that up to the code in ucnv.c) 1260 * 1261 * or else we just stopped at an ESC byte and continue with changeState_2022() 1262 */ 1263 if (U_FAILURE(*err) || 1264 (args->source == realSourceLimit) || 1265 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || 1266 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1267 ) { 1268 /* copy partial or error input for truncated detection and error handling */ 1269 if(U_FAILURE(*err)) { 1270 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1271 if(length > 0) { 1272 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1273 } 1274 } else { 1275 length = saveThis->toULength = myData->currentConverter->toULength; 1276 if(length > 0) { 1277 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1278 if(args->source < mySourceLimit) { 1279 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1280 } 1281 } 1282 } 1283 return; 1284 } 1285 } 1286 } 1287 1288 sourceStart = args->source; 1289 changeState_2022(args->converter, 1290 &(args->source), 1291 realSourceLimit, 1292 ISO_2022, 1293 err); 1294 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { 1295 /* let the ucnv.c code update its current offset */ 1296 return; 1297 } 1298 } 1299 } 1300 1301 #endif 1302 1303 /* 1304 * To Unicode Callback helper function 1305 */ 1306 static void 1307 toUnicodeCallback(UConverter *cnv, 1308 const uint32_t sourceChar, const uint32_t targetUniChar, 1309 UErrorCode* err){ 1310 if(sourceChar>0xff){ 1311 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); 1312 cnv->toUBytes[1] = (uint8_t)sourceChar; 1313 cnv->toULength = 2; 1314 } 1315 else{ 1316 cnv->toUBytes[0] =(char) sourceChar; 1317 cnv->toULength = 1; 1318 } 1319 1320 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1321 *err = U_INVALID_CHAR_FOUND; 1322 } 1323 else{ 1324 *err = U_ILLEGAL_CHAR_FOUND; 1325 } 1326 } 1327 1328 /**************************************ISO-2022-JP*************************************************/ 1329 1330 /************************************** IMPORTANT ************************************************** 1331 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1332 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1333 * The converter iterates over each Unicode codepoint 1334 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1335 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1336 * would do as far as possible. 1337 * 1338 * If the implementation of these macros or structure of sharedData struct change in the future, make 1339 * sure that ISO-2022 is also changed. 1340 *************************************************************************************************** 1341 */ 1342 1343 /*************************************************************************************************** 1344 * Rules for ISO-2022-jp encoding 1345 * (i) Escape sequences must be fully contained within a line they should not 1346 * span new lines or CRs 1347 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1348 * JIS-Roman character escape sequence should follow before the line terminates 1349 * (iii) If the first character on the line is represented by two bytes then a two 1350 * byte character escape sequence should precede it 1351 * (iv) If no escape sequence is encountered then the characters are ASCII 1352 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1353 * and invoked with SS2 (ESC N). 1354 * (vi) If there is any G0 designation in text, there must be a switch to 1355 * ASCII or to JIS X 0201-Roman before a space character (but not 1356 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1357 * characters such as tab or CRLF. 1358 * (vi) Supported encodings: 1359 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1360 * 1361 * source : RFC-1554 1362 * 1363 * JISX201, JISX208,JISX212 : new .cnv data files created 1364 * KSC5601 : alias to ibm-949 mapping table 1365 * GB2312 : alias to ibm-1386 mapping table 1366 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1367 * ISO-8859-7 : alisas to ibm-9409 mapping table 1368 */ 1369 1370 /* preference order of JP charsets */ 1371 static const StateEnum jpCharsetPref[]={ 1372 ASCII, 1373 JISX201, 1374 ISO8859_1, 1375 ISO8859_7, 1376 JISX208, 1377 JISX212, 1378 GB2312, 1379 KSC5601, 1380 HWKANA_7BIT 1381 }; 1382 1383 /* 1384 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1385 * not in order of jpCharsetPref[]! 1386 */ 1387 static const char escSeqChars[][6] ={ 1388 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1389 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1390 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1391 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1392 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1393 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1394 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1395 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1396 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1397 1398 }; 1399 static const int8_t escSeqCharsLen[] ={ 1400 3, /* length of <ESC>(B ASCII */ 1401 3, /* length of <ESC>.A ISO-8859-1 */ 1402 3, /* length of <ESC>.F ISO-8859-7 */ 1403 3, /* length of <ESC>(J JISX-201 */ 1404 3, /* length of <ESC>$B JISX-208 */ 1405 4, /* length of <ESC>$(D JISX-212 */ 1406 3, /* length of <ESC>$A GB2312 */ 1407 4, /* length of <ESC>$(C KSC5601 */ 1408 3 /* length of <ESC>(I HWKANA_7BIT */ 1409 }; 1410 1411 /* 1412 * The iteration over various code pages works this way: 1413 * i) Get the currentState from myConverterData->currentState 1414 * ii) Check if the character is mapped to a valid character in the currentState 1415 * Yes -> a) set the initIterState to currentState 1416 * b) remain in this state until an invalid character is found 1417 * No -> a) go to the next code page and find the character 1418 * iii) Before changing the state increment the current state check if the current state 1419 * is equal to the intitIteration state 1420 * Yes -> A character that cannot be represented in any of the supported encodings 1421 * break and return a U_INVALID_CHARACTER error 1422 * No -> Continue and find the character in next code page 1423 * 1424 * 1425 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1426 */ 1427 1428 /* Map 00..7F to Unicode according to JIS X 0201. */ 1429 static U_INLINE uint32_t 1430 jisx201ToU(uint32_t value) { 1431 if(value < 0x5c) { 1432 return value; 1433 } else if(value == 0x5c) { 1434 return 0xa5; 1435 } else if(value == 0x7e) { 1436 return 0x203e; 1437 } else /* value <= 0x7f */ { 1438 return value; 1439 } 1440 } 1441 1442 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1443 static U_INLINE uint32_t 1444 jisx201FromU(uint32_t value) { 1445 if(value<=0x7f) { 1446 if(value!=0x5c && value!=0x7e) { 1447 return value; 1448 } 1449 } else if(value==0xa5) { 1450 return 0x5c; 1451 } else if(value==0x203e) { 1452 return 0x7e; 1453 } 1454 return 0xfffe; 1455 } 1456 1457 /* 1458 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1459 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1460 * Return 0 if the byte pair is out of range. 1461 */ 1462 static U_INLINE uint32_t 1463 _2022FromSJIS(uint32_t value) { 1464 uint8_t trail; 1465 1466 if(value > 0xEFFC) { 1467 return 0; /* beyond JIS X 0208 */ 1468 } 1469 1470 trail = (uint8_t)value; 1471 1472 value &= 0xff00; /* lead byte */ 1473 if(value <= 0x9f00) { 1474 value -= 0x7000; 1475 } else /* 0xe000 <= value <= 0xef00 */ { 1476 value -= 0xb000; 1477 } 1478 value <<= 1; 1479 1480 if(trail <= 0x9e) { 1481 value -= 0x100; 1482 if(trail <= 0x7e) { 1483 value |= trail - 0x1f; 1484 } else { 1485 value |= trail - 0x20; 1486 } 1487 } else /* trail <= 0xfc */ { 1488 value |= trail - 0x7e; 1489 } 1490 return value; 1491 } 1492 1493 /* 1494 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1495 * If either byte is outside 21..7E make sure that the result is not valid 1496 * for Shift-JIS so that the converter catches it. 1497 * Some invalid byte values already turn into equally invalid Shift-JIS 1498 * byte values and need not be tested explicitly. 1499 */ 1500 static U_INLINE void 1501 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1502 if(c1&1) { 1503 ++c1; 1504 if(c2 <= 0x5f) { 1505 c2 += 0x1f; 1506 } else if(c2 <= 0x7e) { 1507 c2 += 0x20; 1508 } else { 1509 c2 = 0; /* invalid */ 1510 } 1511 } else { 1512 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { 1513 c2 += 0x7e; 1514 } else { 1515 c2 = 0; /* invalid */ 1516 } 1517 } 1518 c1 >>= 1; 1519 if(c1 <= 0x2f) { 1520 c1 += 0x70; 1521 } else if(c1 <= 0x3f) { 1522 c1 += 0xb0; 1523 } else { 1524 c1 = 0; /* invalid */ 1525 } 1526 bytes[0] = (char)c1; 1527 bytes[1] = (char)c2; 1528 } 1529 1530 /* 1531 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1532 * Katakana. 1533 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1534 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1535 * These were the only fallbacks in ICU's jisx-208.ucm file. 1536 */ 1537 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1538 0x2123, /* U+FF61 */ 1539 0x2156, 1540 0x2157, 1541 0x2122, 1542 0x2126, 1543 0x2572, 1544 0x2521, 1545 0x2523, 1546 0x2525, 1547 0x2527, 1548 0x2529, 1549 0x2563, 1550 0x2565, 1551 0x2567, 1552 0x2543, 1553 0x213C, /* U+FF70 */ 1554 0x2522, 1555 0x2524, 1556 0x2526, 1557 0x2528, 1558 0x252A, 1559 0x252B, 1560 0x252D, 1561 0x252F, 1562 0x2531, 1563 0x2533, 1564 0x2535, 1565 0x2537, 1566 0x2539, 1567 0x253B, 1568 0x253D, 1569 0x253F, /* U+FF80 */ 1570 0x2541, 1571 0x2544, 1572 0x2546, 1573 0x2548, 1574 0x254A, 1575 0x254B, 1576 0x254C, 1577 0x254D, 1578 0x254E, 1579 0x254F, 1580 0x2552, 1581 0x2555, 1582 0x2558, 1583 0x255B, 1584 0x255E, 1585 0x255F, /* U+FF90 */ 1586 0x2560, 1587 0x2561, 1588 0x2562, 1589 0x2564, 1590 0x2566, 1591 0x2568, 1592 0x2569, 1593 0x256A, 1594 0x256B, 1595 0x256C, 1596 0x256D, 1597 0x256F, 1598 0x2573, 1599 0x212B, 1600 0x212C /* U+FF9F */ 1601 }; 1602 1603 static void 1604 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1605 UConverter *cnv = args->converter; 1606 UConverterDataISO2022 *converterData; 1607 ISO2022State *pFromU2022State; 1608 uint8_t *target = (uint8_t *) args->target; 1609 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 1610 const UChar* source = args->source; 1611 const UChar* sourceLimit = args->sourceLimit; 1612 int32_t* offsets = args->offsets; 1613 UChar32 sourceChar; 1614 char buffer[8]; 1615 int32_t len, outLen; 1616 int8_t choices[10]; 1617 int32_t choiceCount; 1618 uint32_t targetValue = 0; 1619 UBool useFallback; 1620 1621 int32_t i; 1622 int8_t cs, g; 1623 1624 /* set up the state */ 1625 converterData = (UConverterDataISO2022*)cnv->extraInfo; 1626 pFromU2022State = &converterData->fromU2022State; 1627 1628 choiceCount = 0; 1629 1630 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1631 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1632 goto getTrail; 1633 } 1634 1635 while(source < sourceLimit) { 1636 if(target < targetLimit) { 1637 1638 sourceChar = *(source++); 1639 /*check if the char is a First surrogate*/ 1640 if(UTF_IS_SURROGATE(sourceChar)) { 1641 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 1642 getTrail: 1643 /*look ahead to find the trail surrogate*/ 1644 if(source < sourceLimit) { 1645 /* test the following code unit */ 1646 UChar trail=(UChar) *source; 1647 if(UTF_IS_SECOND_SURROGATE(trail)) { 1648 source++; 1649 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 1650 cnv->fromUChar32=0x00; 1651 /* convert this supplementary code point */ 1652 /* exit this condition tree */ 1653 } else { 1654 /* this is an unmatched lead code unit (1st surrogate) */ 1655 /* callback(illegal) */ 1656 *err=U_ILLEGAL_CHAR_FOUND; 1657 cnv->fromUChar32=sourceChar; 1658 break; 1659 } 1660 } else { 1661 /* no more input */ 1662 cnv->fromUChar32=sourceChar; 1663 break; 1664 } 1665 } else { 1666 /* this is an unmatched trail code unit (2nd surrogate) */ 1667 /* callback(illegal) */ 1668 *err=U_ILLEGAL_CHAR_FOUND; 1669 cnv->fromUChar32=sourceChar; 1670 break; 1671 } 1672 } 1673 1674 /* do not convert SO/SI/ESC */ 1675 if(IS_2022_CONTROL(sourceChar)) { 1676 /* callback(illegal) */ 1677 *err=U_ILLEGAL_CHAR_FOUND; 1678 cnv->fromUChar32=sourceChar; 1679 break; 1680 } 1681 1682 /* do the conversion */ 1683 1684 if(choiceCount == 0) { 1685 uint16_t csm; 1686 1687 /* 1688 * The csm variable keeps track of which charsets are allowed 1689 * and not used yet while building the choices[]. 1690 */ 1691 csm = jpCharsetMasks[converterData->version]; 1692 choiceCount = 0; 1693 1694 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1695 if(converterData->version == 3 || converterData->version == 4) { 1696 choices[choiceCount++] = (int8_t)HWKANA_7BIT; 1697 } 1698 /* Do not try single-byte half-width Katakana for other versions. */ 1699 csm &= ~CSM(HWKANA_7BIT); 1700 1701 /* try the current G0 charset */ 1702 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1703 csm &= ~CSM(cs); 1704 1705 /* try the current G2 charset */ 1706 if((cs = pFromU2022State->cs[2]) != 0) { 1707 choices[choiceCount++] = cs; 1708 csm &= ~CSM(cs); 1709 } 1710 1711 /* try all the other possible charsets */ 1712 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { 1713 cs = (int8_t)jpCharsetPref[i]; 1714 if(CSM(cs) & csm) { 1715 choices[choiceCount++] = cs; 1716 csm &= ~CSM(cs); 1717 } 1718 } 1719 } 1720 1721 cs = g = 0; 1722 /* 1723 * len==0: no mapping found yet 1724 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1725 * len>0: found a roundtrip result, done 1726 */ 1727 len = 0; 1728 /* 1729 * We will turn off useFallback after finding a fallback, 1730 * but we still get fallbacks from PUA code points as usual. 1731 * Therefore, we will also need to check that we don't overwrite 1732 * an early fallback with a later one. 1733 */ 1734 useFallback = cnv->useFallback; 1735 1736 for(i = 0; i < choiceCount && len <= 0; ++i) { 1737 uint32_t value; 1738 int32_t len2; 1739 int8_t cs0 = choices[i]; 1740 switch(cs0) { 1741 case ASCII: 1742 if(sourceChar <= 0x7f) { 1743 targetValue = (uint32_t)sourceChar; 1744 len = 1; 1745 cs = cs0; 1746 g = 0; 1747 } 1748 break; 1749 case ISO8859_1: 1750 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1751 targetValue = (uint32_t)sourceChar - 0x80; 1752 len = 1; 1753 cs = cs0; 1754 g = 2; 1755 } 1756 break; 1757 case HWKANA_7BIT: 1758 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1759 if(converterData->version==3) { 1760 /* JIS7: use G1 (SO) */ 1761 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1762 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); 1763 len = 1; 1764 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1765 g = 1; 1766 } else if(converterData->version==4) { 1767 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1768 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1769 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); 1770 len = 1; 1771 1772 cs = pFromU2022State->cs[0]; 1773 if(IS_JP_DBCS(cs)) { 1774 /* switch from a DBCS charset to JISX201 */ 1775 cs = (int8_t)JISX201; 1776 } 1777 /* else stay in the current G0 charset */ 1778 g = 0; 1779 } 1780 /* else do not use HWKANA_7BIT with other versions */ 1781 } 1782 break; 1783 case JISX201: 1784 /* G0 SBCS */ 1785 value = jisx201FromU(sourceChar); 1786 if(value <= 0x7f) { 1787 targetValue = value; 1788 len = 1; 1789 cs = cs0; 1790 g = 0; 1791 useFallback = FALSE; 1792 } 1793 break; 1794 case JISX208: 1795 /* G0 DBCS from Shift-JIS table */ 1796 len2 = MBCS_FROM_UCHAR32_ISO2022( 1797 converterData->myConverterArray[cs0], 1798 sourceChar, &value, 1799 useFallback, MBCS_OUTPUT_2); 1800 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1801 value = _2022FromSJIS(value); 1802 if(value != 0) { 1803 targetValue = value; 1804 len = len2; 1805 cs = cs0; 1806 g = 0; 1807 useFallback = FALSE; 1808 } 1809 } else if(len == 0 && useFallback && 1810 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1811 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1812 len = -2; 1813 cs = cs0; 1814 g = 0; 1815 useFallback = FALSE; 1816 } 1817 break; 1818 case ISO8859_7: 1819 /* G0 SBCS forced to 7-bit output */ 1820 len2 = MBCS_SINGLE_FROM_UCHAR32( 1821 converterData->myConverterArray[cs0], 1822 sourceChar, &value, 1823 useFallback); 1824 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1825 targetValue = value - 0x80; 1826 len = len2; 1827 cs = cs0; 1828 g = 2; 1829 useFallback = FALSE; 1830 } 1831 break; 1832 default: 1833 /* G0 DBCS */ 1834 len2 = MBCS_FROM_UCHAR32_ISO2022( 1835 converterData->myConverterArray[cs0], 1836 sourceChar, &value, 1837 useFallback, MBCS_OUTPUT_2); 1838 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1839 if(cs0 == KSC5601) { 1840 /* 1841 * Check for valid bytes for the encoding scheme. 1842 * This is necessary because the sub-converter (windows-949) 1843 * has a broader encoding scheme than is valid for 2022. 1844 */ 1845 value = _2022FromGR94DBCS(value); 1846 if(value == 0) { 1847 break; 1848 } 1849 } 1850 targetValue = value; 1851 len = len2; 1852 cs = cs0; 1853 g = 0; 1854 useFallback = FALSE; 1855 } 1856 break; 1857 } 1858 } 1859 1860 if(len != 0) { 1861 if(len < 0) { 1862 len = -len; /* fallback */ 1863 } 1864 outLen = 0; /* count output bytes */ 1865 1866 /* write SI if necessary (only for JIS7) */ 1867 if(pFromU2022State->g == 1 && g == 0) { 1868 buffer[outLen++] = UCNV_SI; 1869 pFromU2022State->g = 0; 1870 } 1871 1872 /* write the designation sequence if necessary */ 1873 if(cs != pFromU2022State->cs[g]) { 1874 int32_t escLen = escSeqCharsLen[cs]; 1875 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1876 outLen += escLen; 1877 pFromU2022State->cs[g] = cs; 1878 1879 /* invalidate the choices[] */ 1880 choiceCount = 0; 1881 } 1882 1883 /* write the shift sequence if necessary */ 1884 if(g != pFromU2022State->g) { 1885 switch(g) { 1886 /* case 0 handled before writing escapes */ 1887 case 1: 1888 buffer[outLen++] = UCNV_SO; 1889 pFromU2022State->g = 1; 1890 break; 1891 default: /* case 2 */ 1892 buffer[outLen++] = 0x1b; 1893 buffer[outLen++] = 0x4e; 1894 break; 1895 /* no case 3: no SS3 in ISO-2022-JP-x */ 1896 } 1897 } 1898 1899 /* write the output bytes */ 1900 if(len == 1) { 1901 buffer[outLen++] = (char)targetValue; 1902 } else /* len == 2 */ { 1903 buffer[outLen++] = (char)(targetValue >> 8); 1904 buffer[outLen++] = (char)targetValue; 1905 } 1906 } else { 1907 /* 1908 * if we cannot find the character after checking all codepages 1909 * then this is an error 1910 */ 1911 *err = U_INVALID_CHAR_FOUND; 1912 cnv->fromUChar32=sourceChar; 1913 break; 1914 } 1915 1916 if(sourceChar == CR || sourceChar == LF) { 1917 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1918 pFromU2022State->cs[2] = 0; 1919 choiceCount = 0; 1920 } 1921 1922 /* output outLen>0 bytes in buffer[] */ 1923 if(outLen == 1) { 1924 *target++ = buffer[0]; 1925 if(offsets) { 1926 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 1927 } 1928 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1929 *target++ = buffer[0]; 1930 *target++ = buffer[1]; 1931 if(offsets) { 1932 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 1933 *offsets++ = sourceIndex; 1934 *offsets++ = sourceIndex; 1935 } 1936 } else { 1937 fromUWriteUInt8( 1938 cnv, 1939 buffer, outLen, 1940 &target, (const char *)targetLimit, 1941 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 1942 err); 1943 if(U_FAILURE(*err)) { 1944 break; 1945 } 1946 } 1947 } /* end if(myTargetIndex<myTargetLength) */ 1948 else{ 1949 *err =U_BUFFER_OVERFLOW_ERROR; 1950 break; 1951 } 1952 1953 }/* end while(mySourceIndex<mySourceLength) */ 1954 1955 /* 1956 * the end of the input stream and detection of truncated input 1957 * are handled by the framework, but for ISO-2022-JP conversion 1958 * we need to be in ASCII mode at the very end 1959 * 1960 * conditions: 1961 * successful 1962 * in SO mode or not in ASCII mode 1963 * end of input and no truncated input 1964 */ 1965 if( U_SUCCESS(*err) && 1966 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 1967 args->flush && source>=sourceLimit && cnv->fromUChar32==0 1968 ) { 1969 int32_t sourceIndex; 1970 1971 outLen = 0; 1972 1973 if(pFromU2022State->g != 0) { 1974 buffer[outLen++] = UCNV_SI; 1975 pFromU2022State->g = 0; 1976 } 1977 1978 if(pFromU2022State->cs[0] != ASCII) { 1979 int32_t escLen = escSeqCharsLen[ASCII]; 1980 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 1981 outLen += escLen; 1982 pFromU2022State->cs[0] = (int8_t)ASCII; 1983 } 1984 1985 /* get the source index of the last input character */ 1986 /* 1987 * TODO this would be simpler and more reliable if we used a pair 1988 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 1989 * so that we could simply use the prevSourceIndex here; 1990 * this code gives an incorrect result for the rare case of an unmatched 1991 * trail surrogate that is alone in the last buffer of the text stream 1992 */ 1993 sourceIndex=(int32_t)(source-args->source); 1994 if(sourceIndex>0) { 1995 --sourceIndex; 1996 if( U16_IS_TRAIL(args->source[sourceIndex]) && 1997 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 1998 ) { 1999 --sourceIndex; 2000 } 2001 } else { 2002 sourceIndex=-1; 2003 } 2004 2005 fromUWriteUInt8( 2006 cnv, 2007 buffer, outLen, 2008 &target, (const char *)targetLimit, 2009 &offsets, sourceIndex, 2010 err); 2011 } 2012 2013 /*save the state and return */ 2014 args->source = source; 2015 args->target = (char*)target; 2016 } 2017 2018 /*************** to unicode *******************/ 2019 2020 static void 2021 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2022 UErrorCode* err){ 2023 char tempBuf[2]; 2024 const char *mySource = (char *) args->source; 2025 UChar *myTarget = args->target; 2026 const char *mySourceLimit = args->sourceLimit; 2027 uint32_t targetUniChar = 0x0000; 2028 uint32_t mySourceChar = 0x0000; 2029 uint32_t tmpSourceChar = 0x0000; 2030 UConverterDataISO2022* myData; 2031 ISO2022State *pToU2022State; 2032 StateEnum cs; 2033 2034 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2035 pToU2022State = &myData->toU2022State; 2036 2037 if(myData->key != 0) { 2038 /* continue with a partial escape sequence */ 2039 goto escape; 2040 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2041 /* continue with a partial double-byte character */ 2042 mySourceChar = args->converter->toUBytes[0]; 2043 args->converter->toULength = 0; 2044 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2045 targetUniChar = missingCharMarker; 2046 goto getTrailByte; 2047 } 2048 2049 while(mySource < mySourceLimit){ 2050 2051 targetUniChar =missingCharMarker; 2052 2053 if(myTarget < args->targetLimit){ 2054 2055 mySourceChar= (unsigned char) *mySource++; 2056 2057 switch(mySourceChar) { 2058 case UCNV_SI: 2059 if(myData->version==3) { 2060 pToU2022State->g=0; 2061 continue; 2062 } else { 2063 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2064 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2065 break; 2066 } 2067 2068 case UCNV_SO: 2069 if(myData->version==3) { 2070 /* JIS7: switch to G1 half-width Katakana */ 2071 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; 2072 pToU2022State->g=1; 2073 continue; 2074 } else { 2075 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2076 myData->isEmptySegment = FALSE; /* reset this, we have a different error */ 2077 break; 2078 } 2079 2080 case ESC_2022: 2081 mySource--; 2082 escape: 2083 { 2084 const char * mySourceBefore = mySource; 2085 int8_t toULengthBefore = args->converter->toULength; 2086 2087 changeState_2022(args->converter,&(mySource), 2088 mySourceLimit, ISO_2022_JP,err); 2089 2090 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ 2091 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2092 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2093 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2094 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); 2095 } 2096 } 2097 2098 /* invalid or illegal escape sequence */ 2099 if(U_FAILURE(*err)){ 2100 args->target = myTarget; 2101 args->source = mySource; 2102 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 2103 return; 2104 } 2105 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2106 if(myData->key==0) { 2107 myData->isEmptySegment = TRUE; 2108 } 2109 continue; 2110 2111 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2112 2113 case CR: 2114 /*falls through*/ 2115 case LF: 2116 /* automatically reset to single-byte mode */ 2117 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { 2118 pToU2022State->cs[0] = (int8_t)ASCII; 2119 } 2120 pToU2022State->cs[2] = 0; 2121 pToU2022State->g = 0; 2122 /* falls through */ 2123 default: 2124 /* convert one or two bytes */ 2125 myData->isEmptySegment = FALSE; 2126 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 2127 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && 2128 !IS_JP_DBCS(cs) 2129 ) { 2130 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2131 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2132 2133 /* return from a single-shift state to the previous one */ 2134 if(pToU2022State->g >= 2) { 2135 pToU2022State->g=pToU2022State->prevG; 2136 } 2137 } else switch(cs) { 2138 case ASCII: 2139 if(mySourceChar <= 0x7f) { 2140 targetUniChar = mySourceChar; 2141 } 2142 break; 2143 case ISO8859_1: 2144 if(mySourceChar <= 0x7f) { 2145 targetUniChar = mySourceChar + 0x80; 2146 } 2147 /* return from a single-shift state to the previous one */ 2148 pToU2022State->g=pToU2022State->prevG; 2149 break; 2150 case ISO8859_7: 2151 if(mySourceChar <= 0x7f) { 2152 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2153 targetUniChar = 2154 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2155 myData->myConverterArray[cs], 2156 mySourceChar + 0x80); 2157 } 2158 /* return from a single-shift state to the previous one */ 2159 pToU2022State->g=pToU2022State->prevG; 2160 break; 2161 case JISX201: 2162 if(mySourceChar <= 0x7f) { 2163 targetUniChar = jisx201ToU(mySourceChar); 2164 } 2165 break; 2166 case HWKANA_7BIT: 2167 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2168 /* 7-bit halfwidth Katakana */ 2169 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2170 } 2171 break; 2172 default: 2173 /* G0 DBCS */ 2174 if(mySource < mySourceLimit) { 2175 int leadIsOk, trailIsOk; 2176 uint8_t trailByte; 2177 getTrailByte: 2178 trailByte = (uint8_t)*mySource; 2179 /* 2180 * Ticket 5691: consistent illegal sequences: 2181 * - We include at least the first byte in the illegal sequence. 2182 * - If any of the non-initial bytes could be the start of a character, 2183 * we stop the illegal sequence before the first one of those. 2184 * 2185 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2186 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2187 * Otherwise we convert or report the pair of bytes. 2188 */ 2189 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2190 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2191 if (leadIsOk && trailIsOk) { 2192 ++mySource; 2193 tmpSourceChar = (mySourceChar << 8) | trailByte; 2194 if(cs == JISX208) { 2195 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); 2196 mySourceChar = tmpSourceChar; 2197 } else { 2198 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2199 mySourceChar = tmpSourceChar; 2200 if (cs == KSC5601) { 2201 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2202 } 2203 tempBuf[0] = (char)(tmpSourceChar >> 8); 2204 tempBuf[1] = (char)(tmpSourceChar); 2205 } 2206 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); 2207 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2208 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2209 ++mySource; 2210 /* add another bit so that the code below writes 2 bytes in case of error */ 2211 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2212 } 2213 } else { 2214 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2215 args->converter->toULength = 1; 2216 goto endloop; 2217 } 2218 } /* End of inner switch */ 2219 break; 2220 } /* End of outer switch */ 2221 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2222 if(args->offsets){ 2223 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2224 } 2225 *(myTarget++)=(UChar)targetUniChar; 2226 } 2227 else if(targetUniChar > missingCharMarker){ 2228 /* disassemble the surrogate pair and write to output*/ 2229 targetUniChar-=0x0010000; 2230 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 2231 if(args->offsets){ 2232 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2233 } 2234 ++myTarget; 2235 if(myTarget< args->targetLimit){ 2236 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2237 if(args->offsets){ 2238 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2239 } 2240 ++myTarget; 2241 }else{ 2242 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2243 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 2244 } 2245 2246 } 2247 else{ 2248 /* Call the callback function*/ 2249 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2250 break; 2251 } 2252 } 2253 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2254 *err =U_BUFFER_OVERFLOW_ERROR; 2255 break; 2256 } 2257 } 2258 endloop: 2259 args->target = myTarget; 2260 args->source = mySource; 2261 } 2262 2263 2264 /*************************************************************** 2265 * Rules for ISO-2022-KR encoding 2266 * i) The KSC5601 designator sequence should appear only once in a file, 2267 * at the begining of a line before any KSC5601 characters. This usually 2268 * means that it appears by itself on the first line of the file 2269 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2270 * and SI to shift into single byte mode 2271 */ 2272 static void 2273 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2274 2275 UConverter* saveConv = args->converter; 2276 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; 2277 args->converter=myConverterData->currentConverter; 2278 2279 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2280 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2281 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2282 2283 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2284 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2285 uprv_memcpy( 2286 saveConv->charErrorBuffer, 2287 myConverterData->currentConverter->charErrorBuffer, 2288 myConverterData->currentConverter->charErrorBufferLength); 2289 } 2290 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2291 myConverterData->currentConverter->charErrorBufferLength = 0; 2292 } 2293 args->converter=saveConv; 2294 } 2295 2296 static void 2297 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2298 2299 const UChar *source = args->source; 2300 const UChar *sourceLimit = args->sourceLimit; 2301 unsigned char *target = (unsigned char *) args->target; 2302 unsigned char *targetLimit = (unsigned char *) args->targetLimit; 2303 int32_t* offsets = args->offsets; 2304 uint32_t targetByteUnit = 0x0000; 2305 UChar32 sourceChar = 0x0000; 2306 UBool isTargetByteDBCS; 2307 UBool oldIsTargetByteDBCS; 2308 UConverterDataISO2022 *converterData; 2309 UConverterSharedData* sharedData; 2310 UBool useFallback; 2311 int32_t length =0; 2312 2313 converterData=(UConverterDataISO2022*)args->converter->extraInfo; 2314 /* if the version is 1 then the user is requesting 2315 * conversion with ibm-25546 pass the arguments to 2316 * MBCS converter and return 2317 */ 2318 if(converterData->version==1){ 2319 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2320 return; 2321 } 2322 2323 /* initialize data */ 2324 sharedData = converterData->currentConverter->sharedData; 2325 useFallback = args->converter->useFallback; 2326 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; 2327 oldIsTargetByteDBCS = isTargetByteDBCS; 2328 2329 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; 2330 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2331 goto getTrail; 2332 } 2333 while(source < sourceLimit){ 2334 2335 targetByteUnit = missingCharMarker; 2336 2337 if(target < (unsigned char*) args->targetLimit){ 2338 sourceChar = *source++; 2339 2340 /* do not convert SO/SI/ESC */ 2341 if(IS_2022_CONTROL(sourceChar)) { 2342 /* callback(illegal) */ 2343 *err=U_ILLEGAL_CHAR_FOUND; 2344 args->converter->fromUChar32=sourceChar; 2345 break; 2346 } 2347 2348 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2349 if(length < 0) { 2350 length = -length; /* fallback */ 2351 } 2352 /* only DBCS or SBCS characters are expected*/ 2353 /* DB characters with high bit set to 1 are expected */ 2354 if( length > 2 || length==0 || 2355 (length == 1 && targetByteUnit > 0x7f) || 2356 (length == 2 && 2357 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2358 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2359 ) { 2360 targetByteUnit=missingCharMarker; 2361 } 2362 if (targetByteUnit != missingCharMarker){ 2363 2364 oldIsTargetByteDBCS = isTargetByteDBCS; 2365 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); 2366 /* append the shift sequence */ 2367 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2368 2369 if (isTargetByteDBCS) 2370 *target++ = UCNV_SO; 2371 else 2372 *target++ = UCNV_SI; 2373 if(offsets) 2374 *(offsets++) = (int32_t)(source - args->source-1); 2375 } 2376 /* write the targetUniChar to target */ 2377 if(targetByteUnit <= 0x00FF){ 2378 if( target < targetLimit){ 2379 *(target++) = (unsigned char) targetByteUnit; 2380 if(offsets){ 2381 *(offsets++) = (int32_t)(source - args->source-1); 2382 } 2383 2384 }else{ 2385 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); 2386 *err = U_BUFFER_OVERFLOW_ERROR; 2387 } 2388 }else{ 2389 if(target < targetLimit){ 2390 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); 2391 if(offsets){ 2392 *(offsets++) = (int32_t)(source - args->source-1); 2393 } 2394 if(target < targetLimit){ 2395 *(target++) =(unsigned char) (targetByteUnit -0x80); 2396 if(offsets){ 2397 *(offsets++) = (int32_t)(source - args->source-1); 2398 } 2399 }else{ 2400 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); 2401 *err = U_BUFFER_OVERFLOW_ERROR; 2402 } 2403 }else{ 2404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); 2405 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); 2406 *err = U_BUFFER_OVERFLOW_ERROR; 2407 } 2408 } 2409 2410 } 2411 else{ 2412 /* oops.. the code point is unassingned 2413 * set the error and reason 2414 */ 2415 2416 /*check if the char is a First surrogate*/ 2417 if(UTF_IS_SURROGATE(sourceChar)) { 2418 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2419 getTrail: 2420 /*look ahead to find the trail surrogate*/ 2421 if(source < sourceLimit) { 2422 /* test the following code unit */ 2423 UChar trail=(UChar) *source; 2424 if(UTF_IS_SECOND_SURROGATE(trail)) { 2425 source++; 2426 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2427 *err = U_INVALID_CHAR_FOUND; 2428 /* convert this surrogate code point */ 2429 /* exit this condition tree */ 2430 } else { 2431 /* this is an unmatched lead code unit (1st surrogate) */ 2432 /* callback(illegal) */ 2433 *err=U_ILLEGAL_CHAR_FOUND; 2434 } 2435 } else { 2436 /* no more input */ 2437 *err = U_ZERO_ERROR; 2438 } 2439 } else { 2440 /* this is an unmatched trail code unit (2nd surrogate) */ 2441 /* callback(illegal) */ 2442 *err=U_ILLEGAL_CHAR_FOUND; 2443 } 2444 } else { 2445 /* callback(unassigned) for a BMP code point */ 2446 *err = U_INVALID_CHAR_FOUND; 2447 } 2448 2449 args->converter->fromUChar32=sourceChar; 2450 break; 2451 } 2452 } /* end if(myTargetIndex<myTargetLength) */ 2453 else{ 2454 *err =U_BUFFER_OVERFLOW_ERROR; 2455 break; 2456 } 2457 2458 }/* end while(mySourceIndex<mySourceLength) */ 2459 2460 /* 2461 * the end of the input stream and detection of truncated input 2462 * are handled by the framework, but for ISO-2022-KR conversion 2463 * we need to be in ASCII mode at the very end 2464 * 2465 * conditions: 2466 * successful 2467 * not in ASCII mode 2468 * end of input and no truncated input 2469 */ 2470 if( U_SUCCESS(*err) && 2471 isTargetByteDBCS && 2472 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2473 ) { 2474 int32_t sourceIndex; 2475 2476 /* we are switching to ASCII */ 2477 isTargetByteDBCS=FALSE; 2478 2479 /* get the source index of the last input character */ 2480 /* 2481 * TODO this would be simpler and more reliable if we used a pair 2482 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2483 * so that we could simply use the prevSourceIndex here; 2484 * this code gives an incorrect result for the rare case of an unmatched 2485 * trail surrogate that is alone in the last buffer of the text stream 2486 */ 2487 sourceIndex=(int32_t)(source-args->source); 2488 if(sourceIndex>0) { 2489 --sourceIndex; 2490 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2491 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2492 ) { 2493 --sourceIndex; 2494 } 2495 } else { 2496 sourceIndex=-1; 2497 } 2498 2499 fromUWriteUInt8( 2500 args->converter, 2501 SHIFT_IN_STR, 1, 2502 &target, (const char *)targetLimit, 2503 &offsets, sourceIndex, 2504 err); 2505 } 2506 2507 /*save the state and return */ 2508 args->source = source; 2509 args->target = (char*)target; 2510 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; 2511 } 2512 2513 /************************ To Unicode ***************************************/ 2514 2515 static void 2516 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2517 UErrorCode* err){ 2518 char const* sourceStart; 2519 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2520 2521 UConverterToUnicodeArgs subArgs; 2522 int32_t minArgsSize; 2523 2524 /* set up the subconverter arguments */ 2525 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2526 minArgsSize = args->size; 2527 } else { 2528 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); 2529 } 2530 2531 uprv_memcpy(&subArgs, args, minArgsSize); 2532 subArgs.size = (uint16_t)minArgsSize; 2533 subArgs.converter = myData->currentConverter; 2534 2535 /* remember the original start of the input for offsets */ 2536 sourceStart = args->source; 2537 2538 if(myData->key != 0) { 2539 /* continue with a partial escape sequence */ 2540 goto escape; 2541 } 2542 2543 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2544 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2545 subArgs.source = args->source; 2546 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2547 if(subArgs.source != subArgs.sourceLimit) { 2548 /* 2549 * get the current partial byte sequence 2550 * 2551 * it needs to be moved between the public and the subconverter 2552 * so that the conversion framework, which only sees the public 2553 * converter, can handle truncated and illegal input etc. 2554 */ 2555 if(args->converter->toULength > 0) { 2556 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2557 } 2558 subArgs.converter->toULength = args->converter->toULength; 2559 2560 /* 2561 * Convert up to the end of the input, or to before the next escape character. 2562 * Does not handle conversion extensions because the preToU[] state etc. 2563 * is not copied. 2564 */ 2565 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2566 2567 if(args->offsets != NULL && sourceStart != args->source) { 2568 /* update offsets to base them on the actual start of the input */ 2569 int32_t *offsets = args->offsets; 2570 UChar *target = args->target; 2571 int32_t delta = (int32_t)(args->source - sourceStart); 2572 while(target < subArgs.target) { 2573 if(*offsets >= 0) { 2574 *offsets += delta; 2575 } 2576 ++offsets; 2577 ++target; 2578 } 2579 } 2580 args->source = subArgs.source; 2581 args->target = subArgs.target; 2582 args->offsets = subArgs.offsets; 2583 2584 /* copy input/error/overflow buffers */ 2585 if(subArgs.converter->toULength > 0) { 2586 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2587 } 2588 args->converter->toULength = subArgs.converter->toULength; 2589 2590 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2591 if(subArgs.converter->UCharErrorBufferLength > 0) { 2592 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2593 subArgs.converter->UCharErrorBufferLength); 2594 } 2595 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2596 subArgs.converter->UCharErrorBufferLength = 0; 2597 } 2598 } 2599 2600 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2601 return; 2602 } 2603 2604 escape: 2605 changeState_2022(args->converter, 2606 &(args->source), 2607 args->sourceLimit, 2608 ISO_2022_KR, 2609 err); 2610 } 2611 } 2612 2613 static void 2614 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2615 UErrorCode* err){ 2616 char tempBuf[2]; 2617 const char *mySource = ( char *) args->source; 2618 UChar *myTarget = args->target; 2619 const char *mySourceLimit = args->sourceLimit; 2620 UChar32 targetUniChar = 0x0000; 2621 UChar mySourceChar = 0x0000; 2622 UConverterDataISO2022* myData; 2623 UConverterSharedData* sharedData ; 2624 UBool useFallback; 2625 2626 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 2627 if(myData->version==1){ 2628 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2629 return; 2630 } 2631 2632 /* initialize state */ 2633 sharedData = myData->currentConverter->sharedData; 2634 useFallback = args->converter->useFallback; 2635 2636 if(myData->key != 0) { 2637 /* continue with a partial escape sequence */ 2638 goto escape; 2639 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2640 /* continue with a partial double-byte character */ 2641 mySourceChar = args->converter->toUBytes[0]; 2642 args->converter->toULength = 0; 2643 goto getTrailByte; 2644 } 2645 2646 while(mySource< mySourceLimit){ 2647 2648 if(myTarget < args->targetLimit){ 2649 2650 mySourceChar= (unsigned char) *mySource++; 2651 2652 if(mySourceChar==UCNV_SI){ 2653 myData->toU2022State.g = 0; 2654 if (myData->isEmptySegment) { 2655 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 2656 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2657 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2658 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2659 args->converter->toULength = 1; 2660 args->target = myTarget; 2661 args->source = mySource; 2662 return; 2663 } 2664 /*consume the source */ 2665 continue; 2666 }else if(mySourceChar==UCNV_SO){ 2667 myData->toU2022State.g = 1; 2668 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 2669 /*consume the source */ 2670 continue; 2671 }else if(mySourceChar==ESC_2022){ 2672 mySource--; 2673 escape: 2674 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2675 changeState_2022(args->converter,&(mySource), 2676 mySourceLimit, ISO_2022_KR, err); 2677 if(U_FAILURE(*err)){ 2678 args->target = myTarget; 2679 args->source = mySource; 2680 return; 2681 } 2682 continue; 2683 } 2684 2685 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ 2686 if(myData->toU2022State.g == 1) { 2687 if(mySource < mySourceLimit) { 2688 int leadIsOk, trailIsOk; 2689 uint8_t trailByte; 2690 getTrailByte: 2691 targetUniChar = missingCharMarker; 2692 trailByte = (uint8_t)*mySource; 2693 /* 2694 * Ticket 5691: consistent illegal sequences: 2695 * - We include at least the first byte in the illegal sequence. 2696 * - If any of the non-initial bytes could be the start of a character, 2697 * we stop the illegal sequence before the first one of those. 2698 * 2699 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2700 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2701 * Otherwise we convert or report the pair of bytes. 2702 */ 2703 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 2704 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 2705 if (leadIsOk && trailIsOk) { 2706 ++mySource; 2707 tempBuf[0] = (char)(mySourceChar + 0x80); 2708 tempBuf[1] = (char)(trailByte + 0x80); 2709 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2710 mySourceChar = (mySourceChar << 8) | trailByte; 2711 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2712 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2713 ++mySource; 2714 /* add another bit so that the code below writes 2 bytes in case of error */ 2715 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2716 } 2717 } else { 2718 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 2719 args->converter->toULength = 1; 2720 break; 2721 } 2722 } 2723 else if(mySourceChar <= 0x7f) { 2724 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2725 } else { 2726 targetUniChar = 0xffff; 2727 } 2728 if(targetUniChar < 0xfffe){ 2729 if(args->offsets) { 2730 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2731 } 2732 *(myTarget++)=(UChar)targetUniChar; 2733 } 2734 else { 2735 /* Call the callback function*/ 2736 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2737 break; 2738 } 2739 } 2740 else{ 2741 *err =U_BUFFER_OVERFLOW_ERROR; 2742 break; 2743 } 2744 } 2745 args->target = myTarget; 2746 args->source = mySource; 2747 } 2748 2749 /*************************** END ISO2022-KR *********************************/ 2750 2751 /*************************** ISO-2022-CN ********************************* 2752 * 2753 * Rules for ISO-2022-CN Encoding: 2754 * i) The designator sequence must appear once on a line before any instance 2755 * of character set it designates. 2756 * ii) If two lines contain characters from the same character set, both lines 2757 * must include the designator sequence. 2758 * iii) Once the designator sequence is known, a shifting sequence has to be found 2759 * to invoke the shifting 2760 * iv) All lines start in ASCII and end in ASCII. 2761 * v) Four shifting sequences are employed for this purpose: 2762 * 2763 * Sequcence ASCII Eq Charsets 2764 * ---------- ------- --------- 2765 * SI <SI> US-ASCII 2766 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2767 * SS2 <ESC>N CNS-11643-1992 Plane 2 2768 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2769 * 2770 * vi) 2771 * SOdesignator : ESC "$" ")" finalchar_for_SO 2772 * SS2designator : ESC "$" "*" finalchar_for_SS2 2773 * SS3designator : ESC "$" "+" finalchar_for_SS3 2774 * 2775 * ESC $ ) A Indicates the bytes following SO are Chinese 2776 * characters as defined in GB 2312-80, until 2777 * another SOdesignation appears 2778 * 2779 * 2780 * ESC $ ) E Indicates the bytes following SO are as defined 2781 * in ISO-IR-165 (for details, see section 2.1), 2782 * until another SOdesignation appears 2783 * 2784 * ESC $ ) G Indicates the bytes following SO are as defined 2785 * in CNS 11643-plane-1, until another 2786 * SOdesignation appears 2787 * 2788 * ESC $ * H Indicates the two bytes immediately following 2789 * SS2 is a Chinese character as defined in CNS 2790 * 11643-plane-2, until another SS2designation 2791 * appears 2792 * (Meaning <ESC>N must preceed every 2 byte 2793 * sequence.) 2794 * 2795 * ESC $ + I Indicates the immediate two bytes following SS3 2796 * is a Chinese character as defined in CNS 2797 * 11643-plane-3, until another SS3designation 2798 * appears 2799 * (Meaning <ESC>O must preceed every 2 byte 2800 * sequence.) 2801 * 2802 * ESC $ + J Indicates the immediate two bytes following SS3 2803 * is a Chinese character as defined in CNS 2804 * 11643-plane-4, until another SS3designation 2805 * appears 2806 * (In English: <ESC>O must preceed every 2 byte 2807 * sequence.) 2808 * 2809 * ESC $ + K Indicates the immediate two bytes following SS3 2810 * is a Chinese character as defined in CNS 2811 * 11643-plane-5, until another SS3designation 2812 * appears 2813 * 2814 * ESC $ + L Indicates the immediate two bytes following SS3 2815 * is a Chinese character as defined in CNS 2816 * 11643-plane-6, until another SS3designation 2817 * appears 2818 * 2819 * ESC $ + M Indicates the immediate two bytes following SS3 2820 * is a Chinese character as defined in CNS 2821 * 11643-plane-7, until another SS3designation 2822 * appears 2823 * 2824 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2825 * has its own designation information before any Chinese characters 2826 * appear 2827 * 2828 */ 2829 2830 /* The following are defined this way to make the strings truely readonly */ 2831 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2832 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2833 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2834 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2835 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2836 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2837 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2838 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2839 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2840 2841 /********************** ISO2022-CN Data **************************/ 2842 static const char* const escSeqCharsCN[10] ={ 2843 SHIFT_IN_STR, /* ASCII */ 2844 GB_2312_80_STR, 2845 ISO_IR_165_STR, 2846 CNS_11643_1992_Plane_1_STR, 2847 CNS_11643_1992_Plane_2_STR, 2848 CNS_11643_1992_Plane_3_STR, 2849 CNS_11643_1992_Plane_4_STR, 2850 CNS_11643_1992_Plane_5_STR, 2851 CNS_11643_1992_Plane_6_STR, 2852 CNS_11643_1992_Plane_7_STR 2853 }; 2854 2855 static void 2856 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2857 UConverter *cnv = args->converter; 2858 UConverterDataISO2022 *converterData; 2859 ISO2022State *pFromU2022State; 2860 uint8_t *target = (uint8_t *) args->target; 2861 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; 2862 const UChar* source = args->source; 2863 const UChar* sourceLimit = args->sourceLimit; 2864 int32_t* offsets = args->offsets; 2865 UChar32 sourceChar; 2866 char buffer[8]; 2867 int32_t len; 2868 int8_t choices[3]; 2869 int32_t choiceCount; 2870 uint32_t targetValue = 0; 2871 UBool useFallback; 2872 2873 /* set up the state */ 2874 converterData = (UConverterDataISO2022*)cnv->extraInfo; 2875 pFromU2022State = &converterData->fromU2022State; 2876 2877 choiceCount = 0; 2878 2879 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2880 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2881 goto getTrail; 2882 } 2883 2884 while( source < sourceLimit){ 2885 if(target < targetLimit){ 2886 2887 sourceChar = *(source++); 2888 /*check if the char is a First surrogate*/ 2889 if(UTF_IS_SURROGATE(sourceChar)) { 2890 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { 2891 getTrail: 2892 /*look ahead to find the trail surrogate*/ 2893 if(source < sourceLimit) { 2894 /* test the following code unit */ 2895 UChar trail=(UChar) *source; 2896 if(UTF_IS_SECOND_SURROGATE(trail)) { 2897 source++; 2898 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); 2899 cnv->fromUChar32=0x00; 2900 /* convert this supplementary code point */ 2901 /* exit this condition tree */ 2902 } else { 2903 /* this is an unmatched lead code unit (1st surrogate) */ 2904 /* callback(illegal) */ 2905 *err=U_ILLEGAL_CHAR_FOUND; 2906 cnv->fromUChar32=sourceChar; 2907 break; 2908 } 2909 } else { 2910 /* no more input */ 2911 cnv->fromUChar32=sourceChar; 2912 break; 2913 } 2914 } else { 2915 /* this is an unmatched trail code unit (2nd surrogate) */ 2916 /* callback(illegal) */ 2917 *err=U_ILLEGAL_CHAR_FOUND; 2918 cnv->fromUChar32=sourceChar; 2919 break; 2920 } 2921 } 2922 2923 /* do the conversion */ 2924 if(sourceChar <= 0x007f ){ 2925 /* do not convert SO/SI/ESC */ 2926 if(IS_2022_CONTROL(sourceChar)) { 2927 /* callback(illegal) */ 2928 *err=U_ILLEGAL_CHAR_FOUND; 2929 cnv->fromUChar32=sourceChar; 2930 break; 2931 } 2932 2933 /* US-ASCII */ 2934 if(pFromU2022State->g == 0) { 2935 buffer[0] = (char)sourceChar; 2936 len = 1; 2937 } else { 2938 buffer[0] = UCNV_SI; 2939 buffer[1] = (char)sourceChar; 2940 len = 2; 2941 pFromU2022State->g = 0; 2942 choiceCount = 0; 2943 } 2944 if(sourceChar == CR || sourceChar == LF) { 2945 /* reset the state at the end of a line */ 2946 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 2947 choiceCount = 0; 2948 } 2949 } 2950 else{ 2951 /* convert U+0080..U+10ffff */ 2952 int32_t i; 2953 int8_t cs, g; 2954 2955 if(choiceCount == 0) { 2956 /* try the current SO/G1 converter first */ 2957 choices[0] = pFromU2022State->cs[1]; 2958 2959 /* default to GB2312_1 if none is designated yet */ 2960 if(choices[0] == 0) { 2961 choices[0] = GB2312_1; 2962 } 2963 2964 if(converterData->version == 0) { 2965 /* ISO-2022-CN */ 2966 2967 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 2968 if(choices[0] == GB2312_1) { 2969 choices[1] = (int8_t)CNS_11643_1; 2970 } else { 2971 choices[1] = (int8_t)GB2312_1; 2972 } 2973 2974 choiceCount = 2; 2975 } else { 2976 /* ISO-2022-CN-EXT */ 2977 2978 /* try one of the other converters */ 2979 switch(choices[0]) { 2980 case GB2312_1: 2981 choices[1] = (int8_t)CNS_11643_1; 2982 choices[2] = (int8_t)ISO_IR_165; 2983 break; 2984 case ISO_IR_165: 2985 choices[1] = (int8_t)GB2312_1; 2986 choices[2] = (int8_t)CNS_11643_1; 2987 break; 2988 default: /* CNS_11643_x */ 2989 choices[1] = (int8_t)GB2312_1; 2990 choices[2] = (int8_t)ISO_IR_165; 2991 break; 2992 } 2993 2994 choiceCount = 3; 2995 } 2996 } 2997 2998 cs = g = 0; 2999 /* 3000 * len==0: no mapping found yet 3001 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3002 * len>0: found a roundtrip result, done 3003 */ 3004 len = 0; 3005 /* 3006 * We will turn off useFallback after finding a fallback, 3007 * but we still get fallbacks from PUA code points as usual. 3008 * Therefore, we will also need to check that we don't overwrite 3009 * an early fallback with a later one. 3010 */ 3011 useFallback = cnv->useFallback; 3012 3013 for(i = 0; i < choiceCount && len <= 0; ++i) { 3014 int8_t cs0 = choices[i]; 3015 if(cs0 > 0) { 3016 uint32_t value; 3017 int32_t len2; 3018 if(cs0 >= CNS_11643_0) { 3019 len2 = MBCS_FROM_UCHAR32_ISO2022( 3020 converterData->myConverterArray[CNS_11643], 3021 sourceChar, 3022 &value, 3023 useFallback, 3024 MBCS_OUTPUT_3); 3025 if(len2 == 3 || (len2 == -3 && len == 0)) { 3026 targetValue = value; 3027 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); 3028 if(len2 >= 0) { 3029 len = 2; 3030 } else { 3031 len = -2; 3032 useFallback = FALSE; 3033 } 3034 if(cs == CNS_11643_1) { 3035 g = 1; 3036 } else if(cs == CNS_11643_2) { 3037 g = 2; 3038 } else /* plane 3..7 */ if(converterData->version == 1) { 3039 g = 3; 3040 } else { 3041 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3042 len = 0; 3043 } 3044 } 3045 } else { 3046 /* GB2312_1 or ISO-IR-165 */ 3047 len2 = MBCS_FROM_UCHAR32_ISO2022( 3048 converterData->myConverterArray[cs0], 3049 sourceChar, 3050 &value, 3051 useFallback, 3052 MBCS_OUTPUT_2); 3053 if(len2 == 2 || (len2 == -2 && len == 0)) { 3054 targetValue = value; 3055 len = len2; 3056 cs = cs0; 3057 g = 1; 3058 useFallback = FALSE; 3059 } 3060 } 3061 } 3062 } 3063 3064 if(len != 0) { 3065 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3066 3067 /* write the designation sequence if necessary */ 3068 if(cs != pFromU2022State->cs[g]) { 3069 if(cs < CNS_11643) { 3070 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3071 } else { 3072 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3073 } 3074 len = 4; 3075 pFromU2022State->cs[g] = cs; 3076 if(g == 1) { 3077 /* changing the SO/G1 charset invalidates the choices[] */ 3078 choiceCount = 0; 3079 } 3080 } 3081 3082 /* write the shift sequence if necessary */ 3083 if(g != pFromU2022State->g) { 3084 switch(g) { 3085 case 1: 3086 buffer[len++] = UCNV_SO; 3087 3088 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3089 pFromU2022State->g = 1; 3090 break; 3091 case 2: 3092 buffer[len++] = 0x1b; 3093 buffer[len++] = 0x4e; 3094 break; 3095 default: /* case 3 */ 3096 buffer[len++] = 0x1b; 3097 buffer[len++] = 0x4f; 3098 break; 3099 } 3100 } 3101 3102 /* write the two output bytes */ 3103 buffer[len++] = (char)(targetValue >> 8); 3104 buffer[len++] = (char)targetValue; 3105 } else { 3106 /* if we cannot find the character after checking all codepages 3107 * then this is an error 3108 */ 3109 *err = U_INVALID_CHAR_FOUND; 3110 cnv->fromUChar32=sourceChar; 3111 break; 3112 } 3113 } 3114 3115 /* output len>0 bytes in buffer[] */ 3116 if(len == 1) { 3117 *target++ = buffer[0]; 3118 if(offsets) { 3119 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ 3120 } 3121 } else if(len == 2 && (target + 2) <= targetLimit) { 3122 *target++ = buffer[0]; 3123 *target++ = buffer[1]; 3124 if(offsets) { 3125 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); 3126 *offsets++ = sourceIndex; 3127 *offsets++ = sourceIndex; 3128 } 3129 } else { 3130 fromUWriteUInt8( 3131 cnv, 3132 buffer, len, 3133 &target, (const char *)targetLimit, 3134 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), 3135 err); 3136 if(U_FAILURE(*err)) { 3137 break; 3138 } 3139 } 3140 } /* end if(myTargetIndex<myTargetLength) */ 3141 else{ 3142 *err =U_BUFFER_OVERFLOW_ERROR; 3143 break; 3144 } 3145 3146 }/* end while(mySourceIndex<mySourceLength) */ 3147 3148 /* 3149 * the end of the input stream and detection of truncated input 3150 * are handled by the framework, but for ISO-2022-CN conversion 3151 * we need to be in ASCII mode at the very end 3152 * 3153 * conditions: 3154 * successful 3155 * not in ASCII mode 3156 * end of input and no truncated input 3157 */ 3158 if( U_SUCCESS(*err) && 3159 pFromU2022State->g!=0 && 3160 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3161 ) { 3162 int32_t sourceIndex; 3163 3164 /* we are switching to ASCII */ 3165 pFromU2022State->g=0; 3166 3167 /* get the source index of the last input character */ 3168 /* 3169 * TODO this would be simpler and more reliable if we used a pair 3170 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3171 * so that we could simply use the prevSourceIndex here; 3172 * this code gives an incorrect result for the rare case of an unmatched 3173 * trail surrogate that is alone in the last buffer of the text stream 3174 */ 3175 sourceIndex=(int32_t)(source-args->source); 3176 if(sourceIndex>0) { 3177 --sourceIndex; 3178 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3179 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3180 ) { 3181 --sourceIndex; 3182 } 3183 } else { 3184 sourceIndex=-1; 3185 } 3186 3187 fromUWriteUInt8( 3188 cnv, 3189 SHIFT_IN_STR, 1, 3190 &target, (const char *)targetLimit, 3191 &offsets, sourceIndex, 3192 err); 3193 } 3194 3195 /*save the state and return */ 3196 args->source = source; 3197 args->target = (char*)target; 3198 } 3199 3200 3201 static void 3202 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3203 UErrorCode* err){ 3204 char tempBuf[3]; 3205 const char *mySource = (char *) args->source; 3206 UChar *myTarget = args->target; 3207 const char *mySourceLimit = args->sourceLimit; 3208 uint32_t targetUniChar = 0x0000; 3209 uint32_t mySourceChar = 0x0000; 3210 UConverterDataISO2022* myData; 3211 ISO2022State *pToU2022State; 3212 3213 myData=(UConverterDataISO2022*)(args->converter->extraInfo); 3214 pToU2022State = &myData->toU2022State; 3215 3216 if(myData->key != 0) { 3217 /* continue with a partial escape sequence */ 3218 goto escape; 3219 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3220 /* continue with a partial double-byte character */ 3221 mySourceChar = args->converter->toUBytes[0]; 3222 args->converter->toULength = 0; 3223 targetUniChar = missingCharMarker; 3224 goto getTrailByte; 3225 } 3226 3227 while(mySource < mySourceLimit){ 3228 3229 targetUniChar =missingCharMarker; 3230 3231 if(myTarget < args->targetLimit){ 3232 3233 mySourceChar= (unsigned char) *mySource++; 3234 3235 switch(mySourceChar){ 3236 case UCNV_SI: 3237 pToU2022State->g=0; 3238 if (myData->isEmptySegment) { 3239 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ 3240 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3241 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3242 args->converter->toUBytes[0] = mySourceChar; 3243 args->converter->toULength = 1; 3244 args->target = myTarget; 3245 args->source = mySource; 3246 return; 3247 } 3248 continue; 3249 3250 case UCNV_SO: 3251 if(pToU2022State->cs[1] != 0) { 3252 pToU2022State->g=1; 3253 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ 3254 continue; 3255 } else { 3256 /* illegal to have SO before a matching designator */ 3257 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ 3258 break; 3259 } 3260 3261 case ESC_2022: 3262 mySource--; 3263 escape: 3264 { 3265 const char * mySourceBefore = mySource; 3266 int8_t toULengthBefore = args->converter->toULength; 3267 3268 changeState_2022(args->converter,&(mySource), 3269 mySourceLimit, ISO_2022_CN,err); 3270 3271 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3272 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3273 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3274 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3275 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); 3276 } 3277 } 3278 3279 /* invalid or illegal escape sequence */ 3280 if(U_FAILURE(*err)){ 3281 args->target = myTarget; 3282 args->source = mySource; 3283 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ 3284 return; 3285 } 3286 continue; 3287 3288 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3289 3290 case CR: 3291 /*falls through*/ 3292 case LF: 3293 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3294 /* falls through */ 3295 default: 3296 /* convert one or two bytes */ 3297 myData->isEmptySegment = FALSE; 3298 if(pToU2022State->g != 0) { 3299 if(mySource < mySourceLimit) { 3300 UConverterSharedData *cnv; 3301 StateEnum tempState; 3302 int32_t tempBufLen; 3303 int leadIsOk, trailIsOk; 3304 uint8_t trailByte; 3305 getTrailByte: 3306 trailByte = (uint8_t)*mySource; 3307 /* 3308 * Ticket 5691: consistent illegal sequences: 3309 * - We include at least the first byte in the illegal sequence. 3310 * - If any of the non-initial bytes could be the start of a character, 3311 * we stop the illegal sequence before the first one of those. 3312 * 3313 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3314 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3315 * Otherwise we convert or report the pair of bytes. 3316 */ 3317 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); 3318 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); 3319 if (leadIsOk && trailIsOk) { 3320 ++mySource; 3321 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; 3322 if(tempState >= CNS_11643_0) { 3323 cnv = myData->myConverterArray[CNS_11643]; 3324 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); 3325 tempBuf[1] = (char) (mySourceChar); 3326 tempBuf[2] = (char) trailByte; 3327 tempBufLen = 3; 3328 3329 }else{ 3330 cnv = myData->myConverterArray[tempState]; 3331 tempBuf[0] = (char) (mySourceChar); 3332 tempBuf[1] = (char) trailByte; 3333 tempBufLen = 2; 3334 } 3335 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); 3336 mySourceChar = (mySourceChar << 8) | trailByte; 3337 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3338 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3339 ++mySource; 3340 /* add another bit so that the code below writes 2 bytes in case of error */ 3341 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3342 } 3343 if(pToU2022State->g>=2) { 3344 /* return from a single-shift state to the previous one */ 3345 pToU2022State->g=pToU2022State->prevG; 3346 } 3347 } else { 3348 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 3349 args->converter->toULength = 1; 3350 goto endloop; 3351 } 3352 } 3353 else{ 3354 if(mySourceChar <= 0x7f) { 3355 targetUniChar = (UChar) mySourceChar; 3356 } 3357 } 3358 break; 3359 } 3360 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3361 if(args->offsets){ 3362 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3363 } 3364 *(myTarget++)=(UChar)targetUniChar; 3365 } 3366 else if(targetUniChar > missingCharMarker){ 3367 /* disassemble the surrogate pair and write to output*/ 3368 targetUniChar-=0x0010000; 3369 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); 3370 if(args->offsets){ 3371 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3372 } 3373 ++myTarget; 3374 if(myTarget< args->targetLimit){ 3375 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3376 if(args->offsets){ 3377 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3378 } 3379 ++myTarget; 3380 }else{ 3381 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3382 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); 3383 } 3384 3385 } 3386 else{ 3387 /* Call the callback function*/ 3388 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3389 break; 3390 } 3391 } 3392 else{ 3393 *err =U_BUFFER_OVERFLOW_ERROR; 3394 break; 3395 } 3396 } 3397 endloop: 3398 args->target = myTarget; 3399 args->source = mySource; 3400 } 3401 3402 static void 3403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3404 UConverter *cnv = args->converter; 3405 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; 3406 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3407 char *p, *subchar; 3408 char buffer[8]; 3409 int32_t length; 3410 3411 subchar=(char *)cnv->subChars; 3412 length=cnv->subCharLen; /* assume length==1 for most variants */ 3413 3414 p = buffer; 3415 switch(myConverterData->locale[0]){ 3416 case 'j': 3417 { 3418 int8_t cs; 3419 3420 if(pFromU2022State->g == 1) { 3421 /* JIS7: switch from G1 to G0 */ 3422 pFromU2022State->g = 0; 3423 *p++ = UCNV_SI; 3424 } 3425 3426 cs = pFromU2022State->cs[0]; 3427 if(cs != ASCII && cs != JISX201) { 3428 /* not in ASCII or JIS X 0201: switch to ASCII */ 3429 pFromU2022State->cs[0] = (int8_t)ASCII; 3430 *p++ = '\x1b'; 3431 *p++ = '\x28'; 3432 *p++ = '\x42'; 3433 } 3434 3435 *p++ = subchar[0]; 3436 break; 3437 } 3438 case 'c': 3439 if(pFromU2022State->g != 0) { 3440 /* not in ASCII mode: switch to ASCII */ 3441 pFromU2022State->g = 0; 3442 *p++ = UCNV_SI; 3443 } 3444 *p++ = subchar[0]; 3445 break; 3446 case 'k': 3447 if(myConverterData->version == 0) { 3448 if(length == 1) { 3449 if((UBool)args->converter->fromUnicodeStatus) { 3450 /* in DBCS mode: switch to SBCS */ 3451 args->converter->fromUnicodeStatus = 0; 3452 *p++ = UCNV_SI; 3453 } 3454 *p++ = subchar[0]; 3455 } else /* length == 2*/ { 3456 if(!(UBool)args->converter->fromUnicodeStatus) { 3457 /* in SBCS mode: switch to DBCS */ 3458 args->converter->fromUnicodeStatus = 1; 3459 *p++ = UCNV_SO; 3460 } 3461 *p++ = subchar[0]; 3462 *p++ = subchar[1]; 3463 } 3464 break; 3465 } else { 3466 /* save the subconverter's substitution string */ 3467 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3468 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3469 3470 /* set our substitution string into the subconverter */ 3471 myConverterData->currentConverter->subChars = (uint8_t *)subchar; 3472 myConverterData->currentConverter->subCharLen = (int8_t)length; 3473 3474 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3475 args->converter = myConverterData->currentConverter; 3476 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3477 ucnv_cbFromUWriteSub(args, 0, err); 3478 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3479 args->converter = cnv; 3480 3481 /* restore the subconverter's substitution string */ 3482 myConverterData->currentConverter->subChars = currentSubChars; 3483 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3484 3485 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3486 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3487 uprv_memcpy( 3488 cnv->charErrorBuffer, 3489 myConverterData->currentConverter->charErrorBuffer, 3490 myConverterData->currentConverter->charErrorBufferLength); 3491 } 3492 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3493 myConverterData->currentConverter->charErrorBufferLength = 0; 3494 } 3495 return; 3496 } 3497 default: 3498 /* not expected */ 3499 break; 3500 } 3501 ucnv_cbFromUWriteBytes(args, 3502 buffer, (int32_t)(p - buffer), 3503 offsetIndex, err); 3504 } 3505 3506 /* 3507 * Structure for cloning an ISO 2022 converter into a single memory block. 3508 * ucnv_safeClone() of the converter will align the entire cloneStruct, 3509 * and then ucnv_safeClone() of the sub-converter may additionally align 3510 * currentConverter inside the cloneStruct, for which we need the deadSpace 3511 * after currentConverter. 3512 * This is because UAlignedMemory may be larger than the actually 3513 * necessary alignment size for the platform. 3514 * The other cloneStruct fields will not be moved around, 3515 * and are aligned properly with cloneStruct's alignment. 3516 */ 3517 struct cloneStruct 3518 { 3519 UConverter cnv; 3520 UConverter currentConverter; 3521 UAlignedMemory deadSpace; 3522 UConverterDataISO2022 mydata; 3523 }; 3524 3525 3526 static UConverter * 3527 _ISO_2022_SafeClone( 3528 const UConverter *cnv, 3529 void *stackBuffer, 3530 int32_t *pBufferSize, 3531 UErrorCode *status) 3532 { 3533 struct cloneStruct * localClone; 3534 UConverterDataISO2022 *cnvData; 3535 int32_t i, size; 3536 3537 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3538 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3539 return NULL; 3540 } 3541 3542 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3543 localClone = (struct cloneStruct *)stackBuffer; 3544 3545 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3546 3547 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3548 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3549 localClone->cnv.isExtraLocal = TRUE; 3550 3551 /* share the subconverters */ 3552 3553 if(cnvData->currentConverter != NULL) { 3554 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ 3555 localClone->mydata.currentConverter = 3556 ucnv_safeClone(cnvData->currentConverter, 3557 &localClone->currentConverter, 3558 &size, status); 3559 if(U_FAILURE(*status)) { 3560 return NULL; 3561 } 3562 } 3563 3564 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3565 if(cnvData->myConverterArray[i] != NULL) { 3566 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3567 } 3568 } 3569 3570 return &localClone->cnv; 3571 } 3572 3573 static void 3574 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3575 const USetAdder *sa, 3576 UConverterUnicodeSet which, 3577 UErrorCode *pErrorCode) 3578 { 3579 int32_t i; 3580 UConverterDataISO2022* cnvData; 3581 3582 if (U_FAILURE(*pErrorCode)) { 3583 return; 3584 } 3585 #ifdef U_ENABLE_GENERIC_ISO_2022 3586 if (cnv->sharedData == &_ISO2022Data) { 3587 /* We use UTF-8 in this case */ 3588 sa->addRange(sa->set, 0, 0xd7FF); 3589 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3590 return; 3591 } 3592 #endif 3593 3594 cnvData = (UConverterDataISO2022*)cnv->extraInfo; 3595 3596 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3597 switch(cnvData->locale[0]){ 3598 case 'j': 3599 /* include JIS X 0201 which is hardcoded */ 3600 sa->add(sa->set, 0xa5); 3601 sa->add(sa->set, 0x203e); 3602 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3603 /* include Latin-1 for some variants of JP */ 3604 sa->addRange(sa->set, 0, 0xff); 3605 } else { 3606 /* include ASCII for JP */ 3607 sa->addRange(sa->set, 0, 0x7f); 3608 } 3609 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3610 /* 3611 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3612 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3613 * use half-width Katakana. 3614 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3615 * half-width Katakana via the ESC ( I sequence. 3616 * However, we only emit (fromUnicode) half-width Katakana according to the 3617 * definition of each variant. 3618 * 3619 * When including fallbacks, 3620 * we need to include half-width Katakana Unicode code points for all JP variants because 3621 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3622 */ 3623 /* include half-width Katakana for JP */ 3624 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3625 } 3626 break; 3627 case 'c': 3628 case 'z': 3629 /* include ASCII for CN */ 3630 sa->addRange(sa->set, 0, 0x7f); 3631 break; 3632 case 'k': 3633 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3634 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3635 cnvData->currentConverter, sa, which, pErrorCode); 3636 /* the loop over myConverterArray[] will simply not find another converter */ 3637 break; 3638 default: 3639 break; 3640 } 3641 3642 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3643 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3644 cnvData->version==0 && i==CNS_11643 3645 ) { 3646 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3647 ucnv_MBCSGetUnicodeSetForBytes( 3648 cnvData->myConverterArray[i], 3649 sa, UCNV_ROUNDTRIP_SET, 3650 0, 0x81, 0x82, 3651 pErrorCode); 3652 } 3653 #endif 3654 3655 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3656 UConverterSetFilter filter; 3657 if(cnvData->myConverterArray[i]!=NULL) { 3658 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3659 cnvData->version==0 && i==CNS_11643 3660 ) { 3661 /* 3662 * Version-specific for CN: 3663 * CN version 0 does not map CNS planes 3..7 although 3664 * they are all available in the CNS conversion table; 3665 * CN version 1 (-EXT) does map them all. 3666 * The two versions create different Unicode sets. 3667 */ 3668 filter=UCNV_SET_FILTER_2022_CN; 3669 } else if(cnvData->locale[0]=='j' && i==JISX208) { 3670 /* 3671 * Only add code points that map to Shift-JIS codes 3672 * corresponding to JIS X 0208. 3673 */ 3674 filter=UCNV_SET_FILTER_SJIS; 3675 } else if(i==KSC5601) { 3676 /* 3677 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3678 * are broader than GR94. 3679 */ 3680 filter=UCNV_SET_FILTER_GR94DBCS; 3681 } else { 3682 filter=UCNV_SET_FILTER_NONE; 3683 } 3684 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3685 } 3686 } 3687 3688 /* 3689 * ISO 2022 converters must not convert SO/SI/ESC despite what 3690 * sub-converters do by themselves. 3691 * Remove these characters from the set. 3692 */ 3693 sa->remove(sa->set, 0x0e); 3694 sa->remove(sa->set, 0x0f); 3695 sa->remove(sa->set, 0x1b); 3696 3697 /* ISO 2022 converters do not convert C1 controls either */ 3698 sa->removeRange(sa->set, 0x80, 0x9f); 3699 } 3700 3701 static const UConverterImpl _ISO2022Impl={ 3702 UCNV_ISO_2022, 3703 3704 NULL, 3705 NULL, 3706 3707 _ISO2022Open, 3708 _ISO2022Close, 3709 _ISO2022Reset, 3710 3711 #ifdef U_ENABLE_GENERIC_ISO_2022 3712 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3713 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3714 ucnv_fromUnicode_UTF8, 3715 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3716 #else 3717 NULL, 3718 NULL, 3719 NULL, 3720 NULL, 3721 #endif 3722 NULL, 3723 3724 NULL, 3725 _ISO2022getName, 3726 _ISO_2022_WriteSub, 3727 _ISO_2022_SafeClone, 3728 _ISO_2022_GetUnicodeSet 3729 }; 3730 static const UConverterStaticData _ISO2022StaticData={ 3731 sizeof(UConverterStaticData), 3732 "ISO_2022", 3733 2022, 3734 UCNV_IBM, 3735 UCNV_ISO_2022, 3736 1, 3737 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 3738 { 0x1a, 0, 0, 0 }, 3739 1, 3740 FALSE, 3741 FALSE, 3742 0, 3743 0, 3744 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3745 }; 3746 const UConverterSharedData _ISO2022Data={ 3747 sizeof(UConverterSharedData), 3748 ~((uint32_t) 0), 3749 NULL, 3750 NULL, 3751 &_ISO2022StaticData, 3752 FALSE, 3753 &_ISO2022Impl, 3754 0 3755 }; 3756 3757 /*************JP****************/ 3758 static const UConverterImpl _ISO2022JPImpl={ 3759 UCNV_ISO_2022, 3760 3761 NULL, 3762 NULL, 3763 3764 _ISO2022Open, 3765 _ISO2022Close, 3766 _ISO2022Reset, 3767 3768 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3769 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3770 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3771 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3772 NULL, 3773 3774 NULL, 3775 _ISO2022getName, 3776 _ISO_2022_WriteSub, 3777 _ISO_2022_SafeClone, 3778 _ISO_2022_GetUnicodeSet 3779 }; 3780 static const UConverterStaticData _ISO2022JPStaticData={ 3781 sizeof(UConverterStaticData), 3782 "ISO_2022_JP", 3783 0, 3784 UCNV_IBM, 3785 UCNV_ISO_2022, 3786 1, 3787 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ 3788 { 0x1a, 0, 0, 0 }, 3789 1, 3790 FALSE, 3791 FALSE, 3792 0, 3793 0, 3794 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3795 }; 3796 static const UConverterSharedData _ISO2022JPData={ 3797 sizeof(UConverterSharedData), 3798 ~((uint32_t) 0), 3799 NULL, 3800 NULL, 3801 &_ISO2022JPStaticData, 3802 FALSE, 3803 &_ISO2022JPImpl, 3804 0 3805 }; 3806 3807 /************* KR ***************/ 3808 static const UConverterImpl _ISO2022KRImpl={ 3809 UCNV_ISO_2022, 3810 3811 NULL, 3812 NULL, 3813 3814 _ISO2022Open, 3815 _ISO2022Close, 3816 _ISO2022Reset, 3817 3818 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3819 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3820 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3821 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3822 NULL, 3823 3824 NULL, 3825 _ISO2022getName, 3826 _ISO_2022_WriteSub, 3827 _ISO_2022_SafeClone, 3828 _ISO_2022_GetUnicodeSet 3829 }; 3830 static const UConverterStaticData _ISO2022KRStaticData={ 3831 sizeof(UConverterStaticData), 3832 "ISO_2022_KR", 3833 0, 3834 UCNV_IBM, 3835 UCNV_ISO_2022, 3836 1, 3837 3, /* max 3 bytes per UChar: SO+DBCS */ 3838 { 0x1a, 0, 0, 0 }, 3839 1, 3840 FALSE, 3841 FALSE, 3842 0, 3843 0, 3844 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3845 }; 3846 static const UConverterSharedData _ISO2022KRData={ 3847 sizeof(UConverterSharedData), 3848 ~((uint32_t) 0), 3849 NULL, 3850 NULL, 3851 &_ISO2022KRStaticData, 3852 FALSE, 3853 &_ISO2022KRImpl, 3854 0 3855 }; 3856 3857 /*************** CN ***************/ 3858 static const UConverterImpl _ISO2022CNImpl={ 3859 3860 UCNV_ISO_2022, 3861 3862 NULL, 3863 NULL, 3864 3865 _ISO2022Open, 3866 _ISO2022Close, 3867 _ISO2022Reset, 3868 3869 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3870 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3871 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3872 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3873 NULL, 3874 3875 NULL, 3876 _ISO2022getName, 3877 _ISO_2022_WriteSub, 3878 _ISO_2022_SafeClone, 3879 _ISO_2022_GetUnicodeSet 3880 }; 3881 static const UConverterStaticData _ISO2022CNStaticData={ 3882 sizeof(UConverterStaticData), 3883 "ISO_2022_CN", 3884 0, 3885 UCNV_IBM, 3886 UCNV_ISO_2022, 3887 1, 3888 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3889 { 0x1a, 0, 0, 0 }, 3890 1, 3891 FALSE, 3892 FALSE, 3893 0, 3894 0, 3895 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3896 }; 3897 static const UConverterSharedData _ISO2022CNData={ 3898 sizeof(UConverterSharedData), 3899 ~((uint32_t) 0), 3900 NULL, 3901 NULL, 3902 &_ISO2022CNStaticData, 3903 FALSE, 3904 &_ISO2022CNImpl, 3905 0 3906 }; 3907 3908 3909 3910 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 3911