1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvmbcs.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000jul03 14 * created by: Markus W. Scherer 15 * 16 * The current code in this file replaces the previous implementation 17 * of conversion code from multi-byte codepages to Unicode and back. 18 * This implementation supports the following: 19 * - legacy variable-length codepages with up to 4 bytes per character 20 * - all Unicode code points (up to 0x10ffff) 21 * - efficient distinction of unassigned vs. illegal byte sequences 22 * - it is possible in fromUnicode() to directly deal with simple 23 * stateful encodings (used for EBCDIC_STATEFUL) 24 * - it is possible to convert Unicode code points 25 * to a single zero byte (but not as a fallback except for SBCS) 26 * 27 * Remaining limitations in fromUnicode: 28 * - byte sequences must not have leading zero bytes 29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30 * - limitation to up to 4 bytes per character 31 * 32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33 * limitations and adds m:n character mappings and other features. 34 * See ucnv_ext.h for details. 35 * 36 * Change history: 37 * 38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40 * macros to ucnvmbcs.h file 41 */ 42 43 #include "unicode/utypes.h" 44 45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47 #include "unicode/ucnv.h" 48 #include "unicode/ucnv_cb.h" 49 #include "unicode/udata.h" 50 #include "unicode/uset.h" 51 #include "ucnv_bld.h" 52 #include "ucnvmbcs.h" 53 #include "ucnv_ext.h" 54 #include "ucnv_cnv.h" 55 #include "umutex.h" 56 #include "cmemory.h" 57 #include "cstring.h" 58 59 /* si_value is defined as a macro in some POSIX implementations' standard headers. */ 60 #ifdef si_value 61 #undef si_value 62 #endif 63 64 /* control optimizations according to the platform */ 65 #define MBCS_UNROLL_SINGLE_TO_BMP 1 66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 67 68 /* 69 * _MBCSHeader versions 5.3 & 4.3 70 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 71 * 72 * This version is optional. Version 5 is used for incompatible data format changes. 73 * makeconv will continue to generate version 4 files if possible. 74 * 75 * Changes from version 4: 76 * 77 * The main difference is an additional _MBCSHeader field with 78 * - the length (number of uint32_t) of the _MBCSHeader 79 * - flags for further incompatible data format changes 80 * - flags for further, backward compatible data format changes 81 * 82 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 83 * the file and needs to be reconstituted at load time. 84 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 85 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 86 * (For details about these structures see below, and see ucnvmbcs.h.) 87 * 88 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 89 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 90 * precision markers for all mappings.) 91 * 92 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 93 * omitted data that can be reconstituted from the toUnicode data. 94 * 95 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 96 * With only roundtrip mappings in the base fromUnicode data, this part is fully 97 * redundant with the mbcsIndex and will be reconstituted from that (also using the 98 * stage 1 table which contains the information about how stage 2 was compacted). 99 * 100 * The rest of the stage 2 table, the part for code points above maxFastUChar, 101 * is stored in the file and will be appended to the reconstituted part. 102 * 103 * The entire fromUBytes array is omitted from the file and will be reconstitued. 104 * This is done by enumerating all toUnicode roundtrip mappings, performing 105 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 106 * writing instead of reading the byte values. 107 * 108 * _MBCSHeader version 4.3 109 * 110 * Change from version 4.2: 111 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 112 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 113 * files which can be used instead of stages 1 & 2. 114 * Faster lookups for roundtrips from most commonly used characters, 115 * and lookups from UTF-8 byte sequences with a natural bit distribution. 116 * See ucnvmbcs.h for more details. 117 * 118 * Change from version 4.1: 119 * - Added an optional extension table structure at the end of the .cnv file. 120 * It is present if the upper bits of the header flags field contains a non-zero 121 * byte offset to it. 122 * Files that contain only a conversion table and no base table 123 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 124 * These contain the base table name between the MBCS header and the extension 125 * data. 126 * 127 * Change from version 4.0: 128 * - Replace header.reserved with header.fromUBytesLength so that all 129 * fields in the data have length. 130 * 131 * Changes from version 3 (for performance improvements): 132 * - new bit distribution for state table entries 133 * - reordered action codes 134 * - new data structure for single-byte fromUnicode 135 * + stage 2 only contains indexes 136 * + stage 3 stores 16 bits per character with classification bits 15..8 137 * - no multiplier for stage 1 entries 138 * - stage 2 for non-single-byte codepages contains the index and the flags in 139 * one 32-bit value 140 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 141 * 142 * For more details about old versions of the MBCS data structure, see 143 * the corresponding versions of this file. 144 * 145 * Converting stateless codepage data ---------------------------------------*** 146 * (or codepage data with simple states) to Unicode. 147 * 148 * Data structure and algorithm for converting from complex legacy codepages 149 * to Unicode. (Designed before 2000-may-22.) 150 * 151 * The basic idea is that the structure of legacy codepages can be described 152 * with state tables. 153 * When reading a byte stream, each input byte causes a state transition. 154 * Some transitions result in the output of a code point, some result in 155 * "unassigned" or "illegal" output. 156 * This is used here for character conversion. 157 * 158 * The data structure begins with a state table consisting of a row 159 * per state, with 256 entries (columns) per row for each possible input 160 * byte value. 161 * Each entry is 32 bits wide, with two formats distinguished by 162 * the sign bit (bit 31): 163 * 164 * One format for transitional entries (bit 31 not set) for non-final bytes, and 165 * one format for final entries (bit 31 set). 166 * Both formats contain the number of the next state in the same bit 167 * positions. 168 * State 0 is the initial state. 169 * 170 * Most of the time, the offset values of subsequent states are added 171 * up to a scalar value. This value will eventually be the index of 172 * the Unicode code point in a table that follows the state table. 173 * The effect is that the code points for final state table rows 174 * are contiguous. The code points of final state rows follow each other 175 * in the order of the references to those final states by previous 176 * states, etc. 177 * 178 * For some terminal states, the offset is itself the output Unicode 179 * code point (16 bits for a BMP code point or 20 bits for a supplementary 180 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 181 * For others, the code point in the Unicode table is stored with either 182 * one or two code units: one for BMP code points, two for a pair of 183 * surrogates. 184 * All code points for a final state entry take up the same number of code 185 * units, regardless of whether they all actually _use_ the same number 186 * of code units. This is necessary for simple array access. 187 * 188 * An additional feature comes in with what in ICU is called "fallback" 189 * mappings: 190 * 191 * In addition to round-trippable, precise, 1:1 mappings, there are often 192 * mappings defined between similar, though not the same, characters. 193 * Typically, such mappings occur only in fromUnicode mapping tables because 194 * Unicode has a superset repertoire of most other codepages. However, it 195 * is possible to provide such mappings in the toUnicode tables, too. 196 * In this case, the fallback mappings are partly integrated into the 197 * general state tables because the structure of the encoding includes their 198 * byte sequences. 199 * For final entries in an initial state, fallback mappings are stored in 200 * the entry itself like with roundtrip mappings. 201 * For other final entries, they are stored in the code units table if 202 * the entry is for a pair of code units. 203 * For single-unit results in the code units table, there is no space to 204 * alternatively hold a fallback mapping; in this case, the code unit 205 * is stored as U+fffe (unassigned), and the fallback mapping needs to 206 * be looked up by the scalar offset value in a separate table. 207 * 208 * "Unassigned" state entries really mean "structurally unassigned", 209 * i.e., such a byte sequence will never have a mapping result. 210 * 211 * The interpretation of the bits in each entry is as follows: 212 * 213 * Bit 31 not set, not a terminal entry ("transitional"): 214 * 30..24 next state 215 * 23..0 offset delta, to be added up 216 * 217 * Bit 31 set, terminal ("final") entry: 218 * 30..24 next state (regardless of action code) 219 * 23..20 action code: 220 * action codes 0 and 1 result in precise-mapping Unicode code points 221 * 0 valid byte sequence 222 * 19..16 not used, 0 223 * 15..0 16-bit Unicode BMP code point 224 * never U+fffe or U+ffff 225 * 1 valid byte sequence 226 * 19..0 20-bit Unicode supplementary code point 227 * never U+fffe or U+ffff 228 * 229 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 230 * 2 valid byte sequence (fallback) 231 * 19..16 not used, 0 232 * 15..0 16-bit Unicode BMP code point as fallback result 233 * 3 valid byte sequence (fallback) 234 * 19..0 20-bit Unicode supplementary code point as fallback result 235 * 236 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 237 * depending on the code units they result in 238 * 4 valid byte sequence 239 * 19..9 not used, 0 240 * 8..0 final offset delta 241 * pointing to one 16-bit code unit which may be 242 * fffe unassigned -- look for a fallback for this offset 243 * ffff illegal 244 * 5 valid byte sequence 245 * 19..9 not used, 0 246 * 8..0 final offset delta 247 * pointing to two 16-bit code units 248 * (typically UTF-16 surrogates) 249 * the result depends on the first code unit as follows: 250 * 0000..d7ff roundtrip BMP code point (1st alone) 251 * d800..dbff roundtrip surrogate pair (1st, 2nd) 252 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 253 * e000 roundtrip BMP code point (2nd alone) 254 * e001 fallback BMP code point (2nd alone) 255 * fffe unassigned 256 * ffff illegal 257 * (the final offset deltas are at most 255 * 2, 258 * times 2 because of storing code unit pairs) 259 * 260 * 6 unassigned byte sequence 261 * 19..16 not used, 0 262 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 263 * this does not contain a final offset delta because the main 264 * purpose of this action code is to save scalar offset values; 265 * therefore, fallback values cannot be assigned to byte 266 * sequences that result in this action code 267 * 7 illegal byte sequence 268 * 19..16 not used, 0 269 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 270 * 8 state change only 271 * 19..0 not used, 0 272 * useful for state changes in simple stateful encodings, 273 * at Shift-In/Shift-Out codes 274 * 275 * 276 * 9..15 reserved for future use 277 * current implementations will only perform a state change 278 * and ignore bits 19..0 279 * 280 * An encoding with contiguous ranges of unassigned byte sequences, like 281 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 282 * at least two states for the trail bytes: 283 * One trail byte state that results in code points, and one that only 284 * has "unassigned" and "illegal" terminal states. 285 * 286 * Note: partly by accident, this data structure supports simple stateful 287 * encodings without any additional logic. 288 * Currently, only simple Shift-In/Shift-Out schemes are handled with 289 * appropriate state tables (especially EBCDIC_STATEFUL!). 290 * 291 * MBCS version 2 added: 292 * unassigned and illegal action codes have U+fffe and U+ffff 293 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 294 * 295 * Converting from Unicode to codepage bytes --------------------------------*** 296 * 297 * The conversion data structure for fromUnicode is designed for the known 298 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 299 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 300 * a roundtrip mapping. 301 * 302 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 303 * like in the character properties table. 304 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 305 * with the resulting bytes is at offsetFromUBytes. 306 * 307 * Beginning with version 4, single-byte codepages have a significantly different 308 * trie compared to other codepages. 309 * In all cases, the entry in stage 1 is directly the index of the block of 310 * 64 entries in stage 2. 311 * 312 * Single-byte lookup: 313 * 314 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 315 * Stage 3 contains one 16-bit word per result: 316 * Bits 15..8 indicate the kind of result: 317 * f roundtrip result 318 * c fallback result from private-use code point 319 * 8 fallback result from other code points 320 * 0 unassigned 321 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 322 * 323 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 324 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 325 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 326 * ASCII code points can be looked up with a linear array access into stage 3. 327 * See maxFastUChar and other details in ucnvmbcs.h. 328 * 329 * Multi-byte lookup: 330 * 331 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 332 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 333 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 334 * If this test is false, then a non-zero result will be interpreted as 335 * a fallback mapping. 336 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 337 * 338 * Stage 3 contains 2, 3, or 4 bytes per result. 339 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 340 * while 3 bytes are stored as bytes in big-endian order. 341 * Leading zero bytes are ignored, and the number of bytes is counted. 342 * A zero byte mapping result is possible as a roundtrip result. 343 * For some output types, the actual result is processed from this; 344 * see ucnv_MBCSFromUnicodeWithOffsets(). 345 * 346 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 347 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 348 * 349 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 350 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 351 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 352 * ASCII code points can be looked up with a linear array access into stage 3. 353 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 354 * 355 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 356 * for compaction. 357 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 358 * may overlap by any number of entries. 359 * 360 * MBCS version 2 added: 361 * the converter checks for known output types, which allows 362 * adding new ones without crashing an unaware converter 363 */ 364 365 static const UConverterImpl _SBCSUTF8Impl; 366 static const UConverterImpl _DBCSUTF8Impl; 367 368 /* GB 18030 data ------------------------------------------------------------ */ 369 370 /* helper macros for linear values for GB 18030 four-byte sequences */ 371 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 372 373 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 374 375 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 376 377 /* 378 * Some ranges of GB 18030 where both the Unicode code points and the 379 * GB four-byte sequences are contiguous and are handled algorithmically by 380 * the special callback functions below. 381 * The values are start & end of Unicode & GB codes. 382 * 383 * Note that single surrogates are not mapped by GB 18030 384 * as of the re-released mapping tables from 2000-nov-30. 385 */ 386 static const uint32_t 387 gb18030Ranges[13][4]={ 388 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 389 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 390 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)}, 391 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 392 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 393 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 394 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 395 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 396 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 397 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 398 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 399 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 400 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 401 }; 402 403 /* bit flag for UConverter.options indicating GB 18030 special handling */ 404 #define _MBCS_OPTION_GB18030 0x8000 405 406 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 407 #define _MBCS_OPTION_KEIS 0x01000 408 #define _MBCS_OPTION_JEF 0x02000 409 #define _MBCS_OPTION_JIPS 0x04000 410 411 #define KEIS_SO_CHAR_1 0x0A 412 #define KEIS_SO_CHAR_2 0x42 413 #define KEIS_SI_CHAR_1 0x0A 414 #define KEIS_SI_CHAR_2 0x41 415 416 #define JEF_SO_CHAR 0x28 417 #define JEF_SI_CHAR 0x29 418 419 #define JIPS_SO_CHAR_1 0x1A 420 #define JIPS_SO_CHAR_2 0x70 421 #define JIPS_SI_CHAR_1 0x1A 422 #define JIPS_SI_CHAR_2 0x71 423 424 enum SISO_Option { 425 SI, 426 SO 427 }; 428 typedef enum SISO_Option SISO_Option; 429 430 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 431 int32_t SISOLength = 0; 432 433 switch (option) { 434 case SI: 435 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 436 value[0] = KEIS_SI_CHAR_1; 437 value[1] = KEIS_SI_CHAR_2; 438 SISOLength = 2; 439 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 440 value[0] = JEF_SI_CHAR; 441 SISOLength = 1; 442 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 443 value[0] = JIPS_SI_CHAR_1; 444 value[1] = JIPS_SI_CHAR_2; 445 SISOLength = 2; 446 } else { 447 value[0] = UCNV_SI; 448 SISOLength = 1; 449 } 450 break; 451 case SO: 452 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 453 value[0] = KEIS_SO_CHAR_1; 454 value[1] = KEIS_SO_CHAR_2; 455 SISOLength = 2; 456 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 457 value[0] = JEF_SO_CHAR; 458 SISOLength = 1; 459 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 460 value[0] = JIPS_SO_CHAR_1; 461 value[1] = JIPS_SO_CHAR_2; 462 SISOLength = 2; 463 } else { 464 value[0] = UCNV_SO; 465 SISOLength = 1; 466 } 467 break; 468 default: 469 /* Should never happen. */ 470 break; 471 } 472 473 return SISOLength; 474 } 475 476 /* Miscellaneous ------------------------------------------------------------ */ 477 478 /** 479 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 480 * consecutive sequences of bytes, starting from the one encoded in value, 481 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 482 * Does not currently support m:n mappings or reverse fallbacks. 483 * This function will not be called for sequences of bytes with leading zeros. 484 * 485 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 486 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 487 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 488 * not map to anything 489 * @return TRUE to continue enumeration, FALSE to stop 490 */ 491 typedef UBool U_CALLCONV 492 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 493 494 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 495 static UBool 496 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 497 int32_t state, uint32_t offset, 498 uint32_t value, 499 UConverterEnumToUCallback *callback, const void *context, 500 UErrorCode *pErrorCode) { 501 UChar32 codePoints[32]; 502 const int32_t *row; 503 const uint16_t *unicodeCodeUnits; 504 UChar32 anyCodePoints; 505 int32_t b, limit; 506 507 row=mbcsTable->stateTable[state]; 508 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 509 510 value<<=8; 511 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 512 513 b=(stateProps[state]&0x38)<<2; 514 if(b==0 && stateProps[state]>=0x40) { 515 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 516 codePoints[0]=U_SENTINEL; 517 b=1; 518 } 519 limit=((stateProps[state]&7)+1)<<5; 520 while(b<limit) { 521 int32_t entry=row[b]; 522 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 523 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 524 if(stateProps[nextState]>=0) { 525 /* recurse to a state with non-ignorable actions */ 526 if(!enumToU( 527 mbcsTable, stateProps, nextState, 528 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 529 value|(uint32_t)b, 530 callback, context, 531 pErrorCode)) { 532 return FALSE; 533 } 534 } 535 codePoints[b&0x1f]=U_SENTINEL; 536 } else { 537 UChar32 c; 538 int32_t action; 539 540 /* 541 * An if-else-if chain provides more reliable performance for 542 * the most common cases compared to a switch. 543 */ 544 action=MBCS_ENTRY_FINAL_ACTION(entry); 545 if(action==MBCS_STATE_VALID_DIRECT_16) { 546 /* output BMP code point */ 547 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 548 } else if(action==MBCS_STATE_VALID_16) { 549 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 550 c=unicodeCodeUnits[finalOffset]; 551 if(c<0xfffe) { 552 /* output BMP code point */ 553 } else { 554 c=U_SENTINEL; 555 } 556 } else if(action==MBCS_STATE_VALID_16_PAIR) { 557 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 558 c=unicodeCodeUnits[finalOffset++]; 559 if(c<0xd800) { 560 /* output BMP code point below 0xd800 */ 561 } else if(c<=0xdbff) { 562 /* output roundtrip or fallback supplementary code point */ 563 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 564 } else if(c==0xe000) { 565 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 566 c=unicodeCodeUnits[finalOffset]; 567 } else { 568 c=U_SENTINEL; 569 } 570 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 571 /* output supplementary code point */ 572 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 573 } else { 574 c=U_SENTINEL; 575 } 576 577 codePoints[b&0x1f]=c; 578 anyCodePoints&=c; 579 } 580 if(((++b)&0x1f)==0) { 581 if(anyCodePoints>=0) { 582 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 583 return FALSE; 584 } 585 anyCodePoints=-1; 586 } 587 } 588 } 589 return TRUE; 590 } 591 592 /* 593 * Only called if stateProps[state]==-1. 594 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 595 * MBCS_STATE_CHANGE_ONLY. 596 */ 597 static int8_t 598 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 599 const int32_t *row; 600 int32_t min, max, entry, nextState; 601 602 row=stateTable[state]; 603 stateProps[state]=0; 604 605 /* find first non-ignorable state */ 606 for(min=0;; ++min) { 607 entry=row[min]; 608 nextState=MBCS_ENTRY_STATE(entry); 609 if(stateProps[nextState]==-1) { 610 getStateProp(stateTable, stateProps, nextState); 611 } 612 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 613 if(stateProps[nextState]>=0) { 614 break; 615 } 616 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 617 break; 618 } 619 if(min==0xff) { 620 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 621 return stateProps[state]; 622 } 623 } 624 stateProps[state]|=(int8_t)((min>>5)<<3); 625 626 /* find last non-ignorable state */ 627 for(max=0xff; min<max; --max) { 628 entry=row[max]; 629 nextState=MBCS_ENTRY_STATE(entry); 630 if(stateProps[nextState]==-1) { 631 getStateProp(stateTable, stateProps, nextState); 632 } 633 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 634 if(stateProps[nextState]>=0) { 635 break; 636 } 637 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 638 break; 639 } 640 } 641 stateProps[state]|=(int8_t)(max>>5); 642 643 /* recurse further and collect direct-state information */ 644 while(min<=max) { 645 entry=row[min]; 646 nextState=MBCS_ENTRY_STATE(entry); 647 if(stateProps[nextState]==-1) { 648 getStateProp(stateTable, stateProps, nextState); 649 } 650 if(MBCS_ENTRY_IS_FINAL(entry)) { 651 stateProps[nextState]|=0x40; 652 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 653 stateProps[state]|=0x40; 654 } 655 } 656 ++min; 657 } 658 return stateProps[state]; 659 } 660 661 /* 662 * Internal function enumerating the toUnicode data of an MBCS converter. 663 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 664 * table, but could also be used for a future ucnv_getUnicodeSet() option 665 * that includes reverse fallbacks (after updating this function's implementation). 666 * Currently only handles roundtrip mappings. 667 * Does not currently handle extensions. 668 */ 669 static void 670 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 671 UConverterEnumToUCallback *callback, const void *context, 672 UErrorCode *pErrorCode) { 673 /* 674 * Properties for each state, to speed up the enumeration. 675 * Ignorable actions are unassigned/illegal/state-change-only: 676 * They do not lead to mappings. 677 * 678 * Bits 7..6: 679 * 1 direct/initial state (stateful converters have multiple) 680 * 0 non-initial state with transitions or with non-ignorable result actions 681 * -1 final state with only ignorable actions 682 * 683 * Bits 5..3: 684 * The lowest byte value with non-ignorable actions is 685 * value<<5 (rounded down). 686 * 687 * Bits 2..0: 688 * The highest byte value with non-ignorable actions is 689 * (value<<5)&0x1f (rounded up). 690 */ 691 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 692 int32_t state; 693 694 uprv_memset(stateProps, -1, sizeof(stateProps)); 695 696 /* recurse from state 0 and set all stateProps */ 697 getStateProp(mbcsTable->stateTable, stateProps, 0); 698 699 for(state=0; state<mbcsTable->countStates; ++state) { 700 /*if(stateProps[state]==-1) { 701 printf("unused/unreachable <icu:state> %d\n", state); 702 }*/ 703 if(stateProps[state]>=0x40) { 704 /* start from each direct state */ 705 enumToU( 706 mbcsTable, stateProps, state, 0, 0, 707 callback, context, 708 pErrorCode); 709 } 710 } 711 } 712 713 U_CFUNC void 714 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 715 const USetAdder *sa, 716 UConverterUnicodeSet which, 717 UConverterSetFilter filter, 718 UErrorCode *pErrorCode) { 719 const UConverterMBCSTable *mbcsTable; 720 const uint16_t *table; 721 722 uint32_t st3; 723 uint16_t st1, maxStage1, st2; 724 725 UChar32 c; 726 727 /* enumerate the from-Unicode trie table */ 728 mbcsTable=&sharedData->mbcs; 729 table=mbcsTable->fromUnicodeTable; 730 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 731 maxStage1=0x440; 732 } else { 733 maxStage1=0x40; 734 } 735 736 c=0; /* keep track of the current code point while enumerating */ 737 738 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 739 const uint16_t *stage2, *stage3, *results; 740 uint16_t minValue; 741 742 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 743 744 /* 745 * Set a threshold variable for selecting which mappings to use. 746 * See ucnv_MBCSSingleFromBMPWithOffsets() and 747 * MBCS_SINGLE_RESULT_FROM_U() for details. 748 */ 749 if(which==UCNV_ROUNDTRIP_SET) { 750 /* use only roundtrips */ 751 minValue=0xf00; 752 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 753 /* use all roundtrip and fallback results */ 754 minValue=0x800; 755 } 756 757 for(st1=0; st1<maxStage1; ++st1) { 758 st2=table[st1]; 759 if(st2>maxStage1) { 760 stage2=table+st2; 761 for(st2=0; st2<64; ++st2) { 762 if((st3=stage2[st2])!=0) { 763 /* read the stage 3 block */ 764 stage3=results+st3; 765 766 do { 767 if(*stage3++>=minValue) { 768 sa->add(sa->set, c); 769 } 770 } while((++c&0xf)!=0); 771 } else { 772 c+=16; /* empty stage 3 block */ 773 } 774 } 775 } else { 776 c+=1024; /* empty stage 2 block */ 777 } 778 } 779 } else { 780 const uint32_t *stage2; 781 const uint8_t *stage3, *bytes; 782 uint32_t st3Multiplier; 783 uint32_t value; 784 UBool useFallback; 785 786 bytes=mbcsTable->fromUnicodeBytes; 787 788 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 789 790 switch(mbcsTable->outputType) { 791 case MBCS_OUTPUT_3: 792 case MBCS_OUTPUT_4_EUC: 793 st3Multiplier=3; 794 break; 795 case MBCS_OUTPUT_4: 796 st3Multiplier=4; 797 break; 798 default: 799 st3Multiplier=2; 800 break; 801 } 802 803 for(st1=0; st1<maxStage1; ++st1) { 804 st2=table[st1]; 805 if(st2>(maxStage1>>1)) { 806 stage2=(const uint32_t *)table+st2; 807 for(st2=0; st2<64; ++st2) { 808 if((st3=stage2[st2])!=0) { 809 /* read the stage 3 block */ 810 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 811 812 /* get the roundtrip flags for the stage 3 block */ 813 st3>>=16; 814 815 /* 816 * Add code points for which the roundtrip flag is set, 817 * or which map to non-zero bytes if we use fallbacks. 818 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 819 */ 820 switch(filter) { 821 case UCNV_SET_FILTER_NONE: 822 do { 823 if(st3&1) { 824 sa->add(sa->set, c); 825 stage3+=st3Multiplier; 826 } else if(useFallback) { 827 uint8_t b=0; 828 switch(st3Multiplier) { 829 case 4: 830 b|=*stage3++; 831 case 3: 832 b|=*stage3++; 833 case 2: 834 b|=stage3[0]|stage3[1]; 835 stage3+=2; 836 default: 837 break; 838 } 839 if(b!=0) { 840 sa->add(sa->set, c); 841 } 842 } 843 st3>>=1; 844 } while((++c&0xf)!=0); 845 break; 846 case UCNV_SET_FILTER_DBCS_ONLY: 847 /* Ignore single-byte results (<0x100). */ 848 do { 849 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 850 sa->add(sa->set, c); 851 } 852 st3>>=1; 853 stage3+=2; /* +=st3Multiplier */ 854 } while((++c&0xf)!=0); 855 break; 856 case UCNV_SET_FILTER_2022_CN: 857 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 858 do { 859 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 860 sa->add(sa->set, c); 861 } 862 st3>>=1; 863 stage3+=3; /* +=st3Multiplier */ 864 } while((++c&0xf)!=0); 865 break; 866 case UCNV_SET_FILTER_SJIS: 867 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 868 do { 869 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 870 sa->add(sa->set, c); 871 } 872 st3>>=1; 873 stage3+=2; /* +=st3Multiplier */ 874 } while((++c&0xf)!=0); 875 break; 876 case UCNV_SET_FILTER_GR94DBCS: 877 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 878 do { 879 if( ((st3&1)!=0 || useFallback) && 880 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 881 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 882 ) { 883 sa->add(sa->set, c); 884 } 885 st3>>=1; 886 stage3+=2; /* +=st3Multiplier */ 887 } while((++c&0xf)!=0); 888 break; 889 case UCNV_SET_FILTER_HZ: 890 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 891 do { 892 if( ((st3&1)!=0 || useFallback) && 893 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 894 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 895 ) { 896 sa->add(sa->set, c); 897 } 898 st3>>=1; 899 stage3+=2; /* +=st3Multiplier */ 900 } while((++c&0xf)!=0); 901 break; 902 default: 903 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 904 return; 905 } 906 } else { 907 c+=16; /* empty stage 3 block */ 908 } 909 } 910 } else { 911 c+=1024; /* empty stage 2 block */ 912 } 913 } 914 } 915 916 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 917 } 918 919 U_CFUNC void 920 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 921 const USetAdder *sa, 922 UConverterUnicodeSet which, 923 UErrorCode *pErrorCode) { 924 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 925 sharedData, sa, which, 926 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 927 UCNV_SET_FILTER_DBCS_ONLY : 928 UCNV_SET_FILTER_NONE, 929 pErrorCode); 930 } 931 932 static void 933 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 934 const USetAdder *sa, 935 UConverterUnicodeSet which, 936 UErrorCode *pErrorCode) { 937 if(cnv->options&_MBCS_OPTION_GB18030) { 938 sa->addRange(sa->set, 0, 0xd7ff); 939 sa->addRange(sa->set, 0xe000, 0x10ffff); 940 } else { 941 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 942 } 943 } 944 945 /* conversion extensions for input not in the main table -------------------- */ 946 947 /* 948 * Hardcoded extension handling for GB 18030. 949 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 950 * 951 * In the future, conversion extensions may handle m:n mappings and delta tables, 952 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 953 * 954 * If an input character cannot be mapped, then these functions set an error 955 * code. The framework will then call the callback function. 956 */ 957 958 /* 959 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 960 * else return 0 after output has been written to the target 961 */ 962 static UChar32 963 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 964 UChar32 cp, 965 const UChar **source, const UChar *sourceLimit, 966 uint8_t **target, const uint8_t *targetLimit, 967 int32_t **offsets, int32_t sourceIndex, 968 UBool flush, 969 UErrorCode *pErrorCode) { 970 const int32_t *cx; 971 972 cnv->useSubChar1=FALSE; 973 974 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 975 ucnv_extInitialMatchFromU( 976 cnv, cx, 977 cp, source, sourceLimit, 978 (char **)target, (char *)targetLimit, 979 offsets, sourceIndex, 980 flush, 981 pErrorCode) 982 ) { 983 return 0; /* an extension mapping handled the input */ 984 } 985 986 /* GB 18030 */ 987 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 988 const uint32_t *range; 989 int32_t i; 990 991 range=gb18030Ranges[0]; 992 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 993 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 994 /* found the Unicode code point, output the four-byte sequence for it */ 995 uint32_t linear; 996 char bytes[4]; 997 998 /* get the linear value of the first GB 18030 code in this range */ 999 linear=range[2]-LINEAR_18030_BASE; 1000 1001 /* add the offset from the beginning of the range */ 1002 linear+=((uint32_t)cp-range[0]); 1003 1004 /* turn this into a four-byte sequence */ 1005 bytes[3]=(char)(0x30+linear%10); linear/=10; 1006 bytes[2]=(char)(0x81+linear%126); linear/=126; 1007 bytes[1]=(char)(0x30+linear%10); linear/=10; 1008 bytes[0]=(char)(0x81+linear); 1009 1010 /* output this sequence */ 1011 ucnv_fromUWriteBytes(cnv, 1012 bytes, 4, (char **)target, (char *)targetLimit, 1013 offsets, sourceIndex, pErrorCode); 1014 return 0; 1015 } 1016 } 1017 } 1018 1019 /* no mapping */ 1020 *pErrorCode=U_INVALID_CHAR_FOUND; 1021 return cp; 1022 } 1023 1024 /* 1025 * Input sequence: cnv->toUBytes[0..length[ 1026 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1027 * else return 0 after output has been written to the target 1028 */ 1029 static int8_t 1030 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1031 int8_t length, 1032 const uint8_t **source, const uint8_t *sourceLimit, 1033 UChar **target, const UChar *targetLimit, 1034 int32_t **offsets, int32_t sourceIndex, 1035 UBool flush, 1036 UErrorCode *pErrorCode) { 1037 const int32_t *cx; 1038 1039 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1040 ucnv_extInitialMatchToU( 1041 cnv, cx, 1042 length, (const char **)source, (const char *)sourceLimit, 1043 target, targetLimit, 1044 offsets, sourceIndex, 1045 flush, 1046 pErrorCode) 1047 ) { 1048 return 0; /* an extension mapping handled the input */ 1049 } 1050 1051 /* GB 18030 */ 1052 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1053 const uint32_t *range; 1054 uint32_t linear; 1055 int32_t i; 1056 1057 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1058 range=gb18030Ranges[0]; 1059 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 1060 if(range[2]<=linear && linear<=range[3]) { 1061 /* found the sequence, output the Unicode code point for it */ 1062 *pErrorCode=U_ZERO_ERROR; 1063 1064 /* add the linear difference between the input and start sequences to the start code point */ 1065 linear=range[0]+(linear-range[2]); 1066 1067 /* output this code point */ 1068 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1069 1070 return 0; 1071 } 1072 } 1073 } 1074 1075 /* no mapping */ 1076 *pErrorCode=U_INVALID_CHAR_FOUND; 1077 return length; 1078 } 1079 1080 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1081 1082 /* 1083 * This code modifies a standard EBCDIC<->Unicode mapping table for 1084 * OS/390 (z/OS) Unix System Services (Open Edition). 1085 * The difference is in the mapping of Line Feed and New Line control codes: 1086 * Standard EBCDIC maps 1087 * 1088 * <U000A> \x25 |0 1089 * <U0085> \x15 |0 1090 * 1091 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1092 * mapping 1093 * 1094 * <U000A> \x15 |0 1095 * <U0085> \x25 |0 1096 * 1097 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1098 * by copying it into allocated memory and swapping the LF and NL values. 1099 * It allows to support the same EBCDIC charset in both versions without 1100 * duplicating the entire installed table. 1101 */ 1102 1103 /* standard EBCDIC codes */ 1104 #define EBCDIC_LF 0x25 1105 #define EBCDIC_NL 0x15 1106 1107 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1108 #define EBCDIC_RT_LF 0xf25 1109 #define EBCDIC_RT_NL 0xf15 1110 1111 /* Unicode code points */ 1112 #define U_LF 0x0a 1113 #define U_NL 0x85 1114 1115 static UBool 1116 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1117 UConverterMBCSTable *mbcsTable; 1118 1119 const uint16_t *table, *results; 1120 const uint8_t *bytes; 1121 1122 int32_t (*newStateTable)[256]; 1123 uint16_t *newResults; 1124 uint8_t *p; 1125 char *name; 1126 1127 uint32_t stage2Entry; 1128 uint32_t size, sizeofFromUBytes; 1129 1130 mbcsTable=&sharedData->mbcs; 1131 1132 table=mbcsTable->fromUnicodeTable; 1133 bytes=mbcsTable->fromUnicodeBytes; 1134 results=(const uint16_t *)bytes; 1135 1136 /* 1137 * Check that this is an EBCDIC table with SBCS portion - 1138 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1139 * 1140 * If not, ignore the option. Options are always ignored if they do not apply. 1141 */ 1142 if(!( 1143 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1144 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1145 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1146 )) { 1147 return FALSE; 1148 } 1149 1150 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1151 if(!( 1152 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1153 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1154 )) { 1155 return FALSE; 1156 } 1157 } else /* MBCS_OUTPUT_2_SISO */ { 1158 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1159 if(!( 1160 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1161 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1162 )) { 1163 return FALSE; 1164 } 1165 1166 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1167 if(!( 1168 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1169 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1170 )) { 1171 return FALSE; 1172 } 1173 } 1174 1175 if(mbcsTable->fromUBytesLength>0) { 1176 /* 1177 * We _know_ the number of bytes in the fromUnicodeBytes array 1178 * starting with header.version 4.1. 1179 */ 1180 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1181 } else { 1182 /* 1183 * Otherwise: 1184 * There used to be code to enumerate the fromUnicode 1185 * trie and find the highest entry, but it was removed in ICU 3.2 1186 * because it was not tested and caused a low code coverage number. 1187 * See Jitterbug 3674. 1188 * This affects only some .cnv file formats with a header.version 1189 * below 4.1, and only when swaplfnl is requested. 1190 * 1191 * ucnvmbcs.c revision 1.99 is the last one with the 1192 * ucnv_MBCSSizeofFromUBytes() function. 1193 */ 1194 *pErrorCode=U_INVALID_FORMAT_ERROR; 1195 return FALSE; 1196 } 1197 1198 /* 1199 * The table has an appropriate format. 1200 * Allocate and build 1201 * - a modified to-Unicode state table 1202 * - a modified from-Unicode output array 1203 * - a converter name string with the swap option appended 1204 */ 1205 size= 1206 mbcsTable->countStates*1024+ 1207 sizeofFromUBytes+ 1208 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1209 p=(uint8_t *)uprv_malloc(size); 1210 if(p==NULL) { 1211 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1212 return FALSE; 1213 } 1214 1215 /* copy and modify the to-Unicode state table */ 1216 newStateTable=(int32_t (*)[256])p; 1217 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1218 1219 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1220 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1221 1222 /* copy and modify the from-Unicode result table */ 1223 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1224 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1225 1226 /* conveniently, the table access macros work on the left side of expressions */ 1227 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1228 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1229 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1230 } else /* MBCS_OUTPUT_2_SISO */ { 1231 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1232 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1233 1234 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1235 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1236 } 1237 1238 /* set the canonical converter name */ 1239 name=(char *)newResults+sizeofFromUBytes; 1240 uprv_strcpy(name, sharedData->staticData->name); 1241 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1242 1243 /* set the pointers */ 1244 umtx_lock(NULL); 1245 if(mbcsTable->swapLFNLStateTable==NULL) { 1246 mbcsTable->swapLFNLStateTable=newStateTable; 1247 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1248 mbcsTable->swapLFNLName=name; 1249 1250 newStateTable=NULL; 1251 } 1252 umtx_unlock(NULL); 1253 1254 /* release the allocated memory if another thread beat us to it */ 1255 if(newStateTable!=NULL) { 1256 uprv_free(newStateTable); 1257 } 1258 return TRUE; 1259 } 1260 1261 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1262 1263 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1264 static UBool U_CALLCONV 1265 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1266 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1267 const uint16_t *table; 1268 uint32_t *stage2; 1269 uint8_t *bytes, *p; 1270 UChar32 c; 1271 int32_t i, st3; 1272 1273 table=mbcsTable->fromUnicodeTable; 1274 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1275 1276 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1277 switch(mbcsTable->outputType) { 1278 case MBCS_OUTPUT_3_EUC: 1279 if(value<=0xffff) { 1280 /* short sequences are stored directly */ 1281 /* code set 0 or 1 */ 1282 } else if(value<=0x8effff) { 1283 /* code set 2 */ 1284 value&=0x7fff; 1285 } else /* first byte is 0x8f */ { 1286 /* code set 3 */ 1287 value&=0xff7f; 1288 } 1289 break; 1290 case MBCS_OUTPUT_4_EUC: 1291 if(value<=0xffffff) { 1292 /* short sequences are stored directly */ 1293 /* code set 0 or 1 */ 1294 } else if(value<=0x8effffff) { 1295 /* code set 2 */ 1296 value&=0x7fffff; 1297 } else /* first byte is 0x8f */ { 1298 /* code set 3 */ 1299 value&=0xff7fff; 1300 } 1301 break; 1302 default: 1303 break; 1304 } 1305 1306 for(i=0; i<=0x1f; ++value, ++i) { 1307 c=codePoints[i]; 1308 if(c<0) { 1309 continue; 1310 } 1311 1312 /* locate the stage 2 & 3 data */ 1313 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1314 p=bytes; 1315 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1316 1317 /* write the codepage bytes into stage 3 */ 1318 switch(mbcsTable->outputType) { 1319 case MBCS_OUTPUT_3: 1320 case MBCS_OUTPUT_4_EUC: 1321 p+=st3*3; 1322 p[0]=(uint8_t)(value>>16); 1323 p[1]=(uint8_t)(value>>8); 1324 p[2]=(uint8_t)value; 1325 break; 1326 case MBCS_OUTPUT_4: 1327 ((uint32_t *)p)[st3]=value; 1328 break; 1329 default: 1330 /* 2 bytes per character */ 1331 ((uint16_t *)p)[st3]=(uint16_t)value; 1332 break; 1333 } 1334 1335 /* set the roundtrip flag */ 1336 *stage2|=(1UL<<(16+(c&0xf))); 1337 } 1338 return TRUE; 1339 } 1340 1341 static void 1342 reconstituteData(UConverterMBCSTable *mbcsTable, 1343 uint32_t stage1Length, uint32_t stage2Length, 1344 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1345 UErrorCode *pErrorCode) { 1346 uint16_t *stage1; 1347 uint32_t *stage2; 1348 uint8_t *bytes; 1349 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1350 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1351 if(mbcsTable->reconstitutedData==NULL) { 1352 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1353 return; 1354 } 1355 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1356 1357 /* copy existing data and reroute the pointers */ 1358 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1359 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1360 1361 stage2=(uint32_t *)(stage1+stage1Length); 1362 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1363 mbcsTable->fromUnicodeTable+stage1Length, 1364 stage2Length*4); 1365 1366 mbcsTable->fromUnicodeTable=stage1; 1367 mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length); 1368 1369 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1370 stage2=(uint32_t *)stage1; 1371 1372 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1373 { 1374 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1375 int32_t stageUTF8Index=0; 1376 int32_t st1, st2, st3, i; 1377 1378 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1379 st2=stage1[st1]; 1380 if(st2!=stage1Length/2) { 1381 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1382 for(i=0; i<16; ++i) { 1383 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1384 if(st3!=0) { 1385 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1386 st3>>=4; 1387 /* 1388 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1389 * allocated together as a single 64-block for access from the mbcsIndex 1390 */ 1391 stage2[st2++]=st3++; 1392 stage2[st2++]=st3++; 1393 stage2[st2++]=st3++; 1394 stage2[st2++]=st3; 1395 } else { 1396 /* no stage 3 block, skip */ 1397 st2+=4; 1398 } 1399 } 1400 } else { 1401 /* no stage 2 block, skip */ 1402 stageUTF8Index+=16; 1403 } 1404 } 1405 } 1406 1407 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1408 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1409 } 1410 1411 /* MBCS setup functions ----------------------------------------------------- */ 1412 1413 static void 1414 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1415 UConverterLoadArgs *pArgs, 1416 const uint8_t *raw, 1417 UErrorCode *pErrorCode) { 1418 UDataInfo info; 1419 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1420 _MBCSHeader *header=(_MBCSHeader *)raw; 1421 uint32_t offset; 1422 uint32_t headerLength; 1423 UBool noFromU=FALSE; 1424 1425 if(header->version[0]==4) { 1426 headerLength=MBCS_HEADER_V4_LENGTH; 1427 } else if(header->version[0]==5 && header->version[1]>=3 && 1428 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1429 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1430 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1431 } else { 1432 *pErrorCode=U_INVALID_TABLE_FORMAT; 1433 return; 1434 } 1435 1436 mbcsTable->outputType=(uint8_t)header->flags; 1437 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1438 *pErrorCode=U_INVALID_TABLE_FORMAT; 1439 return; 1440 } 1441 1442 /* extension data, header version 4.2 and higher */ 1443 offset=header->flags>>8; 1444 if(offset!=0) { 1445 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1446 } 1447 1448 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1449 UConverterLoadArgs args={ 0 }; 1450 UConverterSharedData *baseSharedData; 1451 const int32_t *extIndexes; 1452 const char *baseName; 1453 1454 /* extension-only file, load the base table and set values appropriately */ 1455 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1456 /* extension-only file without extension */ 1457 *pErrorCode=U_INVALID_TABLE_FORMAT; 1458 return; 1459 } 1460 1461 if(pArgs->nestedLoads!=1) { 1462 /* an extension table must not be loaded as a base table */ 1463 *pErrorCode=U_INVALID_TABLE_FILE; 1464 return; 1465 } 1466 1467 /* load the base table */ 1468 baseName=(const char *)header+headerLength*4; 1469 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1470 /* forbid loading this same extension-only file */ 1471 *pErrorCode=U_INVALID_TABLE_FORMAT; 1472 return; 1473 } 1474 1475 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1476 args.size=sizeof(UConverterLoadArgs); 1477 args.nestedLoads=2; 1478 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1479 args.reserved=pArgs->reserved; 1480 args.options=pArgs->options; 1481 args.pkg=pArgs->pkg; 1482 args.name=baseName; 1483 baseSharedData=ucnv_load(&args, pErrorCode); 1484 if(U_FAILURE(*pErrorCode)) { 1485 return; 1486 } 1487 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1488 baseSharedData->mbcs.baseSharedData!=NULL 1489 ) { 1490 ucnv_unload(baseSharedData); 1491 *pErrorCode=U_INVALID_TABLE_FORMAT; 1492 return; 1493 } 1494 if(pArgs->onlyTestIsLoadable) { 1495 /* 1496 * Exit as soon as we know that we can load the converter 1497 * and the format is valid and supported. 1498 * The worst that can happen in the following code is a memory 1499 * allocation error. 1500 */ 1501 ucnv_unload(baseSharedData); 1502 return; 1503 } 1504 1505 /* copy the base table data */ 1506 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1507 1508 /* overwrite values with relevant ones for the extension converter */ 1509 mbcsTable->baseSharedData=baseSharedData; 1510 mbcsTable->extIndexes=extIndexes; 1511 1512 /* 1513 * It would be possible to share the swapLFNL data with a base converter, 1514 * but the generated name would have to be different, and the memory 1515 * would have to be free'd only once. 1516 * It is easier to just create the data for the extension converter 1517 * separately when it is requested. 1518 */ 1519 mbcsTable->swapLFNLStateTable=NULL; 1520 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1521 mbcsTable->swapLFNLName=NULL; 1522 1523 /* 1524 * The reconstitutedData must be deleted only when the base converter 1525 * is unloaded. 1526 */ 1527 mbcsTable->reconstitutedData=NULL; 1528 1529 /* 1530 * Set a special, runtime-only outputType if the extension converter 1531 * is a DBCS version of a base converter that also maps single bytes. 1532 */ 1533 if( sharedData->staticData->conversionType==UCNV_DBCS || 1534 (sharedData->staticData->conversionType==UCNV_MBCS && 1535 sharedData->staticData->minBytesPerChar>=2) 1536 ) { 1537 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1538 /* the base converter is SI/SO-stateful */ 1539 int32_t entry; 1540 1541 /* get the dbcs state from the state table entry for SO=0x0e */ 1542 entry=mbcsTable->stateTable[0][0xe]; 1543 if( MBCS_ENTRY_IS_FINAL(entry) && 1544 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1545 MBCS_ENTRY_FINAL_STATE(entry)!=0 1546 ) { 1547 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1548 1549 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1550 } 1551 } else if( 1552 baseSharedData->staticData->conversionType==UCNV_MBCS && 1553 baseSharedData->staticData->minBytesPerChar==1 && 1554 baseSharedData->staticData->maxBytesPerChar==2 && 1555 mbcsTable->countStates<=127 1556 ) { 1557 /* non-stateful base converter, need to modify the state table */ 1558 int32_t (*newStateTable)[256]; 1559 int32_t *state; 1560 int32_t i, count; 1561 1562 /* allocate a new state table and copy the base state table contents */ 1563 count=mbcsTable->countStates; 1564 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1565 if(newStateTable==NULL) { 1566 ucnv_unload(baseSharedData); 1567 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1568 return; 1569 } 1570 1571 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1572 1573 /* change all final single-byte entries to go to a new all-illegal state */ 1574 state=newStateTable[0]; 1575 for(i=0; i<256; ++i) { 1576 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1577 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1578 } 1579 } 1580 1581 /* build the new all-illegal state */ 1582 state=newStateTable[count]; 1583 for(i=0; i<256; ++i) { 1584 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1585 } 1586 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1587 mbcsTable->countStates=(uint8_t)(count+1); 1588 mbcsTable->stateTableOwned=TRUE; 1589 1590 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1591 } 1592 } 1593 1594 /* 1595 * unlike below for files with base tables, do not get the unicodeMask 1596 * from the sharedData; instead, use the base table's unicodeMask, 1597 * which we copied in the memcpy above; 1598 * this is necessary because the static data unicodeMask, especially 1599 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1600 */ 1601 } else { 1602 /* conversion file with a base table; an additional extension table is optional */ 1603 /* make sure that the output type is known */ 1604 switch(mbcsTable->outputType) { 1605 case MBCS_OUTPUT_1: 1606 case MBCS_OUTPUT_2: 1607 case MBCS_OUTPUT_3: 1608 case MBCS_OUTPUT_4: 1609 case MBCS_OUTPUT_3_EUC: 1610 case MBCS_OUTPUT_4_EUC: 1611 case MBCS_OUTPUT_2_SISO: 1612 /* OK */ 1613 break; 1614 default: 1615 *pErrorCode=U_INVALID_TABLE_FORMAT; 1616 return; 1617 } 1618 if(pArgs->onlyTestIsLoadable) { 1619 /* 1620 * Exit as soon as we know that we can load the converter 1621 * and the format is valid and supported. 1622 * The worst that can happen in the following code is a memory 1623 * allocation error. 1624 */ 1625 return; 1626 } 1627 1628 mbcsTable->countStates=(uint8_t)header->countStates; 1629 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1630 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1631 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1632 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1633 1634 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1635 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1636 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1637 1638 /* 1639 * converter versions 6.1 and up contain a unicodeMask that is 1640 * used here to select the most efficient function implementations 1641 */ 1642 info.size=sizeof(UDataInfo); 1643 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1644 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1645 /* mask off possible future extensions to be safe */ 1646 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1647 } else { 1648 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1649 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1650 } 1651 1652 /* 1653 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1654 * Check for the header version, SBCS vs. MBCS, and for whether the 1655 * data structures are optimized for code points as high as what the 1656 * runtime code is designed for. 1657 * The implementation does not handle mapping tables with entries for 1658 * unpaired surrogates. 1659 */ 1660 if( header->version[1]>=3 && 1661 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1662 (mbcsTable->countStates==1 ? 1663 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1664 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1665 ) 1666 ) { 1667 mbcsTable->utf8Friendly=TRUE; 1668 1669 if(mbcsTable->countStates==1) { 1670 /* 1671 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1672 * Build a table with indexes to each block, to be used instead of 1673 * the regular stage 1/2 table. 1674 */ 1675 int32_t i; 1676 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1677 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1678 } 1679 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1680 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1681 } else { 1682 /* 1683 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1684 * The .cnv file is prebuilt with an additional stage table with indexes 1685 * to each block. 1686 */ 1687 mbcsTable->mbcsIndex=(const uint16_t *) 1688 (mbcsTable->fromUnicodeBytes+ 1689 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1690 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1691 } 1692 } 1693 1694 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1695 { 1696 uint32_t asciiRoundtrips=0xffffffff; 1697 int32_t i; 1698 1699 for(i=0; i<0x80; ++i) { 1700 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1701 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1702 } 1703 } 1704 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1705 } 1706 1707 if(noFromU) { 1708 uint32_t stage1Length= 1709 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1710 0x440 : 0x40; 1711 uint32_t stage2Length= 1712 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1713 stage1Length/2; 1714 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1715 } 1716 } 1717 1718 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1719 if(mbcsTable->utf8Friendly) { 1720 if(mbcsTable->countStates==1) { 1721 sharedData->impl=&_SBCSUTF8Impl; 1722 } else { 1723 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1724 sharedData->impl=&_DBCSUTF8Impl; 1725 } 1726 } 1727 } 1728 1729 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1730 /* 1731 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1732 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1733 */ 1734 mbcsTable->asciiRoundtrips=0; 1735 } 1736 } 1737 1738 static void 1739 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1740 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1741 1742 if(mbcsTable->swapLFNLStateTable!=NULL) { 1743 uprv_free(mbcsTable->swapLFNLStateTable); 1744 } 1745 if(mbcsTable->stateTableOwned) { 1746 uprv_free((void *)mbcsTable->stateTable); 1747 } 1748 if(mbcsTable->baseSharedData!=NULL) { 1749 ucnv_unload(mbcsTable->baseSharedData); 1750 } 1751 if(mbcsTable->reconstitutedData!=NULL) { 1752 uprv_free(mbcsTable->reconstitutedData); 1753 } 1754 } 1755 1756 static void 1757 ucnv_MBCSOpen(UConverter *cnv, 1758 UConverterLoadArgs *pArgs, 1759 UErrorCode *pErrorCode) { 1760 UConverterMBCSTable *mbcsTable; 1761 const int32_t *extIndexes; 1762 uint8_t outputType; 1763 int8_t maxBytesPerUChar; 1764 1765 if(pArgs->onlyTestIsLoadable) { 1766 return; 1767 } 1768 1769 mbcsTable=&cnv->sharedData->mbcs; 1770 outputType=mbcsTable->outputType; 1771 1772 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1773 /* the swaplfnl option does not apply, remove it */ 1774 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1775 } 1776 1777 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1778 /* do this because double-checked locking is broken */ 1779 UBool isCached; 1780 1781 umtx_lock(NULL); 1782 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1783 umtx_unlock(NULL); 1784 1785 if(!isCached) { 1786 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1787 if(U_FAILURE(*pErrorCode)) { 1788 return; /* something went wrong */ 1789 } 1790 1791 /* the option does not apply, remove it */ 1792 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1793 } 1794 } 1795 } 1796 1797 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1798 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1799 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1800 cnv->options|=_MBCS_OPTION_GB18030; 1801 } 1802 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1803 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1804 cnv->options|=_MBCS_OPTION_KEIS; 1805 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1806 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1807 cnv->options|=_MBCS_OPTION_JEF; 1808 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1809 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1810 cnv->options|=_MBCS_OPTION_JIPS; 1811 } 1812 1813 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1814 if(outputType==MBCS_OUTPUT_2_SISO) { 1815 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1816 } 1817 1818 extIndexes=mbcsTable->extIndexes; 1819 if(extIndexes!=NULL) { 1820 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1821 if(outputType==MBCS_OUTPUT_2_SISO) { 1822 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1823 } 1824 1825 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1826 cnv->maxBytesPerUChar=maxBytesPerUChar; 1827 } 1828 } 1829 1830 #if 0 1831 /* 1832 * documentation of UConverter fields used for status 1833 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1834 */ 1835 1836 /* toUnicode */ 1837 cnv->toUnicodeStatus=0; /* offset */ 1838 cnv->mode=0; /* state */ 1839 cnv->toULength=0; /* byteIndex */ 1840 1841 /* fromUnicode */ 1842 cnv->fromUChar32=0; 1843 cnv->fromUnicodeStatus=1; /* prevLength */ 1844 #endif 1845 } 1846 1847 static const char * 1848 ucnv_MBCSGetName(const UConverter *cnv) { 1849 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1850 return cnv->sharedData->mbcs.swapLFNLName; 1851 } else { 1852 return cnv->sharedData->staticData->name; 1853 } 1854 } 1855 1856 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 1857 1858 static UChar32 1859 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1860 const _MBCSToUFallback *toUFallbacks; 1861 uint32_t i, start, limit; 1862 1863 limit=mbcsTable->countToUFallbacks; 1864 if(limit>0) { 1865 /* do a binary search for the fallback mapping */ 1866 toUFallbacks=mbcsTable->toUFallbacks; 1867 start=0; 1868 while(start<limit-1) { 1869 i=(start+limit)/2; 1870 if(offset<toUFallbacks[i].offset) { 1871 limit=i; 1872 } else { 1873 start=i; 1874 } 1875 } 1876 1877 /* did we really find it? */ 1878 if(offset==toUFallbacks[start].offset) { 1879 return toUFallbacks[start].codePoint; 1880 } 1881 } 1882 1883 return 0xfffe; 1884 } 1885 1886 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1887 static void 1888 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1889 UErrorCode *pErrorCode) { 1890 UConverter *cnv; 1891 const uint8_t *source, *sourceLimit; 1892 UChar *target; 1893 const UChar *targetLimit; 1894 int32_t *offsets; 1895 1896 const int32_t (*stateTable)[256]; 1897 1898 int32_t sourceIndex; 1899 1900 int32_t entry; 1901 UChar c; 1902 uint8_t action; 1903 1904 /* set up the local pointers */ 1905 cnv=pArgs->converter; 1906 source=(const uint8_t *)pArgs->source; 1907 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1908 target=pArgs->target; 1909 targetLimit=pArgs->targetLimit; 1910 offsets=pArgs->offsets; 1911 1912 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1913 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1914 } else { 1915 stateTable=cnv->sharedData->mbcs.stateTable; 1916 } 1917 1918 /* sourceIndex=-1 if the current character began in the previous buffer */ 1919 sourceIndex=0; 1920 1921 /* conversion loop */ 1922 while(source<sourceLimit) { 1923 /* 1924 * This following test is to see if available input would overflow the output. 1925 * It does not catch output of more than one code unit that 1926 * overflows as a result of a surrogate pair or callback output 1927 * from the last source byte. 1928 * Therefore, those situations also test for overflows and will 1929 * then break the loop, too. 1930 */ 1931 if(target>=targetLimit) { 1932 /* target is full */ 1933 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1934 break; 1935 } 1936 1937 entry=stateTable[0][*source++]; 1938 /* MBCS_ENTRY_IS_FINAL(entry) */ 1939 1940 /* test the most common case first */ 1941 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1942 /* output BMP code point */ 1943 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1944 if(offsets!=NULL) { 1945 *offsets++=sourceIndex; 1946 } 1947 1948 /* normal end of action codes: prepare for a new character */ 1949 ++sourceIndex; 1950 continue; 1951 } 1952 1953 /* 1954 * An if-else-if chain provides more reliable performance for 1955 * the most common cases compared to a switch. 1956 */ 1957 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1958 if(action==MBCS_STATE_VALID_DIRECT_20 || 1959 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1960 ) { 1961 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1962 /* output surrogate pair */ 1963 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1964 if(offsets!=NULL) { 1965 *offsets++=sourceIndex; 1966 } 1967 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1968 if(target<targetLimit) { 1969 *target++=c; 1970 if(offsets!=NULL) { 1971 *offsets++=sourceIndex; 1972 } 1973 } else { 1974 /* target overflow */ 1975 cnv->UCharErrorBuffer[0]=c; 1976 cnv->UCharErrorBufferLength=1; 1977 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1978 break; 1979 } 1980 1981 ++sourceIndex; 1982 continue; 1983 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1984 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1985 /* output BMP code point */ 1986 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1987 if(offsets!=NULL) { 1988 *offsets++=sourceIndex; 1989 } 1990 1991 ++sourceIndex; 1992 continue; 1993 } 1994 } else if(action==MBCS_STATE_UNASSIGNED) { 1995 /* just fall through */ 1996 } else if(action==MBCS_STATE_ILLEGAL) { 1997 /* callback(illegal) */ 1998 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1999 } else { 2000 /* reserved, must never occur */ 2001 ++sourceIndex; 2002 continue; 2003 } 2004 2005 if(U_FAILURE(*pErrorCode)) { 2006 /* callback(illegal) */ 2007 break; 2008 } else /* unassigned sequences indicated with byteIndex>0 */ { 2009 /* try an extension mapping */ 2010 pArgs->source=(const char *)source; 2011 cnv->toUBytes[0]=*(source-1); 2012 cnv->toULength=_extToU(cnv, cnv->sharedData, 2013 1, &source, sourceLimit, 2014 &target, targetLimit, 2015 &offsets, sourceIndex, 2016 pArgs->flush, 2017 pErrorCode); 2018 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2019 2020 if(U_FAILURE(*pErrorCode)) { 2021 /* not mappable or buffer overflow */ 2022 break; 2023 } 2024 } 2025 } 2026 2027 /* write back the updated pointers */ 2028 pArgs->source=(const char *)source; 2029 pArgs->target=target; 2030 pArgs->offsets=offsets; 2031 } 2032 2033 /* 2034 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2035 * that only map to and from the BMP. 2036 * In addition to single-byte optimizations, the offset calculations 2037 * become much easier. 2038 */ 2039 static void 2040 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2041 UErrorCode *pErrorCode) { 2042 UConverter *cnv; 2043 const uint8_t *source, *sourceLimit, *lastSource; 2044 UChar *target; 2045 int32_t targetCapacity, length; 2046 int32_t *offsets; 2047 2048 const int32_t (*stateTable)[256]; 2049 2050 int32_t sourceIndex; 2051 2052 int32_t entry; 2053 uint8_t action; 2054 2055 /* set up the local pointers */ 2056 cnv=pArgs->converter; 2057 source=(const uint8_t *)pArgs->source; 2058 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2059 target=pArgs->target; 2060 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2061 offsets=pArgs->offsets; 2062 2063 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2064 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2065 } else { 2066 stateTable=cnv->sharedData->mbcs.stateTable; 2067 } 2068 2069 /* sourceIndex=-1 if the current character began in the previous buffer */ 2070 sourceIndex=0; 2071 lastSource=source; 2072 2073 /* 2074 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2075 * for the minimum of the sourceLength and targetCapacity 2076 */ 2077 length=(int32_t)(sourceLimit-source); 2078 if(length<targetCapacity) { 2079 targetCapacity=length; 2080 } 2081 2082 #if MBCS_UNROLL_SINGLE_TO_BMP 2083 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2084 /* unroll the loop with the most common case */ 2085 unrolled: 2086 if(targetCapacity>=16) { 2087 int32_t count, loops, oredEntries; 2088 2089 loops=count=targetCapacity>>4; 2090 do { 2091 oredEntries=entry=stateTable[0][*source++]; 2092 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2093 oredEntries|=entry=stateTable[0][*source++]; 2094 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2095 oredEntries|=entry=stateTable[0][*source++]; 2096 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2097 oredEntries|=entry=stateTable[0][*source++]; 2098 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2099 oredEntries|=entry=stateTable[0][*source++]; 2100 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2101 oredEntries|=entry=stateTable[0][*source++]; 2102 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2103 oredEntries|=entry=stateTable[0][*source++]; 2104 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2105 oredEntries|=entry=stateTable[0][*source++]; 2106 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2107 oredEntries|=entry=stateTable[0][*source++]; 2108 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2109 oredEntries|=entry=stateTable[0][*source++]; 2110 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2111 oredEntries|=entry=stateTable[0][*source++]; 2112 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2113 oredEntries|=entry=stateTable[0][*source++]; 2114 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2115 oredEntries|=entry=stateTable[0][*source++]; 2116 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2117 oredEntries|=entry=stateTable[0][*source++]; 2118 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2119 oredEntries|=entry=stateTable[0][*source++]; 2120 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2121 oredEntries|=entry=stateTable[0][*source++]; 2122 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2123 2124 /* were all 16 entries really valid? */ 2125 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2126 /* no, return to the first of these 16 */ 2127 source-=16; 2128 target-=16; 2129 break; 2130 } 2131 } while(--count>0); 2132 count=loops-count; 2133 targetCapacity-=16*count; 2134 2135 if(offsets!=NULL) { 2136 lastSource+=16*count; 2137 while(count>0) { 2138 *offsets++=sourceIndex++; 2139 *offsets++=sourceIndex++; 2140 *offsets++=sourceIndex++; 2141 *offsets++=sourceIndex++; 2142 *offsets++=sourceIndex++; 2143 *offsets++=sourceIndex++; 2144 *offsets++=sourceIndex++; 2145 *offsets++=sourceIndex++; 2146 *offsets++=sourceIndex++; 2147 *offsets++=sourceIndex++; 2148 *offsets++=sourceIndex++; 2149 *offsets++=sourceIndex++; 2150 *offsets++=sourceIndex++; 2151 *offsets++=sourceIndex++; 2152 *offsets++=sourceIndex++; 2153 *offsets++=sourceIndex++; 2154 --count; 2155 } 2156 } 2157 } 2158 #endif 2159 2160 /* conversion loop */ 2161 while(targetCapacity > 0 && source < sourceLimit) { 2162 entry=stateTable[0][*source++]; 2163 /* MBCS_ENTRY_IS_FINAL(entry) */ 2164 2165 /* test the most common case first */ 2166 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2167 /* output BMP code point */ 2168 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2169 --targetCapacity; 2170 continue; 2171 } 2172 2173 /* 2174 * An if-else-if chain provides more reliable performance for 2175 * the most common cases compared to a switch. 2176 */ 2177 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2178 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2179 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2180 /* output BMP code point */ 2181 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2182 --targetCapacity; 2183 continue; 2184 } 2185 } else if(action==MBCS_STATE_UNASSIGNED) { 2186 /* just fall through */ 2187 } else if(action==MBCS_STATE_ILLEGAL) { 2188 /* callback(illegal) */ 2189 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2190 } else { 2191 /* reserved, must never occur */ 2192 continue; 2193 } 2194 2195 /* set offsets since the start or the last extension */ 2196 if(offsets!=NULL) { 2197 int32_t count=(int32_t)(source-lastSource); 2198 2199 /* predecrement: do not set the offset for the callback-causing character */ 2200 while(--count>0) { 2201 *offsets++=sourceIndex++; 2202 } 2203 /* offset and sourceIndex are now set for the current character */ 2204 } 2205 2206 if(U_FAILURE(*pErrorCode)) { 2207 /* callback(illegal) */ 2208 break; 2209 } else /* unassigned sequences indicated with byteIndex>0 */ { 2210 /* try an extension mapping */ 2211 lastSource=source; 2212 cnv->toUBytes[0]=*(source-1); 2213 cnv->toULength=_extToU(cnv, cnv->sharedData, 2214 1, &source, sourceLimit, 2215 &target, pArgs->targetLimit, 2216 &offsets, sourceIndex, 2217 pArgs->flush, 2218 pErrorCode); 2219 sourceIndex+=1+(int32_t)(source-lastSource); 2220 2221 if(U_FAILURE(*pErrorCode)) { 2222 /* not mappable or buffer overflow */ 2223 break; 2224 } 2225 2226 /* recalculate the targetCapacity after an extension mapping */ 2227 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2228 length=(int32_t)(sourceLimit-source); 2229 if(length<targetCapacity) { 2230 targetCapacity=length; 2231 } 2232 } 2233 2234 #if MBCS_UNROLL_SINGLE_TO_BMP 2235 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2236 goto unrolled; 2237 #endif 2238 } 2239 2240 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2241 /* target is full */ 2242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2243 } 2244 2245 /* set offsets since the start or the last callback */ 2246 if(offsets!=NULL) { 2247 size_t count=source-lastSource; 2248 while(count>0) { 2249 *offsets++=sourceIndex++; 2250 --count; 2251 } 2252 } 2253 2254 /* write back the updated pointers */ 2255 pArgs->source=(const char *)source; 2256 pArgs->target=target; 2257 pArgs->offsets=offsets; 2258 } 2259 2260 static UBool 2261 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2262 const int32_t *row=stateTable[state]; 2263 int32_t b, entry; 2264 /* First test for final entries in this state for some commonly valid byte values. */ 2265 entry=row[0xa1]; 2266 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2267 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2268 ) { 2269 return TRUE; 2270 } 2271 entry=row[0x41]; 2272 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2273 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2274 ) { 2275 return TRUE; 2276 } 2277 /* Then test for final entries in this state. */ 2278 for(b=0; b<=0xff; ++b) { 2279 entry=row[b]; 2280 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2281 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2282 ) { 2283 return TRUE; 2284 } 2285 } 2286 /* Then recurse for transition entries. */ 2287 for(b=0; b<=0xff; ++b) { 2288 entry=row[b]; 2289 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2290 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2291 ) { 2292 return TRUE; 2293 } 2294 } 2295 return FALSE; 2296 } 2297 2298 /* 2299 * Is byte b a single/lead byte in this state? 2300 * Recurse for transition states, because here we don't want to say that 2301 * b is a lead byte if all byte sequences that start with b are illegal. 2302 */ 2303 static UBool 2304 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2305 const int32_t *row=stateTable[state]; 2306 int32_t entry=row[b]; 2307 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2308 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2309 } else { 2310 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2311 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2312 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2313 } else { 2314 return action!=MBCS_STATE_ILLEGAL; 2315 } 2316 } 2317 } 2318 2319 U_CFUNC void 2320 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2321 UErrorCode *pErrorCode) { 2322 UConverter *cnv; 2323 const uint8_t *source, *sourceLimit; 2324 UChar *target; 2325 const UChar *targetLimit; 2326 int32_t *offsets; 2327 2328 const int32_t (*stateTable)[256]; 2329 const uint16_t *unicodeCodeUnits; 2330 2331 uint32_t offset; 2332 uint8_t state; 2333 int8_t byteIndex; 2334 uint8_t *bytes; 2335 2336 int32_t sourceIndex, nextSourceIndex; 2337 2338 int32_t entry; 2339 UChar c; 2340 uint8_t action; 2341 2342 /* use optimized function if possible */ 2343 cnv=pArgs->converter; 2344 2345 if(cnv->preToULength>0) { 2346 /* 2347 * pass sourceIndex=-1 because we continue from an earlier buffer 2348 * in the future, this may change with continuous offsets 2349 */ 2350 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2351 2352 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2353 return; 2354 } 2355 } 2356 2357 if(cnv->sharedData->mbcs.countStates==1) { 2358 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2359 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2360 } else { 2361 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2362 } 2363 return; 2364 } 2365 2366 /* set up the local pointers */ 2367 source=(const uint8_t *)pArgs->source; 2368 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2369 target=pArgs->target; 2370 targetLimit=pArgs->targetLimit; 2371 offsets=pArgs->offsets; 2372 2373 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2374 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2375 } else { 2376 stateTable=cnv->sharedData->mbcs.stateTable; 2377 } 2378 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2379 2380 /* get the converter state from UConverter */ 2381 offset=cnv->toUnicodeStatus; 2382 byteIndex=cnv->toULength; 2383 bytes=cnv->toUBytes; 2384 2385 /* 2386 * if we are in the SBCS state for a DBCS-only converter, 2387 * then load the DBCS state from the MBCS data 2388 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2389 */ 2390 if((state=(uint8_t)(cnv->mode))==0) { 2391 state=cnv->sharedData->mbcs.dbcsOnlyState; 2392 } 2393 2394 /* sourceIndex=-1 if the current character began in the previous buffer */ 2395 sourceIndex=byteIndex==0 ? 0 : -1; 2396 nextSourceIndex=0; 2397 2398 /* conversion loop */ 2399 while(source<sourceLimit) { 2400 /* 2401 * This following test is to see if available input would overflow the output. 2402 * It does not catch output of more than one code unit that 2403 * overflows as a result of a surrogate pair or callback output 2404 * from the last source byte. 2405 * Therefore, those situations also test for overflows and will 2406 * then break the loop, too. 2407 */ 2408 if(target>=targetLimit) { 2409 /* target is full */ 2410 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2411 break; 2412 } 2413 2414 if(byteIndex==0) { 2415 /* optimized loop for 1/2-byte input and BMP output */ 2416 if(offsets==NULL) { 2417 do { 2418 entry=stateTable[state][*source]; 2419 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2420 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2421 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2422 2423 ++source; 2424 if( source<sourceLimit && 2425 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2426 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2427 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2428 ) { 2429 ++source; 2430 *target++=c; 2431 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2432 offset=0; 2433 } else { 2434 /* set the state and leave the optimized loop */ 2435 bytes[0]=*(source-1); 2436 byteIndex=1; 2437 break; 2438 } 2439 } else { 2440 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2441 /* output BMP code point */ 2442 ++source; 2443 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2444 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2445 } else { 2446 /* leave the optimized loop */ 2447 break; 2448 } 2449 } 2450 } while(source<sourceLimit && target<targetLimit); 2451 } else /* offsets!=NULL */ { 2452 do { 2453 entry=stateTable[state][*source]; 2454 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2455 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2456 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2457 2458 ++source; 2459 if( source<sourceLimit && 2460 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2461 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2462 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2463 ) { 2464 ++source; 2465 *target++=c; 2466 if(offsets!=NULL) { 2467 *offsets++=sourceIndex; 2468 sourceIndex=(nextSourceIndex+=2); 2469 } 2470 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2471 offset=0; 2472 } else { 2473 /* set the state and leave the optimized loop */ 2474 ++nextSourceIndex; 2475 bytes[0]=*(source-1); 2476 byteIndex=1; 2477 break; 2478 } 2479 } else { 2480 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2481 /* output BMP code point */ 2482 ++source; 2483 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2484 if(offsets!=NULL) { 2485 *offsets++=sourceIndex; 2486 sourceIndex=++nextSourceIndex; 2487 } 2488 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2489 } else { 2490 /* leave the optimized loop */ 2491 break; 2492 } 2493 } 2494 } while(source<sourceLimit && target<targetLimit); 2495 } 2496 2497 /* 2498 * these tests and break statements could be put inside the loop 2499 * if C had "break outerLoop" like Java 2500 */ 2501 if(source>=sourceLimit) { 2502 break; 2503 } 2504 if(target>=targetLimit) { 2505 /* target is full */ 2506 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2507 break; 2508 } 2509 2510 ++nextSourceIndex; 2511 bytes[byteIndex++]=*source++; 2512 } else /* byteIndex>0 */ { 2513 ++nextSourceIndex; 2514 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2515 } 2516 2517 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2518 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2519 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2520 continue; 2521 } 2522 2523 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2524 cnv->mode=state; 2525 2526 /* set the next state early so that we can reuse the entry variable */ 2527 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2528 2529 /* 2530 * An if-else-if chain provides more reliable performance for 2531 * the most common cases compared to a switch. 2532 */ 2533 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2534 if(action==MBCS_STATE_VALID_16) { 2535 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2536 c=unicodeCodeUnits[offset]; 2537 if(c<0xfffe) { 2538 /* output BMP code point */ 2539 *target++=c; 2540 if(offsets!=NULL) { 2541 *offsets++=sourceIndex; 2542 } 2543 byteIndex=0; 2544 } else if(c==0xfffe) { 2545 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2546 /* output fallback BMP code point */ 2547 *target++=(UChar)entry; 2548 if(offsets!=NULL) { 2549 *offsets++=sourceIndex; 2550 } 2551 byteIndex=0; 2552 } 2553 } else { 2554 /* callback(illegal) */ 2555 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2556 } 2557 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2558 /* output BMP code point */ 2559 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2560 if(offsets!=NULL) { 2561 *offsets++=sourceIndex; 2562 } 2563 byteIndex=0; 2564 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2565 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2566 c=unicodeCodeUnits[offset++]; 2567 if(c<0xd800) { 2568 /* output BMP code point below 0xd800 */ 2569 *target++=c; 2570 if(offsets!=NULL) { 2571 *offsets++=sourceIndex; 2572 } 2573 byteIndex=0; 2574 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2575 /* output roundtrip or fallback surrogate pair */ 2576 *target++=(UChar)(c&0xdbff); 2577 if(offsets!=NULL) { 2578 *offsets++=sourceIndex; 2579 } 2580 byteIndex=0; 2581 if(target<targetLimit) { 2582 *target++=unicodeCodeUnits[offset]; 2583 if(offsets!=NULL) { 2584 *offsets++=sourceIndex; 2585 } 2586 } else { 2587 /* target overflow */ 2588 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2589 cnv->UCharErrorBufferLength=1; 2590 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2591 2592 offset=0; 2593 break; 2594 } 2595 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2596 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2597 *target++=unicodeCodeUnits[offset]; 2598 if(offsets!=NULL) { 2599 *offsets++=sourceIndex; 2600 } 2601 byteIndex=0; 2602 } else if(c==0xffff) { 2603 /* callback(illegal) */ 2604 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2605 } 2606 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2607 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2608 ) { 2609 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2610 /* output surrogate pair */ 2611 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2612 if(offsets!=NULL) { 2613 *offsets++=sourceIndex; 2614 } 2615 byteIndex=0; 2616 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2617 if(target<targetLimit) { 2618 *target++=c; 2619 if(offsets!=NULL) { 2620 *offsets++=sourceIndex; 2621 } 2622 } else { 2623 /* target overflow */ 2624 cnv->UCharErrorBuffer[0]=c; 2625 cnv->UCharErrorBufferLength=1; 2626 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2627 2628 offset=0; 2629 break; 2630 } 2631 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2632 /* 2633 * This serves as a state change without any output. 2634 * It is useful for reading simple stateful encodings, 2635 * for example using just Shift-In/Shift-Out codes. 2636 * The 21 unused bits may later be used for more sophisticated 2637 * state transitions. 2638 */ 2639 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2640 byteIndex=0; 2641 } else { 2642 /* SI/SO are illegal for DBCS-only conversion */ 2643 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2644 2645 /* callback(illegal) */ 2646 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2647 } 2648 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2649 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2650 /* output BMP code point */ 2651 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2652 if(offsets!=NULL) { 2653 *offsets++=sourceIndex; 2654 } 2655 byteIndex=0; 2656 } 2657 } else if(action==MBCS_STATE_UNASSIGNED) { 2658 /* just fall through */ 2659 } else if(action==MBCS_STATE_ILLEGAL) { 2660 /* callback(illegal) */ 2661 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2662 } else { 2663 /* reserved, must never occur */ 2664 byteIndex=0; 2665 } 2666 2667 /* end of action codes: prepare for a new character */ 2668 offset=0; 2669 2670 if(byteIndex==0) { 2671 sourceIndex=nextSourceIndex; 2672 } else if(U_FAILURE(*pErrorCode)) { 2673 /* callback(illegal) */ 2674 if(byteIndex>1) { 2675 /* 2676 * Ticket 5691: consistent illegal sequences: 2677 * - We include at least the first byte in the illegal sequence. 2678 * - If any of the non-initial bytes could be the start of a character, 2679 * we stop the illegal sequence before the first one of those. 2680 */ 2681 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2682 int8_t i; 2683 for(i=1; 2684 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2685 ++i) {} 2686 if(i<byteIndex) { 2687 /* Back out some bytes. */ 2688 int8_t backOutDistance=byteIndex-i; 2689 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2690 byteIndex=i; /* length of reported illegal byte sequence */ 2691 if(backOutDistance<=bytesFromThisBuffer) { 2692 source-=backOutDistance; 2693 } else { 2694 /* Back out bytes from the previous buffer: Need to replay them. */ 2695 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2696 /* preToULength is negative! */ 2697 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2698 source=(const uint8_t *)pArgs->source; 2699 } 2700 } 2701 } 2702 break; 2703 } else /* unassigned sequences indicated with byteIndex>0 */ { 2704 /* try an extension mapping */ 2705 pArgs->source=(const char *)source; 2706 byteIndex=_extToU(cnv, cnv->sharedData, 2707 byteIndex, &source, sourceLimit, 2708 &target, targetLimit, 2709 &offsets, sourceIndex, 2710 pArgs->flush, 2711 pErrorCode); 2712 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2713 2714 if(U_FAILURE(*pErrorCode)) { 2715 /* not mappable or buffer overflow */ 2716 break; 2717 } 2718 } 2719 } 2720 2721 /* set the converter state back into UConverter */ 2722 cnv->toUnicodeStatus=offset; 2723 cnv->mode=state; 2724 cnv->toULength=byteIndex; 2725 2726 /* write back the updated pointers */ 2727 pArgs->source=(const char *)source; 2728 pArgs->target=target; 2729 pArgs->offsets=offsets; 2730 } 2731 2732 /* 2733 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2734 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2735 */ 2736 static UChar32 2737 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2738 UErrorCode *pErrorCode) { 2739 UConverter *cnv; 2740 const int32_t (*stateTable)[256]; 2741 const uint8_t *source, *sourceLimit; 2742 2743 int32_t entry; 2744 uint8_t action; 2745 2746 /* set up the local pointers */ 2747 cnv=pArgs->converter; 2748 source=(const uint8_t *)pArgs->source; 2749 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2750 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2751 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2752 } else { 2753 stateTable=cnv->sharedData->mbcs.stateTable; 2754 } 2755 2756 /* conversion loop */ 2757 while(source<sourceLimit) { 2758 entry=stateTable[0][*source++]; 2759 /* MBCS_ENTRY_IS_FINAL(entry) */ 2760 2761 /* write back the updated pointer early so that we can return directly */ 2762 pArgs->source=(const char *)source; 2763 2764 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2765 /* output BMP code point */ 2766 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2767 } 2768 2769 /* 2770 * An if-else-if chain provides more reliable performance for 2771 * the most common cases compared to a switch. 2772 */ 2773 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2774 if( action==MBCS_STATE_VALID_DIRECT_20 || 2775 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2776 ) { 2777 /* output supplementary code point */ 2778 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2779 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2780 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2781 /* output BMP code point */ 2782 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2783 } 2784 } else if(action==MBCS_STATE_UNASSIGNED) { 2785 /* just fall through */ 2786 } else if(action==MBCS_STATE_ILLEGAL) { 2787 /* callback(illegal) */ 2788 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2789 } else { 2790 /* reserved, must never occur */ 2791 continue; 2792 } 2793 2794 if(U_FAILURE(*pErrorCode)) { 2795 /* callback(illegal) */ 2796 break; 2797 } else /* unassigned sequence */ { 2798 /* defer to the generic implementation */ 2799 pArgs->source=(const char *)source-1; 2800 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2801 } 2802 } 2803 2804 /* no output because of empty input or only state changes */ 2805 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2806 return 0xffff; 2807 } 2808 2809 /* 2810 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2811 * conversion without offset handling. 2812 * 2813 * When a character does not have a mapping to Unicode, then we return to the 2814 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2815 * handling. 2816 * We also defer to the generic code in other complicated cases and have them 2817 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2818 * 2819 * All normal mappings and errors are handled here. 2820 */ 2821 static UChar32 2822 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2823 UErrorCode *pErrorCode) { 2824 UConverter *cnv; 2825 const uint8_t *source, *sourceLimit, *lastSource; 2826 2827 const int32_t (*stateTable)[256]; 2828 const uint16_t *unicodeCodeUnits; 2829 2830 uint32_t offset; 2831 uint8_t state; 2832 2833 int32_t entry; 2834 UChar32 c; 2835 uint8_t action; 2836 2837 /* use optimized function if possible */ 2838 cnv=pArgs->converter; 2839 2840 if(cnv->preToULength>0) { 2841 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2842 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2843 } 2844 2845 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2846 /* 2847 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2848 * with the rare case of a codepage that maps single surrogates 2849 * without adding the complexity to this already complicated function here. 2850 */ 2851 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2852 } else if(cnv->sharedData->mbcs.countStates==1) { 2853 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2854 } 2855 2856 /* set up the local pointers */ 2857 source=lastSource=(const uint8_t *)pArgs->source; 2858 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2859 2860 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2861 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2862 } else { 2863 stateTable=cnv->sharedData->mbcs.stateTable; 2864 } 2865 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2866 2867 /* get the converter state from UConverter */ 2868 offset=cnv->toUnicodeStatus; 2869 2870 /* 2871 * if we are in the SBCS state for a DBCS-only converter, 2872 * then load the DBCS state from the MBCS data 2873 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2874 */ 2875 if((state=(uint8_t)(cnv->mode))==0) { 2876 state=cnv->sharedData->mbcs.dbcsOnlyState; 2877 } 2878 2879 /* conversion loop */ 2880 c=U_SENTINEL; 2881 while(source<sourceLimit) { 2882 entry=stateTable[state][*source++]; 2883 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2884 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2885 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2886 2887 /* optimization for 1/2-byte input and BMP output */ 2888 if( source<sourceLimit && 2889 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2890 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2891 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2892 ) { 2893 ++source; 2894 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2895 /* output BMP code point */ 2896 break; 2897 } 2898 } else { 2899 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2900 cnv->mode=state; 2901 2902 /* set the next state early so that we can reuse the entry variable */ 2903 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2904 2905 /* 2906 * An if-else-if chain provides more reliable performance for 2907 * the most common cases compared to a switch. 2908 */ 2909 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2910 if(action==MBCS_STATE_VALID_DIRECT_16) { 2911 /* output BMP code point */ 2912 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2913 break; 2914 } else if(action==MBCS_STATE_VALID_16) { 2915 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2916 c=unicodeCodeUnits[offset]; 2917 if(c<0xfffe) { 2918 /* output BMP code point */ 2919 break; 2920 } else if(c==0xfffe) { 2921 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2922 break; 2923 } 2924 } else { 2925 /* callback(illegal) */ 2926 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2927 } 2928 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2929 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2930 c=unicodeCodeUnits[offset++]; 2931 if(c<0xd800) { 2932 /* output BMP code point below 0xd800 */ 2933 break; 2934 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2935 /* output roundtrip or fallback supplementary code point */ 2936 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 2937 break; 2938 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2939 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2940 c=unicodeCodeUnits[offset]; 2941 break; 2942 } else if(c==0xffff) { 2943 /* callback(illegal) */ 2944 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2945 } 2946 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2947 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2948 ) { 2949 /* output supplementary code point */ 2950 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2951 break; 2952 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2953 /* 2954 * This serves as a state change without any output. 2955 * It is useful for reading simple stateful encodings, 2956 * for example using just Shift-In/Shift-Out codes. 2957 * The 21 unused bits may later be used for more sophisticated 2958 * state transitions. 2959 */ 2960 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 2961 /* SI/SO are illegal for DBCS-only conversion */ 2962 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2963 2964 /* callback(illegal) */ 2965 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2966 } 2967 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2968 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2969 /* output BMP code point */ 2970 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2971 break; 2972 } 2973 } else if(action==MBCS_STATE_UNASSIGNED) { 2974 /* just fall through */ 2975 } else if(action==MBCS_STATE_ILLEGAL) { 2976 /* callback(illegal) */ 2977 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2978 } else { 2979 /* reserved (must never occur), or only state change */ 2980 offset=0; 2981 lastSource=source; 2982 continue; 2983 } 2984 2985 /* end of action codes: prepare for a new character */ 2986 offset=0; 2987 2988 if(U_FAILURE(*pErrorCode)) { 2989 /* callback(illegal) */ 2990 break; 2991 } else /* unassigned sequence */ { 2992 /* defer to the generic implementation */ 2993 cnv->toUnicodeStatus=0; 2994 cnv->mode=state; 2995 pArgs->source=(const char *)lastSource; 2996 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2997 } 2998 } 2999 } 3000 3001 if(c<0) { 3002 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3003 /* incomplete character byte sequence */ 3004 uint8_t *bytes=cnv->toUBytes; 3005 cnv->toULength=(int8_t)(source-lastSource); 3006 do { 3007 *bytes++=*lastSource++; 3008 } while(lastSource<source); 3009 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3010 } else if(U_FAILURE(*pErrorCode)) { 3011 /* callback(illegal) */ 3012 /* 3013 * Ticket 5691: consistent illegal sequences: 3014 * - We include at least the first byte in the illegal sequence. 3015 * - If any of the non-initial bytes could be the start of a character, 3016 * we stop the illegal sequence before the first one of those. 3017 */ 3018 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3019 uint8_t *bytes=cnv->toUBytes; 3020 *bytes++=*lastSource++; /* first byte */ 3021 if(lastSource==source) { 3022 cnv->toULength=1; 3023 } else /* lastSource<source: multi-byte character */ { 3024 int8_t i; 3025 for(i=1; 3026 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3027 ++i 3028 ) { 3029 *bytes++=*lastSource++; 3030 } 3031 cnv->toULength=i; 3032 source=lastSource; 3033 } 3034 } else { 3035 /* no output because of empty input or only state changes */ 3036 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3037 } 3038 c=0xffff; 3039 } 3040 3041 /* set the converter state back into UConverter, ready for a new character */ 3042 cnv->toUnicodeStatus=0; 3043 cnv->mode=state; 3044 3045 /* write back the updated pointer */ 3046 pArgs->source=(const char *)source; 3047 return c; 3048 } 3049 3050 #if 0 3051 /* 3052 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3053 * Removal improves code coverage. 3054 */ 3055 /** 3056 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3057 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3058 * It does not handle conversion extensions (_extToU()). 3059 */ 3060 U_CFUNC UChar32 3061 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3062 uint8_t b, UBool useFallback) { 3063 int32_t entry; 3064 uint8_t action; 3065 3066 entry=sharedData->mbcs.stateTable[0][b]; 3067 /* MBCS_ENTRY_IS_FINAL(entry) */ 3068 3069 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3070 /* output BMP code point */ 3071 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3072 } 3073 3074 /* 3075 * An if-else-if chain provides more reliable performance for 3076 * the most common cases compared to a switch. 3077 */ 3078 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3079 if(action==MBCS_STATE_VALID_DIRECT_20) { 3080 /* output supplementary code point */ 3081 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3082 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3083 if(!TO_U_USE_FALLBACK(useFallback)) { 3084 return 0xfffe; 3085 } 3086 /* output BMP code point */ 3087 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3088 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3089 if(!TO_U_USE_FALLBACK(useFallback)) { 3090 return 0xfffe; 3091 } 3092 /* output supplementary code point */ 3093 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3094 } else if(action==MBCS_STATE_UNASSIGNED) { 3095 return 0xfffe; 3096 } else if(action==MBCS_STATE_ILLEGAL) { 3097 return 0xffff; 3098 } else { 3099 /* reserved, must never occur */ 3100 return 0xffff; 3101 } 3102 } 3103 #endif 3104 3105 /* 3106 * This is a simple version of _MBCSGetNextUChar() that is used 3107 * by other converter implementations. 3108 * It only returns an "assigned" result if it consumes the entire input. 3109 * It does not use state from the converter, nor error codes. 3110 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3111 * It handles conversion extensions but not GB 18030. 3112 * 3113 * Return value: 3114 * U+fffe unassigned 3115 * U+ffff illegal 3116 * otherwise the Unicode code point 3117 */ 3118 U_CFUNC UChar32 3119 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3120 const char *source, int32_t length, 3121 UBool useFallback) { 3122 const int32_t (*stateTable)[256]; 3123 const uint16_t *unicodeCodeUnits; 3124 3125 uint32_t offset; 3126 uint8_t state, action; 3127 3128 UChar32 c; 3129 int32_t i, entry; 3130 3131 if(length<=0) { 3132 /* no input at all: "illegal" */ 3133 return 0xffff; 3134 } 3135 3136 #if 0 3137 /* 3138 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3139 * TODO In future releases, verify that this function is never called for SBCS 3140 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3141 * Removal improves code coverage. 3142 */ 3143 /* use optimized function if possible */ 3144 if(sharedData->mbcs.countStates==1) { 3145 if(length==1) { 3146 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3147 } else { 3148 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3149 } 3150 } 3151 #endif 3152 3153 /* set up the local pointers */ 3154 stateTable=sharedData->mbcs.stateTable; 3155 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3156 3157 /* converter state */ 3158 offset=0; 3159 state=sharedData->mbcs.dbcsOnlyState; 3160 3161 /* conversion loop */ 3162 for(i=0;;) { 3163 entry=stateTable[state][(uint8_t)source[i++]]; 3164 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3165 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3166 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3167 3168 if(i==length) { 3169 return 0xffff; /* truncated character */ 3170 } 3171 } else { 3172 /* 3173 * An if-else-if chain provides more reliable performance for 3174 * the most common cases compared to a switch. 3175 */ 3176 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3177 if(action==MBCS_STATE_VALID_16) { 3178 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3179 c=unicodeCodeUnits[offset]; 3180 if(c!=0xfffe) { 3181 /* done */ 3182 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3183 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3184 /* else done with 0xfffe */ 3185 } 3186 break; 3187 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3188 /* output BMP code point */ 3189 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3190 break; 3191 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3192 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3193 c=unicodeCodeUnits[offset++]; 3194 if(c<0xd800) { 3195 /* output BMP code point below 0xd800 */ 3196 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3197 /* output roundtrip or fallback supplementary code point */ 3198 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3199 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3200 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3201 c=unicodeCodeUnits[offset]; 3202 } else if(c==0xffff) { 3203 return 0xffff; 3204 } else { 3205 c=0xfffe; 3206 } 3207 break; 3208 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3209 /* output supplementary code point */ 3210 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3211 break; 3212 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3213 if(!TO_U_USE_FALLBACK(useFallback)) { 3214 c=0xfffe; 3215 break; 3216 } 3217 /* output BMP code point */ 3218 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3219 break; 3220 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3221 if(!TO_U_USE_FALLBACK(useFallback)) { 3222 c=0xfffe; 3223 break; 3224 } 3225 /* output supplementary code point */ 3226 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3227 break; 3228 } else if(action==MBCS_STATE_UNASSIGNED) { 3229 c=0xfffe; 3230 break; 3231 } 3232 3233 /* 3234 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3235 * and MBCS_STATE_ILLEGAL and reserved action codes 3236 */ 3237 return 0xffff; 3238 } 3239 } 3240 3241 if(i!=length) { 3242 /* illegal for this function: not all input consumed */ 3243 return 0xffff; 3244 } 3245 3246 if(c==0xfffe) { 3247 /* try an extension mapping */ 3248 const int32_t *cx=sharedData->mbcs.extIndexes; 3249 if(cx!=NULL) { 3250 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3251 } 3252 } 3253 3254 return c; 3255 } 3256 3257 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3258 3259 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3260 static void 3261 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3262 UErrorCode *pErrorCode) { 3263 UConverter *cnv; 3264 const UChar *source, *sourceLimit; 3265 uint8_t *target; 3266 int32_t targetCapacity; 3267 int32_t *offsets; 3268 3269 const uint16_t *table; 3270 const uint16_t *mbcsIndex; 3271 const uint8_t *bytes; 3272 3273 UChar32 c; 3274 3275 int32_t sourceIndex, nextSourceIndex; 3276 3277 uint32_t stage2Entry; 3278 uint32_t asciiRoundtrips; 3279 uint32_t value; 3280 uint8_t unicodeMask; 3281 3282 /* use optimized function if possible */ 3283 cnv=pArgs->converter; 3284 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3285 3286 /* set up the local pointers */ 3287 source=pArgs->source; 3288 sourceLimit=pArgs->sourceLimit; 3289 target=(uint8_t *)pArgs->target; 3290 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3291 offsets=pArgs->offsets; 3292 3293 table=cnv->sharedData->mbcs.fromUnicodeTable; 3294 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3295 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3296 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3297 } else { 3298 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3299 } 3300 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3301 3302 /* get the converter state from UConverter */ 3303 c=cnv->fromUChar32; 3304 3305 /* sourceIndex=-1 if the current character began in the previous buffer */ 3306 sourceIndex= c==0 ? 0 : -1; 3307 nextSourceIndex=0; 3308 3309 /* conversion loop */ 3310 if(c!=0 && targetCapacity>0) { 3311 goto getTrail; 3312 } 3313 3314 while(source<sourceLimit) { 3315 /* 3316 * This following test is to see if available input would overflow the output. 3317 * It does not catch output of more than one byte that 3318 * overflows as a result of a multi-byte character or callback output 3319 * from the last source character. 3320 * Therefore, those situations also test for overflows and will 3321 * then break the loop, too. 3322 */ 3323 if(targetCapacity>0) { 3324 /* 3325 * Get a correct Unicode code point: 3326 * a single UChar for a BMP code point or 3327 * a matched surrogate pair for a "supplementary code point". 3328 */ 3329 c=*source++; 3330 ++nextSourceIndex; 3331 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3332 *target++=(uint8_t)c; 3333 if(offsets!=NULL) { 3334 *offsets++=sourceIndex; 3335 sourceIndex=nextSourceIndex; 3336 } 3337 --targetCapacity; 3338 c=0; 3339 continue; 3340 } 3341 /* 3342 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3343 * to avoid dealing with surrogates. 3344 * MBCS_FAST_MAX must be >=0xd7ff. 3345 */ 3346 if(c<=0xd7ff) { 3347 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3348 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3349 if(value==0) { 3350 goto unassigned; 3351 } 3352 /* output the value */ 3353 } else { 3354 /* 3355 * This also tests if the codepage maps single surrogates. 3356 * If it does, then surrogates are not paired but mapped separately. 3357 * Note that in this case unmatched surrogates are not detected. 3358 */ 3359 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3360 if(UTF_IS_SURROGATE_FIRST(c)) { 3361 getTrail: 3362 if(source<sourceLimit) { 3363 /* test the following code unit */ 3364 UChar trail=*source; 3365 if(UTF_IS_SECOND_SURROGATE(trail)) { 3366 ++source; 3367 ++nextSourceIndex; 3368 c=UTF16_GET_PAIR_VALUE(c, trail); 3369 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3370 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3371 /* callback(unassigned) */ 3372 goto unassigned; 3373 } 3374 /* convert this supplementary code point */ 3375 /* exit this condition tree */ 3376 } else { 3377 /* this is an unmatched lead code unit (1st surrogate) */ 3378 /* callback(illegal) */ 3379 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3380 break; 3381 } 3382 } else { 3383 /* no more input */ 3384 break; 3385 } 3386 } else { 3387 /* this is an unmatched trail code unit (2nd surrogate) */ 3388 /* callback(illegal) */ 3389 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3390 break; 3391 } 3392 } 3393 3394 /* convert the Unicode code point in c into codepage bytes */ 3395 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3396 3397 /* get the bytes and the length for the output */ 3398 /* MBCS_OUTPUT_2 */ 3399 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3400 3401 /* is this code point assigned, or do we use fallbacks? */ 3402 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3403 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3404 ) { 3405 /* 3406 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3407 * There is no way with this data structure for fallback output 3408 * to be a zero byte. 3409 */ 3410 3411 unassigned: 3412 /* try an extension mapping */ 3413 pArgs->source=source; 3414 c=_extFromU(cnv, cnv->sharedData, 3415 c, &source, sourceLimit, 3416 &target, target+targetCapacity, 3417 &offsets, sourceIndex, 3418 pArgs->flush, 3419 pErrorCode); 3420 nextSourceIndex+=(int32_t)(source-pArgs->source); 3421 3422 if(U_FAILURE(*pErrorCode)) { 3423 /* not mappable or buffer overflow */ 3424 break; 3425 } else { 3426 /* a mapping was written to the target, continue */ 3427 3428 /* recalculate the targetCapacity after an extension mapping */ 3429 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3430 3431 /* normal end of conversion: prepare for a new character */ 3432 sourceIndex=nextSourceIndex; 3433 continue; 3434 } 3435 } 3436 } 3437 3438 /* write the output character bytes from value and length */ 3439 /* from the first if in the loop we know that targetCapacity>0 */ 3440 if(value<=0xff) { 3441 /* this is easy because we know that there is enough space */ 3442 *target++=(uint8_t)value; 3443 if(offsets!=NULL) { 3444 *offsets++=sourceIndex; 3445 } 3446 --targetCapacity; 3447 } else /* length==2 */ { 3448 *target++=(uint8_t)(value>>8); 3449 if(2<=targetCapacity) { 3450 *target++=(uint8_t)value; 3451 if(offsets!=NULL) { 3452 *offsets++=sourceIndex; 3453 *offsets++=sourceIndex; 3454 } 3455 targetCapacity-=2; 3456 } else { 3457 if(offsets!=NULL) { 3458 *offsets++=sourceIndex; 3459 } 3460 cnv->charErrorBuffer[0]=(char)value; 3461 cnv->charErrorBufferLength=1; 3462 3463 /* target overflow */ 3464 targetCapacity=0; 3465 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3466 c=0; 3467 break; 3468 } 3469 } 3470 3471 /* normal end of conversion: prepare for a new character */ 3472 c=0; 3473 sourceIndex=nextSourceIndex; 3474 continue; 3475 } else { 3476 /* target is full */ 3477 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3478 break; 3479 } 3480 } 3481 3482 /* set the converter state back into UConverter */ 3483 cnv->fromUChar32=c; 3484 3485 /* write back the updated pointers */ 3486 pArgs->source=source; 3487 pArgs->target=(char *)target; 3488 pArgs->offsets=offsets; 3489 } 3490 3491 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3492 static void 3493 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3494 UErrorCode *pErrorCode) { 3495 UConverter *cnv; 3496 const UChar *source, *sourceLimit; 3497 uint8_t *target; 3498 int32_t targetCapacity; 3499 int32_t *offsets; 3500 3501 const uint16_t *table; 3502 const uint16_t *results; 3503 3504 UChar32 c; 3505 3506 int32_t sourceIndex, nextSourceIndex; 3507 3508 uint16_t value, minValue; 3509 UBool hasSupplementary; 3510 3511 /* set up the local pointers */ 3512 cnv=pArgs->converter; 3513 source=pArgs->source; 3514 sourceLimit=pArgs->sourceLimit; 3515 target=(uint8_t *)pArgs->target; 3516 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3517 offsets=pArgs->offsets; 3518 3519 table=cnv->sharedData->mbcs.fromUnicodeTable; 3520 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3521 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3522 } else { 3523 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3524 } 3525 3526 if(cnv->useFallback) { 3527 /* use all roundtrip and fallback results */ 3528 minValue=0x800; 3529 } else { 3530 /* use only roundtrips and fallbacks from private-use characters */ 3531 minValue=0xc00; 3532 } 3533 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3534 3535 /* get the converter state from UConverter */ 3536 c=cnv->fromUChar32; 3537 3538 /* sourceIndex=-1 if the current character began in the previous buffer */ 3539 sourceIndex= c==0 ? 0 : -1; 3540 nextSourceIndex=0; 3541 3542 /* conversion loop */ 3543 if(c!=0 && targetCapacity>0) { 3544 goto getTrail; 3545 } 3546 3547 while(source<sourceLimit) { 3548 /* 3549 * This following test is to see if available input would overflow the output. 3550 * It does not catch output of more than one byte that 3551 * overflows as a result of a multi-byte character or callback output 3552 * from the last source character. 3553 * Therefore, those situations also test for overflows and will 3554 * then break the loop, too. 3555 */ 3556 if(targetCapacity>0) { 3557 /* 3558 * Get a correct Unicode code point: 3559 * a single UChar for a BMP code point or 3560 * a matched surrogate pair for a "supplementary code point". 3561 */ 3562 c=*source++; 3563 ++nextSourceIndex; 3564 if(UTF_IS_SURROGATE(c)) { 3565 if(UTF_IS_SURROGATE_FIRST(c)) { 3566 getTrail: 3567 if(source<sourceLimit) { 3568 /* test the following code unit */ 3569 UChar trail=*source; 3570 if(UTF_IS_SECOND_SURROGATE(trail)) { 3571 ++source; 3572 ++nextSourceIndex; 3573 c=UTF16_GET_PAIR_VALUE(c, trail); 3574 if(!hasSupplementary) { 3575 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3576 /* callback(unassigned) */ 3577 goto unassigned; 3578 } 3579 /* convert this supplementary code point */ 3580 /* exit this condition tree */ 3581 } else { 3582 /* this is an unmatched lead code unit (1st surrogate) */ 3583 /* callback(illegal) */ 3584 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3585 break; 3586 } 3587 } else { 3588 /* no more input */ 3589 break; 3590 } 3591 } else { 3592 /* this is an unmatched trail code unit (2nd surrogate) */ 3593 /* callback(illegal) */ 3594 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3595 break; 3596 } 3597 } 3598 3599 /* convert the Unicode code point in c into codepage bytes */ 3600 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3601 3602 /* is this code point assigned, or do we use fallbacks? */ 3603 if(value>=minValue) { 3604 /* assigned, write the output character bytes from value and length */ 3605 /* length==1 */ 3606 /* this is easy because we know that there is enough space */ 3607 *target++=(uint8_t)value; 3608 if(offsets!=NULL) { 3609 *offsets++=sourceIndex; 3610 } 3611 --targetCapacity; 3612 3613 /* normal end of conversion: prepare for a new character */ 3614 c=0; 3615 sourceIndex=nextSourceIndex; 3616 } else { /* unassigned */ 3617 unassigned: 3618 /* try an extension mapping */ 3619 pArgs->source=source; 3620 c=_extFromU(cnv, cnv->sharedData, 3621 c, &source, sourceLimit, 3622 &target, target+targetCapacity, 3623 &offsets, sourceIndex, 3624 pArgs->flush, 3625 pErrorCode); 3626 nextSourceIndex+=(int32_t)(source-pArgs->source); 3627 3628 if(U_FAILURE(*pErrorCode)) { 3629 /* not mappable or buffer overflow */ 3630 break; 3631 } else { 3632 /* a mapping was written to the target, continue */ 3633 3634 /* recalculate the targetCapacity after an extension mapping */ 3635 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3636 3637 /* normal end of conversion: prepare for a new character */ 3638 sourceIndex=nextSourceIndex; 3639 } 3640 } 3641 } else { 3642 /* target is full */ 3643 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3644 break; 3645 } 3646 } 3647 3648 /* set the converter state back into UConverter */ 3649 cnv->fromUChar32=c; 3650 3651 /* write back the updated pointers */ 3652 pArgs->source=source; 3653 pArgs->target=(char *)target; 3654 pArgs->offsets=offsets; 3655 } 3656 3657 /* 3658 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3659 * that map only to and from the BMP. 3660 * In addition to single-byte/state optimizations, the offset calculations 3661 * become much easier. 3662 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3663 * but measurements have shown that this diminishes performance 3664 * in more cases than it improves it. 3665 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3666 * for various MBCS and SBCS optimizations. 3667 */ 3668 static void 3669 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3670 UErrorCode *pErrorCode) { 3671 UConverter *cnv; 3672 const UChar *source, *sourceLimit, *lastSource; 3673 uint8_t *target; 3674 int32_t targetCapacity, length; 3675 int32_t *offsets; 3676 3677 const uint16_t *table; 3678 const uint16_t *results; 3679 3680 UChar32 c; 3681 3682 int32_t sourceIndex; 3683 3684 uint32_t asciiRoundtrips; 3685 uint16_t value, minValue; 3686 3687 /* set up the local pointers */ 3688 cnv=pArgs->converter; 3689 source=pArgs->source; 3690 sourceLimit=pArgs->sourceLimit; 3691 target=(uint8_t *)pArgs->target; 3692 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3693 offsets=pArgs->offsets; 3694 3695 table=cnv->sharedData->mbcs.fromUnicodeTable; 3696 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3697 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3698 } else { 3699 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3700 } 3701 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3702 3703 if(cnv->useFallback) { 3704 /* use all roundtrip and fallback results */ 3705 minValue=0x800; 3706 } else { 3707 /* use only roundtrips and fallbacks from private-use characters */ 3708 minValue=0xc00; 3709 } 3710 3711 /* get the converter state from UConverter */ 3712 c=cnv->fromUChar32; 3713 3714 /* sourceIndex=-1 if the current character began in the previous buffer */ 3715 sourceIndex= c==0 ? 0 : -1; 3716 lastSource=source; 3717 3718 /* 3719 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3720 * for the minimum of the sourceLength and targetCapacity 3721 */ 3722 length=(int32_t)(sourceLimit-source); 3723 if(length<targetCapacity) { 3724 targetCapacity=length; 3725 } 3726 3727 /* conversion loop */ 3728 if(c!=0 && targetCapacity>0) { 3729 goto getTrail; 3730 } 3731 3732 #if MBCS_UNROLL_SINGLE_FROM_BMP 3733 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3734 /* unroll the loop with the most common case */ 3735 unrolled: 3736 if(targetCapacity>=4) { 3737 int32_t count, loops; 3738 uint16_t andedValues; 3739 3740 loops=count=targetCapacity>>2; 3741 do { 3742 c=*source++; 3743 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3744 *target++=(uint8_t)value; 3745 c=*source++; 3746 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3747 *target++=(uint8_t)value; 3748 c=*source++; 3749 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3750 *target++=(uint8_t)value; 3751 c=*source++; 3752 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3753 *target++=(uint8_t)value; 3754 3755 /* were all 4 entries really valid? */ 3756 if(andedValues<minValue) { 3757 /* no, return to the first of these 4 */ 3758 source-=4; 3759 target-=4; 3760 break; 3761 } 3762 } while(--count>0); 3763 count=loops-count; 3764 targetCapacity-=4*count; 3765 3766 if(offsets!=NULL) { 3767 lastSource+=4*count; 3768 while(count>0) { 3769 *offsets++=sourceIndex++; 3770 *offsets++=sourceIndex++; 3771 *offsets++=sourceIndex++; 3772 *offsets++=sourceIndex++; 3773 --count; 3774 } 3775 } 3776 3777 c=0; 3778 } 3779 #endif 3780 3781 while(targetCapacity>0) { 3782 /* 3783 * Get a correct Unicode code point: 3784 * a single UChar for a BMP code point or 3785 * a matched surrogate pair for a "supplementary code point". 3786 */ 3787 c=*source++; 3788 /* 3789 * Do not immediately check for single surrogates: 3790 * Assume that they are unassigned and check for them in that case. 3791 * This speeds up the conversion of assigned characters. 3792 */ 3793 /* convert the Unicode code point in c into codepage bytes */ 3794 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3795 *target++=(uint8_t)c; 3796 --targetCapacity; 3797 c=0; 3798 continue; 3799 } 3800 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3801 /* is this code point assigned, or do we use fallbacks? */ 3802 if(value>=minValue) { 3803 /* assigned, write the output character bytes from value and length */ 3804 /* length==1 */ 3805 /* this is easy because we know that there is enough space */ 3806 *target++=(uint8_t)value; 3807 --targetCapacity; 3808 3809 /* normal end of conversion: prepare for a new character */ 3810 c=0; 3811 continue; 3812 } else if(!UTF_IS_SURROGATE(c)) { 3813 /* normal, unassigned BMP character */ 3814 } else if(UTF_IS_SURROGATE_FIRST(c)) { 3815 getTrail: 3816 if(source<sourceLimit) { 3817 /* test the following code unit */ 3818 UChar trail=*source; 3819 if(UTF_IS_SECOND_SURROGATE(trail)) { 3820 ++source; 3821 c=UTF16_GET_PAIR_VALUE(c, trail); 3822 /* this codepage does not map supplementary code points */ 3823 /* callback(unassigned) */ 3824 } else { 3825 /* this is an unmatched lead code unit (1st surrogate) */ 3826 /* callback(illegal) */ 3827 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3828 break; 3829 } 3830 } else { 3831 /* no more input */ 3832 if (pArgs->flush) { 3833 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3834 } 3835 break; 3836 } 3837 } else { 3838 /* this is an unmatched trail code unit (2nd surrogate) */ 3839 /* callback(illegal) */ 3840 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3841 break; 3842 } 3843 3844 /* c does not have a mapping */ 3845 3846 /* get the number of code units for c to correctly advance sourceIndex */ 3847 length=U16_LENGTH(c); 3848 3849 /* set offsets since the start or the last extension */ 3850 if(offsets!=NULL) { 3851 int32_t count=(int32_t)(source-lastSource); 3852 3853 /* do not set the offset for this character */ 3854 count-=length; 3855 3856 while(count>0) { 3857 *offsets++=sourceIndex++; 3858 --count; 3859 } 3860 /* offsets and sourceIndex are now set for the current character */ 3861 } 3862 3863 /* try an extension mapping */ 3864 lastSource=source; 3865 c=_extFromU(cnv, cnv->sharedData, 3866 c, &source, sourceLimit, 3867 &target, (const uint8_t *)(pArgs->targetLimit), 3868 &offsets, sourceIndex, 3869 pArgs->flush, 3870 pErrorCode); 3871 sourceIndex+=length+(int32_t)(source-lastSource); 3872 lastSource=source; 3873 3874 if(U_FAILURE(*pErrorCode)) { 3875 /* not mappable or buffer overflow */ 3876 break; 3877 } else { 3878 /* a mapping was written to the target, continue */ 3879 3880 /* recalculate the targetCapacity after an extension mapping */ 3881 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3882 length=(int32_t)(sourceLimit-source); 3883 if(length<targetCapacity) { 3884 targetCapacity=length; 3885 } 3886 } 3887 3888 #if MBCS_UNROLL_SINGLE_FROM_BMP 3889 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3890 goto unrolled; 3891 #endif 3892 } 3893 3894 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 3895 /* target is full */ 3896 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3897 } 3898 3899 /* set offsets since the start or the last callback */ 3900 if(offsets!=NULL) { 3901 size_t count=source-lastSource; 3902 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 3903 /* 3904 Caller gave us a partial supplementary character, 3905 which this function couldn't convert in any case. 3906 The callback will handle the offset. 3907 */ 3908 count--; 3909 } 3910 while(count>0) { 3911 *offsets++=sourceIndex++; 3912 --count; 3913 } 3914 } 3915 3916 /* set the converter state back into UConverter */ 3917 cnv->fromUChar32=c; 3918 3919 /* write back the updated pointers */ 3920 pArgs->source=source; 3921 pArgs->target=(char *)target; 3922 pArgs->offsets=offsets; 3923 } 3924 3925 U_CFUNC void 3926 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3927 UErrorCode *pErrorCode) { 3928 UConverter *cnv; 3929 const UChar *source, *sourceLimit; 3930 uint8_t *target; 3931 int32_t targetCapacity; 3932 int32_t *offsets; 3933 3934 const uint16_t *table; 3935 const uint16_t *mbcsIndex; 3936 const uint8_t *p, *bytes; 3937 uint8_t outputType; 3938 3939 UChar32 c; 3940 3941 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 3942 3943 uint32_t stage2Entry; 3944 uint32_t asciiRoundtrips; 3945 uint32_t value; 3946 uint8_t si_value[2] = {0, 0}; 3947 uint8_t so_value[2] = {0, 0}; 3948 uint8_t si_value_length, so_value_length; 3949 int32_t length = 0, prevLength; 3950 uint8_t unicodeMask; 3951 3952 cnv=pArgs->converter; 3953 3954 if(cnv->preFromUFirstCP>=0) { 3955 /* 3956 * pass sourceIndex=-1 because we continue from an earlier buffer 3957 * in the future, this may change with continuous offsets 3958 */ 3959 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 3960 3961 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 3962 return; 3963 } 3964 } 3965 3966 /* use optimized function if possible */ 3967 outputType=cnv->sharedData->mbcs.outputType; 3968 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3969 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3970 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3971 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 3972 } else { 3973 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 3974 } 3975 return; 3976 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 3977 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 3978 return; 3979 } 3980 3981 /* set up the local pointers */ 3982 source=pArgs->source; 3983 sourceLimit=pArgs->sourceLimit; 3984 target=(uint8_t *)pArgs->target; 3985 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3986 offsets=pArgs->offsets; 3987 3988 table=cnv->sharedData->mbcs.fromUnicodeTable; 3989 if(cnv->sharedData->mbcs.utf8Friendly) { 3990 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3991 } else { 3992 mbcsIndex=NULL; 3993 } 3994 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3995 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3996 } else { 3997 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3998 } 3999 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4000 4001 /* get the converter state from UConverter */ 4002 c=cnv->fromUChar32; 4003 4004 if(outputType==MBCS_OUTPUT_2_SISO) { 4005 prevLength=cnv->fromUnicodeStatus; 4006 if(prevLength==0) { 4007 /* set the real value */ 4008 prevLength=1; 4009 } 4010 } else { 4011 /* prevent fromUnicodeStatus from being set to something non-0 */ 4012 prevLength=0; 4013 } 4014 4015 /* sourceIndex=-1 if the current character began in the previous buffer */ 4016 prevSourceIndex=-1; 4017 sourceIndex= c==0 ? 0 : -1; 4018 nextSourceIndex=0; 4019 4020 /* Get the SI/SO character for the converter */ 4021 si_value_length = getSISOBytes(SI, cnv->options, si_value); 4022 so_value_length = getSISOBytes(SO, cnv->options, so_value); 4023 4024 /* conversion loop */ 4025 /* 4026 * This is another piece of ugly code: 4027 * A goto into the loop if the converter state contains a first surrogate 4028 * from the previous function call. 4029 * It saves me to check in each loop iteration a check of if(c==0) 4030 * and duplicating the trail-surrogate-handling code in the else 4031 * branch of that check. 4032 * I could not find any other way to get around this other than 4033 * using a function call for the conversion and callback, which would 4034 * be even more inefficient. 4035 * 4036 * Markus Scherer 2000-jul-19 4037 */ 4038 if(c!=0 && targetCapacity>0) { 4039 goto getTrail; 4040 } 4041 4042 while(source<sourceLimit) { 4043 /* 4044 * This following test is to see if available input would overflow the output. 4045 * It does not catch output of more than one byte that 4046 * overflows as a result of a multi-byte character or callback output 4047 * from the last source character. 4048 * Therefore, those situations also test for overflows and will 4049 * then break the loop, too. 4050 */ 4051 if(targetCapacity>0) { 4052 /* 4053 * Get a correct Unicode code point: 4054 * a single UChar for a BMP code point or 4055 * a matched surrogate pair for a "supplementary code point". 4056 */ 4057 c=*source++; 4058 ++nextSourceIndex; 4059 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4060 *target++=(uint8_t)c; 4061 if(offsets!=NULL) { 4062 *offsets++=sourceIndex; 4063 prevSourceIndex=sourceIndex; 4064 sourceIndex=nextSourceIndex; 4065 } 4066 --targetCapacity; 4067 c=0; 4068 continue; 4069 } 4070 /* 4071 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4072 * to avoid dealing with surrogates. 4073 * MBCS_FAST_MAX must be >=0xd7ff. 4074 */ 4075 if(c<=0xd7ff && mbcsIndex!=NULL) { 4076 value=mbcsIndex[c>>6]; 4077 4078 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4079 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4080 switch(outputType) { 4081 case MBCS_OUTPUT_2: 4082 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4083 if(value<=0xff) { 4084 if(value==0) { 4085 goto unassigned; 4086 } else { 4087 length=1; 4088 } 4089 } else { 4090 length=2; 4091 } 4092 break; 4093 case MBCS_OUTPUT_2_SISO: 4094 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4095 /* 4096 * Save the old state in the converter object 4097 * right here, then change the local prevLength state variable if necessary. 4098 * Then, if this character turns out to be unassigned or a fallback that 4099 * is not taken, the callback code must not save the new state in the converter 4100 * because the new state is for a character that is not output. 4101 * However, the callback must still restore the state from the converter 4102 * in case the callback function changed it for its output. 4103 */ 4104 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4105 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4106 if(value<=0xff) { 4107 if(value==0) { 4108 goto unassigned; 4109 } else if(prevLength<=1) { 4110 length=1; 4111 } else { 4112 /* change from double-byte mode to single-byte */ 4113 if (si_value_length == 1) { 4114 value|=(uint32_t)si_value[0]<<8; 4115 length = 2; 4116 } else if (si_value_length == 2) { 4117 value|=(uint32_t)si_value[1]<<8; 4118 value|=(uint32_t)si_value[0]<<16; 4119 length = 3; 4120 } 4121 prevLength=1; 4122 } 4123 } else { 4124 if(prevLength==2) { 4125 length=2; 4126 } else { 4127 /* change from single-byte mode to double-byte */ 4128 if (so_value_length == 1) { 4129 value|=(uint32_t)so_value[0]<<16; 4130 length = 3; 4131 } else if (so_value_length == 2) { 4132 value|=(uint32_t)so_value[1]<<16; 4133 value|=(uint32_t)so_value[0]<<24; 4134 length = 4; 4135 } 4136 prevLength=2; 4137 } 4138 } 4139 break; 4140 case MBCS_OUTPUT_DBCS_ONLY: 4141 /* table with single-byte results, but only DBCS mappings used */ 4142 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4143 if(value<=0xff) { 4144 /* no mapping or SBCS result, not taken for DBCS-only */ 4145 goto unassigned; 4146 } else { 4147 length=2; 4148 } 4149 break; 4150 case MBCS_OUTPUT_3: 4151 p=bytes+(value+(c&0x3f))*3; 4152 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4153 if(value<=0xff) { 4154 if(value==0) { 4155 goto unassigned; 4156 } else { 4157 length=1; 4158 } 4159 } else if(value<=0xffff) { 4160 length=2; 4161 } else { 4162 length=3; 4163 } 4164 break; 4165 case MBCS_OUTPUT_4: 4166 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4167 if(value<=0xff) { 4168 if(value==0) { 4169 goto unassigned; 4170 } else { 4171 length=1; 4172 } 4173 } else if(value<=0xffff) { 4174 length=2; 4175 } else if(value<=0xffffff) { 4176 length=3; 4177 } else { 4178 length=4; 4179 } 4180 break; 4181 case MBCS_OUTPUT_3_EUC: 4182 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4183 /* EUC 16-bit fixed-length representation */ 4184 if(value<=0xff) { 4185 if(value==0) { 4186 goto unassigned; 4187 } else { 4188 length=1; 4189 } 4190 } else if((value&0x8000)==0) { 4191 value|=0x8e8000; 4192 length=3; 4193 } else if((value&0x80)==0) { 4194 value|=0x8f0080; 4195 length=3; 4196 } else { 4197 length=2; 4198 } 4199 break; 4200 case MBCS_OUTPUT_4_EUC: 4201 p=bytes+(value+(c&0x3f))*3; 4202 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4203 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4204 if(value<=0xff) { 4205 if(value==0) { 4206 goto unassigned; 4207 } else { 4208 length=1; 4209 } 4210 } else if(value<=0xffff) { 4211 length=2; 4212 } else if((value&0x800000)==0) { 4213 value|=0x8e800000; 4214 length=4; 4215 } else if((value&0x8000)==0) { 4216 value|=0x8f008000; 4217 length=4; 4218 } else { 4219 length=3; 4220 } 4221 break; 4222 default: 4223 /* must not occur */ 4224 /* 4225 * To avoid compiler warnings that value & length may be 4226 * used without having been initialized, we set them here. 4227 * In reality, this is unreachable code. 4228 * Not having a default branch also causes warnings with 4229 * some compilers. 4230 */ 4231 value=0; 4232 length=0; 4233 break; 4234 } 4235 /* output the value */ 4236 } else { 4237 /* 4238 * This also tests if the codepage maps single surrogates. 4239 * If it does, then surrogates are not paired but mapped separately. 4240 * Note that in this case unmatched surrogates are not detected. 4241 */ 4242 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4243 if(UTF_IS_SURROGATE_FIRST(c)) { 4244 getTrail: 4245 if(source<sourceLimit) { 4246 /* test the following code unit */ 4247 UChar trail=*source; 4248 if(UTF_IS_SECOND_SURROGATE(trail)) { 4249 ++source; 4250 ++nextSourceIndex; 4251 c=UTF16_GET_PAIR_VALUE(c, trail); 4252 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4253 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4254 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4255 /* callback(unassigned) */ 4256 goto unassigned; 4257 } 4258 /* convert this supplementary code point */ 4259 /* exit this condition tree */ 4260 } else { 4261 /* this is an unmatched lead code unit (1st surrogate) */ 4262 /* callback(illegal) */ 4263 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4264 break; 4265 } 4266 } else { 4267 /* no more input */ 4268 break; 4269 } 4270 } else { 4271 /* this is an unmatched trail code unit (2nd surrogate) */ 4272 /* callback(illegal) */ 4273 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4274 break; 4275 } 4276 } 4277 4278 /* convert the Unicode code point in c into codepage bytes */ 4279 4280 /* 4281 * The basic lookup is a triple-stage compact array (trie) lookup. 4282 * For details see the beginning of this file. 4283 * 4284 * Single-byte codepages are handled with a different data structure 4285 * by _MBCSSingle... functions. 4286 * 4287 * The result consists of a 32-bit value from stage 2 and 4288 * a pointer to as many bytes as are stored per character. 4289 * The pointer points to the character's bytes in stage 3. 4290 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4291 * for that pointer, while bits 31..16 are flags for which of 4292 * the 16 characters in the block are roundtrip-assigned. 4293 * 4294 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4295 * respectively as uint32_t, in the platform encoding. 4296 * For 3-byte codepages, the bytes are always stored in big-endian order. 4297 * 4298 * For EUC encodings that use only either 0x8e or 0x8f as the first 4299 * byte of their longest byte sequences, the first two bytes in 4300 * this third stage indicate with their 7th bits whether these bytes 4301 * are to be written directly or actually need to be preceeded by 4302 * one of the two Single-Shift codes. With this, the third stage 4303 * stores one byte fewer per character than the actual maximum length of 4304 * EUC byte sequences. 4305 * 4306 * Other than that, leading zero bytes are removed and the other 4307 * bytes output. A single zero byte may be output if the "assigned" 4308 * bit in stage 2 was on. 4309 * The data structure does not support zero byte output as a fallback, 4310 * and also does not allow output of leading zeros. 4311 */ 4312 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4313 4314 /* get the bytes and the length for the output */ 4315 switch(outputType) { 4316 case MBCS_OUTPUT_2: 4317 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4318 if(value<=0xff) { 4319 length=1; 4320 } else { 4321 length=2; 4322 } 4323 break; 4324 case MBCS_OUTPUT_2_SISO: 4325 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4326 /* 4327 * Save the old state in the converter object 4328 * right here, then change the local prevLength state variable if necessary. 4329 * Then, if this character turns out to be unassigned or a fallback that 4330 * is not taken, the callback code must not save the new state in the converter 4331 * because the new state is for a character that is not output. 4332 * However, the callback must still restore the state from the converter 4333 * in case the callback function changed it for its output. 4334 */ 4335 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4336 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4337 if(value<=0xff) { 4338 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4339 /* no mapping, leave value==0 */ 4340 length=0; 4341 } else if(prevLength<=1) { 4342 length=1; 4343 } else { 4344 /* change from double-byte mode to single-byte */ 4345 if (si_value_length == 1) { 4346 value|=(uint32_t)si_value[0]<<8; 4347 length = 2; 4348 } else if (si_value_length == 2) { 4349 value|=(uint32_t)si_value[1]<<8; 4350 value|=(uint32_t)si_value[0]<<16; 4351 length = 3; 4352 } 4353 prevLength=1; 4354 } 4355 } else { 4356 if(prevLength==2) { 4357 length=2; 4358 } else { 4359 /* change from single-byte mode to double-byte */ 4360 if (so_value_length == 1) { 4361 value|=(uint32_t)so_value[0]<<16; 4362 length = 3; 4363 } else if (so_value_length == 2) { 4364 value|=(uint32_t)so_value[1]<<16; 4365 value|=(uint32_t)so_value[0]<<24; 4366 length = 4; 4367 } 4368 prevLength=2; 4369 } 4370 } 4371 break; 4372 case MBCS_OUTPUT_DBCS_ONLY: 4373 /* table with single-byte results, but only DBCS mappings used */ 4374 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4375 if(value<=0xff) { 4376 /* no mapping or SBCS result, not taken for DBCS-only */ 4377 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4378 length=0; 4379 } else { 4380 length=2; 4381 } 4382 break; 4383 case MBCS_OUTPUT_3: 4384 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4385 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4386 if(value<=0xff) { 4387 length=1; 4388 } else if(value<=0xffff) { 4389 length=2; 4390 } else { 4391 length=3; 4392 } 4393 break; 4394 case MBCS_OUTPUT_4: 4395 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4396 if(value<=0xff) { 4397 length=1; 4398 } else if(value<=0xffff) { 4399 length=2; 4400 } else if(value<=0xffffff) { 4401 length=3; 4402 } else { 4403 length=4; 4404 } 4405 break; 4406 case MBCS_OUTPUT_3_EUC: 4407 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4408 /* EUC 16-bit fixed-length representation */ 4409 if(value<=0xff) { 4410 length=1; 4411 } else if((value&0x8000)==0) { 4412 value|=0x8e8000; 4413 length=3; 4414 } else if((value&0x80)==0) { 4415 value|=0x8f0080; 4416 length=3; 4417 } else { 4418 length=2; 4419 } 4420 break; 4421 case MBCS_OUTPUT_4_EUC: 4422 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4423 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4424 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4425 if(value<=0xff) { 4426 length=1; 4427 } else if(value<=0xffff) { 4428 length=2; 4429 } else if((value&0x800000)==0) { 4430 value|=0x8e800000; 4431 length=4; 4432 } else if((value&0x8000)==0) { 4433 value|=0x8f008000; 4434 length=4; 4435 } else { 4436 length=3; 4437 } 4438 break; 4439 default: 4440 /* must not occur */ 4441 /* 4442 * To avoid compiler warnings that value & length may be 4443 * used without having been initialized, we set them here. 4444 * In reality, this is unreachable code. 4445 * Not having a default branch also causes warnings with 4446 * some compilers. 4447 */ 4448 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4449 length=0; 4450 break; 4451 } 4452 4453 /* is this code point assigned, or do we use fallbacks? */ 4454 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4455 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4456 ) { 4457 /* 4458 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4459 * There is no way with this data structure for fallback output 4460 * to be a zero byte. 4461 */ 4462 4463 unassigned: 4464 /* try an extension mapping */ 4465 pArgs->source=source; 4466 c=_extFromU(cnv, cnv->sharedData, 4467 c, &source, sourceLimit, 4468 &target, target+targetCapacity, 4469 &offsets, sourceIndex, 4470 pArgs->flush, 4471 pErrorCode); 4472 nextSourceIndex+=(int32_t)(source-pArgs->source); 4473 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4474 4475 if(U_FAILURE(*pErrorCode)) { 4476 /* not mappable or buffer overflow */ 4477 break; 4478 } else { 4479 /* a mapping was written to the target, continue */ 4480 4481 /* recalculate the targetCapacity after an extension mapping */ 4482 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4483 4484 /* normal end of conversion: prepare for a new character */ 4485 if(offsets!=NULL) { 4486 prevSourceIndex=sourceIndex; 4487 sourceIndex=nextSourceIndex; 4488 } 4489 continue; 4490 } 4491 } 4492 } 4493 4494 /* write the output character bytes from value and length */ 4495 /* from the first if in the loop we know that targetCapacity>0 */ 4496 if(length<=targetCapacity) { 4497 if(offsets==NULL) { 4498 switch(length) { 4499 /* each branch falls through to the next one */ 4500 case 4: 4501 *target++=(uint8_t)(value>>24); 4502 case 3: 4503 *target++=(uint8_t)(value>>16); 4504 case 2: 4505 *target++=(uint8_t)(value>>8); 4506 case 1: 4507 *target++=(uint8_t)value; 4508 default: 4509 /* will never occur */ 4510 break; 4511 } 4512 } else { 4513 switch(length) { 4514 /* each branch falls through to the next one */ 4515 case 4: 4516 *target++=(uint8_t)(value>>24); 4517 *offsets++=sourceIndex; 4518 case 3: 4519 *target++=(uint8_t)(value>>16); 4520 *offsets++=sourceIndex; 4521 case 2: 4522 *target++=(uint8_t)(value>>8); 4523 *offsets++=sourceIndex; 4524 case 1: 4525 *target++=(uint8_t)value; 4526 *offsets++=sourceIndex; 4527 default: 4528 /* will never occur */ 4529 break; 4530 } 4531 } 4532 targetCapacity-=length; 4533 } else { 4534 uint8_t *charErrorBuffer; 4535 4536 /* 4537 * We actually do this backwards here: 4538 * In order to save an intermediate variable, we output 4539 * first to the overflow buffer what does not fit into the 4540 * regular target. 4541 */ 4542 /* we know that 1<=targetCapacity<length<=4 */ 4543 length-=targetCapacity; 4544 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4545 switch(length) { 4546 /* each branch falls through to the next one */ 4547 case 3: 4548 *charErrorBuffer++=(uint8_t)(value>>16); 4549 case 2: 4550 *charErrorBuffer++=(uint8_t)(value>>8); 4551 case 1: 4552 *charErrorBuffer=(uint8_t)value; 4553 default: 4554 /* will never occur */ 4555 break; 4556 } 4557 cnv->charErrorBufferLength=(int8_t)length; 4558 4559 /* now output what fits into the regular target */ 4560 value>>=8*length; /* length was reduced by targetCapacity */ 4561 switch(targetCapacity) { 4562 /* each branch falls through to the next one */ 4563 case 3: 4564 *target++=(uint8_t)(value>>16); 4565 if(offsets!=NULL) { 4566 *offsets++=sourceIndex; 4567 } 4568 case 2: 4569 *target++=(uint8_t)(value>>8); 4570 if(offsets!=NULL) { 4571 *offsets++=sourceIndex; 4572 } 4573 case 1: 4574 *target++=(uint8_t)value; 4575 if(offsets!=NULL) { 4576 *offsets++=sourceIndex; 4577 } 4578 default: 4579 /* will never occur */ 4580 break; 4581 } 4582 4583 /* target overflow */ 4584 targetCapacity=0; 4585 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4586 c=0; 4587 break; 4588 } 4589 4590 /* normal end of conversion: prepare for a new character */ 4591 c=0; 4592 if(offsets!=NULL) { 4593 prevSourceIndex=sourceIndex; 4594 sourceIndex=nextSourceIndex; 4595 } 4596 continue; 4597 } else { 4598 /* target is full */ 4599 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4600 break; 4601 } 4602 } 4603 4604 /* 4605 * the end of the input stream and detection of truncated input 4606 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4607 * we need to emit an SI at the very end 4608 * 4609 * conditions: 4610 * successful 4611 * EBCDIC_STATEFUL in DBCS mode 4612 * end of input and no truncated input 4613 */ 4614 if( U_SUCCESS(*pErrorCode) && 4615 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4616 pArgs->flush && source>=sourceLimit && c==0 4617 ) { 4618 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4619 if(targetCapacity>0) { 4620 *target++=(uint8_t)si_value[0]; 4621 if (si_value_length == 2) { 4622 if (targetCapacity<2) { 4623 cnv->charErrorBuffer[0]=(uint8_t)si_value[1]; 4624 cnv->charErrorBufferLength=1; 4625 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4626 } else { 4627 *target++=(uint8_t)si_value[1]; 4628 } 4629 } 4630 if(offsets!=NULL) { 4631 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4632 *offsets++=prevSourceIndex; 4633 } 4634 } else { 4635 /* target is full */ 4636 cnv->charErrorBuffer[0]=(uint8_t)si_value[0]; 4637 if (si_value_length == 2) { 4638 cnv->charErrorBuffer[1]=(uint8_t)si_value[1]; 4639 } 4640 cnv->charErrorBufferLength=si_value_length; 4641 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4642 } 4643 prevLength=1; /* we switched into SBCS */ 4644 } 4645 4646 /* set the converter state back into UConverter */ 4647 cnv->fromUChar32=c; 4648 cnv->fromUnicodeStatus=prevLength; 4649 4650 /* write back the updated pointers */ 4651 pArgs->source=source; 4652 pArgs->target=(char *)target; 4653 pArgs->offsets=offsets; 4654 } 4655 4656 /* 4657 * This is another simple conversion function for internal use by other 4658 * conversion implementations. 4659 * It does not use the converter state nor call callbacks. 4660 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4661 * It handles conversion extensions but not GB 18030. 4662 * 4663 * It converts one single Unicode code point into codepage bytes, encoded 4664 * as one 32-bit value. The function returns the number of bytes in *pValue: 4665 * 1..4 the number of bytes in *pValue 4666 * 0 unassigned (*pValue undefined) 4667 * -1 illegal (currently not used, *pValue undefined) 4668 * 4669 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4670 * the second to last byte in bits 15..8, etc. 4671 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4672 */ 4673 U_CFUNC int32_t 4674 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4675 UChar32 c, uint32_t *pValue, 4676 UBool useFallback) { 4677 const int32_t *cx; 4678 const uint16_t *table; 4679 #if 0 4680 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4681 const uint8_t *p; 4682 #endif 4683 uint32_t stage2Entry; 4684 uint32_t value; 4685 int32_t length; 4686 4687 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4688 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4689 table=sharedData->mbcs.fromUnicodeTable; 4690 4691 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4692 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4693 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4694 /* is this code point assigned, or do we use fallbacks? */ 4695 if(useFallback ? value>=0x800 : value>=0xc00) { 4696 *pValue=value&0xff; 4697 return 1; 4698 } 4699 } else /* outputType!=MBCS_OUTPUT_1 */ { 4700 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4701 4702 /* get the bytes and the length for the output */ 4703 switch(sharedData->mbcs.outputType) { 4704 case MBCS_OUTPUT_2: 4705 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4706 if(value<=0xff) { 4707 length=1; 4708 } else { 4709 length=2; 4710 } 4711 break; 4712 #if 0 4713 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4714 case MBCS_OUTPUT_DBCS_ONLY: 4715 /* table with single-byte results, but only DBCS mappings used */ 4716 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4717 if(value<=0xff) { 4718 /* no mapping or SBCS result, not taken for DBCS-only */ 4719 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4720 length=0; 4721 } else { 4722 length=2; 4723 } 4724 break; 4725 case MBCS_OUTPUT_3: 4726 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4727 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4728 if(value<=0xff) { 4729 length=1; 4730 } else if(value<=0xffff) { 4731 length=2; 4732 } else { 4733 length=3; 4734 } 4735 break; 4736 case MBCS_OUTPUT_4: 4737 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4738 if(value<=0xff) { 4739 length=1; 4740 } else if(value<=0xffff) { 4741 length=2; 4742 } else if(value<=0xffffff) { 4743 length=3; 4744 } else { 4745 length=4; 4746 } 4747 break; 4748 case MBCS_OUTPUT_3_EUC: 4749 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4750 /* EUC 16-bit fixed-length representation */ 4751 if(value<=0xff) { 4752 length=1; 4753 } else if((value&0x8000)==0) { 4754 value|=0x8e8000; 4755 length=3; 4756 } else if((value&0x80)==0) { 4757 value|=0x8f0080; 4758 length=3; 4759 } else { 4760 length=2; 4761 } 4762 break; 4763 case MBCS_OUTPUT_4_EUC: 4764 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4765 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4766 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4767 if(value<=0xff) { 4768 length=1; 4769 } else if(value<=0xffff) { 4770 length=2; 4771 } else if((value&0x800000)==0) { 4772 value|=0x8e800000; 4773 length=4; 4774 } else if((value&0x8000)==0) { 4775 value|=0x8f008000; 4776 length=4; 4777 } else { 4778 length=3; 4779 } 4780 break; 4781 #endif 4782 default: 4783 /* must not occur */ 4784 return -1; 4785 } 4786 4787 /* is this code point assigned, or do we use fallbacks? */ 4788 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4789 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4790 ) { 4791 /* 4792 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4793 * There is no way with this data structure for fallback output 4794 * to be a zero byte. 4795 */ 4796 /* assigned */ 4797 *pValue=value; 4798 return length; 4799 } 4800 } 4801 } 4802 4803 cx=sharedData->mbcs.extIndexes; 4804 if(cx!=NULL) { 4805 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4806 return length>=0 ? length : -length; /* return abs(length); */ 4807 } 4808 4809 /* unassigned */ 4810 return 0; 4811 } 4812 4813 4814 #if 0 4815 /* 4816 * This function has been moved to ucnv2022.c for inlining. 4817 * This implementation is here only for documentation purposes 4818 */ 4819 4820 /** 4821 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4822 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4823 * It does not handle conversion extensions (_extFromU()). 4824 * 4825 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4826 */ 4827 U_CFUNC int32_t 4828 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4829 UChar32 c, 4830 UBool useFallback) { 4831 const uint16_t *table; 4832 int32_t value; 4833 4834 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4835 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4836 return -1; 4837 } 4838 4839 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4840 table=sharedData->mbcs.fromUnicodeTable; 4841 4842 /* get the byte for the output */ 4843 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4844 /* is this code point assigned, or do we use fallbacks? */ 4845 if(useFallback ? value>=0x800 : value>=0xc00) { 4846 return value&0xff; 4847 } else { 4848 return -1; 4849 } 4850 } 4851 #endif 4852 4853 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4854 4855 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4856 static const UChar32 4857 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4858 4859 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4860 static const UChar32 4861 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4862 4863 static void 4864 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4865 UConverterToUnicodeArgs *pToUArgs, 4866 UErrorCode *pErrorCode) { 4867 UConverter *utf8, *cnv; 4868 const uint8_t *source, *sourceLimit; 4869 uint8_t *target; 4870 int32_t targetCapacity; 4871 4872 const uint16_t *table, *sbcsIndex; 4873 const uint16_t *results; 4874 4875 int8_t oldToULength, toULength, toULimit; 4876 4877 UChar32 c; 4878 uint8_t b, t1, t2; 4879 4880 uint32_t asciiRoundtrips; 4881 uint16_t value, minValue; 4882 UBool hasSupplementary; 4883 4884 /* set up the local pointers */ 4885 utf8=pToUArgs->converter; 4886 cnv=pFromUArgs->converter; 4887 source=(uint8_t *)pToUArgs->source; 4888 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4889 target=(uint8_t *)pFromUArgs->target; 4890 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4891 4892 table=cnv->sharedData->mbcs.fromUnicodeTable; 4893 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 4894 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4895 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4896 } else { 4897 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4898 } 4899 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4900 4901 if(cnv->useFallback) { 4902 /* use all roundtrip and fallback results */ 4903 minValue=0x800; 4904 } else { 4905 /* use only roundtrips and fallbacks from private-use characters */ 4906 minValue=0xc00; 4907 } 4908 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4909 4910 /* get the converter state from the UTF-8 UConverter */ 4911 c=(UChar32)utf8->toUnicodeStatus; 4912 if(c!=0) { 4913 toULength=oldToULength=utf8->toULength; 4914 toULimit=(int8_t)utf8->mode; 4915 } else { 4916 toULength=oldToULength=toULimit=0; 4917 } 4918 4919 /* 4920 * Make sure that the last byte sequence before sourceLimit is complete 4921 * or runs into a lead byte. 4922 * Do not go back into the bytes that will be read for finishing a partial 4923 * sequence from the previous buffer. 4924 * In the conversion loop compare source with sourceLimit only once 4925 * per multi-byte character. 4926 */ 4927 { 4928 int32_t i, length; 4929 4930 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4931 for(i=0; i<3 && i<length;) { 4932 b=*(sourceLimit-i-1); 4933 if(U8_IS_TRAIL(b)) { 4934 ++i; 4935 } else { 4936 if(i<utf8_countTrailBytes[b]) { 4937 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4938 sourceLimit-=i+1; 4939 } 4940 break; 4941 } 4942 } 4943 } 4944 4945 if(c!=0 && targetCapacity>0) { 4946 utf8->toUnicodeStatus=0; 4947 utf8->toULength=0; 4948 goto moreBytes; 4949 /* 4950 * Note: We could avoid the goto by duplicating some of the moreBytes 4951 * code, but only up to the point of collecting a complete UTF-8 4952 * sequence; then recurse for the toUBytes[toULength] 4953 * and then continue with normal conversion. 4954 * 4955 * If so, move this code to just after initializing the minimum 4956 * set of local variables for reading the UTF-8 input 4957 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 4958 * 4959 * Potential advantages: 4960 * - avoid the goto 4961 * - oldToULength could become a local variable in just those code blocks 4962 * that deal with buffer boundaries 4963 * - possibly faster if the goto prevents some compiler optimizations 4964 * (this would need measuring to confirm) 4965 * Disadvantage: 4966 * - code duplication 4967 */ 4968 } 4969 4970 /* conversion loop */ 4971 while(source<sourceLimit) { 4972 if(targetCapacity>0) { 4973 b=*source++; 4974 if((int8_t)b>=0) { 4975 /* convert ASCII */ 4976 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4977 *target++=(uint8_t)b; 4978 --targetCapacity; 4979 continue; 4980 } else { 4981 c=b; 4982 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 4983 } 4984 } else { 4985 if(b<0xe0) { 4986 if( /* handle U+0080..U+07FF inline */ 4987 b>=0xc2 && 4988 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4989 ) { 4990 c=b&0x1f; 4991 ++source; 4992 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 4993 if(value>=minValue) { 4994 *target++=(uint8_t)value; 4995 --targetCapacity; 4996 continue; 4997 } else { 4998 c=(c<<6)|t1; 4999 } 5000 } else { 5001 c=-1; 5002 } 5003 } else if(b==0xe0) { 5004 if( /* handle U+0800..U+0FFF inline */ 5005 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5006 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5007 ) { 5008 c=t1; 5009 source+=2; 5010 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5011 if(value>=minValue) { 5012 *target++=(uint8_t)value; 5013 --targetCapacity; 5014 continue; 5015 } else { 5016 c=(c<<6)|t2; 5017 } 5018 } else { 5019 c=-1; 5020 } 5021 } else { 5022 c=-1; 5023 } 5024 5025 if(c<0) { 5026 /* handle "complicated" and error cases, and continuing partial characters */ 5027 oldToULength=0; 5028 toULength=1; 5029 toULimit=utf8_countTrailBytes[b]+1; 5030 c=b; 5031 moreBytes: 5032 while(toULength<toULimit) { 5033 /* 5034 * The sourceLimit may have been adjusted before the conversion loop 5035 * to stop before a truncated sequence. 5036 * Here we need to use the real limit in case we have two truncated 5037 * sequences at the end. 5038 * See ticket #7492. 5039 */ 5040 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5041 b=*source; 5042 if(U8_IS_TRAIL(b)) { 5043 ++source; 5044 ++toULength; 5045 c=(c<<6)+b; 5046 } else { 5047 break; /* sequence too short, stop with toULength<toULimit */ 5048 } 5049 } else { 5050 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5051 source-=(toULength-oldToULength); 5052 while(oldToULength<toULength) { 5053 utf8->toUBytes[oldToULength++]=*source++; 5054 } 5055 utf8->toUnicodeStatus=c; 5056 utf8->toULength=toULength; 5057 utf8->mode=toULimit; 5058 pToUArgs->source=(char *)source; 5059 pFromUArgs->target=(char *)target; 5060 return; 5061 } 5062 } 5063 5064 if( toULength==toULimit && /* consumed all trail bytes */ 5065 (toULength==3 || toULength==2) && /* BMP */ 5066 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5067 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5068 ) { 5069 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5070 } else if( 5071 toULength==toULimit && toULength==4 && 5072 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5073 ) { 5074 /* supplementary code point */ 5075 if(!hasSupplementary) { 5076 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5077 value=0; 5078 } else { 5079 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5080 } 5081 } else { 5082 /* error handling: illegal UTF-8 byte sequence */ 5083 source-=(toULength-oldToULength); 5084 while(oldToULength<toULength) { 5085 utf8->toUBytes[oldToULength++]=*source++; 5086 } 5087 utf8->toULength=toULength; 5088 pToUArgs->source=(char *)source; 5089 pFromUArgs->target=(char *)target; 5090 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5091 return; 5092 } 5093 } 5094 } 5095 5096 if(value>=minValue) { 5097 /* output the mapping for c */ 5098 *target++=(uint8_t)value; 5099 --targetCapacity; 5100 } else { 5101 /* value<minValue means c is unassigned (unmappable) */ 5102 /* 5103 * Try an extension mapping. 5104 * Pass in no source because we don't have UTF-16 input. 5105 * If we have a partial match on c, we will return and revert 5106 * to UTF-8->UTF-16->charset conversion. 5107 */ 5108 static const UChar nul=0; 5109 const UChar *noSource=&nul; 5110 c=_extFromU(cnv, cnv->sharedData, 5111 c, &noSource, noSource, 5112 &target, target+targetCapacity, 5113 NULL, -1, 5114 pFromUArgs->flush, 5115 pErrorCode); 5116 5117 if(U_FAILURE(*pErrorCode)) { 5118 /* not mappable or buffer overflow */ 5119 cnv->fromUChar32=c; 5120 break; 5121 } else if(cnv->preFromUFirstCP>=0) { 5122 /* 5123 * Partial match, return and revert to pivoting. 5124 * In normal from-UTF-16 conversion, we would just continue 5125 * but then exit the loop because the extension match would 5126 * have consumed the source. 5127 */ 5128 break; 5129 } else { 5130 /* a mapping was written to the target, continue */ 5131 5132 /* recalculate the targetCapacity after an extension mapping */ 5133 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5134 } 5135 } 5136 } else { 5137 /* target is full */ 5138 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5139 break; 5140 } 5141 } 5142 5143 /* 5144 * The sourceLimit may have been adjusted before the conversion loop 5145 * to stop before a truncated sequence. 5146 * If so, then collect the truncated sequence now. 5147 */ 5148 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5149 c=utf8->toUBytes[0]=b=*source++; 5150 toULength=1; 5151 toULimit=utf8_countTrailBytes[b]+1; 5152 while(source<sourceLimit) { 5153 utf8->toUBytes[toULength++]=b=*source++; 5154 c=(c<<6)+b; 5155 } 5156 utf8->toUnicodeStatus=c; 5157 utf8->toULength=toULength; 5158 utf8->mode=toULimit; 5159 } 5160 5161 /* write back the updated pointers */ 5162 pToUArgs->source=(char *)source; 5163 pFromUArgs->target=(char *)target; 5164 } 5165 5166 static void 5167 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5168 UConverterToUnicodeArgs *pToUArgs, 5169 UErrorCode *pErrorCode) { 5170 UConverter *utf8, *cnv; 5171 const uint8_t *source, *sourceLimit; 5172 uint8_t *target; 5173 int32_t targetCapacity; 5174 5175 const uint16_t *table, *mbcsIndex; 5176 const uint16_t *results; 5177 5178 int8_t oldToULength, toULength, toULimit; 5179 5180 UChar32 c; 5181 uint8_t b, t1, t2; 5182 5183 uint32_t stage2Entry; 5184 uint32_t asciiRoundtrips; 5185 uint16_t value, minValue; 5186 UBool hasSupplementary; 5187 5188 /* set up the local pointers */ 5189 utf8=pToUArgs->converter; 5190 cnv=pFromUArgs->converter; 5191 source=(uint8_t *)pToUArgs->source; 5192 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5193 target=(uint8_t *)pFromUArgs->target; 5194 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5195 5196 table=cnv->sharedData->mbcs.fromUnicodeTable; 5197 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5198 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5199 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5200 } else { 5201 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5202 } 5203 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5204 5205 if(cnv->useFallback) { 5206 /* use all roundtrip and fallback results */ 5207 minValue=0x800; 5208 } else { 5209 /* use only roundtrips and fallbacks from private-use characters */ 5210 minValue=0xc00; 5211 } 5212 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5213 5214 /* get the converter state from the UTF-8 UConverter */ 5215 c=(UChar32)utf8->toUnicodeStatus; 5216 if(c!=0) { 5217 toULength=oldToULength=utf8->toULength; 5218 toULimit=(int8_t)utf8->mode; 5219 } else { 5220 toULength=oldToULength=toULimit=0; 5221 } 5222 5223 /* 5224 * Make sure that the last byte sequence before sourceLimit is complete 5225 * or runs into a lead byte. 5226 * Do not go back into the bytes that will be read for finishing a partial 5227 * sequence from the previous buffer. 5228 * In the conversion loop compare source with sourceLimit only once 5229 * per multi-byte character. 5230 */ 5231 { 5232 int32_t i, length; 5233 5234 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5235 for(i=0; i<3 && i<length;) { 5236 b=*(sourceLimit-i-1); 5237 if(U8_IS_TRAIL(b)) { 5238 ++i; 5239 } else { 5240 if(i<utf8_countTrailBytes[b]) { 5241 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5242 sourceLimit-=i+1; 5243 } 5244 break; 5245 } 5246 } 5247 } 5248 5249 if(c!=0 && targetCapacity>0) { 5250 utf8->toUnicodeStatus=0; 5251 utf8->toULength=0; 5252 goto moreBytes; 5253 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5254 } 5255 5256 /* conversion loop */ 5257 while(source<sourceLimit) { 5258 if(targetCapacity>0) { 5259 b=*source++; 5260 if((int8_t)b>=0) { 5261 /* convert ASCII */ 5262 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5263 *target++=b; 5264 --targetCapacity; 5265 continue; 5266 } else { 5267 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5268 if(value==0) { 5269 c=b; 5270 goto unassigned; 5271 } 5272 } 5273 } else { 5274 if(b>0xe0) { 5275 if( /* handle U+1000..U+D7FF inline */ 5276 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5277 (b==0xed && (t1 <= 0x1f))) && 5278 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5279 ) { 5280 c=((b&0xf)<<6)|t1; 5281 source+=2; 5282 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5283 if(value==0) { 5284 c=(c<<6)|t2; 5285 goto unassigned; 5286 } 5287 } else { 5288 c=-1; 5289 } 5290 } else if(b<0xe0) { 5291 if( /* handle U+0080..U+07FF inline */ 5292 b>=0xc2 && 5293 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5294 ) { 5295 c=b&0x1f; 5296 ++source; 5297 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5298 if(value==0) { 5299 c=(c<<6)|t1; 5300 goto unassigned; 5301 } 5302 } else { 5303 c=-1; 5304 } 5305 } else { 5306 c=-1; 5307 } 5308 5309 if(c<0) { 5310 /* handle "complicated" and error cases, and continuing partial characters */ 5311 oldToULength=0; 5312 toULength=1; 5313 toULimit=utf8_countTrailBytes[b]+1; 5314 c=b; 5315 moreBytes: 5316 while(toULength<toULimit) { 5317 /* 5318 * The sourceLimit may have been adjusted before the conversion loop 5319 * to stop before a truncated sequence. 5320 * Here we need to use the real limit in case we have two truncated 5321 * sequences at the end. 5322 * See ticket #7492. 5323 */ 5324 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5325 b=*source; 5326 if(U8_IS_TRAIL(b)) { 5327 ++source; 5328 ++toULength; 5329 c=(c<<6)+b; 5330 } else { 5331 break; /* sequence too short, stop with toULength<toULimit */ 5332 } 5333 } else { 5334 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5335 source-=(toULength-oldToULength); 5336 while(oldToULength<toULength) { 5337 utf8->toUBytes[oldToULength++]=*source++; 5338 } 5339 utf8->toUnicodeStatus=c; 5340 utf8->toULength=toULength; 5341 utf8->mode=toULimit; 5342 pToUArgs->source=(char *)source; 5343 pFromUArgs->target=(char *)target; 5344 return; 5345 } 5346 } 5347 5348 if( toULength==toULimit && /* consumed all trail bytes */ 5349 (toULength==3 || toULength==2) && /* BMP */ 5350 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5351 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5352 ) { 5353 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5354 } else if( 5355 toULength==toULimit && toULength==4 && 5356 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5357 ) { 5358 /* supplementary code point */ 5359 if(!hasSupplementary) { 5360 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5361 stage2Entry=0; 5362 } else { 5363 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5364 } 5365 } else { 5366 /* error handling: illegal UTF-8 byte sequence */ 5367 source-=(toULength-oldToULength); 5368 while(oldToULength<toULength) { 5369 utf8->toUBytes[oldToULength++]=*source++; 5370 } 5371 utf8->toULength=toULength; 5372 pToUArgs->source=(char *)source; 5373 pFromUArgs->target=(char *)target; 5374 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5375 return; 5376 } 5377 5378 /* get the bytes and the length for the output */ 5379 /* MBCS_OUTPUT_2 */ 5380 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5381 5382 /* is this code point assigned, or do we use fallbacks? */ 5383 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5384 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5385 ) { 5386 goto unassigned; 5387 } 5388 } 5389 } 5390 5391 /* write the output character bytes from value and length */ 5392 /* from the first if in the loop we know that targetCapacity>0 */ 5393 if(value<=0xff) { 5394 /* this is easy because we know that there is enough space */ 5395 *target++=(uint8_t)value; 5396 --targetCapacity; 5397 } else /* length==2 */ { 5398 *target++=(uint8_t)(value>>8); 5399 if(2<=targetCapacity) { 5400 *target++=(uint8_t)value; 5401 targetCapacity-=2; 5402 } else { 5403 cnv->charErrorBuffer[0]=(char)value; 5404 cnv->charErrorBufferLength=1; 5405 5406 /* target overflow */ 5407 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5408 break; 5409 } 5410 } 5411 continue; 5412 5413 unassigned: 5414 { 5415 /* 5416 * Try an extension mapping. 5417 * Pass in no source because we don't have UTF-16 input. 5418 * If we have a partial match on c, we will return and revert 5419 * to UTF-8->UTF-16->charset conversion. 5420 */ 5421 static const UChar nul=0; 5422 const UChar *noSource=&nul; 5423 c=_extFromU(cnv, cnv->sharedData, 5424 c, &noSource, noSource, 5425 &target, target+targetCapacity, 5426 NULL, -1, 5427 pFromUArgs->flush, 5428 pErrorCode); 5429 5430 if(U_FAILURE(*pErrorCode)) { 5431 /* not mappable or buffer overflow */ 5432 cnv->fromUChar32=c; 5433 break; 5434 } else if(cnv->preFromUFirstCP>=0) { 5435 /* 5436 * Partial match, return and revert to pivoting. 5437 * In normal from-UTF-16 conversion, we would just continue 5438 * but then exit the loop because the extension match would 5439 * have consumed the source. 5440 */ 5441 break; 5442 } else { 5443 /* a mapping was written to the target, continue */ 5444 5445 /* recalculate the targetCapacity after an extension mapping */ 5446 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5447 continue; 5448 } 5449 } 5450 } else { 5451 /* target is full */ 5452 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5453 break; 5454 } 5455 } 5456 5457 /* 5458 * The sourceLimit may have been adjusted before the conversion loop 5459 * to stop before a truncated sequence. 5460 * If so, then collect the truncated sequence now. 5461 */ 5462 if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5463 c=utf8->toUBytes[0]=b=*source++; 5464 toULength=1; 5465 toULimit=utf8_countTrailBytes[b]+1; 5466 while(source<sourceLimit) { 5467 utf8->toUBytes[toULength++]=b=*source++; 5468 c=(c<<6)+b; 5469 } 5470 utf8->toUnicodeStatus=c; 5471 utf8->toULength=toULength; 5472 utf8->mode=toULimit; 5473 } 5474 5475 /* write back the updated pointers */ 5476 pToUArgs->source=(char *)source; 5477 pFromUArgs->target=(char *)target; 5478 } 5479 5480 /* miscellaneous ------------------------------------------------------------ */ 5481 5482 static void 5483 ucnv_MBCSGetStarters(const UConverter* cnv, 5484 UBool starters[256], 5485 UErrorCode *pErrorCode) { 5486 const int32_t *state0; 5487 int i; 5488 5489 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5490 for(i=0; i<256; ++i) { 5491 /* all bytes that cause a state transition from state 0 are lead bytes */ 5492 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5493 } 5494 } 5495 5496 /* 5497 * This is an internal function that allows other converter implementations 5498 * to check whether a byte is a lead byte. 5499 */ 5500 U_CFUNC UBool 5501 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5502 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5503 } 5504 5505 static void 5506 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5507 int32_t offsetIndex, 5508 UErrorCode *pErrorCode) { 5509 UConverter *cnv=pArgs->converter; 5510 char *p, *subchar; 5511 char buffer[4]; 5512 int32_t length; 5513 5514 /* first, select between subChar and subChar1 */ 5515 if( cnv->subChar1!=0 && 5516 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5517 cnv->useSubChar1 : 5518 (cnv->invalidUCharBuffer[0]<=0xff)) 5519 ) { 5520 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5521 subchar=(char *)&cnv->subChar1; 5522 length=1; 5523 } else { 5524 /* select subChar in all other cases */ 5525 subchar=(char *)cnv->subChars; 5526 length=cnv->subCharLen; 5527 } 5528 5529 /* reset the selector for the next code point */ 5530 cnv->useSubChar1=FALSE; 5531 5532 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5533 p=buffer; 5534 5535 /* fromUnicodeStatus contains prevLength */ 5536 switch(length) { 5537 case 1: 5538 if(cnv->fromUnicodeStatus==2) { 5539 /* DBCS mode and SBCS sub char: change to SBCS */ 5540 cnv->fromUnicodeStatus=1; 5541 *p++=UCNV_SI; 5542 } 5543 *p++=subchar[0]; 5544 break; 5545 case 2: 5546 if(cnv->fromUnicodeStatus<=1) { 5547 /* SBCS mode and DBCS sub char: change to DBCS */ 5548 cnv->fromUnicodeStatus=2; 5549 *p++=UCNV_SO; 5550 } 5551 *p++=subchar[0]; 5552 *p++=subchar[1]; 5553 break; 5554 default: 5555 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5556 return; 5557 } 5558 subchar=buffer; 5559 length=(int32_t)(p-buffer); 5560 } 5561 5562 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5563 } 5564 5565 U_CFUNC UConverterType 5566 ucnv_MBCSGetType(const UConverter* converter) { 5567 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5568 if(converter->sharedData->mbcs.countStates==1) { 5569 return (UConverterType)UCNV_SBCS; 5570 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5571 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5572 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5573 return (UConverterType)UCNV_DBCS; 5574 } 5575 return (UConverterType)UCNV_MBCS; 5576 } 5577 5578 static const UConverterImpl _SBCSUTF8Impl={ 5579 UCNV_MBCS, 5580 5581 ucnv_MBCSLoad, 5582 ucnv_MBCSUnload, 5583 5584 ucnv_MBCSOpen, 5585 NULL, 5586 NULL, 5587 5588 ucnv_MBCSToUnicodeWithOffsets, 5589 ucnv_MBCSToUnicodeWithOffsets, 5590 ucnv_MBCSFromUnicodeWithOffsets, 5591 ucnv_MBCSFromUnicodeWithOffsets, 5592 ucnv_MBCSGetNextUChar, 5593 5594 ucnv_MBCSGetStarters, 5595 ucnv_MBCSGetName, 5596 ucnv_MBCSWriteSub, 5597 NULL, 5598 ucnv_MBCSGetUnicodeSet, 5599 5600 NULL, 5601 ucnv_SBCSFromUTF8 5602 }; 5603 5604 static const UConverterImpl _DBCSUTF8Impl={ 5605 UCNV_MBCS, 5606 5607 ucnv_MBCSLoad, 5608 ucnv_MBCSUnload, 5609 5610 ucnv_MBCSOpen, 5611 NULL, 5612 NULL, 5613 5614 ucnv_MBCSToUnicodeWithOffsets, 5615 ucnv_MBCSToUnicodeWithOffsets, 5616 ucnv_MBCSFromUnicodeWithOffsets, 5617 ucnv_MBCSFromUnicodeWithOffsets, 5618 ucnv_MBCSGetNextUChar, 5619 5620 ucnv_MBCSGetStarters, 5621 ucnv_MBCSGetName, 5622 ucnv_MBCSWriteSub, 5623 NULL, 5624 ucnv_MBCSGetUnicodeSet, 5625 5626 NULL, 5627 ucnv_DBCSFromUTF8 5628 }; 5629 5630 static const UConverterImpl _MBCSImpl={ 5631 UCNV_MBCS, 5632 5633 ucnv_MBCSLoad, 5634 ucnv_MBCSUnload, 5635 5636 ucnv_MBCSOpen, 5637 NULL, 5638 NULL, 5639 5640 ucnv_MBCSToUnicodeWithOffsets, 5641 ucnv_MBCSToUnicodeWithOffsets, 5642 ucnv_MBCSFromUnicodeWithOffsets, 5643 ucnv_MBCSFromUnicodeWithOffsets, 5644 ucnv_MBCSGetNextUChar, 5645 5646 ucnv_MBCSGetStarters, 5647 ucnv_MBCSGetName, 5648 ucnv_MBCSWriteSub, 5649 NULL, 5650 ucnv_MBCSGetUnicodeSet 5651 }; 5652 5653 5654 /* Static data is in tools/makeconv/ucnvstat.c for data-based 5655 * converters. Be sure to update it as well. 5656 */ 5657 5658 const UConverterSharedData _MBCSData={ 5659 sizeof(UConverterSharedData), 1, 5660 NULL, NULL, NULL, FALSE, &_MBCSImpl, 5661 0 5662 }; 5663 5664 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5665