1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvmbcs.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000jul03 14 * created by: Markus W. Scherer 15 * 16 * The current code in this file replaces the previous implementation 17 * of conversion code from multi-byte codepages to Unicode and back. 18 * This implementation supports the following: 19 * - legacy variable-length codepages with up to 4 bytes per character 20 * - all Unicode code points (up to 0x10ffff) 21 * - efficient distinction of unassigned vs. illegal byte sequences 22 * - it is possible in fromUnicode() to directly deal with simple 23 * stateful encodings (used for EBCDIC_STATEFUL) 24 * - it is possible to convert Unicode code points 25 * to a single zero byte (but not as a fallback except for SBCS) 26 * 27 * Remaining limitations in fromUnicode: 28 * - byte sequences must not have leading zero bytes 29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30 * - limitation to up to 4 bytes per character 31 * 32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33 * limitations and adds m:n character mappings and other features. 34 * See ucnv_ext.h for details. 35 * 36 * Change history: 37 * 38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40 * macros to ucnvmbcs.h file 41 */ 42 43 #include "unicode/utypes.h" 44 45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47 #include "unicode/ucnv.h" 48 #include "unicode/ucnv_cb.h" 49 #include "unicode/udata.h" 50 #include "unicode/uset.h" 51 #include "unicode/utf8.h" 52 #include "unicode/utf16.h" 53 #include "ucnv_bld.h" 54 #include "ucnvmbcs.h" 55 #include "ucnv_ext.h" 56 #include "ucnv_cnv.h" 57 #include "cmemory.h" 58 #include "cstring.h" 59 #include "umutex.h" 60 61 /* control optimizations according to the platform */ 62 #define MBCS_UNROLL_SINGLE_TO_BMP 1 63 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 64 65 /* 66 * _MBCSHeader versions 5.3 & 4.3 67 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 68 * 69 * This version is optional. Version 5 is used for incompatible data format changes. 70 * makeconv will continue to generate version 4 files if possible. 71 * 72 * Changes from version 4: 73 * 74 * The main difference is an additional _MBCSHeader field with 75 * - the length (number of uint32_t) of the _MBCSHeader 76 * - flags for further incompatible data format changes 77 * - flags for further, backward compatible data format changes 78 * 79 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 80 * the file and needs to be reconstituted at load time. 81 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 82 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 83 * (For details about these structures see below, and see ucnvmbcs.h.) 84 * 85 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 86 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 87 * precision markers for all mappings.) 88 * 89 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 90 * omitted data that can be reconstituted from the toUnicode data. 91 * 92 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 93 * With only roundtrip mappings in the base fromUnicode data, this part is fully 94 * redundant with the mbcsIndex and will be reconstituted from that (also using the 95 * stage 1 table which contains the information about how stage 2 was compacted). 96 * 97 * The rest of the stage 2 table, the part for code points above maxFastUChar, 98 * is stored in the file and will be appended to the reconstituted part. 99 * 100 * The entire fromUBytes array is omitted from the file and will be reconstitued. 101 * This is done by enumerating all toUnicode roundtrip mappings, performing 102 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 103 * writing instead of reading the byte values. 104 * 105 * _MBCSHeader version 4.3 106 * 107 * Change from version 4.2: 108 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 109 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 110 * files which can be used instead of stages 1 & 2. 111 * Faster lookups for roundtrips from most commonly used characters, 112 * and lookups from UTF-8 byte sequences with a natural bit distribution. 113 * See ucnvmbcs.h for more details. 114 * 115 * Change from version 4.1: 116 * - Added an optional extension table structure at the end of the .cnv file. 117 * It is present if the upper bits of the header flags field contains a non-zero 118 * byte offset to it. 119 * Files that contain only a conversion table and no base table 120 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 121 * These contain the base table name between the MBCS header and the extension 122 * data. 123 * 124 * Change from version 4.0: 125 * - Replace header.reserved with header.fromUBytesLength so that all 126 * fields in the data have length. 127 * 128 * Changes from version 3 (for performance improvements): 129 * - new bit distribution for state table entries 130 * - reordered action codes 131 * - new data structure for single-byte fromUnicode 132 * + stage 2 only contains indexes 133 * + stage 3 stores 16 bits per character with classification bits 15..8 134 * - no multiplier for stage 1 entries 135 * - stage 2 for non-single-byte codepages contains the index and the flags in 136 * one 32-bit value 137 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 138 * 139 * For more details about old versions of the MBCS data structure, see 140 * the corresponding versions of this file. 141 * 142 * Converting stateless codepage data ---------------------------------------*** 143 * (or codepage data with simple states) to Unicode. 144 * 145 * Data structure and algorithm for converting from complex legacy codepages 146 * to Unicode. (Designed before 2000-may-22.) 147 * 148 * The basic idea is that the structure of legacy codepages can be described 149 * with state tables. 150 * When reading a byte stream, each input byte causes a state transition. 151 * Some transitions result in the output of a code point, some result in 152 * "unassigned" or "illegal" output. 153 * This is used here for character conversion. 154 * 155 * The data structure begins with a state table consisting of a row 156 * per state, with 256 entries (columns) per row for each possible input 157 * byte value. 158 * Each entry is 32 bits wide, with two formats distinguished by 159 * the sign bit (bit 31): 160 * 161 * One format for transitional entries (bit 31 not set) for non-final bytes, and 162 * one format for final entries (bit 31 set). 163 * Both formats contain the number of the next state in the same bit 164 * positions. 165 * State 0 is the initial state. 166 * 167 * Most of the time, the offset values of subsequent states are added 168 * up to a scalar value. This value will eventually be the index of 169 * the Unicode code point in a table that follows the state table. 170 * The effect is that the code points for final state table rows 171 * are contiguous. The code points of final state rows follow each other 172 * in the order of the references to those final states by previous 173 * states, etc. 174 * 175 * For some terminal states, the offset is itself the output Unicode 176 * code point (16 bits for a BMP code point or 20 bits for a supplementary 177 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 178 * For others, the code point in the Unicode table is stored with either 179 * one or two code units: one for BMP code points, two for a pair of 180 * surrogates. 181 * All code points for a final state entry take up the same number of code 182 * units, regardless of whether they all actually _use_ the same number 183 * of code units. This is necessary for simple array access. 184 * 185 * An additional feature comes in with what in ICU is called "fallback" 186 * mappings: 187 * 188 * In addition to round-trippable, precise, 1:1 mappings, there are often 189 * mappings defined between similar, though not the same, characters. 190 * Typically, such mappings occur only in fromUnicode mapping tables because 191 * Unicode has a superset repertoire of most other codepages. However, it 192 * is possible to provide such mappings in the toUnicode tables, too. 193 * In this case, the fallback mappings are partly integrated into the 194 * general state tables because the structure of the encoding includes their 195 * byte sequences. 196 * For final entries in an initial state, fallback mappings are stored in 197 * the entry itself like with roundtrip mappings. 198 * For other final entries, they are stored in the code units table if 199 * the entry is for a pair of code units. 200 * For single-unit results in the code units table, there is no space to 201 * alternatively hold a fallback mapping; in this case, the code unit 202 * is stored as U+fffe (unassigned), and the fallback mapping needs to 203 * be looked up by the scalar offset value in a separate table. 204 * 205 * "Unassigned" state entries really mean "structurally unassigned", 206 * i.e., such a byte sequence will never have a mapping result. 207 * 208 * The interpretation of the bits in each entry is as follows: 209 * 210 * Bit 31 not set, not a terminal entry ("transitional"): 211 * 30..24 next state 212 * 23..0 offset delta, to be added up 213 * 214 * Bit 31 set, terminal ("final") entry: 215 * 30..24 next state (regardless of action code) 216 * 23..20 action code: 217 * action codes 0 and 1 result in precise-mapping Unicode code points 218 * 0 valid byte sequence 219 * 19..16 not used, 0 220 * 15..0 16-bit Unicode BMP code point 221 * never U+fffe or U+ffff 222 * 1 valid byte sequence 223 * 19..0 20-bit Unicode supplementary code point 224 * never U+fffe or U+ffff 225 * 226 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 227 * 2 valid byte sequence (fallback) 228 * 19..16 not used, 0 229 * 15..0 16-bit Unicode BMP code point as fallback result 230 * 3 valid byte sequence (fallback) 231 * 19..0 20-bit Unicode supplementary code point as fallback result 232 * 233 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 234 * depending on the code units they result in 235 * 4 valid byte sequence 236 * 19..9 not used, 0 237 * 8..0 final offset delta 238 * pointing to one 16-bit code unit which may be 239 * fffe unassigned -- look for a fallback for this offset 240 * ffff illegal 241 * 5 valid byte sequence 242 * 19..9 not used, 0 243 * 8..0 final offset delta 244 * pointing to two 16-bit code units 245 * (typically UTF-16 surrogates) 246 * the result depends on the first code unit as follows: 247 * 0000..d7ff roundtrip BMP code point (1st alone) 248 * d800..dbff roundtrip surrogate pair (1st, 2nd) 249 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 250 * e000 roundtrip BMP code point (2nd alone) 251 * e001 fallback BMP code point (2nd alone) 252 * fffe unassigned 253 * ffff illegal 254 * (the final offset deltas are at most 255 * 2, 255 * times 2 because of storing code unit pairs) 256 * 257 * 6 unassigned byte sequence 258 * 19..16 not used, 0 259 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 260 * this does not contain a final offset delta because the main 261 * purpose of this action code is to save scalar offset values; 262 * therefore, fallback values cannot be assigned to byte 263 * sequences that result in this action code 264 * 7 illegal byte sequence 265 * 19..16 not used, 0 266 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 267 * 8 state change only 268 * 19..0 not used, 0 269 * useful for state changes in simple stateful encodings, 270 * at Shift-In/Shift-Out codes 271 * 272 * 273 * 9..15 reserved for future use 274 * current implementations will only perform a state change 275 * and ignore bits 19..0 276 * 277 * An encoding with contiguous ranges of unassigned byte sequences, like 278 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 279 * at least two states for the trail bytes: 280 * One trail byte state that results in code points, and one that only 281 * has "unassigned" and "illegal" terminal states. 282 * 283 * Note: partly by accident, this data structure supports simple stateful 284 * encodings without any additional logic. 285 * Currently, only simple Shift-In/Shift-Out schemes are handled with 286 * appropriate state tables (especially EBCDIC_STATEFUL!). 287 * 288 * MBCS version 2 added: 289 * unassigned and illegal action codes have U+fffe and U+ffff 290 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 291 * 292 * Converting from Unicode to codepage bytes --------------------------------*** 293 * 294 * The conversion data structure for fromUnicode is designed for the known 295 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 296 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 297 * a roundtrip mapping. 298 * 299 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 300 * like in the character properties table. 301 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 302 * with the resulting bytes is at offsetFromUBytes. 303 * 304 * Beginning with version 4, single-byte codepages have a significantly different 305 * trie compared to other codepages. 306 * In all cases, the entry in stage 1 is directly the index of the block of 307 * 64 entries in stage 2. 308 * 309 * Single-byte lookup: 310 * 311 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 312 * Stage 3 contains one 16-bit word per result: 313 * Bits 15..8 indicate the kind of result: 314 * f roundtrip result 315 * c fallback result from private-use code point 316 * 8 fallback result from other code points 317 * 0 unassigned 318 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 319 * 320 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 321 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 322 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 323 * ASCII code points can be looked up with a linear array access into stage 3. 324 * See maxFastUChar and other details in ucnvmbcs.h. 325 * 326 * Multi-byte lookup: 327 * 328 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 329 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 330 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 331 * If this test is false, then a non-zero result will be interpreted as 332 * a fallback mapping. 333 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 334 * 335 * Stage 3 contains 2, 3, or 4 bytes per result. 336 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 337 * while 3 bytes are stored as bytes in big-endian order. 338 * Leading zero bytes are ignored, and the number of bytes is counted. 339 * A zero byte mapping result is possible as a roundtrip result. 340 * For some output types, the actual result is processed from this; 341 * see ucnv_MBCSFromUnicodeWithOffsets(). 342 * 343 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 344 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 345 * 346 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 347 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 348 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 349 * ASCII code points can be looked up with a linear array access into stage 3. 350 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 351 * 352 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 353 * for compaction. 354 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 355 * may overlap by any number of entries. 356 * 357 * MBCS version 2 added: 358 * the converter checks for known output types, which allows 359 * adding new ones without crashing an unaware converter 360 */ 361 362 static const UConverterImpl _SBCSUTF8Impl; 363 static const UConverterImpl _DBCSUTF8Impl; 364 365 /* GB 18030 data ------------------------------------------------------------ */ 366 367 /* helper macros for linear values for GB 18030 four-byte sequences */ 368 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 369 370 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 371 372 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 373 374 /* 375 * Some ranges of GB 18030 where both the Unicode code points and the 376 * GB four-byte sequences are contiguous and are handled algorithmically by 377 * the special callback functions below. 378 * The values are start & end of Unicode & GB codes. 379 * 380 * Note that single surrogates are not mapped by GB 18030 381 * as of the re-released mapping tables from 2000-nov-30. 382 */ 383 static const uint32_t 384 gb18030Ranges[14][4]={ 385 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 386 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 387 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 388 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 389 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 390 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 391 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 392 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 393 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 394 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 395 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 396 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 397 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 398 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 399 }; 400 401 /* bit flag for UConverter.options indicating GB 18030 special handling */ 402 #define _MBCS_OPTION_GB18030 0x8000 403 404 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 405 #define _MBCS_OPTION_KEIS 0x01000 406 #define _MBCS_OPTION_JEF 0x02000 407 #define _MBCS_OPTION_JIPS 0x04000 408 409 #define KEIS_SO_CHAR_1 0x0A 410 #define KEIS_SO_CHAR_2 0x42 411 #define KEIS_SI_CHAR_1 0x0A 412 #define KEIS_SI_CHAR_2 0x41 413 414 #define JEF_SO_CHAR 0x28 415 #define JEF_SI_CHAR 0x29 416 417 #define JIPS_SO_CHAR_1 0x1A 418 #define JIPS_SO_CHAR_2 0x70 419 #define JIPS_SI_CHAR_1 0x1A 420 #define JIPS_SI_CHAR_2 0x71 421 422 enum SISO_Option { 423 SI, 424 SO 425 }; 426 typedef enum SISO_Option SISO_Option; 427 428 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 429 int32_t SISOLength = 0; 430 431 switch (option) { 432 case SI: 433 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 434 value[0] = KEIS_SI_CHAR_1; 435 value[1] = KEIS_SI_CHAR_2; 436 SISOLength = 2; 437 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 438 value[0] = JEF_SI_CHAR; 439 SISOLength = 1; 440 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 441 value[0] = JIPS_SI_CHAR_1; 442 value[1] = JIPS_SI_CHAR_2; 443 SISOLength = 2; 444 } else { 445 value[0] = UCNV_SI; 446 SISOLength = 1; 447 } 448 break; 449 case SO: 450 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 451 value[0] = KEIS_SO_CHAR_1; 452 value[1] = KEIS_SO_CHAR_2; 453 SISOLength = 2; 454 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 455 value[0] = JEF_SO_CHAR; 456 SISOLength = 1; 457 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 458 value[0] = JIPS_SO_CHAR_1; 459 value[1] = JIPS_SO_CHAR_2; 460 SISOLength = 2; 461 } else { 462 value[0] = UCNV_SO; 463 SISOLength = 1; 464 } 465 break; 466 default: 467 /* Should never happen. */ 468 break; 469 } 470 471 return SISOLength; 472 } 473 474 /* Miscellaneous ------------------------------------------------------------ */ 475 476 /** 477 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 478 * consecutive sequences of bytes, starting from the one encoded in value, 479 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 480 * Does not currently support m:n mappings or reverse fallbacks. 481 * This function will not be called for sequences of bytes with leading zeros. 482 * 483 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 484 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 485 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 486 * not map to anything 487 * @return TRUE to continue enumeration, FALSE to stop 488 */ 489 typedef UBool U_CALLCONV 490 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 491 492 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 493 static UBool 494 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 495 int32_t state, uint32_t offset, 496 uint32_t value, 497 UConverterEnumToUCallback *callback, const void *context, 498 UErrorCode *pErrorCode) { 499 UChar32 codePoints[32]; 500 const int32_t *row; 501 const uint16_t *unicodeCodeUnits; 502 UChar32 anyCodePoints; 503 int32_t b, limit; 504 505 row=mbcsTable->stateTable[state]; 506 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 507 508 value<<=8; 509 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 510 511 b=(stateProps[state]&0x38)<<2; 512 if(b==0 && stateProps[state]>=0x40) { 513 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 514 codePoints[0]=U_SENTINEL; 515 b=1; 516 } 517 limit=((stateProps[state]&7)+1)<<5; 518 while(b<limit) { 519 int32_t entry=row[b]; 520 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 521 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 522 if(stateProps[nextState]>=0) { 523 /* recurse to a state with non-ignorable actions */ 524 if(!enumToU( 525 mbcsTable, stateProps, nextState, 526 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 527 value|(uint32_t)b, 528 callback, context, 529 pErrorCode)) { 530 return FALSE; 531 } 532 } 533 codePoints[b&0x1f]=U_SENTINEL; 534 } else { 535 UChar32 c; 536 int32_t action; 537 538 /* 539 * An if-else-if chain provides more reliable performance for 540 * the most common cases compared to a switch. 541 */ 542 action=MBCS_ENTRY_FINAL_ACTION(entry); 543 if(action==MBCS_STATE_VALID_DIRECT_16) { 544 /* output BMP code point */ 545 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 546 } else if(action==MBCS_STATE_VALID_16) { 547 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 548 c=unicodeCodeUnits[finalOffset]; 549 if(c<0xfffe) { 550 /* output BMP code point */ 551 } else { 552 c=U_SENTINEL; 553 } 554 } else if(action==MBCS_STATE_VALID_16_PAIR) { 555 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 556 c=unicodeCodeUnits[finalOffset++]; 557 if(c<0xd800) { 558 /* output BMP code point below 0xd800 */ 559 } else if(c<=0xdbff) { 560 /* output roundtrip or fallback supplementary code point */ 561 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 562 } else if(c==0xe000) { 563 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 564 c=unicodeCodeUnits[finalOffset]; 565 } else { 566 c=U_SENTINEL; 567 } 568 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 569 /* output supplementary code point */ 570 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 571 } else { 572 c=U_SENTINEL; 573 } 574 575 codePoints[b&0x1f]=c; 576 anyCodePoints&=c; 577 } 578 if(((++b)&0x1f)==0) { 579 if(anyCodePoints>=0) { 580 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 581 return FALSE; 582 } 583 anyCodePoints=-1; 584 } 585 } 586 } 587 return TRUE; 588 } 589 590 /* 591 * Only called if stateProps[state]==-1. 592 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 593 * MBCS_STATE_CHANGE_ONLY. 594 */ 595 static int8_t 596 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 597 const int32_t *row; 598 int32_t min, max, entry, nextState; 599 600 row=stateTable[state]; 601 stateProps[state]=0; 602 603 /* find first non-ignorable state */ 604 for(min=0;; ++min) { 605 entry=row[min]; 606 nextState=MBCS_ENTRY_STATE(entry); 607 if(stateProps[nextState]==-1) { 608 getStateProp(stateTable, stateProps, nextState); 609 } 610 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 611 if(stateProps[nextState]>=0) { 612 break; 613 } 614 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 615 break; 616 } 617 if(min==0xff) { 618 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 619 return stateProps[state]; 620 } 621 } 622 stateProps[state]|=(int8_t)((min>>5)<<3); 623 624 /* find last non-ignorable state */ 625 for(max=0xff; min<max; --max) { 626 entry=row[max]; 627 nextState=MBCS_ENTRY_STATE(entry); 628 if(stateProps[nextState]==-1) { 629 getStateProp(stateTable, stateProps, nextState); 630 } 631 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 632 if(stateProps[nextState]>=0) { 633 break; 634 } 635 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 636 break; 637 } 638 } 639 stateProps[state]|=(int8_t)(max>>5); 640 641 /* recurse further and collect direct-state information */ 642 while(min<=max) { 643 entry=row[min]; 644 nextState=MBCS_ENTRY_STATE(entry); 645 if(stateProps[nextState]==-1) { 646 getStateProp(stateTable, stateProps, nextState); 647 } 648 if(MBCS_ENTRY_IS_FINAL(entry)) { 649 stateProps[nextState]|=0x40; 650 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 651 stateProps[state]|=0x40; 652 } 653 } 654 ++min; 655 } 656 return stateProps[state]; 657 } 658 659 /* 660 * Internal function enumerating the toUnicode data of an MBCS converter. 661 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 662 * table, but could also be used for a future ucnv_getUnicodeSet() option 663 * that includes reverse fallbacks (after updating this function's implementation). 664 * Currently only handles roundtrip mappings. 665 * Does not currently handle extensions. 666 */ 667 static void 668 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 669 UConverterEnumToUCallback *callback, const void *context, 670 UErrorCode *pErrorCode) { 671 /* 672 * Properties for each state, to speed up the enumeration. 673 * Ignorable actions are unassigned/illegal/state-change-only: 674 * They do not lead to mappings. 675 * 676 * Bits 7..6: 677 * 1 direct/initial state (stateful converters have multiple) 678 * 0 non-initial state with transitions or with non-ignorable result actions 679 * -1 final state with only ignorable actions 680 * 681 * Bits 5..3: 682 * The lowest byte value with non-ignorable actions is 683 * value<<5 (rounded down). 684 * 685 * Bits 2..0: 686 * The highest byte value with non-ignorable actions is 687 * (value<<5)&0x1f (rounded up). 688 */ 689 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 690 int32_t state; 691 692 uprv_memset(stateProps, -1, sizeof(stateProps)); 693 694 /* recurse from state 0 and set all stateProps */ 695 getStateProp(mbcsTable->stateTable, stateProps, 0); 696 697 for(state=0; state<mbcsTable->countStates; ++state) { 698 /*if(stateProps[state]==-1) { 699 printf("unused/unreachable <icu:state> %d\n", state); 700 }*/ 701 if(stateProps[state]>=0x40) { 702 /* start from each direct state */ 703 enumToU( 704 mbcsTable, stateProps, state, 0, 0, 705 callback, context, 706 pErrorCode); 707 } 708 } 709 } 710 711 U_CFUNC void 712 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 713 const USetAdder *sa, 714 UConverterUnicodeSet which, 715 UConverterSetFilter filter, 716 UErrorCode *pErrorCode) { 717 const UConverterMBCSTable *mbcsTable; 718 const uint16_t *table; 719 720 uint32_t st3; 721 uint16_t st1, maxStage1, st2; 722 723 UChar32 c; 724 725 /* enumerate the from-Unicode trie table */ 726 mbcsTable=&sharedData->mbcs; 727 table=mbcsTable->fromUnicodeTable; 728 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 729 maxStage1=0x440; 730 } else { 731 maxStage1=0x40; 732 } 733 734 c=0; /* keep track of the current code point while enumerating */ 735 736 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 737 const uint16_t *stage2, *stage3, *results; 738 uint16_t minValue; 739 740 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 741 742 /* 743 * Set a threshold variable for selecting which mappings to use. 744 * See ucnv_MBCSSingleFromBMPWithOffsets() and 745 * MBCS_SINGLE_RESULT_FROM_U() for details. 746 */ 747 if(which==UCNV_ROUNDTRIP_SET) { 748 /* use only roundtrips */ 749 minValue=0xf00; 750 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 751 /* use all roundtrip and fallback results */ 752 minValue=0x800; 753 } 754 755 for(st1=0; st1<maxStage1; ++st1) { 756 st2=table[st1]; 757 if(st2>maxStage1) { 758 stage2=table+st2; 759 for(st2=0; st2<64; ++st2) { 760 if((st3=stage2[st2])!=0) { 761 /* read the stage 3 block */ 762 stage3=results+st3; 763 764 do { 765 if(*stage3++>=minValue) { 766 sa->add(sa->set, c); 767 } 768 } while((++c&0xf)!=0); 769 } else { 770 c+=16; /* empty stage 3 block */ 771 } 772 } 773 } else { 774 c+=1024; /* empty stage 2 block */ 775 } 776 } 777 } else { 778 const uint32_t *stage2; 779 const uint8_t *stage3, *bytes; 780 uint32_t st3Multiplier; 781 uint32_t value; 782 UBool useFallback; 783 784 bytes=mbcsTable->fromUnicodeBytes; 785 786 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 787 788 switch(mbcsTable->outputType) { 789 case MBCS_OUTPUT_3: 790 case MBCS_OUTPUT_4_EUC: 791 st3Multiplier=3; 792 break; 793 case MBCS_OUTPUT_4: 794 st3Multiplier=4; 795 break; 796 default: 797 st3Multiplier=2; 798 break; 799 } 800 801 for(st1=0; st1<maxStage1; ++st1) { 802 st2=table[st1]; 803 if(st2>(maxStage1>>1)) { 804 stage2=(const uint32_t *)table+st2; 805 for(st2=0; st2<64; ++st2) { 806 if((st3=stage2[st2])!=0) { 807 /* read the stage 3 block */ 808 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 809 810 /* get the roundtrip flags for the stage 3 block */ 811 st3>>=16; 812 813 /* 814 * Add code points for which the roundtrip flag is set, 815 * or which map to non-zero bytes if we use fallbacks. 816 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 817 */ 818 switch(filter) { 819 case UCNV_SET_FILTER_NONE: 820 do { 821 if(st3&1) { 822 sa->add(sa->set, c); 823 stage3+=st3Multiplier; 824 } else if(useFallback) { 825 uint8_t b=0; 826 switch(st3Multiplier) { 827 case 4: 828 b|=*stage3++; 829 case 3: /*fall through*/ 830 b|=*stage3++; 831 case 2: /*fall through*/ 832 b|=stage3[0]|stage3[1]; 833 stage3+=2; 834 default: 835 break; 836 } 837 if(b!=0) { 838 sa->add(sa->set, c); 839 } 840 } 841 st3>>=1; 842 } while((++c&0xf)!=0); 843 break; 844 case UCNV_SET_FILTER_DBCS_ONLY: 845 /* Ignore single-byte results (<0x100). */ 846 do { 847 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 848 sa->add(sa->set, c); 849 } 850 st3>>=1; 851 stage3+=2; /* +=st3Multiplier */ 852 } while((++c&0xf)!=0); 853 break; 854 case UCNV_SET_FILTER_2022_CN: 855 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 856 do { 857 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 858 sa->add(sa->set, c); 859 } 860 st3>>=1; 861 stage3+=3; /* +=st3Multiplier */ 862 } while((++c&0xf)!=0); 863 break; 864 case UCNV_SET_FILTER_SJIS: 865 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 866 do { 867 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 868 sa->add(sa->set, c); 869 } 870 st3>>=1; 871 stage3+=2; /* +=st3Multiplier */ 872 } while((++c&0xf)!=0); 873 break; 874 case UCNV_SET_FILTER_GR94DBCS: 875 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 876 do { 877 if( ((st3&1)!=0 || useFallback) && 878 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 879 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 880 ) { 881 sa->add(sa->set, c); 882 } 883 st3>>=1; 884 stage3+=2; /* +=st3Multiplier */ 885 } while((++c&0xf)!=0); 886 break; 887 case UCNV_SET_FILTER_HZ: 888 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 889 do { 890 if( ((st3&1)!=0 || useFallback) && 891 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 892 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 893 ) { 894 sa->add(sa->set, c); 895 } 896 st3>>=1; 897 stage3+=2; /* +=st3Multiplier */ 898 } while((++c&0xf)!=0); 899 break; 900 default: 901 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 902 return; 903 } 904 } else { 905 c+=16; /* empty stage 3 block */ 906 } 907 } 908 } else { 909 c+=1024; /* empty stage 2 block */ 910 } 911 } 912 } 913 914 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 915 } 916 917 U_CFUNC void 918 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 919 const USetAdder *sa, 920 UConverterUnicodeSet which, 921 UErrorCode *pErrorCode) { 922 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 923 sharedData, sa, which, 924 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 925 UCNV_SET_FILTER_DBCS_ONLY : 926 UCNV_SET_FILTER_NONE, 927 pErrorCode); 928 } 929 930 static void 931 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 932 const USetAdder *sa, 933 UConverterUnicodeSet which, 934 UErrorCode *pErrorCode) { 935 if(cnv->options&_MBCS_OPTION_GB18030) { 936 sa->addRange(sa->set, 0, 0xd7ff); 937 sa->addRange(sa->set, 0xe000, 0x10ffff); 938 } else { 939 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 940 } 941 } 942 943 /* conversion extensions for input not in the main table -------------------- */ 944 945 /* 946 * Hardcoded extension handling for GB 18030. 947 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 948 * 949 * In the future, conversion extensions may handle m:n mappings and delta tables, 950 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 951 * 952 * If an input character cannot be mapped, then these functions set an error 953 * code. The framework will then call the callback function. 954 */ 955 956 /* 957 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 958 * else return 0 after output has been written to the target 959 */ 960 static UChar32 961 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 962 UChar32 cp, 963 const UChar **source, const UChar *sourceLimit, 964 uint8_t **target, const uint8_t *targetLimit, 965 int32_t **offsets, int32_t sourceIndex, 966 UBool flush, 967 UErrorCode *pErrorCode) { 968 const int32_t *cx; 969 970 cnv->useSubChar1=FALSE; 971 972 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 973 ucnv_extInitialMatchFromU( 974 cnv, cx, 975 cp, source, sourceLimit, 976 (char **)target, (char *)targetLimit, 977 offsets, sourceIndex, 978 flush, 979 pErrorCode) 980 ) { 981 return 0; /* an extension mapping handled the input */ 982 } 983 984 /* GB 18030 */ 985 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 986 const uint32_t *range; 987 int32_t i; 988 989 range=gb18030Ranges[0]; 990 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 991 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 992 /* found the Unicode code point, output the four-byte sequence for it */ 993 uint32_t linear; 994 char bytes[4]; 995 996 /* get the linear value of the first GB 18030 code in this range */ 997 linear=range[2]-LINEAR_18030_BASE; 998 999 /* add the offset from the beginning of the range */ 1000 linear+=((uint32_t)cp-range[0]); 1001 1002 /* turn this into a four-byte sequence */ 1003 bytes[3]=(char)(0x30+linear%10); linear/=10; 1004 bytes[2]=(char)(0x81+linear%126); linear/=126; 1005 bytes[1]=(char)(0x30+linear%10); linear/=10; 1006 bytes[0]=(char)(0x81+linear); 1007 1008 /* output this sequence */ 1009 ucnv_fromUWriteBytes(cnv, 1010 bytes, 4, (char **)target, (char *)targetLimit, 1011 offsets, sourceIndex, pErrorCode); 1012 return 0; 1013 } 1014 } 1015 } 1016 1017 /* no mapping */ 1018 *pErrorCode=U_INVALID_CHAR_FOUND; 1019 return cp; 1020 } 1021 1022 /* 1023 * Input sequence: cnv->toUBytes[0..length[ 1024 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1025 * else return 0 after output has been written to the target 1026 */ 1027 static int8_t 1028 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1029 int8_t length, 1030 const uint8_t **source, const uint8_t *sourceLimit, 1031 UChar **target, const UChar *targetLimit, 1032 int32_t **offsets, int32_t sourceIndex, 1033 UBool flush, 1034 UErrorCode *pErrorCode) { 1035 const int32_t *cx; 1036 1037 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1038 ucnv_extInitialMatchToU( 1039 cnv, cx, 1040 length, (const char **)source, (const char *)sourceLimit, 1041 target, targetLimit, 1042 offsets, sourceIndex, 1043 flush, 1044 pErrorCode) 1045 ) { 1046 return 0; /* an extension mapping handled the input */ 1047 } 1048 1049 /* GB 18030 */ 1050 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1051 const uint32_t *range; 1052 uint32_t linear; 1053 int32_t i; 1054 1055 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1056 range=gb18030Ranges[0]; 1057 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) { 1058 if(range[2]<=linear && linear<=range[3]) { 1059 /* found the sequence, output the Unicode code point for it */ 1060 *pErrorCode=U_ZERO_ERROR; 1061 1062 /* add the linear difference between the input and start sequences to the start code point */ 1063 linear=range[0]+(linear-range[2]); 1064 1065 /* output this code point */ 1066 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1067 1068 return 0; 1069 } 1070 } 1071 } 1072 1073 /* no mapping */ 1074 *pErrorCode=U_INVALID_CHAR_FOUND; 1075 return length; 1076 } 1077 1078 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1079 1080 /* 1081 * This code modifies a standard EBCDIC<->Unicode mapping table for 1082 * OS/390 (z/OS) Unix System Services (Open Edition). 1083 * The difference is in the mapping of Line Feed and New Line control codes: 1084 * Standard EBCDIC maps 1085 * 1086 * <U000A> \x25 |0 1087 * <U0085> \x15 |0 1088 * 1089 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1090 * mapping 1091 * 1092 * <U000A> \x15 |0 1093 * <U0085> \x25 |0 1094 * 1095 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1096 * by copying it into allocated memory and swapping the LF and NL values. 1097 * It allows to support the same EBCDIC charset in both versions without 1098 * duplicating the entire installed table. 1099 */ 1100 1101 /* standard EBCDIC codes */ 1102 #define EBCDIC_LF 0x25 1103 #define EBCDIC_NL 0x15 1104 1105 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1106 #define EBCDIC_RT_LF 0xf25 1107 #define EBCDIC_RT_NL 0xf15 1108 1109 /* Unicode code points */ 1110 #define U_LF 0x0a 1111 #define U_NL 0x85 1112 1113 static UBool 1114 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1115 UConverterMBCSTable *mbcsTable; 1116 1117 const uint16_t *table, *results; 1118 const uint8_t *bytes; 1119 1120 int32_t (*newStateTable)[256]; 1121 uint16_t *newResults; 1122 uint8_t *p; 1123 char *name; 1124 1125 uint32_t stage2Entry; 1126 uint32_t size, sizeofFromUBytes; 1127 1128 mbcsTable=&sharedData->mbcs; 1129 1130 table=mbcsTable->fromUnicodeTable; 1131 bytes=mbcsTable->fromUnicodeBytes; 1132 results=(const uint16_t *)bytes; 1133 1134 /* 1135 * Check that this is an EBCDIC table with SBCS portion - 1136 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1137 * 1138 * If not, ignore the option. Options are always ignored if they do not apply. 1139 */ 1140 if(!( 1141 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1142 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1143 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1144 )) { 1145 return FALSE; 1146 } 1147 1148 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1149 if(!( 1150 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1151 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1152 )) { 1153 return FALSE; 1154 } 1155 } else /* MBCS_OUTPUT_2_SISO */ { 1156 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1157 if(!( 1158 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1159 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1160 )) { 1161 return FALSE; 1162 } 1163 1164 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1165 if(!( 1166 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1167 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1168 )) { 1169 return FALSE; 1170 } 1171 } 1172 1173 if(mbcsTable->fromUBytesLength>0) { 1174 /* 1175 * We _know_ the number of bytes in the fromUnicodeBytes array 1176 * starting with header.version 4.1. 1177 */ 1178 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1179 } else { 1180 /* 1181 * Otherwise: 1182 * There used to be code to enumerate the fromUnicode 1183 * trie and find the highest entry, but it was removed in ICU 3.2 1184 * because it was not tested and caused a low code coverage number. 1185 * See Jitterbug 3674. 1186 * This affects only some .cnv file formats with a header.version 1187 * below 4.1, and only when swaplfnl is requested. 1188 * 1189 * ucnvmbcs.c revision 1.99 is the last one with the 1190 * ucnv_MBCSSizeofFromUBytes() function. 1191 */ 1192 *pErrorCode=U_INVALID_FORMAT_ERROR; 1193 return FALSE; 1194 } 1195 1196 /* 1197 * The table has an appropriate format. 1198 * Allocate and build 1199 * - a modified to-Unicode state table 1200 * - a modified from-Unicode output array 1201 * - a converter name string with the swap option appended 1202 */ 1203 size= 1204 mbcsTable->countStates*1024+ 1205 sizeofFromUBytes+ 1206 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1207 p=(uint8_t *)uprv_malloc(size); 1208 if(p==NULL) { 1209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1210 return FALSE; 1211 } 1212 1213 /* copy and modify the to-Unicode state table */ 1214 newStateTable=(int32_t (*)[256])p; 1215 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1216 1217 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1218 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1219 1220 /* copy and modify the from-Unicode result table */ 1221 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1222 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1223 1224 /* conveniently, the table access macros work on the left side of expressions */ 1225 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1226 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1227 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1228 } else /* MBCS_OUTPUT_2_SISO */ { 1229 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1230 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1231 1232 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1233 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1234 } 1235 1236 /* set the canonical converter name */ 1237 name=(char *)newResults+sizeofFromUBytes; 1238 uprv_strcpy(name, sharedData->staticData->name); 1239 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1240 1241 /* set the pointers */ 1242 umtx_lock(NULL); 1243 if(mbcsTable->swapLFNLStateTable==NULL) { 1244 mbcsTable->swapLFNLStateTable=newStateTable; 1245 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1246 mbcsTable->swapLFNLName=name; 1247 1248 newStateTable=NULL; 1249 } 1250 umtx_unlock(NULL); 1251 1252 /* release the allocated memory if another thread beat us to it */ 1253 if(newStateTable!=NULL) { 1254 uprv_free(newStateTable); 1255 } 1256 return TRUE; 1257 } 1258 1259 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1260 1261 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1262 static UBool U_CALLCONV 1263 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1264 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1265 const uint16_t *table; 1266 uint32_t *stage2; 1267 uint8_t *bytes, *p; 1268 UChar32 c; 1269 int32_t i, st3; 1270 1271 table=mbcsTable->fromUnicodeTable; 1272 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1273 1274 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1275 switch(mbcsTable->outputType) { 1276 case MBCS_OUTPUT_3_EUC: 1277 if(value<=0xffff) { 1278 /* short sequences are stored directly */ 1279 /* code set 0 or 1 */ 1280 } else if(value<=0x8effff) { 1281 /* code set 2 */ 1282 value&=0x7fff; 1283 } else /* first byte is 0x8f */ { 1284 /* code set 3 */ 1285 value&=0xff7f; 1286 } 1287 break; 1288 case MBCS_OUTPUT_4_EUC: 1289 if(value<=0xffffff) { 1290 /* short sequences are stored directly */ 1291 /* code set 0 or 1 */ 1292 } else if(value<=0x8effffff) { 1293 /* code set 2 */ 1294 value&=0x7fffff; 1295 } else /* first byte is 0x8f */ { 1296 /* code set 3 */ 1297 value&=0xff7fff; 1298 } 1299 break; 1300 default: 1301 break; 1302 } 1303 1304 for(i=0; i<=0x1f; ++value, ++i) { 1305 c=codePoints[i]; 1306 if(c<0) { 1307 continue; 1308 } 1309 1310 /* locate the stage 2 & 3 data */ 1311 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1312 p=bytes; 1313 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1314 1315 /* write the codepage bytes into stage 3 */ 1316 switch(mbcsTable->outputType) { 1317 case MBCS_OUTPUT_3: 1318 case MBCS_OUTPUT_4_EUC: 1319 p+=st3*3; 1320 p[0]=(uint8_t)(value>>16); 1321 p[1]=(uint8_t)(value>>8); 1322 p[2]=(uint8_t)value; 1323 break; 1324 case MBCS_OUTPUT_4: 1325 ((uint32_t *)p)[st3]=value; 1326 break; 1327 default: 1328 /* 2 bytes per character */ 1329 ((uint16_t *)p)[st3]=(uint16_t)value; 1330 break; 1331 } 1332 1333 /* set the roundtrip flag */ 1334 *stage2|=(1UL<<(16+(c&0xf))); 1335 } 1336 return TRUE; 1337 } 1338 1339 static void 1340 reconstituteData(UConverterMBCSTable *mbcsTable, 1341 uint32_t stage1Length, uint32_t stage2Length, 1342 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1343 UErrorCode *pErrorCode) { 1344 uint16_t *stage1; 1345 uint32_t *stage2; 1346 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1347 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1348 if(mbcsTable->reconstitutedData==NULL) { 1349 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1350 return; 1351 } 1352 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1353 1354 /* copy existing data and reroute the pointers */ 1355 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1356 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1357 1358 stage2=(uint32_t *)(stage1+stage1Length); 1359 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1360 mbcsTable->fromUnicodeTable+stage1Length, 1361 stage2Length*4); 1362 1363 mbcsTable->fromUnicodeTable=stage1; 1364 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length); 1365 1366 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1367 stage2=(uint32_t *)stage1; 1368 1369 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1370 { 1371 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1372 int32_t stageUTF8Index=0; 1373 int32_t st1, st2, st3, i; 1374 1375 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1376 st2=stage1[st1]; 1377 if(st2!=stage1Length/2) { 1378 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1379 for(i=0; i<16; ++i) { 1380 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1381 if(st3!=0) { 1382 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1383 st3>>=4; 1384 /* 1385 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1386 * allocated together as a single 64-block for access from the mbcsIndex 1387 */ 1388 stage2[st2++]=st3++; 1389 stage2[st2++]=st3++; 1390 stage2[st2++]=st3++; 1391 stage2[st2++]=st3; 1392 } else { 1393 /* no stage 3 block, skip */ 1394 st2+=4; 1395 } 1396 } 1397 } else { 1398 /* no stage 2 block, skip */ 1399 stageUTF8Index+=16; 1400 } 1401 } 1402 } 1403 1404 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1405 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1406 } 1407 1408 /* MBCS setup functions ----------------------------------------------------- */ 1409 1410 static void 1411 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1412 UConverterLoadArgs *pArgs, 1413 const uint8_t *raw, 1414 UErrorCode *pErrorCode) { 1415 UDataInfo info; 1416 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1417 _MBCSHeader *header=(_MBCSHeader *)raw; 1418 uint32_t offset; 1419 uint32_t headerLength; 1420 UBool noFromU=FALSE; 1421 1422 if(header->version[0]==4) { 1423 headerLength=MBCS_HEADER_V4_LENGTH; 1424 } else if(header->version[0]==5 && header->version[1]>=3 && 1425 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1426 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1427 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1428 } else { 1429 *pErrorCode=U_INVALID_TABLE_FORMAT; 1430 return; 1431 } 1432 1433 mbcsTable->outputType=(uint8_t)header->flags; 1434 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1435 *pErrorCode=U_INVALID_TABLE_FORMAT; 1436 return; 1437 } 1438 1439 /* extension data, header version 4.2 and higher */ 1440 offset=header->flags>>8; 1441 if(offset!=0) { 1442 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1443 } 1444 1445 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1446 UConverterLoadArgs args={ 0 }; 1447 UConverterSharedData *baseSharedData; 1448 const int32_t *extIndexes; 1449 const char *baseName; 1450 1451 /* extension-only file, load the base table and set values appropriately */ 1452 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1453 /* extension-only file without extension */ 1454 *pErrorCode=U_INVALID_TABLE_FORMAT; 1455 return; 1456 } 1457 1458 if(pArgs->nestedLoads!=1) { 1459 /* an extension table must not be loaded as a base table */ 1460 *pErrorCode=U_INVALID_TABLE_FILE; 1461 return; 1462 } 1463 1464 /* load the base table */ 1465 baseName=(const char *)header+headerLength*4; 1466 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1467 /* forbid loading this same extension-only file */ 1468 *pErrorCode=U_INVALID_TABLE_FORMAT; 1469 return; 1470 } 1471 1472 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1473 args.size=sizeof(UConverterLoadArgs); 1474 args.nestedLoads=2; 1475 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1476 args.reserved=pArgs->reserved; 1477 args.options=pArgs->options; 1478 args.pkg=pArgs->pkg; 1479 args.name=baseName; 1480 baseSharedData=ucnv_load(&args, pErrorCode); 1481 if(U_FAILURE(*pErrorCode)) { 1482 return; 1483 } 1484 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1485 baseSharedData->mbcs.baseSharedData!=NULL 1486 ) { 1487 ucnv_unload(baseSharedData); 1488 *pErrorCode=U_INVALID_TABLE_FORMAT; 1489 return; 1490 } 1491 if(pArgs->onlyTestIsLoadable) { 1492 /* 1493 * Exit as soon as we know that we can load the converter 1494 * and the format is valid and supported. 1495 * The worst that can happen in the following code is a memory 1496 * allocation error. 1497 */ 1498 ucnv_unload(baseSharedData); 1499 return; 1500 } 1501 1502 /* copy the base table data */ 1503 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1504 1505 /* overwrite values with relevant ones for the extension converter */ 1506 mbcsTable->baseSharedData=baseSharedData; 1507 mbcsTable->extIndexes=extIndexes; 1508 1509 /* 1510 * It would be possible to share the swapLFNL data with a base converter, 1511 * but the generated name would have to be different, and the memory 1512 * would have to be free'd only once. 1513 * It is easier to just create the data for the extension converter 1514 * separately when it is requested. 1515 */ 1516 mbcsTable->swapLFNLStateTable=NULL; 1517 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1518 mbcsTable->swapLFNLName=NULL; 1519 1520 /* 1521 * The reconstitutedData must be deleted only when the base converter 1522 * is unloaded. 1523 */ 1524 mbcsTable->reconstitutedData=NULL; 1525 1526 /* 1527 * Set a special, runtime-only outputType if the extension converter 1528 * is a DBCS version of a base converter that also maps single bytes. 1529 */ 1530 if( sharedData->staticData->conversionType==UCNV_DBCS || 1531 (sharedData->staticData->conversionType==UCNV_MBCS && 1532 sharedData->staticData->minBytesPerChar>=2) 1533 ) { 1534 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1535 /* the base converter is SI/SO-stateful */ 1536 int32_t entry; 1537 1538 /* get the dbcs state from the state table entry for SO=0x0e */ 1539 entry=mbcsTable->stateTable[0][0xe]; 1540 if( MBCS_ENTRY_IS_FINAL(entry) && 1541 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1542 MBCS_ENTRY_FINAL_STATE(entry)!=0 1543 ) { 1544 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1545 1546 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1547 } 1548 } else if( 1549 baseSharedData->staticData->conversionType==UCNV_MBCS && 1550 baseSharedData->staticData->minBytesPerChar==1 && 1551 baseSharedData->staticData->maxBytesPerChar==2 && 1552 mbcsTable->countStates<=127 1553 ) { 1554 /* non-stateful base converter, need to modify the state table */ 1555 int32_t (*newStateTable)[256]; 1556 int32_t *state; 1557 int32_t i, count; 1558 1559 /* allocate a new state table and copy the base state table contents */ 1560 count=mbcsTable->countStates; 1561 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1562 if(newStateTable==NULL) { 1563 ucnv_unload(baseSharedData); 1564 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1565 return; 1566 } 1567 1568 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1569 1570 /* change all final single-byte entries to go to a new all-illegal state */ 1571 state=newStateTable[0]; 1572 for(i=0; i<256; ++i) { 1573 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1574 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1575 } 1576 } 1577 1578 /* build the new all-illegal state */ 1579 state=newStateTable[count]; 1580 for(i=0; i<256; ++i) { 1581 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1582 } 1583 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1584 mbcsTable->countStates=(uint8_t)(count+1); 1585 mbcsTable->stateTableOwned=TRUE; 1586 1587 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1588 } 1589 } 1590 1591 /* 1592 * unlike below for files with base tables, do not get the unicodeMask 1593 * from the sharedData; instead, use the base table's unicodeMask, 1594 * which we copied in the memcpy above; 1595 * this is necessary because the static data unicodeMask, especially 1596 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1597 */ 1598 } else { 1599 /* conversion file with a base table; an additional extension table is optional */ 1600 /* make sure that the output type is known */ 1601 switch(mbcsTable->outputType) { 1602 case MBCS_OUTPUT_1: 1603 case MBCS_OUTPUT_2: 1604 case MBCS_OUTPUT_3: 1605 case MBCS_OUTPUT_4: 1606 case MBCS_OUTPUT_3_EUC: 1607 case MBCS_OUTPUT_4_EUC: 1608 case MBCS_OUTPUT_2_SISO: 1609 /* OK */ 1610 break; 1611 default: 1612 *pErrorCode=U_INVALID_TABLE_FORMAT; 1613 return; 1614 } 1615 if(pArgs->onlyTestIsLoadable) { 1616 /* 1617 * Exit as soon as we know that we can load the converter 1618 * and the format is valid and supported. 1619 * The worst that can happen in the following code is a memory 1620 * allocation error. 1621 */ 1622 return; 1623 } 1624 1625 mbcsTable->countStates=(uint8_t)header->countStates; 1626 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1627 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1628 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1629 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1630 1631 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1632 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1633 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1634 1635 /* 1636 * converter versions 6.1 and up contain a unicodeMask that is 1637 * used here to select the most efficient function implementations 1638 */ 1639 info.size=sizeof(UDataInfo); 1640 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1641 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1642 /* mask off possible future extensions to be safe */ 1643 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1644 } else { 1645 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1646 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1647 } 1648 1649 /* 1650 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1651 * Check for the header version, SBCS vs. MBCS, and for whether the 1652 * data structures are optimized for code points as high as what the 1653 * runtime code is designed for. 1654 * The implementation does not handle mapping tables with entries for 1655 * unpaired surrogates. 1656 */ 1657 if( header->version[1]>=3 && 1658 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1659 (mbcsTable->countStates==1 ? 1660 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1661 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1662 ) 1663 ) { 1664 mbcsTable->utf8Friendly=TRUE; 1665 1666 if(mbcsTable->countStates==1) { 1667 /* 1668 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1669 * Build a table with indexes to each block, to be used instead of 1670 * the regular stage 1/2 table. 1671 */ 1672 int32_t i; 1673 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1674 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1675 } 1676 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1677 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1678 } else { 1679 /* 1680 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1681 * The .cnv file is prebuilt with an additional stage table with indexes 1682 * to each block. 1683 */ 1684 mbcsTable->mbcsIndex=(const uint16_t *) 1685 (mbcsTable->fromUnicodeBytes+ 1686 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1687 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1688 } 1689 } 1690 1691 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1692 { 1693 uint32_t asciiRoundtrips=0xffffffff; 1694 int32_t i; 1695 1696 for(i=0; i<0x80; ++i) { 1697 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1698 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1699 } 1700 } 1701 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1702 } 1703 1704 if(noFromU) { 1705 uint32_t stage1Length= 1706 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1707 0x440 : 0x40; 1708 uint32_t stage2Length= 1709 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1710 stage1Length/2; 1711 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1712 } 1713 } 1714 1715 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1716 if(mbcsTable->utf8Friendly) { 1717 if(mbcsTable->countStates==1) { 1718 sharedData->impl=&_SBCSUTF8Impl; 1719 } else { 1720 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1721 sharedData->impl=&_DBCSUTF8Impl; 1722 } 1723 } 1724 } 1725 1726 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1727 /* 1728 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1729 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1730 */ 1731 mbcsTable->asciiRoundtrips=0; 1732 } 1733 } 1734 1735 static void 1736 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1737 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1738 1739 if(mbcsTable->swapLFNLStateTable!=NULL) { 1740 uprv_free(mbcsTable->swapLFNLStateTable); 1741 } 1742 if(mbcsTable->stateTableOwned) { 1743 uprv_free((void *)mbcsTable->stateTable); 1744 } 1745 if(mbcsTable->baseSharedData!=NULL) { 1746 ucnv_unload(mbcsTable->baseSharedData); 1747 } 1748 if(mbcsTable->reconstitutedData!=NULL) { 1749 uprv_free(mbcsTable->reconstitutedData); 1750 } 1751 } 1752 1753 static void 1754 ucnv_MBCSOpen(UConverter *cnv, 1755 UConverterLoadArgs *pArgs, 1756 UErrorCode *pErrorCode) { 1757 UConverterMBCSTable *mbcsTable; 1758 const int32_t *extIndexes; 1759 uint8_t outputType; 1760 int8_t maxBytesPerUChar; 1761 1762 if(pArgs->onlyTestIsLoadable) { 1763 return; 1764 } 1765 1766 mbcsTable=&cnv->sharedData->mbcs; 1767 outputType=mbcsTable->outputType; 1768 1769 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1770 /* the swaplfnl option does not apply, remove it */ 1771 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1772 } 1773 1774 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1775 /* do this because double-checked locking is broken */ 1776 UBool isCached; 1777 1778 umtx_lock(NULL); 1779 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1780 umtx_unlock(NULL); 1781 1782 if(!isCached) { 1783 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1784 if(U_FAILURE(*pErrorCode)) { 1785 return; /* something went wrong */ 1786 } 1787 1788 /* the option does not apply, remove it */ 1789 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1790 } 1791 } 1792 } 1793 1794 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1795 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1796 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1797 cnv->options|=_MBCS_OPTION_GB18030; 1798 } 1799 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1800 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1801 cnv->options|=_MBCS_OPTION_KEIS; 1802 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1803 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1804 cnv->options|=_MBCS_OPTION_JEF; 1805 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1806 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1807 cnv->options|=_MBCS_OPTION_JIPS; 1808 } 1809 1810 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1811 if(outputType==MBCS_OUTPUT_2_SISO) { 1812 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1813 } 1814 1815 extIndexes=mbcsTable->extIndexes; 1816 if(extIndexes!=NULL) { 1817 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1818 if(outputType==MBCS_OUTPUT_2_SISO) { 1819 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1820 } 1821 1822 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1823 cnv->maxBytesPerUChar=maxBytesPerUChar; 1824 } 1825 } 1826 1827 #if 0 1828 /* 1829 * documentation of UConverter fields used for status 1830 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1831 */ 1832 1833 /* toUnicode */ 1834 cnv->toUnicodeStatus=0; /* offset */ 1835 cnv->mode=0; /* state */ 1836 cnv->toULength=0; /* byteIndex */ 1837 1838 /* fromUnicode */ 1839 cnv->fromUChar32=0; 1840 cnv->fromUnicodeStatus=1; /* prevLength */ 1841 #endif 1842 } 1843 1844 static const char * 1845 ucnv_MBCSGetName(const UConverter *cnv) { 1846 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1847 return cnv->sharedData->mbcs.swapLFNLName; 1848 } else { 1849 return cnv->sharedData->staticData->name; 1850 } 1851 } 1852 1853 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 1854 1855 static UChar32 1856 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1857 const _MBCSToUFallback *toUFallbacks; 1858 uint32_t i, start, limit; 1859 1860 limit=mbcsTable->countToUFallbacks; 1861 if(limit>0) { 1862 /* do a binary search for the fallback mapping */ 1863 toUFallbacks=mbcsTable->toUFallbacks; 1864 start=0; 1865 while(start<limit-1) { 1866 i=(start+limit)/2; 1867 if(offset<toUFallbacks[i].offset) { 1868 limit=i; 1869 } else { 1870 start=i; 1871 } 1872 } 1873 1874 /* did we really find it? */ 1875 if(offset==toUFallbacks[start].offset) { 1876 return toUFallbacks[start].codePoint; 1877 } 1878 } 1879 1880 return 0xfffe; 1881 } 1882 1883 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 1884 static void 1885 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1886 UErrorCode *pErrorCode) { 1887 UConverter *cnv; 1888 const uint8_t *source, *sourceLimit; 1889 UChar *target; 1890 const UChar *targetLimit; 1891 int32_t *offsets; 1892 1893 const int32_t (*stateTable)[256]; 1894 1895 int32_t sourceIndex; 1896 1897 int32_t entry; 1898 UChar c; 1899 uint8_t action; 1900 1901 /* set up the local pointers */ 1902 cnv=pArgs->converter; 1903 source=(const uint8_t *)pArgs->source; 1904 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1905 target=pArgs->target; 1906 targetLimit=pArgs->targetLimit; 1907 offsets=pArgs->offsets; 1908 1909 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1910 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 1911 } else { 1912 stateTable=cnv->sharedData->mbcs.stateTable; 1913 } 1914 1915 /* sourceIndex=-1 if the current character began in the previous buffer */ 1916 sourceIndex=0; 1917 1918 /* conversion loop */ 1919 while(source<sourceLimit) { 1920 /* 1921 * This following test is to see if available input would overflow the output. 1922 * It does not catch output of more than one code unit that 1923 * overflows as a result of a surrogate pair or callback output 1924 * from the last source byte. 1925 * Therefore, those situations also test for overflows and will 1926 * then break the loop, too. 1927 */ 1928 if(target>=targetLimit) { 1929 /* target is full */ 1930 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1931 break; 1932 } 1933 1934 entry=stateTable[0][*source++]; 1935 /* MBCS_ENTRY_IS_FINAL(entry) */ 1936 1937 /* test the most common case first */ 1938 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 1939 /* output BMP code point */ 1940 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1941 if(offsets!=NULL) { 1942 *offsets++=sourceIndex; 1943 } 1944 1945 /* normal end of action codes: prepare for a new character */ 1946 ++sourceIndex; 1947 continue; 1948 } 1949 1950 /* 1951 * An if-else-if chain provides more reliable performance for 1952 * the most common cases compared to a switch. 1953 */ 1954 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 1955 if(action==MBCS_STATE_VALID_DIRECT_20 || 1956 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 1957 ) { 1958 entry=MBCS_ENTRY_FINAL_VALUE(entry); 1959 /* output surrogate pair */ 1960 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 1961 if(offsets!=NULL) { 1962 *offsets++=sourceIndex; 1963 } 1964 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 1965 if(target<targetLimit) { 1966 *target++=c; 1967 if(offsets!=NULL) { 1968 *offsets++=sourceIndex; 1969 } 1970 } else { 1971 /* target overflow */ 1972 cnv->UCharErrorBuffer[0]=c; 1973 cnv->UCharErrorBufferLength=1; 1974 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1975 break; 1976 } 1977 1978 ++sourceIndex; 1979 continue; 1980 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 1981 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 1982 /* output BMP code point */ 1983 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 1984 if(offsets!=NULL) { 1985 *offsets++=sourceIndex; 1986 } 1987 1988 ++sourceIndex; 1989 continue; 1990 } 1991 } else if(action==MBCS_STATE_UNASSIGNED) { 1992 /* just fall through */ 1993 } else if(action==MBCS_STATE_ILLEGAL) { 1994 /* callback(illegal) */ 1995 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1996 } else { 1997 /* reserved, must never occur */ 1998 ++sourceIndex; 1999 continue; 2000 } 2001 2002 if(U_FAILURE(*pErrorCode)) { 2003 /* callback(illegal) */ 2004 break; 2005 } else /* unassigned sequences indicated with byteIndex>0 */ { 2006 /* try an extension mapping */ 2007 pArgs->source=(const char *)source; 2008 cnv->toUBytes[0]=*(source-1); 2009 cnv->toULength=_extToU(cnv, cnv->sharedData, 2010 1, &source, sourceLimit, 2011 &target, targetLimit, 2012 &offsets, sourceIndex, 2013 pArgs->flush, 2014 pErrorCode); 2015 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2016 2017 if(U_FAILURE(*pErrorCode)) { 2018 /* not mappable or buffer overflow */ 2019 break; 2020 } 2021 } 2022 } 2023 2024 /* write back the updated pointers */ 2025 pArgs->source=(const char *)source; 2026 pArgs->target=target; 2027 pArgs->offsets=offsets; 2028 } 2029 2030 /* 2031 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2032 * that only map to and from the BMP. 2033 * In addition to single-byte optimizations, the offset calculations 2034 * become much easier. 2035 */ 2036 static void 2037 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2038 UErrorCode *pErrorCode) { 2039 UConverter *cnv; 2040 const uint8_t *source, *sourceLimit, *lastSource; 2041 UChar *target; 2042 int32_t targetCapacity, length; 2043 int32_t *offsets; 2044 2045 const int32_t (*stateTable)[256]; 2046 2047 int32_t sourceIndex; 2048 2049 int32_t entry; 2050 uint8_t action; 2051 2052 /* set up the local pointers */ 2053 cnv=pArgs->converter; 2054 source=(const uint8_t *)pArgs->source; 2055 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2056 target=pArgs->target; 2057 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2058 offsets=pArgs->offsets; 2059 2060 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2061 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2062 } else { 2063 stateTable=cnv->sharedData->mbcs.stateTable; 2064 } 2065 2066 /* sourceIndex=-1 if the current character began in the previous buffer */ 2067 sourceIndex=0; 2068 lastSource=source; 2069 2070 /* 2071 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2072 * for the minimum of the sourceLength and targetCapacity 2073 */ 2074 length=(int32_t)(sourceLimit-source); 2075 if(length<targetCapacity) { 2076 targetCapacity=length; 2077 } 2078 2079 #if MBCS_UNROLL_SINGLE_TO_BMP 2080 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2081 /* unroll the loop with the most common case */ 2082 unrolled: 2083 if(targetCapacity>=16) { 2084 int32_t count, loops, oredEntries; 2085 2086 loops=count=targetCapacity>>4; 2087 do { 2088 oredEntries=entry=stateTable[0][*source++]; 2089 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2090 oredEntries|=entry=stateTable[0][*source++]; 2091 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2092 oredEntries|=entry=stateTable[0][*source++]; 2093 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2094 oredEntries|=entry=stateTable[0][*source++]; 2095 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2096 oredEntries|=entry=stateTable[0][*source++]; 2097 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2098 oredEntries|=entry=stateTable[0][*source++]; 2099 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2100 oredEntries|=entry=stateTable[0][*source++]; 2101 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2102 oredEntries|=entry=stateTable[0][*source++]; 2103 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2104 oredEntries|=entry=stateTable[0][*source++]; 2105 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2106 oredEntries|=entry=stateTable[0][*source++]; 2107 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2108 oredEntries|=entry=stateTable[0][*source++]; 2109 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2110 oredEntries|=entry=stateTable[0][*source++]; 2111 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2112 oredEntries|=entry=stateTable[0][*source++]; 2113 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2114 oredEntries|=entry=stateTable[0][*source++]; 2115 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2116 oredEntries|=entry=stateTable[0][*source++]; 2117 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2118 oredEntries|=entry=stateTable[0][*source++]; 2119 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2120 2121 /* were all 16 entries really valid? */ 2122 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2123 /* no, return to the first of these 16 */ 2124 source-=16; 2125 target-=16; 2126 break; 2127 } 2128 } while(--count>0); 2129 count=loops-count; 2130 targetCapacity-=16*count; 2131 2132 if(offsets!=NULL) { 2133 lastSource+=16*count; 2134 while(count>0) { 2135 *offsets++=sourceIndex++; 2136 *offsets++=sourceIndex++; 2137 *offsets++=sourceIndex++; 2138 *offsets++=sourceIndex++; 2139 *offsets++=sourceIndex++; 2140 *offsets++=sourceIndex++; 2141 *offsets++=sourceIndex++; 2142 *offsets++=sourceIndex++; 2143 *offsets++=sourceIndex++; 2144 *offsets++=sourceIndex++; 2145 *offsets++=sourceIndex++; 2146 *offsets++=sourceIndex++; 2147 *offsets++=sourceIndex++; 2148 *offsets++=sourceIndex++; 2149 *offsets++=sourceIndex++; 2150 *offsets++=sourceIndex++; 2151 --count; 2152 } 2153 } 2154 } 2155 #endif 2156 2157 /* conversion loop */ 2158 while(targetCapacity > 0 && source < sourceLimit) { 2159 entry=stateTable[0][*source++]; 2160 /* MBCS_ENTRY_IS_FINAL(entry) */ 2161 2162 /* test the most common case first */ 2163 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2164 /* output BMP code point */ 2165 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2166 --targetCapacity; 2167 continue; 2168 } 2169 2170 /* 2171 * An if-else-if chain provides more reliable performance for 2172 * the most common cases compared to a switch. 2173 */ 2174 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2175 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2176 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2177 /* output BMP code point */ 2178 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2179 --targetCapacity; 2180 continue; 2181 } 2182 } else if(action==MBCS_STATE_UNASSIGNED) { 2183 /* just fall through */ 2184 } else if(action==MBCS_STATE_ILLEGAL) { 2185 /* callback(illegal) */ 2186 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2187 } else { 2188 /* reserved, must never occur */ 2189 continue; 2190 } 2191 2192 /* set offsets since the start or the last extension */ 2193 if(offsets!=NULL) { 2194 int32_t count=(int32_t)(source-lastSource); 2195 2196 /* predecrement: do not set the offset for the callback-causing character */ 2197 while(--count>0) { 2198 *offsets++=sourceIndex++; 2199 } 2200 /* offset and sourceIndex are now set for the current character */ 2201 } 2202 2203 if(U_FAILURE(*pErrorCode)) { 2204 /* callback(illegal) */ 2205 break; 2206 } else /* unassigned sequences indicated with byteIndex>0 */ { 2207 /* try an extension mapping */ 2208 lastSource=source; 2209 cnv->toUBytes[0]=*(source-1); 2210 cnv->toULength=_extToU(cnv, cnv->sharedData, 2211 1, &source, sourceLimit, 2212 &target, pArgs->targetLimit, 2213 &offsets, sourceIndex, 2214 pArgs->flush, 2215 pErrorCode); 2216 sourceIndex+=1+(int32_t)(source-lastSource); 2217 2218 if(U_FAILURE(*pErrorCode)) { 2219 /* not mappable or buffer overflow */ 2220 break; 2221 } 2222 2223 /* recalculate the targetCapacity after an extension mapping */ 2224 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2225 length=(int32_t)(sourceLimit-source); 2226 if(length<targetCapacity) { 2227 targetCapacity=length; 2228 } 2229 } 2230 2231 #if MBCS_UNROLL_SINGLE_TO_BMP 2232 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2233 goto unrolled; 2234 #endif 2235 } 2236 2237 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2238 /* target is full */ 2239 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2240 } 2241 2242 /* set offsets since the start or the last callback */ 2243 if(offsets!=NULL) { 2244 size_t count=source-lastSource; 2245 while(count>0) { 2246 *offsets++=sourceIndex++; 2247 --count; 2248 } 2249 } 2250 2251 /* write back the updated pointers */ 2252 pArgs->source=(const char *)source; 2253 pArgs->target=target; 2254 pArgs->offsets=offsets; 2255 } 2256 2257 static UBool 2258 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2259 const int32_t *row=stateTable[state]; 2260 int32_t b, entry; 2261 /* First test for final entries in this state for some commonly valid byte values. */ 2262 entry=row[0xa1]; 2263 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2264 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2265 ) { 2266 return TRUE; 2267 } 2268 entry=row[0x41]; 2269 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2270 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2271 ) { 2272 return TRUE; 2273 } 2274 /* Then test for final entries in this state. */ 2275 for(b=0; b<=0xff; ++b) { 2276 entry=row[b]; 2277 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2278 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2279 ) { 2280 return TRUE; 2281 } 2282 } 2283 /* Then recurse for transition entries. */ 2284 for(b=0; b<=0xff; ++b) { 2285 entry=row[b]; 2286 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2287 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2288 ) { 2289 return TRUE; 2290 } 2291 } 2292 return FALSE; 2293 } 2294 2295 /* 2296 * Is byte b a single/lead byte in this state? 2297 * Recurse for transition states, because here we don't want to say that 2298 * b is a lead byte if all byte sequences that start with b are illegal. 2299 */ 2300 static UBool 2301 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2302 const int32_t *row=stateTable[state]; 2303 int32_t entry=row[b]; 2304 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2305 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2306 } else { 2307 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2308 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2309 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2310 } else { 2311 return action!=MBCS_STATE_ILLEGAL; 2312 } 2313 } 2314 } 2315 2316 U_CFUNC void 2317 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2318 UErrorCode *pErrorCode) { 2319 UConverter *cnv; 2320 const uint8_t *source, *sourceLimit; 2321 UChar *target; 2322 const UChar *targetLimit; 2323 int32_t *offsets; 2324 2325 const int32_t (*stateTable)[256]; 2326 const uint16_t *unicodeCodeUnits; 2327 2328 uint32_t offset; 2329 uint8_t state; 2330 int8_t byteIndex; 2331 uint8_t *bytes; 2332 2333 int32_t sourceIndex, nextSourceIndex; 2334 2335 int32_t entry; 2336 UChar c; 2337 uint8_t action; 2338 2339 /* use optimized function if possible */ 2340 cnv=pArgs->converter; 2341 2342 if(cnv->preToULength>0) { 2343 /* 2344 * pass sourceIndex=-1 because we continue from an earlier buffer 2345 * in the future, this may change with continuous offsets 2346 */ 2347 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2348 2349 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2350 return; 2351 } 2352 } 2353 2354 if(cnv->sharedData->mbcs.countStates==1) { 2355 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2356 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2357 } else { 2358 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2359 } 2360 return; 2361 } 2362 2363 /* set up the local pointers */ 2364 source=(const uint8_t *)pArgs->source; 2365 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2366 target=pArgs->target; 2367 targetLimit=pArgs->targetLimit; 2368 offsets=pArgs->offsets; 2369 2370 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2371 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2372 } else { 2373 stateTable=cnv->sharedData->mbcs.stateTable; 2374 } 2375 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2376 2377 /* get the converter state from UConverter */ 2378 offset=cnv->toUnicodeStatus; 2379 byteIndex=cnv->toULength; 2380 bytes=cnv->toUBytes; 2381 2382 /* 2383 * if we are in the SBCS state for a DBCS-only converter, 2384 * then load the DBCS state from the MBCS data 2385 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2386 */ 2387 if((state=(uint8_t)(cnv->mode))==0) { 2388 state=cnv->sharedData->mbcs.dbcsOnlyState; 2389 } 2390 2391 /* sourceIndex=-1 if the current character began in the previous buffer */ 2392 sourceIndex=byteIndex==0 ? 0 : -1; 2393 nextSourceIndex=0; 2394 2395 /* conversion loop */ 2396 while(source<sourceLimit) { 2397 /* 2398 * This following test is to see if available input would overflow the output. 2399 * It does not catch output of more than one code unit that 2400 * overflows as a result of a surrogate pair or callback output 2401 * from the last source byte. 2402 * Therefore, those situations also test for overflows and will 2403 * then break the loop, too. 2404 */ 2405 if(target>=targetLimit) { 2406 /* target is full */ 2407 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2408 break; 2409 } 2410 2411 if(byteIndex==0) { 2412 /* optimized loop for 1/2-byte input and BMP output */ 2413 if(offsets==NULL) { 2414 do { 2415 entry=stateTable[state][*source]; 2416 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2417 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2418 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2419 2420 ++source; 2421 if( source<sourceLimit && 2422 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2423 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2424 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2425 ) { 2426 ++source; 2427 *target++=c; 2428 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2429 offset=0; 2430 } else { 2431 /* set the state and leave the optimized loop */ 2432 bytes[0]=*(source-1); 2433 byteIndex=1; 2434 break; 2435 } 2436 } else { 2437 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2438 /* output BMP code point */ 2439 ++source; 2440 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2441 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2442 } else { 2443 /* leave the optimized loop */ 2444 break; 2445 } 2446 } 2447 } while(source<sourceLimit && target<targetLimit); 2448 } else /* offsets!=NULL */ { 2449 do { 2450 entry=stateTable[state][*source]; 2451 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2452 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2453 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2454 2455 ++source; 2456 if( source<sourceLimit && 2457 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2458 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2459 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2460 ) { 2461 ++source; 2462 *target++=c; 2463 if(offsets!=NULL) { 2464 *offsets++=sourceIndex; 2465 sourceIndex=(nextSourceIndex+=2); 2466 } 2467 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2468 offset=0; 2469 } else { 2470 /* set the state and leave the optimized loop */ 2471 ++nextSourceIndex; 2472 bytes[0]=*(source-1); 2473 byteIndex=1; 2474 break; 2475 } 2476 } else { 2477 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2478 /* output BMP code point */ 2479 ++source; 2480 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2481 if(offsets!=NULL) { 2482 *offsets++=sourceIndex; 2483 sourceIndex=++nextSourceIndex; 2484 } 2485 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2486 } else { 2487 /* leave the optimized loop */ 2488 break; 2489 } 2490 } 2491 } while(source<sourceLimit && target<targetLimit); 2492 } 2493 2494 /* 2495 * these tests and break statements could be put inside the loop 2496 * if C had "break outerLoop" like Java 2497 */ 2498 if(source>=sourceLimit) { 2499 break; 2500 } 2501 if(target>=targetLimit) { 2502 /* target is full */ 2503 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2504 break; 2505 } 2506 2507 ++nextSourceIndex; 2508 bytes[byteIndex++]=*source++; 2509 } else /* byteIndex>0 */ { 2510 ++nextSourceIndex; 2511 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2512 } 2513 2514 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2515 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2516 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2517 continue; 2518 } 2519 2520 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2521 cnv->mode=state; 2522 2523 /* set the next state early so that we can reuse the entry variable */ 2524 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2525 2526 /* 2527 * An if-else-if chain provides more reliable performance for 2528 * the most common cases compared to a switch. 2529 */ 2530 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2531 if(action==MBCS_STATE_VALID_16) { 2532 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2533 c=unicodeCodeUnits[offset]; 2534 if(c<0xfffe) { 2535 /* output BMP code point */ 2536 *target++=c; 2537 if(offsets!=NULL) { 2538 *offsets++=sourceIndex; 2539 } 2540 byteIndex=0; 2541 } else if(c==0xfffe) { 2542 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2543 /* output fallback BMP code point */ 2544 *target++=(UChar)entry; 2545 if(offsets!=NULL) { 2546 *offsets++=sourceIndex; 2547 } 2548 byteIndex=0; 2549 } 2550 } else { 2551 /* callback(illegal) */ 2552 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2553 } 2554 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2555 /* output BMP code point */ 2556 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2557 if(offsets!=NULL) { 2558 *offsets++=sourceIndex; 2559 } 2560 byteIndex=0; 2561 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2562 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2563 c=unicodeCodeUnits[offset++]; 2564 if(c<0xd800) { 2565 /* output BMP code point below 0xd800 */ 2566 *target++=c; 2567 if(offsets!=NULL) { 2568 *offsets++=sourceIndex; 2569 } 2570 byteIndex=0; 2571 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2572 /* output roundtrip or fallback surrogate pair */ 2573 *target++=(UChar)(c&0xdbff); 2574 if(offsets!=NULL) { 2575 *offsets++=sourceIndex; 2576 } 2577 byteIndex=0; 2578 if(target<targetLimit) { 2579 *target++=unicodeCodeUnits[offset]; 2580 if(offsets!=NULL) { 2581 *offsets++=sourceIndex; 2582 } 2583 } else { 2584 /* target overflow */ 2585 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2586 cnv->UCharErrorBufferLength=1; 2587 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2588 2589 offset=0; 2590 break; 2591 } 2592 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2593 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2594 *target++=unicodeCodeUnits[offset]; 2595 if(offsets!=NULL) { 2596 *offsets++=sourceIndex; 2597 } 2598 byteIndex=0; 2599 } else if(c==0xffff) { 2600 /* callback(illegal) */ 2601 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2602 } 2603 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2604 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2605 ) { 2606 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2607 /* output surrogate pair */ 2608 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2609 if(offsets!=NULL) { 2610 *offsets++=sourceIndex; 2611 } 2612 byteIndex=0; 2613 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2614 if(target<targetLimit) { 2615 *target++=c; 2616 if(offsets!=NULL) { 2617 *offsets++=sourceIndex; 2618 } 2619 } else { 2620 /* target overflow */ 2621 cnv->UCharErrorBuffer[0]=c; 2622 cnv->UCharErrorBufferLength=1; 2623 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2624 2625 offset=0; 2626 break; 2627 } 2628 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2629 /* 2630 * This serves as a state change without any output. 2631 * It is useful for reading simple stateful encodings, 2632 * for example using just Shift-In/Shift-Out codes. 2633 * The 21 unused bits may later be used for more sophisticated 2634 * state transitions. 2635 */ 2636 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2637 byteIndex=0; 2638 } else { 2639 /* SI/SO are illegal for DBCS-only conversion */ 2640 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2641 2642 /* callback(illegal) */ 2643 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2644 } 2645 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2646 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2647 /* output BMP code point */ 2648 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2649 if(offsets!=NULL) { 2650 *offsets++=sourceIndex; 2651 } 2652 byteIndex=0; 2653 } 2654 } else if(action==MBCS_STATE_UNASSIGNED) { 2655 /* just fall through */ 2656 } else if(action==MBCS_STATE_ILLEGAL) { 2657 /* callback(illegal) */ 2658 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2659 } else { 2660 /* reserved, must never occur */ 2661 byteIndex=0; 2662 } 2663 2664 /* end of action codes: prepare for a new character */ 2665 offset=0; 2666 2667 if(byteIndex==0) { 2668 sourceIndex=nextSourceIndex; 2669 } else if(U_FAILURE(*pErrorCode)) { 2670 /* callback(illegal) */ 2671 if(byteIndex>1) { 2672 /* 2673 * Ticket 5691: consistent illegal sequences: 2674 * - We include at least the first byte in the illegal sequence. 2675 * - If any of the non-initial bytes could be the start of a character, 2676 * we stop the illegal sequence before the first one of those. 2677 */ 2678 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2679 int8_t i; 2680 for(i=1; 2681 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2682 ++i) {} 2683 if(i<byteIndex) { 2684 /* Back out some bytes. */ 2685 int8_t backOutDistance=byteIndex-i; 2686 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2687 byteIndex=i; /* length of reported illegal byte sequence */ 2688 if(backOutDistance<=bytesFromThisBuffer) { 2689 source-=backOutDistance; 2690 } else { 2691 /* Back out bytes from the previous buffer: Need to replay them. */ 2692 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2693 /* preToULength is negative! */ 2694 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2695 source=(const uint8_t *)pArgs->source; 2696 } 2697 } 2698 } 2699 break; 2700 } else /* unassigned sequences indicated with byteIndex>0 */ { 2701 /* try an extension mapping */ 2702 pArgs->source=(const char *)source; 2703 byteIndex=_extToU(cnv, cnv->sharedData, 2704 byteIndex, &source, sourceLimit, 2705 &target, targetLimit, 2706 &offsets, sourceIndex, 2707 pArgs->flush, 2708 pErrorCode); 2709 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2710 2711 if(U_FAILURE(*pErrorCode)) { 2712 /* not mappable or buffer overflow */ 2713 break; 2714 } 2715 } 2716 } 2717 2718 /* set the converter state back into UConverter */ 2719 cnv->toUnicodeStatus=offset; 2720 cnv->mode=state; 2721 cnv->toULength=byteIndex; 2722 2723 /* write back the updated pointers */ 2724 pArgs->source=(const char *)source; 2725 pArgs->target=target; 2726 pArgs->offsets=offsets; 2727 } 2728 2729 /* 2730 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2731 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2732 */ 2733 static UChar32 2734 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2735 UErrorCode *pErrorCode) { 2736 UConverter *cnv; 2737 const int32_t (*stateTable)[256]; 2738 const uint8_t *source, *sourceLimit; 2739 2740 int32_t entry; 2741 uint8_t action; 2742 2743 /* set up the local pointers */ 2744 cnv=pArgs->converter; 2745 source=(const uint8_t *)pArgs->source; 2746 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2747 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2748 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2749 } else { 2750 stateTable=cnv->sharedData->mbcs.stateTable; 2751 } 2752 2753 /* conversion loop */ 2754 while(source<sourceLimit) { 2755 entry=stateTable[0][*source++]; 2756 /* MBCS_ENTRY_IS_FINAL(entry) */ 2757 2758 /* write back the updated pointer early so that we can return directly */ 2759 pArgs->source=(const char *)source; 2760 2761 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2762 /* output BMP code point */ 2763 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2764 } 2765 2766 /* 2767 * An if-else-if chain provides more reliable performance for 2768 * the most common cases compared to a switch. 2769 */ 2770 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2771 if( action==MBCS_STATE_VALID_DIRECT_20 || 2772 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2773 ) { 2774 /* output supplementary code point */ 2775 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2776 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2777 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2778 /* output BMP code point */ 2779 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2780 } 2781 } else if(action==MBCS_STATE_UNASSIGNED) { 2782 /* just fall through */ 2783 } else if(action==MBCS_STATE_ILLEGAL) { 2784 /* callback(illegal) */ 2785 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2786 } else { 2787 /* reserved, must never occur */ 2788 continue; 2789 } 2790 2791 if(U_FAILURE(*pErrorCode)) { 2792 /* callback(illegal) */ 2793 break; 2794 } else /* unassigned sequence */ { 2795 /* defer to the generic implementation */ 2796 pArgs->source=(const char *)source-1; 2797 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2798 } 2799 } 2800 2801 /* no output because of empty input or only state changes */ 2802 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2803 return 0xffff; 2804 } 2805 2806 /* 2807 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2808 * conversion without offset handling. 2809 * 2810 * When a character does not have a mapping to Unicode, then we return to the 2811 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2812 * handling. 2813 * We also defer to the generic code in other complicated cases and have them 2814 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2815 * 2816 * All normal mappings and errors are handled here. 2817 */ 2818 static UChar32 2819 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2820 UErrorCode *pErrorCode) { 2821 UConverter *cnv; 2822 const uint8_t *source, *sourceLimit, *lastSource; 2823 2824 const int32_t (*stateTable)[256]; 2825 const uint16_t *unicodeCodeUnits; 2826 2827 uint32_t offset; 2828 uint8_t state; 2829 2830 int32_t entry; 2831 UChar32 c; 2832 uint8_t action; 2833 2834 /* use optimized function if possible */ 2835 cnv=pArgs->converter; 2836 2837 if(cnv->preToULength>0) { 2838 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2839 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2840 } 2841 2842 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2843 /* 2844 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2845 * with the rare case of a codepage that maps single surrogates 2846 * without adding the complexity to this already complicated function here. 2847 */ 2848 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2849 } else if(cnv->sharedData->mbcs.countStates==1) { 2850 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2851 } 2852 2853 /* set up the local pointers */ 2854 source=lastSource=(const uint8_t *)pArgs->source; 2855 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2856 2857 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2858 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2859 } else { 2860 stateTable=cnv->sharedData->mbcs.stateTable; 2861 } 2862 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2863 2864 /* get the converter state from UConverter */ 2865 offset=cnv->toUnicodeStatus; 2866 2867 /* 2868 * if we are in the SBCS state for a DBCS-only converter, 2869 * then load the DBCS state from the MBCS data 2870 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2871 */ 2872 if((state=(uint8_t)(cnv->mode))==0) { 2873 state=cnv->sharedData->mbcs.dbcsOnlyState; 2874 } 2875 2876 /* conversion loop */ 2877 c=U_SENTINEL; 2878 while(source<sourceLimit) { 2879 entry=stateTable[state][*source++]; 2880 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2881 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2882 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2883 2884 /* optimization for 1/2-byte input and BMP output */ 2885 if( source<sourceLimit && 2886 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2887 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2888 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2889 ) { 2890 ++source; 2891 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2892 /* output BMP code point */ 2893 break; 2894 } 2895 } else { 2896 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2897 cnv->mode=state; 2898 2899 /* set the next state early so that we can reuse the entry variable */ 2900 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2901 2902 /* 2903 * An if-else-if chain provides more reliable performance for 2904 * the most common cases compared to a switch. 2905 */ 2906 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2907 if(action==MBCS_STATE_VALID_DIRECT_16) { 2908 /* output BMP code point */ 2909 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2910 break; 2911 } else if(action==MBCS_STATE_VALID_16) { 2912 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2913 c=unicodeCodeUnits[offset]; 2914 if(c<0xfffe) { 2915 /* output BMP code point */ 2916 break; 2917 } else if(c==0xfffe) { 2918 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2919 break; 2920 } 2921 } else { 2922 /* callback(illegal) */ 2923 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2924 } 2925 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2926 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2927 c=unicodeCodeUnits[offset++]; 2928 if(c<0xd800) { 2929 /* output BMP code point below 0xd800 */ 2930 break; 2931 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2932 /* output roundtrip or fallback supplementary code point */ 2933 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 2934 break; 2935 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2936 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2937 c=unicodeCodeUnits[offset]; 2938 break; 2939 } else if(c==0xffff) { 2940 /* callback(illegal) */ 2941 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2942 } 2943 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2944 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2945 ) { 2946 /* output supplementary code point */ 2947 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2948 break; 2949 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2950 /* 2951 * This serves as a state change without any output. 2952 * It is useful for reading simple stateful encodings, 2953 * for example using just Shift-In/Shift-Out codes. 2954 * The 21 unused bits may later be used for more sophisticated 2955 * state transitions. 2956 */ 2957 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 2958 /* SI/SO are illegal for DBCS-only conversion */ 2959 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2960 2961 /* callback(illegal) */ 2962 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2963 } 2964 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2965 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2966 /* output BMP code point */ 2967 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2968 break; 2969 } 2970 } else if(action==MBCS_STATE_UNASSIGNED) { 2971 /* just fall through */ 2972 } else if(action==MBCS_STATE_ILLEGAL) { 2973 /* callback(illegal) */ 2974 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2975 } else { 2976 /* reserved (must never occur), or only state change */ 2977 offset=0; 2978 lastSource=source; 2979 continue; 2980 } 2981 2982 /* end of action codes: prepare for a new character */ 2983 offset=0; 2984 2985 if(U_FAILURE(*pErrorCode)) { 2986 /* callback(illegal) */ 2987 break; 2988 } else /* unassigned sequence */ { 2989 /* defer to the generic implementation */ 2990 cnv->toUnicodeStatus=0; 2991 cnv->mode=state; 2992 pArgs->source=(const char *)lastSource; 2993 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2994 } 2995 } 2996 } 2997 2998 if(c<0) { 2999 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3000 /* incomplete character byte sequence */ 3001 uint8_t *bytes=cnv->toUBytes; 3002 cnv->toULength=(int8_t)(source-lastSource); 3003 do { 3004 *bytes++=*lastSource++; 3005 } while(lastSource<source); 3006 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3007 } else if(U_FAILURE(*pErrorCode)) { 3008 /* callback(illegal) */ 3009 /* 3010 * Ticket 5691: consistent illegal sequences: 3011 * - We include at least the first byte in the illegal sequence. 3012 * - If any of the non-initial bytes could be the start of a character, 3013 * we stop the illegal sequence before the first one of those. 3014 */ 3015 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3016 uint8_t *bytes=cnv->toUBytes; 3017 *bytes++=*lastSource++; /* first byte */ 3018 if(lastSource==source) { 3019 cnv->toULength=1; 3020 } else /* lastSource<source: multi-byte character */ { 3021 int8_t i; 3022 for(i=1; 3023 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3024 ++i 3025 ) { 3026 *bytes++=*lastSource++; 3027 } 3028 cnv->toULength=i; 3029 source=lastSource; 3030 } 3031 } else { 3032 /* no output because of empty input or only state changes */ 3033 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3034 } 3035 c=0xffff; 3036 } 3037 3038 /* set the converter state back into UConverter, ready for a new character */ 3039 cnv->toUnicodeStatus=0; 3040 cnv->mode=state; 3041 3042 /* write back the updated pointer */ 3043 pArgs->source=(const char *)source; 3044 return c; 3045 } 3046 3047 #if 0 3048 /* 3049 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3050 * Removal improves code coverage. 3051 */ 3052 /** 3053 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3054 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3055 * It does not handle conversion extensions (_extToU()). 3056 */ 3057 U_CFUNC UChar32 3058 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3059 uint8_t b, UBool useFallback) { 3060 int32_t entry; 3061 uint8_t action; 3062 3063 entry=sharedData->mbcs.stateTable[0][b]; 3064 /* MBCS_ENTRY_IS_FINAL(entry) */ 3065 3066 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3067 /* output BMP code point */ 3068 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3069 } 3070 3071 /* 3072 * An if-else-if chain provides more reliable performance for 3073 * the most common cases compared to a switch. 3074 */ 3075 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3076 if(action==MBCS_STATE_VALID_DIRECT_20) { 3077 /* output supplementary code point */ 3078 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3079 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3080 if(!TO_U_USE_FALLBACK(useFallback)) { 3081 return 0xfffe; 3082 } 3083 /* output BMP code point */ 3084 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3085 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3086 if(!TO_U_USE_FALLBACK(useFallback)) { 3087 return 0xfffe; 3088 } 3089 /* output supplementary code point */ 3090 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3091 } else if(action==MBCS_STATE_UNASSIGNED) { 3092 return 0xfffe; 3093 } else if(action==MBCS_STATE_ILLEGAL) { 3094 return 0xffff; 3095 } else { 3096 /* reserved, must never occur */ 3097 return 0xffff; 3098 } 3099 } 3100 #endif 3101 3102 /* 3103 * This is a simple version of _MBCSGetNextUChar() that is used 3104 * by other converter implementations. 3105 * It only returns an "assigned" result if it consumes the entire input. 3106 * It does not use state from the converter, nor error codes. 3107 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3108 * It handles conversion extensions but not GB 18030. 3109 * 3110 * Return value: 3111 * U+fffe unassigned 3112 * U+ffff illegal 3113 * otherwise the Unicode code point 3114 */ 3115 U_CFUNC UChar32 3116 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3117 const char *source, int32_t length, 3118 UBool useFallback) { 3119 const int32_t (*stateTable)[256]; 3120 const uint16_t *unicodeCodeUnits; 3121 3122 uint32_t offset; 3123 uint8_t state, action; 3124 3125 UChar32 c; 3126 int32_t i, entry; 3127 3128 if(length<=0) { 3129 /* no input at all: "illegal" */ 3130 return 0xffff; 3131 } 3132 3133 #if 0 3134 /* 3135 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3136 * TODO In future releases, verify that this function is never called for SBCS 3137 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3138 * Removal improves code coverage. 3139 */ 3140 /* use optimized function if possible */ 3141 if(sharedData->mbcs.countStates==1) { 3142 if(length==1) { 3143 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3144 } else { 3145 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3146 } 3147 } 3148 #endif 3149 3150 /* set up the local pointers */ 3151 stateTable=sharedData->mbcs.stateTable; 3152 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3153 3154 /* converter state */ 3155 offset=0; 3156 state=sharedData->mbcs.dbcsOnlyState; 3157 3158 /* conversion loop */ 3159 for(i=0;;) { 3160 entry=stateTable[state][(uint8_t)source[i++]]; 3161 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3162 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3163 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3164 3165 if(i==length) { 3166 return 0xffff; /* truncated character */ 3167 } 3168 } else { 3169 /* 3170 * An if-else-if chain provides more reliable performance for 3171 * the most common cases compared to a switch. 3172 */ 3173 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3174 if(action==MBCS_STATE_VALID_16) { 3175 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3176 c=unicodeCodeUnits[offset]; 3177 if(c!=0xfffe) { 3178 /* done */ 3179 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3180 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3181 /* else done with 0xfffe */ 3182 } 3183 break; 3184 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3185 /* output BMP code point */ 3186 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3187 break; 3188 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3189 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3190 c=unicodeCodeUnits[offset++]; 3191 if(c<0xd800) { 3192 /* output BMP code point below 0xd800 */ 3193 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3194 /* output roundtrip or fallback supplementary code point */ 3195 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3196 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3197 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3198 c=unicodeCodeUnits[offset]; 3199 } else if(c==0xffff) { 3200 return 0xffff; 3201 } else { 3202 c=0xfffe; 3203 } 3204 break; 3205 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3206 /* output supplementary code point */ 3207 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3208 break; 3209 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3210 if(!TO_U_USE_FALLBACK(useFallback)) { 3211 c=0xfffe; 3212 break; 3213 } 3214 /* output BMP code point */ 3215 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3216 break; 3217 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3218 if(!TO_U_USE_FALLBACK(useFallback)) { 3219 c=0xfffe; 3220 break; 3221 } 3222 /* output supplementary code point */ 3223 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3224 break; 3225 } else if(action==MBCS_STATE_UNASSIGNED) { 3226 c=0xfffe; 3227 break; 3228 } 3229 3230 /* 3231 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3232 * and MBCS_STATE_ILLEGAL and reserved action codes 3233 */ 3234 return 0xffff; 3235 } 3236 } 3237 3238 if(i!=length) { 3239 /* illegal for this function: not all input consumed */ 3240 return 0xffff; 3241 } 3242 3243 if(c==0xfffe) { 3244 /* try an extension mapping */ 3245 const int32_t *cx=sharedData->mbcs.extIndexes; 3246 if(cx!=NULL) { 3247 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3248 } 3249 } 3250 3251 return c; 3252 } 3253 3254 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3255 3256 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3257 static void 3258 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3259 UErrorCode *pErrorCode) { 3260 UConverter *cnv; 3261 const UChar *source, *sourceLimit; 3262 uint8_t *target; 3263 int32_t targetCapacity; 3264 int32_t *offsets; 3265 3266 const uint16_t *table; 3267 const uint16_t *mbcsIndex; 3268 const uint8_t *bytes; 3269 3270 UChar32 c; 3271 3272 int32_t sourceIndex, nextSourceIndex; 3273 3274 uint32_t stage2Entry; 3275 uint32_t asciiRoundtrips; 3276 uint32_t value; 3277 uint8_t unicodeMask; 3278 3279 /* use optimized function if possible */ 3280 cnv=pArgs->converter; 3281 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3282 3283 /* set up the local pointers */ 3284 source=pArgs->source; 3285 sourceLimit=pArgs->sourceLimit; 3286 target=(uint8_t *)pArgs->target; 3287 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3288 offsets=pArgs->offsets; 3289 3290 table=cnv->sharedData->mbcs.fromUnicodeTable; 3291 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3292 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3293 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3294 } else { 3295 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3296 } 3297 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3298 3299 /* get the converter state from UConverter */ 3300 c=cnv->fromUChar32; 3301 3302 /* sourceIndex=-1 if the current character began in the previous buffer */ 3303 sourceIndex= c==0 ? 0 : -1; 3304 nextSourceIndex=0; 3305 3306 /* conversion loop */ 3307 if(c!=0 && targetCapacity>0) { 3308 goto getTrail; 3309 } 3310 3311 while(source<sourceLimit) { 3312 /* 3313 * This following test is to see if available input would overflow the output. 3314 * It does not catch output of more than one byte that 3315 * overflows as a result of a multi-byte character or callback output 3316 * from the last source character. 3317 * Therefore, those situations also test for overflows and will 3318 * then break the loop, too. 3319 */ 3320 if(targetCapacity>0) { 3321 /* 3322 * Get a correct Unicode code point: 3323 * a single UChar for a BMP code point or 3324 * a matched surrogate pair for a "supplementary code point". 3325 */ 3326 c=*source++; 3327 ++nextSourceIndex; 3328 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3329 *target++=(uint8_t)c; 3330 if(offsets!=NULL) { 3331 *offsets++=sourceIndex; 3332 sourceIndex=nextSourceIndex; 3333 } 3334 --targetCapacity; 3335 c=0; 3336 continue; 3337 } 3338 /* 3339 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3340 * to avoid dealing with surrogates. 3341 * MBCS_FAST_MAX must be >=0xd7ff. 3342 */ 3343 if(c<=0xd7ff) { 3344 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3345 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3346 if(value==0) { 3347 goto unassigned; 3348 } 3349 /* output the value */ 3350 } else { 3351 /* 3352 * This also tests if the codepage maps single surrogates. 3353 * If it does, then surrogates are not paired but mapped separately. 3354 * Note that in this case unmatched surrogates are not detected. 3355 */ 3356 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3357 if(U16_IS_SURROGATE_LEAD(c)) { 3358 getTrail: 3359 if(source<sourceLimit) { 3360 /* test the following code unit */ 3361 UChar trail=*source; 3362 if(U16_IS_TRAIL(trail)) { 3363 ++source; 3364 ++nextSourceIndex; 3365 c=U16_GET_SUPPLEMENTARY(c, trail); 3366 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3367 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3368 /* callback(unassigned) */ 3369 goto unassigned; 3370 } 3371 /* convert this supplementary code point */ 3372 /* exit this condition tree */ 3373 } else { 3374 /* this is an unmatched lead code unit (1st surrogate) */ 3375 /* callback(illegal) */ 3376 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3377 break; 3378 } 3379 } else { 3380 /* no more input */ 3381 break; 3382 } 3383 } else { 3384 /* this is an unmatched trail code unit (2nd surrogate) */ 3385 /* callback(illegal) */ 3386 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3387 break; 3388 } 3389 } 3390 3391 /* convert the Unicode code point in c into codepage bytes */ 3392 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3393 3394 /* get the bytes and the length for the output */ 3395 /* MBCS_OUTPUT_2 */ 3396 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3397 3398 /* is this code point assigned, or do we use fallbacks? */ 3399 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3400 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3401 ) { 3402 /* 3403 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3404 * There is no way with this data structure for fallback output 3405 * to be a zero byte. 3406 */ 3407 3408 unassigned: 3409 /* try an extension mapping */ 3410 pArgs->source=source; 3411 c=_extFromU(cnv, cnv->sharedData, 3412 c, &source, sourceLimit, 3413 &target, target+targetCapacity, 3414 &offsets, sourceIndex, 3415 pArgs->flush, 3416 pErrorCode); 3417 nextSourceIndex+=(int32_t)(source-pArgs->source); 3418 3419 if(U_FAILURE(*pErrorCode)) { 3420 /* not mappable or buffer overflow */ 3421 break; 3422 } else { 3423 /* a mapping was written to the target, continue */ 3424 3425 /* recalculate the targetCapacity after an extension mapping */ 3426 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3427 3428 /* normal end of conversion: prepare for a new character */ 3429 sourceIndex=nextSourceIndex; 3430 continue; 3431 } 3432 } 3433 } 3434 3435 /* write the output character bytes from value and length */ 3436 /* from the first if in the loop we know that targetCapacity>0 */ 3437 if(value<=0xff) { 3438 /* this is easy because we know that there is enough space */ 3439 *target++=(uint8_t)value; 3440 if(offsets!=NULL) { 3441 *offsets++=sourceIndex; 3442 } 3443 --targetCapacity; 3444 } else /* length==2 */ { 3445 *target++=(uint8_t)(value>>8); 3446 if(2<=targetCapacity) { 3447 *target++=(uint8_t)value; 3448 if(offsets!=NULL) { 3449 *offsets++=sourceIndex; 3450 *offsets++=sourceIndex; 3451 } 3452 targetCapacity-=2; 3453 } else { 3454 if(offsets!=NULL) { 3455 *offsets++=sourceIndex; 3456 } 3457 cnv->charErrorBuffer[0]=(char)value; 3458 cnv->charErrorBufferLength=1; 3459 3460 /* target overflow */ 3461 targetCapacity=0; 3462 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3463 c=0; 3464 break; 3465 } 3466 } 3467 3468 /* normal end of conversion: prepare for a new character */ 3469 c=0; 3470 sourceIndex=nextSourceIndex; 3471 continue; 3472 } else { 3473 /* target is full */ 3474 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3475 break; 3476 } 3477 } 3478 3479 /* set the converter state back into UConverter */ 3480 cnv->fromUChar32=c; 3481 3482 /* write back the updated pointers */ 3483 pArgs->source=source; 3484 pArgs->target=(char *)target; 3485 pArgs->offsets=offsets; 3486 } 3487 3488 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3489 static void 3490 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3491 UErrorCode *pErrorCode) { 3492 UConverter *cnv; 3493 const UChar *source, *sourceLimit; 3494 uint8_t *target; 3495 int32_t targetCapacity; 3496 int32_t *offsets; 3497 3498 const uint16_t *table; 3499 const uint16_t *results; 3500 3501 UChar32 c; 3502 3503 int32_t sourceIndex, nextSourceIndex; 3504 3505 uint16_t value, minValue; 3506 UBool hasSupplementary; 3507 3508 /* set up the local pointers */ 3509 cnv=pArgs->converter; 3510 source=pArgs->source; 3511 sourceLimit=pArgs->sourceLimit; 3512 target=(uint8_t *)pArgs->target; 3513 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3514 offsets=pArgs->offsets; 3515 3516 table=cnv->sharedData->mbcs.fromUnicodeTable; 3517 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3518 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3519 } else { 3520 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3521 } 3522 3523 if(cnv->useFallback) { 3524 /* use all roundtrip and fallback results */ 3525 minValue=0x800; 3526 } else { 3527 /* use only roundtrips and fallbacks from private-use characters */ 3528 minValue=0xc00; 3529 } 3530 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3531 3532 /* get the converter state from UConverter */ 3533 c=cnv->fromUChar32; 3534 3535 /* sourceIndex=-1 if the current character began in the previous buffer */ 3536 sourceIndex= c==0 ? 0 : -1; 3537 nextSourceIndex=0; 3538 3539 /* conversion loop */ 3540 if(c!=0 && targetCapacity>0) { 3541 goto getTrail; 3542 } 3543 3544 while(source<sourceLimit) { 3545 /* 3546 * This following test is to see if available input would overflow the output. 3547 * It does not catch output of more than one byte that 3548 * overflows as a result of a multi-byte character or callback output 3549 * from the last source character. 3550 * Therefore, those situations also test for overflows and will 3551 * then break the loop, too. 3552 */ 3553 if(targetCapacity>0) { 3554 /* 3555 * Get a correct Unicode code point: 3556 * a single UChar for a BMP code point or 3557 * a matched surrogate pair for a "supplementary code point". 3558 */ 3559 c=*source++; 3560 ++nextSourceIndex; 3561 if(U16_IS_SURROGATE(c)) { 3562 if(U16_IS_SURROGATE_LEAD(c)) { 3563 getTrail: 3564 if(source<sourceLimit) { 3565 /* test the following code unit */ 3566 UChar trail=*source; 3567 if(U16_IS_TRAIL(trail)) { 3568 ++source; 3569 ++nextSourceIndex; 3570 c=U16_GET_SUPPLEMENTARY(c, trail); 3571 if(!hasSupplementary) { 3572 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3573 /* callback(unassigned) */ 3574 goto unassigned; 3575 } 3576 /* convert this supplementary code point */ 3577 /* exit this condition tree */ 3578 } else { 3579 /* this is an unmatched lead code unit (1st surrogate) */ 3580 /* callback(illegal) */ 3581 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3582 break; 3583 } 3584 } else { 3585 /* no more input */ 3586 break; 3587 } 3588 } else { 3589 /* this is an unmatched trail code unit (2nd surrogate) */ 3590 /* callback(illegal) */ 3591 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3592 break; 3593 } 3594 } 3595 3596 /* convert the Unicode code point in c into codepage bytes */ 3597 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3598 3599 /* is this code point assigned, or do we use fallbacks? */ 3600 if(value>=minValue) { 3601 /* assigned, write the output character bytes from value and length */ 3602 /* length==1 */ 3603 /* this is easy because we know that there is enough space */ 3604 *target++=(uint8_t)value; 3605 if(offsets!=NULL) { 3606 *offsets++=sourceIndex; 3607 } 3608 --targetCapacity; 3609 3610 /* normal end of conversion: prepare for a new character */ 3611 c=0; 3612 sourceIndex=nextSourceIndex; 3613 } else { /* unassigned */ 3614 unassigned: 3615 /* try an extension mapping */ 3616 pArgs->source=source; 3617 c=_extFromU(cnv, cnv->sharedData, 3618 c, &source, sourceLimit, 3619 &target, target+targetCapacity, 3620 &offsets, sourceIndex, 3621 pArgs->flush, 3622 pErrorCode); 3623 nextSourceIndex+=(int32_t)(source-pArgs->source); 3624 3625 if(U_FAILURE(*pErrorCode)) { 3626 /* not mappable or buffer overflow */ 3627 break; 3628 } else { 3629 /* a mapping was written to the target, continue */ 3630 3631 /* recalculate the targetCapacity after an extension mapping */ 3632 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3633 3634 /* normal end of conversion: prepare for a new character */ 3635 sourceIndex=nextSourceIndex; 3636 } 3637 } 3638 } else { 3639 /* target is full */ 3640 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3641 break; 3642 } 3643 } 3644 3645 /* set the converter state back into UConverter */ 3646 cnv->fromUChar32=c; 3647 3648 /* write back the updated pointers */ 3649 pArgs->source=source; 3650 pArgs->target=(char *)target; 3651 pArgs->offsets=offsets; 3652 } 3653 3654 /* 3655 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3656 * that map only to and from the BMP. 3657 * In addition to single-byte/state optimizations, the offset calculations 3658 * become much easier. 3659 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3660 * but measurements have shown that this diminishes performance 3661 * in more cases than it improves it. 3662 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3663 * for various MBCS and SBCS optimizations. 3664 */ 3665 static void 3666 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3667 UErrorCode *pErrorCode) { 3668 UConverter *cnv; 3669 const UChar *source, *sourceLimit, *lastSource; 3670 uint8_t *target; 3671 int32_t targetCapacity, length; 3672 int32_t *offsets; 3673 3674 const uint16_t *table; 3675 const uint16_t *results; 3676 3677 UChar32 c; 3678 3679 int32_t sourceIndex; 3680 3681 uint32_t asciiRoundtrips; 3682 uint16_t value, minValue; 3683 3684 /* set up the local pointers */ 3685 cnv=pArgs->converter; 3686 source=pArgs->source; 3687 sourceLimit=pArgs->sourceLimit; 3688 target=(uint8_t *)pArgs->target; 3689 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3690 offsets=pArgs->offsets; 3691 3692 table=cnv->sharedData->mbcs.fromUnicodeTable; 3693 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3694 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3695 } else { 3696 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3697 } 3698 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3699 3700 if(cnv->useFallback) { 3701 /* use all roundtrip and fallback results */ 3702 minValue=0x800; 3703 } else { 3704 /* use only roundtrips and fallbacks from private-use characters */ 3705 minValue=0xc00; 3706 } 3707 3708 /* get the converter state from UConverter */ 3709 c=cnv->fromUChar32; 3710 3711 /* sourceIndex=-1 if the current character began in the previous buffer */ 3712 sourceIndex= c==0 ? 0 : -1; 3713 lastSource=source; 3714 3715 /* 3716 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3717 * for the minimum of the sourceLength and targetCapacity 3718 */ 3719 length=(int32_t)(sourceLimit-source); 3720 if(length<targetCapacity) { 3721 targetCapacity=length; 3722 } 3723 3724 /* conversion loop */ 3725 if(c!=0 && targetCapacity>0) { 3726 goto getTrail; 3727 } 3728 3729 #if MBCS_UNROLL_SINGLE_FROM_BMP 3730 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3731 /* unroll the loop with the most common case */ 3732 unrolled: 3733 if(targetCapacity>=4) { 3734 int32_t count, loops; 3735 uint16_t andedValues; 3736 3737 loops=count=targetCapacity>>2; 3738 do { 3739 c=*source++; 3740 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3741 *target++=(uint8_t)value; 3742 c=*source++; 3743 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3744 *target++=(uint8_t)value; 3745 c=*source++; 3746 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3747 *target++=(uint8_t)value; 3748 c=*source++; 3749 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3750 *target++=(uint8_t)value; 3751 3752 /* were all 4 entries really valid? */ 3753 if(andedValues<minValue) { 3754 /* no, return to the first of these 4 */ 3755 source-=4; 3756 target-=4; 3757 break; 3758 } 3759 } while(--count>0); 3760 count=loops-count; 3761 targetCapacity-=4*count; 3762 3763 if(offsets!=NULL) { 3764 lastSource+=4*count; 3765 while(count>0) { 3766 *offsets++=sourceIndex++; 3767 *offsets++=sourceIndex++; 3768 *offsets++=sourceIndex++; 3769 *offsets++=sourceIndex++; 3770 --count; 3771 } 3772 } 3773 3774 c=0; 3775 } 3776 #endif 3777 3778 while(targetCapacity>0) { 3779 /* 3780 * Get a correct Unicode code point: 3781 * a single UChar for a BMP code point or 3782 * a matched surrogate pair for a "supplementary code point". 3783 */ 3784 c=*source++; 3785 /* 3786 * Do not immediately check for single surrogates: 3787 * Assume that they are unassigned and check for them in that case. 3788 * This speeds up the conversion of assigned characters. 3789 */ 3790 /* convert the Unicode code point in c into codepage bytes */ 3791 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3792 *target++=(uint8_t)c; 3793 --targetCapacity; 3794 c=0; 3795 continue; 3796 } 3797 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3798 /* is this code point assigned, or do we use fallbacks? */ 3799 if(value>=minValue) { 3800 /* assigned, write the output character bytes from value and length */ 3801 /* length==1 */ 3802 /* this is easy because we know that there is enough space */ 3803 *target++=(uint8_t)value; 3804 --targetCapacity; 3805 3806 /* normal end of conversion: prepare for a new character */ 3807 c=0; 3808 continue; 3809 } else if(!U16_IS_SURROGATE(c)) { 3810 /* normal, unassigned BMP character */ 3811 } else if(U16_IS_SURROGATE_LEAD(c)) { 3812 getTrail: 3813 if(source<sourceLimit) { 3814 /* test the following code unit */ 3815 UChar trail=*source; 3816 if(U16_IS_TRAIL(trail)) { 3817 ++source; 3818 c=U16_GET_SUPPLEMENTARY(c, trail); 3819 /* this codepage does not map supplementary code points */ 3820 /* callback(unassigned) */ 3821 } else { 3822 /* this is an unmatched lead code unit (1st surrogate) */ 3823 /* callback(illegal) */ 3824 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3825 break; 3826 } 3827 } else { 3828 /* no more input */ 3829 if (pArgs->flush) { 3830 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3831 } 3832 break; 3833 } 3834 } else { 3835 /* this is an unmatched trail code unit (2nd surrogate) */ 3836 /* callback(illegal) */ 3837 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3838 break; 3839 } 3840 3841 /* c does not have a mapping */ 3842 3843 /* get the number of code units for c to correctly advance sourceIndex */ 3844 length=U16_LENGTH(c); 3845 3846 /* set offsets since the start or the last extension */ 3847 if(offsets!=NULL) { 3848 int32_t count=(int32_t)(source-lastSource); 3849 3850 /* do not set the offset for this character */ 3851 count-=length; 3852 3853 while(count>0) { 3854 *offsets++=sourceIndex++; 3855 --count; 3856 } 3857 /* offsets and sourceIndex are now set for the current character */ 3858 } 3859 3860 /* try an extension mapping */ 3861 lastSource=source; 3862 c=_extFromU(cnv, cnv->sharedData, 3863 c, &source, sourceLimit, 3864 &target, (const uint8_t *)(pArgs->targetLimit), 3865 &offsets, sourceIndex, 3866 pArgs->flush, 3867 pErrorCode); 3868 sourceIndex+=length+(int32_t)(source-lastSource); 3869 lastSource=source; 3870 3871 if(U_FAILURE(*pErrorCode)) { 3872 /* not mappable or buffer overflow */ 3873 break; 3874 } else { 3875 /* a mapping was written to the target, continue */ 3876 3877 /* recalculate the targetCapacity after an extension mapping */ 3878 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3879 length=(int32_t)(sourceLimit-source); 3880 if(length<targetCapacity) { 3881 targetCapacity=length; 3882 } 3883 } 3884 3885 #if MBCS_UNROLL_SINGLE_FROM_BMP 3886 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3887 goto unrolled; 3888 #endif 3889 } 3890 3891 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 3892 /* target is full */ 3893 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3894 } 3895 3896 /* set offsets since the start or the last callback */ 3897 if(offsets!=NULL) { 3898 size_t count=source-lastSource; 3899 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 3900 /* 3901 Caller gave us a partial supplementary character, 3902 which this function couldn't convert in any case. 3903 The callback will handle the offset. 3904 */ 3905 count--; 3906 } 3907 while(count>0) { 3908 *offsets++=sourceIndex++; 3909 --count; 3910 } 3911 } 3912 3913 /* set the converter state back into UConverter */ 3914 cnv->fromUChar32=c; 3915 3916 /* write back the updated pointers */ 3917 pArgs->source=source; 3918 pArgs->target=(char *)target; 3919 pArgs->offsets=offsets; 3920 } 3921 3922 /* Begin Android-added */ 3923 #undef si_value 3924 #undef so_value 3925 /* End Android-added */ 3926 3927 U_CFUNC void 3928 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3929 UErrorCode *pErrorCode) { 3930 UConverter *cnv; 3931 const UChar *source, *sourceLimit; 3932 uint8_t *target; 3933 int32_t targetCapacity; 3934 int32_t *offsets; 3935 3936 const uint16_t *table; 3937 const uint16_t *mbcsIndex; 3938 const uint8_t *p, *bytes; 3939 uint8_t outputType; 3940 3941 UChar32 c; 3942 3943 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 3944 3945 uint32_t stage2Entry; 3946 uint32_t asciiRoundtrips; 3947 uint32_t value; 3948 uint8_t si_value[2] = {0, 0}; 3949 uint8_t so_value[2] = {0, 0}; 3950 uint8_t si_value_length, so_value_length; 3951 int32_t length = 0, prevLength; 3952 uint8_t unicodeMask; 3953 3954 cnv=pArgs->converter; 3955 3956 if(cnv->preFromUFirstCP>=0) { 3957 /* 3958 * pass sourceIndex=-1 because we continue from an earlier buffer 3959 * in the future, this may change with continuous offsets 3960 */ 3961 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 3962 3963 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 3964 return; 3965 } 3966 } 3967 3968 /* use optimized function if possible */ 3969 outputType=cnv->sharedData->mbcs.outputType; 3970 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3971 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3972 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3973 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 3974 } else { 3975 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 3976 } 3977 return; 3978 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 3979 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 3980 return; 3981 } 3982 3983 /* set up the local pointers */ 3984 source=pArgs->source; 3985 sourceLimit=pArgs->sourceLimit; 3986 target=(uint8_t *)pArgs->target; 3987 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3988 offsets=pArgs->offsets; 3989 3990 table=cnv->sharedData->mbcs.fromUnicodeTable; 3991 if(cnv->sharedData->mbcs.utf8Friendly) { 3992 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3993 } else { 3994 mbcsIndex=NULL; 3995 } 3996 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3997 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3998 } else { 3999 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 4000 } 4001 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4002 4003 /* get the converter state from UConverter */ 4004 c=cnv->fromUChar32; 4005 4006 if(outputType==MBCS_OUTPUT_2_SISO) { 4007 prevLength=cnv->fromUnicodeStatus; 4008 if(prevLength==0) { 4009 /* set the real value */ 4010 prevLength=1; 4011 } 4012 } else { 4013 /* prevent fromUnicodeStatus from being set to something non-0 */ 4014 prevLength=0; 4015 } 4016 4017 /* sourceIndex=-1 if the current character began in the previous buffer */ 4018 prevSourceIndex=-1; 4019 sourceIndex= c==0 ? 0 : -1; 4020 nextSourceIndex=0; 4021 4022 /* Get the SI/SO character for the converter */ 4023 si_value_length = getSISOBytes(SI, cnv->options, si_value); 4024 so_value_length = getSISOBytes(SO, cnv->options, so_value); 4025 4026 /* conversion loop */ 4027 /* 4028 * This is another piece of ugly code: 4029 * A goto into the loop if the converter state contains a first surrogate 4030 * from the previous function call. 4031 * It saves me to check in each loop iteration a check of if(c==0) 4032 * and duplicating the trail-surrogate-handling code in the else 4033 * branch of that check. 4034 * I could not find any other way to get around this other than 4035 * using a function call for the conversion and callback, which would 4036 * be even more inefficient. 4037 * 4038 * Markus Scherer 2000-jul-19 4039 */ 4040 if(c!=0 && targetCapacity>0) { 4041 goto getTrail; 4042 } 4043 4044 while(source<sourceLimit) { 4045 /* 4046 * This following test is to see if available input would overflow the output. 4047 * It does not catch output of more than one byte that 4048 * overflows as a result of a multi-byte character or callback output 4049 * from the last source character. 4050 * Therefore, those situations also test for overflows and will 4051 * then break the loop, too. 4052 */ 4053 if(targetCapacity>0) { 4054 /* 4055 * Get a correct Unicode code point: 4056 * a single UChar for a BMP code point or 4057 * a matched surrogate pair for a "supplementary code point". 4058 */ 4059 c=*source++; 4060 ++nextSourceIndex; 4061 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4062 *target++=(uint8_t)c; 4063 if(offsets!=NULL) { 4064 *offsets++=sourceIndex; 4065 prevSourceIndex=sourceIndex; 4066 sourceIndex=nextSourceIndex; 4067 } 4068 --targetCapacity; 4069 c=0; 4070 continue; 4071 } 4072 /* 4073 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4074 * to avoid dealing with surrogates. 4075 * MBCS_FAST_MAX must be >=0xd7ff. 4076 */ 4077 if(c<=0xd7ff && mbcsIndex!=NULL) { 4078 value=mbcsIndex[c>>6]; 4079 4080 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4081 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4082 switch(outputType) { 4083 case MBCS_OUTPUT_2: 4084 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4085 if(value<=0xff) { 4086 if(value==0) { 4087 goto unassigned; 4088 } else { 4089 length=1; 4090 } 4091 } else { 4092 length=2; 4093 } 4094 break; 4095 case MBCS_OUTPUT_2_SISO: 4096 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4097 /* 4098 * Save the old state in the converter object 4099 * right here, then change the local prevLength state variable if necessary. 4100 * Then, if this character turns out to be unassigned or a fallback that 4101 * is not taken, the callback code must not save the new state in the converter 4102 * because the new state is for a character that is not output. 4103 * However, the callback must still restore the state from the converter 4104 * in case the callback function changed it for its output. 4105 */ 4106 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4107 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4108 if(value<=0xff) { 4109 if(value==0) { 4110 goto unassigned; 4111 } else if(prevLength<=1) { 4112 length=1; 4113 } else { 4114 /* change from double-byte mode to single-byte */ 4115 if (si_value_length == 1) { 4116 value|=(uint32_t)si_value[0]<<8; 4117 length = 2; 4118 } else if (si_value_length == 2) { 4119 value|=(uint32_t)si_value[1]<<8; 4120 value|=(uint32_t)si_value[0]<<16; 4121 length = 3; 4122 } 4123 prevLength=1; 4124 } 4125 } else { 4126 if(prevLength==2) { 4127 length=2; 4128 } else { 4129 /* change from single-byte mode to double-byte */ 4130 if (so_value_length == 1) { 4131 value|=(uint32_t)so_value[0]<<16; 4132 length = 3; 4133 } else if (so_value_length == 2) { 4134 value|=(uint32_t)so_value[1]<<16; 4135 value|=(uint32_t)so_value[0]<<24; 4136 length = 4; 4137 } 4138 prevLength=2; 4139 } 4140 } 4141 break; 4142 case MBCS_OUTPUT_DBCS_ONLY: 4143 /* table with single-byte results, but only DBCS mappings used */ 4144 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4145 if(value<=0xff) { 4146 /* no mapping or SBCS result, not taken for DBCS-only */ 4147 goto unassigned; 4148 } else { 4149 length=2; 4150 } 4151 break; 4152 case MBCS_OUTPUT_3: 4153 p=bytes+(value+(c&0x3f))*3; 4154 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4155 if(value<=0xff) { 4156 if(value==0) { 4157 goto unassigned; 4158 } else { 4159 length=1; 4160 } 4161 } else if(value<=0xffff) { 4162 length=2; 4163 } else { 4164 length=3; 4165 } 4166 break; 4167 case MBCS_OUTPUT_4: 4168 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4169 if(value<=0xff) { 4170 if(value==0) { 4171 goto unassigned; 4172 } else { 4173 length=1; 4174 } 4175 } else if(value<=0xffff) { 4176 length=2; 4177 } else if(value<=0xffffff) { 4178 length=3; 4179 } else { 4180 length=4; 4181 } 4182 break; 4183 case MBCS_OUTPUT_3_EUC: 4184 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4185 /* EUC 16-bit fixed-length representation */ 4186 if(value<=0xff) { 4187 if(value==0) { 4188 goto unassigned; 4189 } else { 4190 length=1; 4191 } 4192 } else if((value&0x8000)==0) { 4193 value|=0x8e8000; 4194 length=3; 4195 } else if((value&0x80)==0) { 4196 value|=0x8f0080; 4197 length=3; 4198 } else { 4199 length=2; 4200 } 4201 break; 4202 case MBCS_OUTPUT_4_EUC: 4203 p=bytes+(value+(c&0x3f))*3; 4204 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4205 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4206 if(value<=0xff) { 4207 if(value==0) { 4208 goto unassigned; 4209 } else { 4210 length=1; 4211 } 4212 } else if(value<=0xffff) { 4213 length=2; 4214 } else if((value&0x800000)==0) { 4215 value|=0x8e800000; 4216 length=4; 4217 } else if((value&0x8000)==0) { 4218 value|=0x8f008000; 4219 length=4; 4220 } else { 4221 length=3; 4222 } 4223 break; 4224 default: 4225 /* must not occur */ 4226 /* 4227 * To avoid compiler warnings that value & length may be 4228 * used without having been initialized, we set them here. 4229 * In reality, this is unreachable code. 4230 * Not having a default branch also causes warnings with 4231 * some compilers. 4232 */ 4233 value=0; 4234 length=0; 4235 break; 4236 } 4237 /* output the value */ 4238 } else { 4239 /* 4240 * This also tests if the codepage maps single surrogates. 4241 * If it does, then surrogates are not paired but mapped separately. 4242 * Note that in this case unmatched surrogates are not detected. 4243 */ 4244 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4245 if(U16_IS_SURROGATE_LEAD(c)) { 4246 getTrail: 4247 if(source<sourceLimit) { 4248 /* test the following code unit */ 4249 UChar trail=*source; 4250 if(U16_IS_TRAIL(trail)) { 4251 ++source; 4252 ++nextSourceIndex; 4253 c=U16_GET_SUPPLEMENTARY(c, trail); 4254 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4255 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4256 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4257 /* callback(unassigned) */ 4258 goto unassigned; 4259 } 4260 /* convert this supplementary code point */ 4261 /* exit this condition tree */ 4262 } else { 4263 /* this is an unmatched lead code unit (1st surrogate) */ 4264 /* callback(illegal) */ 4265 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4266 break; 4267 } 4268 } else { 4269 /* no more input */ 4270 break; 4271 } 4272 } else { 4273 /* this is an unmatched trail code unit (2nd surrogate) */ 4274 /* callback(illegal) */ 4275 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4276 break; 4277 } 4278 } 4279 4280 /* convert the Unicode code point in c into codepage bytes */ 4281 4282 /* 4283 * The basic lookup is a triple-stage compact array (trie) lookup. 4284 * For details see the beginning of this file. 4285 * 4286 * Single-byte codepages are handled with a different data structure 4287 * by _MBCSSingle... functions. 4288 * 4289 * The result consists of a 32-bit value from stage 2 and 4290 * a pointer to as many bytes as are stored per character. 4291 * The pointer points to the character's bytes in stage 3. 4292 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4293 * for that pointer, while bits 31..16 are flags for which of 4294 * the 16 characters in the block are roundtrip-assigned. 4295 * 4296 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4297 * respectively as uint32_t, in the platform encoding. 4298 * For 3-byte codepages, the bytes are always stored in big-endian order. 4299 * 4300 * For EUC encodings that use only either 0x8e or 0x8f as the first 4301 * byte of their longest byte sequences, the first two bytes in 4302 * this third stage indicate with their 7th bits whether these bytes 4303 * are to be written directly or actually need to be preceeded by 4304 * one of the two Single-Shift codes. With this, the third stage 4305 * stores one byte fewer per character than the actual maximum length of 4306 * EUC byte sequences. 4307 * 4308 * Other than that, leading zero bytes are removed and the other 4309 * bytes output. A single zero byte may be output if the "assigned" 4310 * bit in stage 2 was on. 4311 * The data structure does not support zero byte output as a fallback, 4312 * and also does not allow output of leading zeros. 4313 */ 4314 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4315 4316 /* get the bytes and the length for the output */ 4317 switch(outputType) { 4318 case MBCS_OUTPUT_2: 4319 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4320 if(value<=0xff) { 4321 length=1; 4322 } else { 4323 length=2; 4324 } 4325 break; 4326 case MBCS_OUTPUT_2_SISO: 4327 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4328 /* 4329 * Save the old state in the converter object 4330 * right here, then change the local prevLength state variable if necessary. 4331 * Then, if this character turns out to be unassigned or a fallback that 4332 * is not taken, the callback code must not save the new state in the converter 4333 * because the new state is for a character that is not output. 4334 * However, the callback must still restore the state from the converter 4335 * in case the callback function changed it for its output. 4336 */ 4337 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4338 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4339 if(value<=0xff) { 4340 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4341 /* no mapping, leave value==0 */ 4342 length=0; 4343 } else if(prevLength<=1) { 4344 length=1; 4345 } else { 4346 /* change from double-byte mode to single-byte */ 4347 if (si_value_length == 1) { 4348 value|=(uint32_t)si_value[0]<<8; 4349 length = 2; 4350 } else if (si_value_length == 2) { 4351 value|=(uint32_t)si_value[1]<<8; 4352 value|=(uint32_t)si_value[0]<<16; 4353 length = 3; 4354 } 4355 prevLength=1; 4356 } 4357 } else { 4358 if(prevLength==2) { 4359 length=2; 4360 } else { 4361 /* change from single-byte mode to double-byte */ 4362 if (so_value_length == 1) { 4363 value|=(uint32_t)so_value[0]<<16; 4364 length = 3; 4365 } else if (so_value_length == 2) { 4366 value|=(uint32_t)so_value[1]<<16; 4367 value|=(uint32_t)so_value[0]<<24; 4368 length = 4; 4369 } 4370 prevLength=2; 4371 } 4372 } 4373 break; 4374 case MBCS_OUTPUT_DBCS_ONLY: 4375 /* table with single-byte results, but only DBCS mappings used */ 4376 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4377 if(value<=0xff) { 4378 /* no mapping or SBCS result, not taken for DBCS-only */ 4379 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4380 length=0; 4381 } else { 4382 length=2; 4383 } 4384 break; 4385 case MBCS_OUTPUT_3: 4386 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4387 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4388 if(value<=0xff) { 4389 length=1; 4390 } else if(value<=0xffff) { 4391 length=2; 4392 } else { 4393 length=3; 4394 } 4395 break; 4396 case MBCS_OUTPUT_4: 4397 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4398 if(value<=0xff) { 4399 length=1; 4400 } else if(value<=0xffff) { 4401 length=2; 4402 } else if(value<=0xffffff) { 4403 length=3; 4404 } else { 4405 length=4; 4406 } 4407 break; 4408 case MBCS_OUTPUT_3_EUC: 4409 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4410 /* EUC 16-bit fixed-length representation */ 4411 if(value<=0xff) { 4412 length=1; 4413 } else if((value&0x8000)==0) { 4414 value|=0x8e8000; 4415 length=3; 4416 } else if((value&0x80)==0) { 4417 value|=0x8f0080; 4418 length=3; 4419 } else { 4420 length=2; 4421 } 4422 break; 4423 case MBCS_OUTPUT_4_EUC: 4424 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4425 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4426 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4427 if(value<=0xff) { 4428 length=1; 4429 } else if(value<=0xffff) { 4430 length=2; 4431 } else if((value&0x800000)==0) { 4432 value|=0x8e800000; 4433 length=4; 4434 } else if((value&0x8000)==0) { 4435 value|=0x8f008000; 4436 length=4; 4437 } else { 4438 length=3; 4439 } 4440 break; 4441 default: 4442 /* must not occur */ 4443 /* 4444 * To avoid compiler warnings that value & length may be 4445 * used without having been initialized, we set them here. 4446 * In reality, this is unreachable code. 4447 * Not having a default branch also causes warnings with 4448 * some compilers. 4449 */ 4450 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4451 length=0; 4452 break; 4453 } 4454 4455 /* is this code point assigned, or do we use fallbacks? */ 4456 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4457 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4458 ) { 4459 /* 4460 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4461 * There is no way with this data structure for fallback output 4462 * to be a zero byte. 4463 */ 4464 4465 unassigned: 4466 /* try an extension mapping */ 4467 pArgs->source=source; 4468 c=_extFromU(cnv, cnv->sharedData, 4469 c, &source, sourceLimit, 4470 &target, target+targetCapacity, 4471 &offsets, sourceIndex, 4472 pArgs->flush, 4473 pErrorCode); 4474 nextSourceIndex+=(int32_t)(source-pArgs->source); 4475 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4476 4477 if(U_FAILURE(*pErrorCode)) { 4478 /* not mappable or buffer overflow */ 4479 break; 4480 } else { 4481 /* a mapping was written to the target, continue */ 4482 4483 /* recalculate the targetCapacity after an extension mapping */ 4484 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4485 4486 /* normal end of conversion: prepare for a new character */ 4487 if(offsets!=NULL) { 4488 prevSourceIndex=sourceIndex; 4489 sourceIndex=nextSourceIndex; 4490 } 4491 continue; 4492 } 4493 } 4494 } 4495 4496 /* write the output character bytes from value and length */ 4497 /* from the first if in the loop we know that targetCapacity>0 */ 4498 if(length<=targetCapacity) { 4499 if(offsets==NULL) { 4500 switch(length) { 4501 /* each branch falls through to the next one */ 4502 case 4: 4503 *target++=(uint8_t)(value>>24); 4504 case 3: /*fall through*/ 4505 *target++=(uint8_t)(value>>16); 4506 case 2: /*fall through*/ 4507 *target++=(uint8_t)(value>>8); 4508 case 1: /*fall through*/ 4509 *target++=(uint8_t)value; 4510 default: 4511 /* will never occur */ 4512 break; 4513 } 4514 } else { 4515 switch(length) { 4516 /* each branch falls through to the next one */ 4517 case 4: 4518 *target++=(uint8_t)(value>>24); 4519 *offsets++=sourceIndex; 4520 case 3: /*fall through*/ 4521 *target++=(uint8_t)(value>>16); 4522 *offsets++=sourceIndex; 4523 case 2: /*fall through*/ 4524 *target++=(uint8_t)(value>>8); 4525 *offsets++=sourceIndex; 4526 case 1: /*fall through*/ 4527 *target++=(uint8_t)value; 4528 *offsets++=sourceIndex; 4529 default: 4530 /* will never occur */ 4531 break; 4532 } 4533 } 4534 targetCapacity-=length; 4535 } else { 4536 uint8_t *charErrorBuffer; 4537 4538 /* 4539 * We actually do this backwards here: 4540 * In order to save an intermediate variable, we output 4541 * first to the overflow buffer what does not fit into the 4542 * regular target. 4543 */ 4544 /* we know that 1<=targetCapacity<length<=4 */ 4545 length-=targetCapacity; 4546 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4547 switch(length) { 4548 /* each branch falls through to the next one */ 4549 case 3: 4550 *charErrorBuffer++=(uint8_t)(value>>16); 4551 case 2: /*fall through*/ 4552 *charErrorBuffer++=(uint8_t)(value>>8); 4553 case 1: /*fall through*/ 4554 *charErrorBuffer=(uint8_t)value; 4555 default: 4556 /* will never occur */ 4557 break; 4558 } 4559 cnv->charErrorBufferLength=(int8_t)length; 4560 4561 /* now output what fits into the regular target */ 4562 value>>=8*length; /* length was reduced by targetCapacity */ 4563 switch(targetCapacity) { 4564 /* each branch falls through to the next one */ 4565 case 3: 4566 *target++=(uint8_t)(value>>16); 4567 if(offsets!=NULL) { 4568 *offsets++=sourceIndex; 4569 } 4570 case 2: /*fall through*/ 4571 *target++=(uint8_t)(value>>8); 4572 if(offsets!=NULL) { 4573 *offsets++=sourceIndex; 4574 } 4575 case 1: /*fall through*/ 4576 *target++=(uint8_t)value; 4577 if(offsets!=NULL) { 4578 *offsets++=sourceIndex; 4579 } 4580 default: 4581 /* will never occur */ 4582 break; 4583 } 4584 4585 /* target overflow */ 4586 targetCapacity=0; 4587 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4588 c=0; 4589 break; 4590 } 4591 4592 /* normal end of conversion: prepare for a new character */ 4593 c=0; 4594 if(offsets!=NULL) { 4595 prevSourceIndex=sourceIndex; 4596 sourceIndex=nextSourceIndex; 4597 } 4598 continue; 4599 } else { 4600 /* target is full */ 4601 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4602 break; 4603 } 4604 } 4605 4606 /* 4607 * the end of the input stream and detection of truncated input 4608 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4609 * we need to emit an SI at the very end 4610 * 4611 * conditions: 4612 * successful 4613 * EBCDIC_STATEFUL in DBCS mode 4614 * end of input and no truncated input 4615 */ 4616 if( U_SUCCESS(*pErrorCode) && 4617 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4618 pArgs->flush && source>=sourceLimit && c==0 4619 ) { 4620 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4621 if(targetCapacity>0) { 4622 *target++=(uint8_t)si_value[0]; 4623 if (si_value_length == 2) { 4624 if (targetCapacity<2) { 4625 cnv->charErrorBuffer[0]=(uint8_t)si_value[1]; 4626 cnv->charErrorBufferLength=1; 4627 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4628 } else { 4629 *target++=(uint8_t)si_value[1]; 4630 } 4631 } 4632 if(offsets!=NULL) { 4633 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4634 *offsets++=prevSourceIndex; 4635 } 4636 } else { 4637 /* target is full */ 4638 cnv->charErrorBuffer[0]=(uint8_t)si_value[0]; 4639 if (si_value_length == 2) { 4640 cnv->charErrorBuffer[1]=(uint8_t)si_value[1]; 4641 } 4642 cnv->charErrorBufferLength=si_value_length; 4643 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4644 } 4645 prevLength=1; /* we switched into SBCS */ 4646 } 4647 4648 /* set the converter state back into UConverter */ 4649 cnv->fromUChar32=c; 4650 cnv->fromUnicodeStatus=prevLength; 4651 4652 /* write back the updated pointers */ 4653 pArgs->source=source; 4654 pArgs->target=(char *)target; 4655 pArgs->offsets=offsets; 4656 } 4657 4658 /* 4659 * This is another simple conversion function for internal use by other 4660 * conversion implementations. 4661 * It does not use the converter state nor call callbacks. 4662 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4663 * It handles conversion extensions but not GB 18030. 4664 * 4665 * It converts one single Unicode code point into codepage bytes, encoded 4666 * as one 32-bit value. The function returns the number of bytes in *pValue: 4667 * 1..4 the number of bytes in *pValue 4668 * 0 unassigned (*pValue undefined) 4669 * -1 illegal (currently not used, *pValue undefined) 4670 * 4671 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4672 * the second to last byte in bits 15..8, etc. 4673 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4674 */ 4675 U_CFUNC int32_t 4676 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4677 UChar32 c, uint32_t *pValue, 4678 UBool useFallback) { 4679 const int32_t *cx; 4680 const uint16_t *table; 4681 #if 0 4682 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4683 const uint8_t *p; 4684 #endif 4685 uint32_t stage2Entry; 4686 uint32_t value; 4687 int32_t length; 4688 4689 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4690 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4691 table=sharedData->mbcs.fromUnicodeTable; 4692 4693 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4694 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4695 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4696 /* is this code point assigned, or do we use fallbacks? */ 4697 if(useFallback ? value>=0x800 : value>=0xc00) { 4698 *pValue=value&0xff; 4699 return 1; 4700 } 4701 } else /* outputType!=MBCS_OUTPUT_1 */ { 4702 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4703 4704 /* get the bytes and the length for the output */ 4705 switch(sharedData->mbcs.outputType) { 4706 case MBCS_OUTPUT_2: 4707 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4708 if(value<=0xff) { 4709 length=1; 4710 } else { 4711 length=2; 4712 } 4713 break; 4714 #if 0 4715 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4716 case MBCS_OUTPUT_DBCS_ONLY: 4717 /* table with single-byte results, but only DBCS mappings used */ 4718 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4719 if(value<=0xff) { 4720 /* no mapping or SBCS result, not taken for DBCS-only */ 4721 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4722 length=0; 4723 } else { 4724 length=2; 4725 } 4726 break; 4727 case MBCS_OUTPUT_3: 4728 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4729 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4730 if(value<=0xff) { 4731 length=1; 4732 } else if(value<=0xffff) { 4733 length=2; 4734 } else { 4735 length=3; 4736 } 4737 break; 4738 case MBCS_OUTPUT_4: 4739 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4740 if(value<=0xff) { 4741 length=1; 4742 } else if(value<=0xffff) { 4743 length=2; 4744 } else if(value<=0xffffff) { 4745 length=3; 4746 } else { 4747 length=4; 4748 } 4749 break; 4750 case MBCS_OUTPUT_3_EUC: 4751 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4752 /* EUC 16-bit fixed-length representation */ 4753 if(value<=0xff) { 4754 length=1; 4755 } else if((value&0x8000)==0) { 4756 value|=0x8e8000; 4757 length=3; 4758 } else if((value&0x80)==0) { 4759 value|=0x8f0080; 4760 length=3; 4761 } else { 4762 length=2; 4763 } 4764 break; 4765 case MBCS_OUTPUT_4_EUC: 4766 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4767 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4768 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4769 if(value<=0xff) { 4770 length=1; 4771 } else if(value<=0xffff) { 4772 length=2; 4773 } else if((value&0x800000)==0) { 4774 value|=0x8e800000; 4775 length=4; 4776 } else if((value&0x8000)==0) { 4777 value|=0x8f008000; 4778 length=4; 4779 } else { 4780 length=3; 4781 } 4782 break; 4783 #endif 4784 default: 4785 /* must not occur */ 4786 return -1; 4787 } 4788 4789 /* is this code point assigned, or do we use fallbacks? */ 4790 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4791 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4792 ) { 4793 /* 4794 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4795 * There is no way with this data structure for fallback output 4796 * to be a zero byte. 4797 */ 4798 /* assigned */ 4799 *pValue=value; 4800 return length; 4801 } 4802 } 4803 } 4804 4805 cx=sharedData->mbcs.extIndexes; 4806 if(cx!=NULL) { 4807 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4808 return length>=0 ? length : -length; /* return abs(length); */ 4809 } 4810 4811 /* unassigned */ 4812 return 0; 4813 } 4814 4815 4816 #if 0 4817 /* 4818 * This function has been moved to ucnv2022.c for inlining. 4819 * This implementation is here only for documentation purposes 4820 */ 4821 4822 /** 4823 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4824 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4825 * It does not handle conversion extensions (_extFromU()). 4826 * 4827 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4828 */ 4829 U_CFUNC int32_t 4830 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4831 UChar32 c, 4832 UBool useFallback) { 4833 const uint16_t *table; 4834 int32_t value; 4835 4836 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4837 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4838 return -1; 4839 } 4840 4841 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4842 table=sharedData->mbcs.fromUnicodeTable; 4843 4844 /* get the byte for the output */ 4845 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4846 /* is this code point assigned, or do we use fallbacks? */ 4847 if(useFallback ? value>=0x800 : value>=0xc00) { 4848 return value&0xff; 4849 } else { 4850 return -1; 4851 } 4852 } 4853 #endif 4854 4855 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4856 4857 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4858 static const UChar32 4859 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4860 4861 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4862 static const UChar32 4863 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4864 4865 static void 4866 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 4867 UConverterToUnicodeArgs *pToUArgs, 4868 UErrorCode *pErrorCode) { 4869 UConverter *utf8, *cnv; 4870 const uint8_t *source, *sourceLimit; 4871 uint8_t *target; 4872 int32_t targetCapacity; 4873 4874 const uint16_t *table, *sbcsIndex; 4875 const uint16_t *results; 4876 4877 int8_t oldToULength, toULength, toULimit; 4878 4879 UChar32 c; 4880 uint8_t b, t1, t2; 4881 4882 uint32_t asciiRoundtrips; 4883 uint16_t value, minValue; 4884 UBool hasSupplementary; 4885 4886 /* set up the local pointers */ 4887 utf8=pToUArgs->converter; 4888 cnv=pFromUArgs->converter; 4889 source=(uint8_t *)pToUArgs->source; 4890 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 4891 target=(uint8_t *)pFromUArgs->target; 4892 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 4893 4894 table=cnv->sharedData->mbcs.fromUnicodeTable; 4895 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 4896 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4897 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4898 } else { 4899 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 4900 } 4901 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4902 4903 if(cnv->useFallback) { 4904 /* use all roundtrip and fallback results */ 4905 minValue=0x800; 4906 } else { 4907 /* use only roundtrips and fallbacks from private-use characters */ 4908 minValue=0xc00; 4909 } 4910 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 4911 4912 /* get the converter state from the UTF-8 UConverter */ 4913 c=(UChar32)utf8->toUnicodeStatus; 4914 if(c!=0) { 4915 toULength=oldToULength=utf8->toULength; 4916 toULimit=(int8_t)utf8->mode; 4917 } else { 4918 toULength=oldToULength=toULimit=0; 4919 } 4920 4921 /* 4922 * Make sure that the last byte sequence before sourceLimit is complete 4923 * or runs into a lead byte. 4924 * Do not go back into the bytes that will be read for finishing a partial 4925 * sequence from the previous buffer. 4926 * In the conversion loop compare source with sourceLimit only once 4927 * per multi-byte character. 4928 */ 4929 { 4930 int32_t i, length; 4931 4932 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 4933 for(i=0; i<3 && i<length;) { 4934 b=*(sourceLimit-i-1); 4935 if(U8_IS_TRAIL(b)) { 4936 ++i; 4937 } else { 4938 if(i<U8_COUNT_TRAIL_BYTES(b)) { 4939 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 4940 sourceLimit-=i+1; 4941 } 4942 break; 4943 } 4944 } 4945 } 4946 4947 if(c!=0 && targetCapacity>0) { 4948 utf8->toUnicodeStatus=0; 4949 utf8->toULength=0; 4950 goto moreBytes; 4951 /* 4952 * Note: We could avoid the goto by duplicating some of the moreBytes 4953 * code, but only up to the point of collecting a complete UTF-8 4954 * sequence; then recurse for the toUBytes[toULength] 4955 * and then continue with normal conversion. 4956 * 4957 * If so, move this code to just after initializing the minimum 4958 * set of local variables for reading the UTF-8 input 4959 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 4960 * 4961 * Potential advantages: 4962 * - avoid the goto 4963 * - oldToULength could become a local variable in just those code blocks 4964 * that deal with buffer boundaries 4965 * - possibly faster if the goto prevents some compiler optimizations 4966 * (this would need measuring to confirm) 4967 * Disadvantage: 4968 * - code duplication 4969 */ 4970 } 4971 4972 /* conversion loop */ 4973 while(source<sourceLimit) { 4974 if(targetCapacity>0) { 4975 b=*source++; 4976 if((int8_t)b>=0) { 4977 /* convert ASCII */ 4978 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 4979 *target++=(uint8_t)b; 4980 --targetCapacity; 4981 continue; 4982 } else { 4983 c=b; 4984 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 4985 } 4986 } else { 4987 if(b<0xe0) { 4988 if( /* handle U+0080..U+07FF inline */ 4989 b>=0xc2 && 4990 (t1=(uint8_t)(*source-0x80)) <= 0x3f 4991 ) { 4992 c=b&0x1f; 4993 ++source; 4994 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 4995 if(value>=minValue) { 4996 *target++=(uint8_t)value; 4997 --targetCapacity; 4998 continue; 4999 } else { 5000 c=(c<<6)|t1; 5001 } 5002 } else { 5003 c=-1; 5004 } 5005 } else if(b==0xe0) { 5006 if( /* handle U+0800..U+0FFF inline */ 5007 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5008 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5009 ) { 5010 c=t1; 5011 source+=2; 5012 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5013 if(value>=minValue) { 5014 *target++=(uint8_t)value; 5015 --targetCapacity; 5016 continue; 5017 } else { 5018 c=(c<<6)|t2; 5019 } 5020 } else { 5021 c=-1; 5022 } 5023 } else { 5024 c=-1; 5025 } 5026 5027 if(c<0) { 5028 /* handle "complicated" and error cases, and continuing partial characters */ 5029 oldToULength=0; 5030 toULength=1; 5031 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5032 c=b; 5033 moreBytes: 5034 while(toULength<toULimit) { 5035 /* 5036 * The sourceLimit may have been adjusted before the conversion loop 5037 * to stop before a truncated sequence. 5038 * Here we need to use the real limit in case we have two truncated 5039 * sequences at the end. 5040 * See ticket #7492. 5041 */ 5042 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5043 b=*source; 5044 if(U8_IS_TRAIL(b)) { 5045 ++source; 5046 ++toULength; 5047 c=(c<<6)+b; 5048 } else { 5049 break; /* sequence too short, stop with toULength<toULimit */ 5050 } 5051 } else { 5052 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5053 source-=(toULength-oldToULength); 5054 while(oldToULength<toULength) { 5055 utf8->toUBytes[oldToULength++]=*source++; 5056 } 5057 utf8->toUnicodeStatus=c; 5058 utf8->toULength=toULength; 5059 utf8->mode=toULimit; 5060 pToUArgs->source=(char *)source; 5061 pFromUArgs->target=(char *)target; 5062 return; 5063 } 5064 } 5065 5066 if( toULength==toULimit && /* consumed all trail bytes */ 5067 (toULength==3 || toULength==2) && /* BMP */ 5068 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5069 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5070 ) { 5071 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5072 } else if( 5073 toULength==toULimit && toULength==4 && 5074 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5075 ) { 5076 /* supplementary code point */ 5077 if(!hasSupplementary) { 5078 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5079 value=0; 5080 } else { 5081 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5082 } 5083 } else { 5084 /* error handling: illegal UTF-8 byte sequence */ 5085 source-=(toULength-oldToULength); 5086 while(oldToULength<toULength) { 5087 utf8->toUBytes[oldToULength++]=*source++; 5088 } 5089 utf8->toULength=toULength; 5090 pToUArgs->source=(char *)source; 5091 pFromUArgs->target=(char *)target; 5092 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5093 return; 5094 } 5095 } 5096 } 5097 5098 if(value>=minValue) { 5099 /* output the mapping for c */ 5100 *target++=(uint8_t)value; 5101 --targetCapacity; 5102 } else { 5103 /* value<minValue means c is unassigned (unmappable) */ 5104 /* 5105 * Try an extension mapping. 5106 * Pass in no source because we don't have UTF-16 input. 5107 * If we have a partial match on c, we will return and revert 5108 * to UTF-8->UTF-16->charset conversion. 5109 */ 5110 static const UChar nul=0; 5111 const UChar *noSource=&nul; 5112 c=_extFromU(cnv, cnv->sharedData, 5113 c, &noSource, noSource, 5114 &target, target+targetCapacity, 5115 NULL, -1, 5116 pFromUArgs->flush, 5117 pErrorCode); 5118 5119 if(U_FAILURE(*pErrorCode)) { 5120 /* not mappable or buffer overflow */ 5121 cnv->fromUChar32=c; 5122 break; 5123 } else if(cnv->preFromUFirstCP>=0) { 5124 /* 5125 * Partial match, return and revert to pivoting. 5126 * In normal from-UTF-16 conversion, we would just continue 5127 * but then exit the loop because the extension match would 5128 * have consumed the source. 5129 */ 5130 *pErrorCode=U_USING_DEFAULT_WARNING; 5131 break; 5132 } else { 5133 /* a mapping was written to the target, continue */ 5134 5135 /* recalculate the targetCapacity after an extension mapping */ 5136 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5137 } 5138 } 5139 } else { 5140 /* target is full */ 5141 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5142 break; 5143 } 5144 } 5145 5146 /* 5147 * The sourceLimit may have been adjusted before the conversion loop 5148 * to stop before a truncated sequence. 5149 * If so, then collect the truncated sequence now. 5150 */ 5151 if(U_SUCCESS(*pErrorCode) && 5152 cnv->preFromUFirstCP<0 && 5153 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5154 c=utf8->toUBytes[0]=b=*source++; 5155 toULength=1; 5156 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5157 while(source<sourceLimit) { 5158 utf8->toUBytes[toULength++]=b=*source++; 5159 c=(c<<6)+b; 5160 } 5161 utf8->toUnicodeStatus=c; 5162 utf8->toULength=toULength; 5163 utf8->mode=toULimit; 5164 } 5165 5166 /* write back the updated pointers */ 5167 pToUArgs->source=(char *)source; 5168 pFromUArgs->target=(char *)target; 5169 } 5170 5171 static void 5172 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5173 UConverterToUnicodeArgs *pToUArgs, 5174 UErrorCode *pErrorCode) { 5175 UConverter *utf8, *cnv; 5176 const uint8_t *source, *sourceLimit; 5177 uint8_t *target; 5178 int32_t targetCapacity; 5179 5180 const uint16_t *table, *mbcsIndex; 5181 const uint16_t *results; 5182 5183 int8_t oldToULength, toULength, toULimit; 5184 5185 UChar32 c; 5186 uint8_t b, t1, t2; 5187 5188 uint32_t stage2Entry; 5189 uint32_t asciiRoundtrips; 5190 uint16_t value; 5191 UBool hasSupplementary; 5192 5193 /* set up the local pointers */ 5194 utf8=pToUArgs->converter; 5195 cnv=pFromUArgs->converter; 5196 source=(uint8_t *)pToUArgs->source; 5197 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5198 target=(uint8_t *)pFromUArgs->target; 5199 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5200 5201 table=cnv->sharedData->mbcs.fromUnicodeTable; 5202 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5203 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5204 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5205 } else { 5206 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5207 } 5208 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5209 5210 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5211 5212 /* get the converter state from the UTF-8 UConverter */ 5213 c=(UChar32)utf8->toUnicodeStatus; 5214 if(c!=0) { 5215 toULength=oldToULength=utf8->toULength; 5216 toULimit=(int8_t)utf8->mode; 5217 } else { 5218 toULength=oldToULength=toULimit=0; 5219 } 5220 5221 /* 5222 * Make sure that the last byte sequence before sourceLimit is complete 5223 * or runs into a lead byte. 5224 * Do not go back into the bytes that will be read for finishing a partial 5225 * sequence from the previous buffer. 5226 * In the conversion loop compare source with sourceLimit only once 5227 * per multi-byte character. 5228 */ 5229 { 5230 int32_t i, length; 5231 5232 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5233 for(i=0; i<3 && i<length;) { 5234 b=*(sourceLimit-i-1); 5235 if(U8_IS_TRAIL(b)) { 5236 ++i; 5237 } else { 5238 if(i<U8_COUNT_TRAIL_BYTES(b)) { 5239 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5240 sourceLimit-=i+1; 5241 } 5242 break; 5243 } 5244 } 5245 } 5246 5247 if(c!=0 && targetCapacity>0) { 5248 utf8->toUnicodeStatus=0; 5249 utf8->toULength=0; 5250 goto moreBytes; 5251 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5252 } 5253 5254 /* conversion loop */ 5255 while(source<sourceLimit) { 5256 if(targetCapacity>0) { 5257 b=*source++; 5258 if((int8_t)b>=0) { 5259 /* convert ASCII */ 5260 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5261 *target++=b; 5262 --targetCapacity; 5263 continue; 5264 } else { 5265 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5266 if(value==0) { 5267 c=b; 5268 goto unassigned; 5269 } 5270 } 5271 } else { 5272 if(b>0xe0) { 5273 if( /* handle U+1000..U+D7FF inline */ 5274 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5275 (b==0xed && (t1 <= 0x1f))) && 5276 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5277 ) { 5278 c=((b&0xf)<<6)|t1; 5279 source+=2; 5280 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5281 if(value==0) { 5282 c=(c<<6)|t2; 5283 goto unassigned; 5284 } 5285 } else { 5286 c=-1; 5287 } 5288 } else if(b<0xe0) { 5289 if( /* handle U+0080..U+07FF inline */ 5290 b>=0xc2 && 5291 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5292 ) { 5293 c=b&0x1f; 5294 ++source; 5295 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5296 if(value==0) { 5297 c=(c<<6)|t1; 5298 goto unassigned; 5299 } 5300 } else { 5301 c=-1; 5302 } 5303 } else { 5304 c=-1; 5305 } 5306 5307 if(c<0) { 5308 /* handle "complicated" and error cases, and continuing partial characters */ 5309 oldToULength=0; 5310 toULength=1; 5311 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5312 c=b; 5313 moreBytes: 5314 while(toULength<toULimit) { 5315 /* 5316 * The sourceLimit may have been adjusted before the conversion loop 5317 * to stop before a truncated sequence. 5318 * Here we need to use the real limit in case we have two truncated 5319 * sequences at the end. 5320 * See ticket #7492. 5321 */ 5322 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5323 b=*source; 5324 if(U8_IS_TRAIL(b)) { 5325 ++source; 5326 ++toULength; 5327 c=(c<<6)+b; 5328 } else { 5329 break; /* sequence too short, stop with toULength<toULimit */ 5330 } 5331 } else { 5332 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5333 source-=(toULength-oldToULength); 5334 while(oldToULength<toULength) { 5335 utf8->toUBytes[oldToULength++]=*source++; 5336 } 5337 utf8->toUnicodeStatus=c; 5338 utf8->toULength=toULength; 5339 utf8->mode=toULimit; 5340 pToUArgs->source=(char *)source; 5341 pFromUArgs->target=(char *)target; 5342 return; 5343 } 5344 } 5345 5346 if( toULength==toULimit && /* consumed all trail bytes */ 5347 (toULength==3 || toULength==2) && /* BMP */ 5348 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5349 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5350 ) { 5351 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5352 } else if( 5353 toULength==toULimit && toULength==4 && 5354 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5355 ) { 5356 /* supplementary code point */ 5357 if(!hasSupplementary) { 5358 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5359 stage2Entry=0; 5360 } else { 5361 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5362 } 5363 } else { 5364 /* error handling: illegal UTF-8 byte sequence */ 5365 source-=(toULength-oldToULength); 5366 while(oldToULength<toULength) { 5367 utf8->toUBytes[oldToULength++]=*source++; 5368 } 5369 utf8->toULength=toULength; 5370 pToUArgs->source=(char *)source; 5371 pFromUArgs->target=(char *)target; 5372 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5373 return; 5374 } 5375 5376 /* get the bytes and the length for the output */ 5377 /* MBCS_OUTPUT_2 */ 5378 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5379 5380 /* is this code point assigned, or do we use fallbacks? */ 5381 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5382 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5383 ) { 5384 goto unassigned; 5385 } 5386 } 5387 } 5388 5389 /* write the output character bytes from value and length */ 5390 /* from the first if in the loop we know that targetCapacity>0 */ 5391 if(value<=0xff) { 5392 /* this is easy because we know that there is enough space */ 5393 *target++=(uint8_t)value; 5394 --targetCapacity; 5395 } else /* length==2 */ { 5396 *target++=(uint8_t)(value>>8); 5397 if(2<=targetCapacity) { 5398 *target++=(uint8_t)value; 5399 targetCapacity-=2; 5400 } else { 5401 cnv->charErrorBuffer[0]=(char)value; 5402 cnv->charErrorBufferLength=1; 5403 5404 /* target overflow */ 5405 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5406 break; 5407 } 5408 } 5409 continue; 5410 5411 unassigned: 5412 { 5413 /* 5414 * Try an extension mapping. 5415 * Pass in no source because we don't have UTF-16 input. 5416 * If we have a partial match on c, we will return and revert 5417 * to UTF-8->UTF-16->charset conversion. 5418 */ 5419 static const UChar nul=0; 5420 const UChar *noSource=&nul; 5421 c=_extFromU(cnv, cnv->sharedData, 5422 c, &noSource, noSource, 5423 &target, target+targetCapacity, 5424 NULL, -1, 5425 pFromUArgs->flush, 5426 pErrorCode); 5427 5428 if(U_FAILURE(*pErrorCode)) { 5429 /* not mappable or buffer overflow */ 5430 cnv->fromUChar32=c; 5431 break; 5432 } else if(cnv->preFromUFirstCP>=0) { 5433 /* 5434 * Partial match, return and revert to pivoting. 5435 * In normal from-UTF-16 conversion, we would just continue 5436 * but then exit the loop because the extension match would 5437 * have consumed the source. 5438 */ 5439 *pErrorCode=U_USING_DEFAULT_WARNING; 5440 break; 5441 } else { 5442 /* a mapping was written to the target, continue */ 5443 5444 /* recalculate the targetCapacity after an extension mapping */ 5445 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5446 continue; 5447 } 5448 } 5449 } else { 5450 /* target is full */ 5451 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5452 break; 5453 } 5454 } 5455 5456 /* 5457 * The sourceLimit may have been adjusted before the conversion loop 5458 * to stop before a truncated sequence. 5459 * If so, then collect the truncated sequence now. 5460 */ 5461 if(U_SUCCESS(*pErrorCode) && 5462 cnv->preFromUFirstCP<0 && 5463 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5464 c=utf8->toUBytes[0]=b=*source++; 5465 toULength=1; 5466 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5467 while(source<sourceLimit) { 5468 utf8->toUBytes[toULength++]=b=*source++; 5469 c=(c<<6)+b; 5470 } 5471 utf8->toUnicodeStatus=c; 5472 utf8->toULength=toULength; 5473 utf8->mode=toULimit; 5474 } 5475 5476 /* write back the updated pointers */ 5477 pToUArgs->source=(char *)source; 5478 pFromUArgs->target=(char *)target; 5479 } 5480 5481 /* miscellaneous ------------------------------------------------------------ */ 5482 5483 static void 5484 ucnv_MBCSGetStarters(const UConverter* cnv, 5485 UBool starters[256], 5486 UErrorCode *pErrorCode) { 5487 const int32_t *state0; 5488 int i; 5489 5490 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5491 for(i=0; i<256; ++i) { 5492 /* all bytes that cause a state transition from state 0 are lead bytes */ 5493 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5494 } 5495 } 5496 5497 /* 5498 * This is an internal function that allows other converter implementations 5499 * to check whether a byte is a lead byte. 5500 */ 5501 U_CFUNC UBool 5502 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5503 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5504 } 5505 5506 static void 5507 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5508 int32_t offsetIndex, 5509 UErrorCode *pErrorCode) { 5510 UConverter *cnv=pArgs->converter; 5511 char *p, *subchar; 5512 char buffer[4]; 5513 int32_t length; 5514 5515 /* first, select between subChar and subChar1 */ 5516 if( cnv->subChar1!=0 && 5517 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5518 cnv->useSubChar1 : 5519 (cnv->invalidUCharBuffer[0]<=0xff)) 5520 ) { 5521 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5522 subchar=(char *)&cnv->subChar1; 5523 length=1; 5524 } else { 5525 /* select subChar in all other cases */ 5526 subchar=(char *)cnv->subChars; 5527 length=cnv->subCharLen; 5528 } 5529 5530 /* reset the selector for the next code point */ 5531 cnv->useSubChar1=FALSE; 5532 5533 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5534 p=buffer; 5535 5536 /* fromUnicodeStatus contains prevLength */ 5537 switch(length) { 5538 case 1: 5539 if(cnv->fromUnicodeStatus==2) { 5540 /* DBCS mode and SBCS sub char: change to SBCS */ 5541 cnv->fromUnicodeStatus=1; 5542 *p++=UCNV_SI; 5543 } 5544 *p++=subchar[0]; 5545 break; 5546 case 2: 5547 if(cnv->fromUnicodeStatus<=1) { 5548 /* SBCS mode and DBCS sub char: change to DBCS */ 5549 cnv->fromUnicodeStatus=2; 5550 *p++=UCNV_SO; 5551 } 5552 *p++=subchar[0]; 5553 *p++=subchar[1]; 5554 break; 5555 default: 5556 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5557 return; 5558 } 5559 subchar=buffer; 5560 length=(int32_t)(p-buffer); 5561 } 5562 5563 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5564 } 5565 5566 U_CFUNC UConverterType 5567 ucnv_MBCSGetType(const UConverter* converter) { 5568 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5569 if(converter->sharedData->mbcs.countStates==1) { 5570 return (UConverterType)UCNV_SBCS; 5571 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5572 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5573 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5574 return (UConverterType)UCNV_DBCS; 5575 } 5576 return (UConverterType)UCNV_MBCS; 5577 } 5578 5579 static const UConverterImpl _SBCSUTF8Impl={ 5580 UCNV_MBCS, 5581 5582 ucnv_MBCSLoad, 5583 ucnv_MBCSUnload, 5584 5585 ucnv_MBCSOpen, 5586 NULL, 5587 NULL, 5588 5589 ucnv_MBCSToUnicodeWithOffsets, 5590 ucnv_MBCSToUnicodeWithOffsets, 5591 ucnv_MBCSFromUnicodeWithOffsets, 5592 ucnv_MBCSFromUnicodeWithOffsets, 5593 ucnv_MBCSGetNextUChar, 5594 5595 ucnv_MBCSGetStarters, 5596 ucnv_MBCSGetName, 5597 ucnv_MBCSWriteSub, 5598 NULL, 5599 ucnv_MBCSGetUnicodeSet, 5600 5601 NULL, 5602 ucnv_SBCSFromUTF8 5603 }; 5604 5605 static const UConverterImpl _DBCSUTF8Impl={ 5606 UCNV_MBCS, 5607 5608 ucnv_MBCSLoad, 5609 ucnv_MBCSUnload, 5610 5611 ucnv_MBCSOpen, 5612 NULL, 5613 NULL, 5614 5615 ucnv_MBCSToUnicodeWithOffsets, 5616 ucnv_MBCSToUnicodeWithOffsets, 5617 ucnv_MBCSFromUnicodeWithOffsets, 5618 ucnv_MBCSFromUnicodeWithOffsets, 5619 ucnv_MBCSGetNextUChar, 5620 5621 ucnv_MBCSGetStarters, 5622 ucnv_MBCSGetName, 5623 ucnv_MBCSWriteSub, 5624 NULL, 5625 ucnv_MBCSGetUnicodeSet, 5626 5627 NULL, 5628 ucnv_DBCSFromUTF8 5629 }; 5630 5631 static const UConverterImpl _MBCSImpl={ 5632 UCNV_MBCS, 5633 5634 ucnv_MBCSLoad, 5635 ucnv_MBCSUnload, 5636 5637 ucnv_MBCSOpen, 5638 NULL, 5639 NULL, 5640 5641 ucnv_MBCSToUnicodeWithOffsets, 5642 ucnv_MBCSToUnicodeWithOffsets, 5643 ucnv_MBCSFromUnicodeWithOffsets, 5644 ucnv_MBCSFromUnicodeWithOffsets, 5645 ucnv_MBCSGetNextUChar, 5646 5647 ucnv_MBCSGetStarters, 5648 ucnv_MBCSGetName, 5649 ucnv_MBCSWriteSub, 5650 NULL, 5651 ucnv_MBCSGetUnicodeSet 5652 }; 5653 5654 5655 /* Static data is in tools/makeconv/ucnvstat.c for data-based 5656 * converters. Be sure to update it as well. 5657 */ 5658 5659 const UConverterSharedData _MBCSData={ 5660 sizeof(UConverterSharedData), 1, 5661 NULL, NULL, NULL, FALSE, &_MBCSImpl, 5662 0 5663 }; 5664 5665 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5666