1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2000-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvmbcs.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000jul03 16 * created by: Markus W. Scherer 17 * 18 * The current code in this file replaces the previous implementation 19 * of conversion code from multi-byte codepages to Unicode and back. 20 * This implementation supports the following: 21 * - legacy variable-length codepages with up to 4 bytes per character 22 * - all Unicode code points (up to 0x10ffff) 23 * - efficient distinction of unassigned vs. illegal byte sequences 24 * - it is possible in fromUnicode() to directly deal with simple 25 * stateful encodings (used for EBCDIC_STATEFUL) 26 * - it is possible to convert Unicode code points 27 * to a single zero byte (but not as a fallback except for SBCS) 28 * 29 * Remaining limitations in fromUnicode: 30 * - byte sequences must not have leading zero bytes 31 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 32 * - limitation to up to 4 bytes per character 33 * 34 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 35 * limitations and adds m:n character mappings and other features. 36 * See ucnv_ext.h for details. 37 * 38 * Change history: 39 * 40 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 41 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 42 * macros to ucnvmbcs.h file 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 48 49 #include "unicode/ucnv.h" 50 #include "unicode/ucnv_cb.h" 51 #include "unicode/udata.h" 52 #include "unicode/uset.h" 53 #include "unicode/utf8.h" 54 #include "unicode/utf16.h" 55 #include "ucnv_bld.h" 56 #include "ucnvmbcs.h" 57 #include "ucnv_ext.h" 58 #include "ucnv_cnv.h" 59 #include "cmemory.h" 60 #include "cstring.h" 61 #include "umutex.h" 62 #include "ustr_imp.h" 63 64 /* control optimizations according to the platform */ 65 #define MBCS_UNROLL_SINGLE_TO_BMP 1 66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 67 68 /* 69 * _MBCSHeader versions 5.3 & 4.3 70 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 71 * 72 * This version is optional. Version 5 is used for incompatible data format changes. 73 * makeconv will continue to generate version 4 files if possible. 74 * 75 * Changes from version 4: 76 * 77 * The main difference is an additional _MBCSHeader field with 78 * - the length (number of uint32_t) of the _MBCSHeader 79 * - flags for further incompatible data format changes 80 * - flags for further, backward compatible data format changes 81 * 82 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 83 * the file and needs to be reconstituted at load time. 84 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 85 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 86 * (For details about these structures see below, and see ucnvmbcs.h.) 87 * 88 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 89 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 90 * precision markers for all mappings.) 91 * 92 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 93 * omitted data that can be reconstituted from the toUnicode data. 94 * 95 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 96 * With only roundtrip mappings in the base fromUnicode data, this part is fully 97 * redundant with the mbcsIndex and will be reconstituted from that (also using the 98 * stage 1 table which contains the information about how stage 2 was compacted). 99 * 100 * The rest of the stage 2 table, the part for code points above maxFastUChar, 101 * is stored in the file and will be appended to the reconstituted part. 102 * 103 * The entire fromUBytes array is omitted from the file and will be reconstitued. 104 * This is done by enumerating all toUnicode roundtrip mappings, performing 105 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 106 * writing instead of reading the byte values. 107 * 108 * _MBCSHeader version 4.3 109 * 110 * Change from version 4.2: 111 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 112 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 113 * files which can be used instead of stages 1 & 2. 114 * Faster lookups for roundtrips from most commonly used characters, 115 * and lookups from UTF-8 byte sequences with a natural bit distribution. 116 * See ucnvmbcs.h for more details. 117 * 118 * Change from version 4.1: 119 * - Added an optional extension table structure at the end of the .cnv file. 120 * It is present if the upper bits of the header flags field contains a non-zero 121 * byte offset to it. 122 * Files that contain only a conversion table and no base table 123 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 124 * These contain the base table name between the MBCS header and the extension 125 * data. 126 * 127 * Change from version 4.0: 128 * - Replace header.reserved with header.fromUBytesLength so that all 129 * fields in the data have length. 130 * 131 * Changes from version 3 (for performance improvements): 132 * - new bit distribution for state table entries 133 * - reordered action codes 134 * - new data structure for single-byte fromUnicode 135 * + stage 2 only contains indexes 136 * + stage 3 stores 16 bits per character with classification bits 15..8 137 * - no multiplier for stage 1 entries 138 * - stage 2 for non-single-byte codepages contains the index and the flags in 139 * one 32-bit value 140 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 141 * 142 * For more details about old versions of the MBCS data structure, see 143 * the corresponding versions of this file. 144 * 145 * Converting stateless codepage data ---------------------------------------*** 146 * (or codepage data with simple states) to Unicode. 147 * 148 * Data structure and algorithm for converting from complex legacy codepages 149 * to Unicode. (Designed before 2000-may-22.) 150 * 151 * The basic idea is that the structure of legacy codepages can be described 152 * with state tables. 153 * When reading a byte stream, each input byte causes a state transition. 154 * Some transitions result in the output of a code point, some result in 155 * "unassigned" or "illegal" output. 156 * This is used here for character conversion. 157 * 158 * The data structure begins with a state table consisting of a row 159 * per state, with 256 entries (columns) per row for each possible input 160 * byte value. 161 * Each entry is 32 bits wide, with two formats distinguished by 162 * the sign bit (bit 31): 163 * 164 * One format for transitional entries (bit 31 not set) for non-final bytes, and 165 * one format for final entries (bit 31 set). 166 * Both formats contain the number of the next state in the same bit 167 * positions. 168 * State 0 is the initial state. 169 * 170 * Most of the time, the offset values of subsequent states are added 171 * up to a scalar value. This value will eventually be the index of 172 * the Unicode code point in a table that follows the state table. 173 * The effect is that the code points for final state table rows 174 * are contiguous. The code points of final state rows follow each other 175 * in the order of the references to those final states by previous 176 * states, etc. 177 * 178 * For some terminal states, the offset is itself the output Unicode 179 * code point (16 bits for a BMP code point or 20 bits for a supplementary 180 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 181 * For others, the code point in the Unicode table is stored with either 182 * one or two code units: one for BMP code points, two for a pair of 183 * surrogates. 184 * All code points for a final state entry take up the same number of code 185 * units, regardless of whether they all actually _use_ the same number 186 * of code units. This is necessary for simple array access. 187 * 188 * An additional feature comes in with what in ICU is called "fallback" 189 * mappings: 190 * 191 * In addition to round-trippable, precise, 1:1 mappings, there are often 192 * mappings defined between similar, though not the same, characters. 193 * Typically, such mappings occur only in fromUnicode mapping tables because 194 * Unicode has a superset repertoire of most other codepages. However, it 195 * is possible to provide such mappings in the toUnicode tables, too. 196 * In this case, the fallback mappings are partly integrated into the 197 * general state tables because the structure of the encoding includes their 198 * byte sequences. 199 * For final entries in an initial state, fallback mappings are stored in 200 * the entry itself like with roundtrip mappings. 201 * For other final entries, they are stored in the code units table if 202 * the entry is for a pair of code units. 203 * For single-unit results in the code units table, there is no space to 204 * alternatively hold a fallback mapping; in this case, the code unit 205 * is stored as U+fffe (unassigned), and the fallback mapping needs to 206 * be looked up by the scalar offset value in a separate table. 207 * 208 * "Unassigned" state entries really mean "structurally unassigned", 209 * i.e., such a byte sequence will never have a mapping result. 210 * 211 * The interpretation of the bits in each entry is as follows: 212 * 213 * Bit 31 not set, not a terminal entry ("transitional"): 214 * 30..24 next state 215 * 23..0 offset delta, to be added up 216 * 217 * Bit 31 set, terminal ("final") entry: 218 * 30..24 next state (regardless of action code) 219 * 23..20 action code: 220 * action codes 0 and 1 result in precise-mapping Unicode code points 221 * 0 valid byte sequence 222 * 19..16 not used, 0 223 * 15..0 16-bit Unicode BMP code point 224 * never U+fffe or U+ffff 225 * 1 valid byte sequence 226 * 19..0 20-bit Unicode supplementary code point 227 * never U+fffe or U+ffff 228 * 229 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 230 * 2 valid byte sequence (fallback) 231 * 19..16 not used, 0 232 * 15..0 16-bit Unicode BMP code point as fallback result 233 * 3 valid byte sequence (fallback) 234 * 19..0 20-bit Unicode supplementary code point as fallback result 235 * 236 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 237 * depending on the code units they result in 238 * 4 valid byte sequence 239 * 19..9 not used, 0 240 * 8..0 final offset delta 241 * pointing to one 16-bit code unit which may be 242 * fffe unassigned -- look for a fallback for this offset 243 * ffff illegal 244 * 5 valid byte sequence 245 * 19..9 not used, 0 246 * 8..0 final offset delta 247 * pointing to two 16-bit code units 248 * (typically UTF-16 surrogates) 249 * the result depends on the first code unit as follows: 250 * 0000..d7ff roundtrip BMP code point (1st alone) 251 * d800..dbff roundtrip surrogate pair (1st, 2nd) 252 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 253 * e000 roundtrip BMP code point (2nd alone) 254 * e001 fallback BMP code point (2nd alone) 255 * fffe unassigned 256 * ffff illegal 257 * (the final offset deltas are at most 255 * 2, 258 * times 2 because of storing code unit pairs) 259 * 260 * 6 unassigned byte sequence 261 * 19..16 not used, 0 262 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 263 * this does not contain a final offset delta because the main 264 * purpose of this action code is to save scalar offset values; 265 * therefore, fallback values cannot be assigned to byte 266 * sequences that result in this action code 267 * 7 illegal byte sequence 268 * 19..16 not used, 0 269 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 270 * 8 state change only 271 * 19..0 not used, 0 272 * useful for state changes in simple stateful encodings, 273 * at Shift-In/Shift-Out codes 274 * 275 * 276 * 9..15 reserved for future use 277 * current implementations will only perform a state change 278 * and ignore bits 19..0 279 * 280 * An encoding with contiguous ranges of unassigned byte sequences, like 281 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 282 * at least two states for the trail bytes: 283 * One trail byte state that results in code points, and one that only 284 * has "unassigned" and "illegal" terminal states. 285 * 286 * Note: partly by accident, this data structure supports simple stateful 287 * encodings without any additional logic. 288 * Currently, only simple Shift-In/Shift-Out schemes are handled with 289 * appropriate state tables (especially EBCDIC_STATEFUL!). 290 * 291 * MBCS version 2 added: 292 * unassigned and illegal action codes have U+fffe and U+ffff 293 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 294 * 295 * Converting from Unicode to codepage bytes --------------------------------*** 296 * 297 * The conversion data structure for fromUnicode is designed for the known 298 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 299 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 300 * a roundtrip mapping. 301 * 302 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 303 * like in the character properties table. 304 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 305 * with the resulting bytes is at offsetFromUBytes. 306 * 307 * Beginning with version 4, single-byte codepages have a significantly different 308 * trie compared to other codepages. 309 * In all cases, the entry in stage 1 is directly the index of the block of 310 * 64 entries in stage 2. 311 * 312 * Single-byte lookup: 313 * 314 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 315 * Stage 3 contains one 16-bit word per result: 316 * Bits 15..8 indicate the kind of result: 317 * f roundtrip result 318 * c fallback result from private-use code point 319 * 8 fallback result from other code points 320 * 0 unassigned 321 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 322 * 323 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 324 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 325 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 326 * ASCII code points can be looked up with a linear array access into stage 3. 327 * See maxFastUChar and other details in ucnvmbcs.h. 328 * 329 * Multi-byte lookup: 330 * 331 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 332 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 333 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 334 * If this test is false, then a non-zero result will be interpreted as 335 * a fallback mapping. 336 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 337 * 338 * Stage 3 contains 2, 3, or 4 bytes per result. 339 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 340 * while 3 bytes are stored as bytes in big-endian order. 341 * Leading zero bytes are ignored, and the number of bytes is counted. 342 * A zero byte mapping result is possible as a roundtrip result. 343 * For some output types, the actual result is processed from this; 344 * see ucnv_MBCSFromUnicodeWithOffsets(). 345 * 346 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 347 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 348 * 349 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 350 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 351 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 352 * ASCII code points can be looked up with a linear array access into stage 3. 353 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 354 * 355 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 356 * for compaction. 357 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 358 * may overlap by any number of entries. 359 * 360 * MBCS version 2 added: 361 * the converter checks for known output types, which allows 362 * adding new ones without crashing an unaware converter 363 */ 364 365 /** 366 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 367 * consecutive sequences of bytes, starting from the one encoded in value, 368 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 369 * Does not currently support m:n mappings or reverse fallbacks. 370 * This function will not be called for sequences of bytes with leading zeros. 371 * 372 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 373 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 374 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 375 * not map to anything 376 * @return TRUE to continue enumeration, FALSE to stop 377 */ 378 typedef UBool U_CALLCONV 379 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 380 381 static void U_CALLCONV 382 ucnv_MBCSLoad(UConverterSharedData *sharedData, 383 UConverterLoadArgs *pArgs, 384 const uint8_t *raw, 385 UErrorCode *pErrorCode); 386 387 static void U_CALLCONV 388 ucnv_MBCSUnload(UConverterSharedData *sharedData); 389 390 static void U_CALLCONV 391 ucnv_MBCSOpen(UConverter *cnv, 392 UConverterLoadArgs *pArgs, 393 UErrorCode *pErrorCode); 394 395 static UChar32 U_CALLCONV 396 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 397 UErrorCode *pErrorCode); 398 399 static void U_CALLCONV 400 ucnv_MBCSGetStarters(const UConverter* cnv, 401 UBool starters[256], 402 UErrorCode *pErrorCode); 403 404 U_CDECL_BEGIN 405 static const char* U_CALLCONV 406 ucnv_MBCSGetName(const UConverter *cnv); 407 U_CDECL_END 408 409 static void U_CALLCONV 410 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 411 int32_t offsetIndex, 412 UErrorCode *pErrorCode); 413 414 static UChar32 U_CALLCONV 415 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 416 UErrorCode *pErrorCode); 417 418 static void U_CALLCONV 419 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 420 UConverterToUnicodeArgs *pToUArgs, 421 UErrorCode *pErrorCode); 422 423 static void U_CALLCONV 424 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 425 const USetAdder *sa, 426 UConverterUnicodeSet which, 427 UErrorCode *pErrorCode); 428 429 static void U_CALLCONV 430 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 431 UConverterToUnicodeArgs *pToUArgs, 432 UErrorCode *pErrorCode); 433 434 static const UConverterImpl _SBCSUTF8Impl={ 435 UCNV_MBCS, 436 437 ucnv_MBCSLoad, 438 ucnv_MBCSUnload, 439 440 ucnv_MBCSOpen, 441 NULL, 442 NULL, 443 444 ucnv_MBCSToUnicodeWithOffsets, 445 ucnv_MBCSToUnicodeWithOffsets, 446 ucnv_MBCSFromUnicodeWithOffsets, 447 ucnv_MBCSFromUnicodeWithOffsets, 448 ucnv_MBCSGetNextUChar, 449 450 ucnv_MBCSGetStarters, 451 ucnv_MBCSGetName, 452 ucnv_MBCSWriteSub, 453 NULL, 454 ucnv_MBCSGetUnicodeSet, 455 456 NULL, 457 ucnv_SBCSFromUTF8 458 }; 459 460 static const UConverterImpl _DBCSUTF8Impl={ 461 UCNV_MBCS, 462 463 ucnv_MBCSLoad, 464 ucnv_MBCSUnload, 465 466 ucnv_MBCSOpen, 467 NULL, 468 NULL, 469 470 ucnv_MBCSToUnicodeWithOffsets, 471 ucnv_MBCSToUnicodeWithOffsets, 472 ucnv_MBCSFromUnicodeWithOffsets, 473 ucnv_MBCSFromUnicodeWithOffsets, 474 ucnv_MBCSGetNextUChar, 475 476 ucnv_MBCSGetStarters, 477 ucnv_MBCSGetName, 478 ucnv_MBCSWriteSub, 479 NULL, 480 ucnv_MBCSGetUnicodeSet, 481 482 NULL, 483 ucnv_DBCSFromUTF8 484 }; 485 486 static const UConverterImpl _MBCSImpl={ 487 UCNV_MBCS, 488 489 ucnv_MBCSLoad, 490 ucnv_MBCSUnload, 491 492 ucnv_MBCSOpen, 493 NULL, 494 NULL, 495 496 ucnv_MBCSToUnicodeWithOffsets, 497 ucnv_MBCSToUnicodeWithOffsets, 498 ucnv_MBCSFromUnicodeWithOffsets, 499 ucnv_MBCSFromUnicodeWithOffsets, 500 ucnv_MBCSGetNextUChar, 501 502 ucnv_MBCSGetStarters, 503 ucnv_MBCSGetName, 504 ucnv_MBCSWriteSub, 505 NULL, 506 ucnv_MBCSGetUnicodeSet, 507 NULL, 508 NULL 509 }; 510 511 /* Static data is in tools/makeconv/ucnvstat.c for data-based 512 * converters. Be sure to update it as well. 513 */ 514 515 const UConverterSharedData _MBCSData={ 516 sizeof(UConverterSharedData), 1, 517 NULL, NULL, FALSE, TRUE, &_MBCSImpl, 518 0, UCNV_MBCS_TABLE_INITIALIZER 519 }; 520 521 522 /* GB 18030 data ------------------------------------------------------------ */ 523 524 /* helper macros for linear values for GB 18030 four-byte sequences */ 525 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 526 527 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 528 529 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 530 531 /* 532 * Some ranges of GB 18030 where both the Unicode code points and the 533 * GB four-byte sequences are contiguous and are handled algorithmically by 534 * the special callback functions below. 535 * The values are start & end of Unicode & GB codes. 536 * 537 * Note that single surrogates are not mapped by GB 18030 538 * as of the re-released mapping tables from 2000-nov-30. 539 */ 540 static const uint32_t 541 gb18030Ranges[14][4]={ 542 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 543 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 544 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 545 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 546 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 547 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 548 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 549 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 550 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 551 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 552 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 553 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 554 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 555 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 556 }; 557 558 /* bit flag for UConverter.options indicating GB 18030 special handling */ 559 #define _MBCS_OPTION_GB18030 0x8000 560 561 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 562 #define _MBCS_OPTION_KEIS 0x01000 563 #define _MBCS_OPTION_JEF 0x02000 564 #define _MBCS_OPTION_JIPS 0x04000 565 566 #define KEIS_SO_CHAR_1 0x0A 567 #define KEIS_SO_CHAR_2 0x42 568 #define KEIS_SI_CHAR_1 0x0A 569 #define KEIS_SI_CHAR_2 0x41 570 571 #define JEF_SO_CHAR 0x28 572 #define JEF_SI_CHAR 0x29 573 574 #define JIPS_SO_CHAR_1 0x1A 575 #define JIPS_SO_CHAR_2 0x70 576 #define JIPS_SI_CHAR_1 0x1A 577 #define JIPS_SI_CHAR_2 0x71 578 579 enum SISO_Option { 580 SI, 581 SO 582 }; 583 typedef enum SISO_Option SISO_Option; 584 585 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 586 int32_t SISOLength = 0; 587 588 switch (option) { 589 case SI: 590 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 591 value[0] = KEIS_SI_CHAR_1; 592 value[1] = KEIS_SI_CHAR_2; 593 SISOLength = 2; 594 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 595 value[0] = JEF_SI_CHAR; 596 SISOLength = 1; 597 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 598 value[0] = JIPS_SI_CHAR_1; 599 value[1] = JIPS_SI_CHAR_2; 600 SISOLength = 2; 601 } else { 602 value[0] = UCNV_SI; 603 SISOLength = 1; 604 } 605 break; 606 case SO: 607 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 608 value[0] = KEIS_SO_CHAR_1; 609 value[1] = KEIS_SO_CHAR_2; 610 SISOLength = 2; 611 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 612 value[0] = JEF_SO_CHAR; 613 SISOLength = 1; 614 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 615 value[0] = JIPS_SO_CHAR_1; 616 value[1] = JIPS_SO_CHAR_2; 617 SISOLength = 2; 618 } else { 619 value[0] = UCNV_SO; 620 SISOLength = 1; 621 } 622 break; 623 default: 624 /* Should never happen. */ 625 break; 626 } 627 628 return SISOLength; 629 } 630 631 /* Miscellaneous ------------------------------------------------------------ */ 632 633 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 634 static UBool 635 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 636 int32_t state, uint32_t offset, 637 uint32_t value, 638 UConverterEnumToUCallback *callback, const void *context, 639 UErrorCode *pErrorCode) { 640 UChar32 codePoints[32]; 641 const int32_t *row; 642 const uint16_t *unicodeCodeUnits; 643 UChar32 anyCodePoints; 644 int32_t b, limit; 645 646 row=mbcsTable->stateTable[state]; 647 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 648 649 value<<=8; 650 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 651 652 b=(stateProps[state]&0x38)<<2; 653 if(b==0 && stateProps[state]>=0x40) { 654 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 655 codePoints[0]=U_SENTINEL; 656 b=1; 657 } 658 limit=((stateProps[state]&7)+1)<<5; 659 while(b<limit) { 660 int32_t entry=row[b]; 661 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 662 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 663 if(stateProps[nextState]>=0) { 664 /* recurse to a state with non-ignorable actions */ 665 if(!enumToU( 666 mbcsTable, stateProps, nextState, 667 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 668 value|(uint32_t)b, 669 callback, context, 670 pErrorCode)) { 671 return FALSE; 672 } 673 } 674 codePoints[b&0x1f]=U_SENTINEL; 675 } else { 676 UChar32 c; 677 int32_t action; 678 679 /* 680 * An if-else-if chain provides more reliable performance for 681 * the most common cases compared to a switch. 682 */ 683 action=MBCS_ENTRY_FINAL_ACTION(entry); 684 if(action==MBCS_STATE_VALID_DIRECT_16) { 685 /* output BMP code point */ 686 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 687 } else if(action==MBCS_STATE_VALID_16) { 688 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 689 c=unicodeCodeUnits[finalOffset]; 690 if(c<0xfffe) { 691 /* output BMP code point */ 692 } else { 693 c=U_SENTINEL; 694 } 695 } else if(action==MBCS_STATE_VALID_16_PAIR) { 696 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 697 c=unicodeCodeUnits[finalOffset++]; 698 if(c<0xd800) { 699 /* output BMP code point below 0xd800 */ 700 } else if(c<=0xdbff) { 701 /* output roundtrip or fallback supplementary code point */ 702 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 703 } else if(c==0xe000) { 704 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 705 c=unicodeCodeUnits[finalOffset]; 706 } else { 707 c=U_SENTINEL; 708 } 709 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 710 /* output supplementary code point */ 711 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 712 } else { 713 c=U_SENTINEL; 714 } 715 716 codePoints[b&0x1f]=c; 717 anyCodePoints&=c; 718 } 719 if(((++b)&0x1f)==0) { 720 if(anyCodePoints>=0) { 721 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 722 return FALSE; 723 } 724 anyCodePoints=-1; 725 } 726 } 727 } 728 return TRUE; 729 } 730 731 /* 732 * Only called if stateProps[state]==-1. 733 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 734 * MBCS_STATE_CHANGE_ONLY. 735 */ 736 static int8_t 737 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 738 const int32_t *row; 739 int32_t min, max, entry, nextState; 740 741 row=stateTable[state]; 742 stateProps[state]=0; 743 744 /* find first non-ignorable state */ 745 for(min=0;; ++min) { 746 entry=row[min]; 747 nextState=MBCS_ENTRY_STATE(entry); 748 if(stateProps[nextState]==-1) { 749 getStateProp(stateTable, stateProps, nextState); 750 } 751 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 752 if(stateProps[nextState]>=0) { 753 break; 754 } 755 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 756 break; 757 } 758 if(min==0xff) { 759 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 760 return stateProps[state]; 761 } 762 } 763 stateProps[state]|=(int8_t)((min>>5)<<3); 764 765 /* find last non-ignorable state */ 766 for(max=0xff; min<max; --max) { 767 entry=row[max]; 768 nextState=MBCS_ENTRY_STATE(entry); 769 if(stateProps[nextState]==-1) { 770 getStateProp(stateTable, stateProps, nextState); 771 } 772 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 773 if(stateProps[nextState]>=0) { 774 break; 775 } 776 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 777 break; 778 } 779 } 780 stateProps[state]|=(int8_t)(max>>5); 781 782 /* recurse further and collect direct-state information */ 783 while(min<=max) { 784 entry=row[min]; 785 nextState=MBCS_ENTRY_STATE(entry); 786 if(stateProps[nextState]==-1) { 787 getStateProp(stateTable, stateProps, nextState); 788 } 789 if(MBCS_ENTRY_IS_FINAL(entry)) { 790 stateProps[nextState]|=0x40; 791 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 792 stateProps[state]|=0x40; 793 } 794 } 795 ++min; 796 } 797 return stateProps[state]; 798 } 799 800 /* 801 * Internal function enumerating the toUnicode data of an MBCS converter. 802 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 803 * table, but could also be used for a future ucnv_getUnicodeSet() option 804 * that includes reverse fallbacks (after updating this function's implementation). 805 * Currently only handles roundtrip mappings. 806 * Does not currently handle extensions. 807 */ 808 static void 809 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 810 UConverterEnumToUCallback *callback, const void *context, 811 UErrorCode *pErrorCode) { 812 /* 813 * Properties for each state, to speed up the enumeration. 814 * Ignorable actions are unassigned/illegal/state-change-only: 815 * They do not lead to mappings. 816 * 817 * Bits 7..6: 818 * 1 direct/initial state (stateful converters have multiple) 819 * 0 non-initial state with transitions or with non-ignorable result actions 820 * -1 final state with only ignorable actions 821 * 822 * Bits 5..3: 823 * The lowest byte value with non-ignorable actions is 824 * value<<5 (rounded down). 825 * 826 * Bits 2..0: 827 * The highest byte value with non-ignorable actions is 828 * (value<<5)&0x1f (rounded up). 829 */ 830 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 831 int32_t state; 832 833 uprv_memset(stateProps, -1, sizeof(stateProps)); 834 835 /* recurse from state 0 and set all stateProps */ 836 getStateProp(mbcsTable->stateTable, stateProps, 0); 837 838 for(state=0; state<mbcsTable->countStates; ++state) { 839 /*if(stateProps[state]==-1) { 840 printf("unused/unreachable <icu:state> %d\n", state); 841 }*/ 842 if(stateProps[state]>=0x40) { 843 /* start from each direct state */ 844 enumToU( 845 mbcsTable, stateProps, state, 0, 0, 846 callback, context, 847 pErrorCode); 848 } 849 } 850 } 851 852 U_CFUNC void 853 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 854 const USetAdder *sa, 855 UConverterUnicodeSet which, 856 UConverterSetFilter filter, 857 UErrorCode *pErrorCode) { 858 const UConverterMBCSTable *mbcsTable; 859 const uint16_t *table; 860 861 uint32_t st3; 862 uint16_t st1, maxStage1, st2; 863 864 UChar32 c; 865 866 /* enumerate the from-Unicode trie table */ 867 mbcsTable=&sharedData->mbcs; 868 table=mbcsTable->fromUnicodeTable; 869 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 870 maxStage1=0x440; 871 } else { 872 maxStage1=0x40; 873 } 874 875 c=0; /* keep track of the current code point while enumerating */ 876 877 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 878 const uint16_t *stage2, *stage3, *results; 879 uint16_t minValue; 880 881 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 882 883 /* 884 * Set a threshold variable for selecting which mappings to use. 885 * See ucnv_MBCSSingleFromBMPWithOffsets() and 886 * MBCS_SINGLE_RESULT_FROM_U() for details. 887 */ 888 if(which==UCNV_ROUNDTRIP_SET) { 889 /* use only roundtrips */ 890 minValue=0xf00; 891 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 892 /* use all roundtrip and fallback results */ 893 minValue=0x800; 894 } 895 896 for(st1=0; st1<maxStage1; ++st1) { 897 st2=table[st1]; 898 if(st2>maxStage1) { 899 stage2=table+st2; 900 for(st2=0; st2<64; ++st2) { 901 if((st3=stage2[st2])!=0) { 902 /* read the stage 3 block */ 903 stage3=results+st3; 904 905 do { 906 if(*stage3++>=minValue) { 907 sa->add(sa->set, c); 908 } 909 } while((++c&0xf)!=0); 910 } else { 911 c+=16; /* empty stage 3 block */ 912 } 913 } 914 } else { 915 c+=1024; /* empty stage 2 block */ 916 } 917 } 918 } else { 919 const uint32_t *stage2; 920 const uint8_t *stage3, *bytes; 921 uint32_t st3Multiplier; 922 uint32_t value; 923 UBool useFallback; 924 925 bytes=mbcsTable->fromUnicodeBytes; 926 927 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 928 929 switch(mbcsTable->outputType) { 930 case MBCS_OUTPUT_3: 931 case MBCS_OUTPUT_4_EUC: 932 st3Multiplier=3; 933 break; 934 case MBCS_OUTPUT_4: 935 st3Multiplier=4; 936 break; 937 default: 938 st3Multiplier=2; 939 break; 940 } 941 942 for(st1=0; st1<maxStage1; ++st1) { 943 st2=table[st1]; 944 if(st2>(maxStage1>>1)) { 945 stage2=(const uint32_t *)table+st2; 946 for(st2=0; st2<64; ++st2) { 947 if((st3=stage2[st2])!=0) { 948 /* read the stage 3 block */ 949 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 950 951 /* get the roundtrip flags for the stage 3 block */ 952 st3>>=16; 953 954 /* 955 * Add code points for which the roundtrip flag is set, 956 * or which map to non-zero bytes if we use fallbacks. 957 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 958 */ 959 switch(filter) { 960 case UCNV_SET_FILTER_NONE: 961 do { 962 if(st3&1) { 963 sa->add(sa->set, c); 964 stage3+=st3Multiplier; 965 } else if(useFallback) { 966 uint8_t b=0; 967 switch(st3Multiplier) { 968 case 4: 969 b|=*stage3++; 970 U_FALLTHROUGH; 971 case 3: 972 b|=*stage3++; 973 U_FALLTHROUGH; 974 case 2: 975 b|=stage3[0]|stage3[1]; 976 stage3+=2; 977 U_FALLTHROUGH; 978 default: 979 break; 980 } 981 if(b!=0) { 982 sa->add(sa->set, c); 983 } 984 } 985 st3>>=1; 986 } while((++c&0xf)!=0); 987 break; 988 case UCNV_SET_FILTER_DBCS_ONLY: 989 /* Ignore single-byte results (<0x100). */ 990 do { 991 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 992 sa->add(sa->set, c); 993 } 994 st3>>=1; 995 stage3+=2; /* +=st3Multiplier */ 996 } while((++c&0xf)!=0); 997 break; 998 case UCNV_SET_FILTER_2022_CN: 999 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 1000 do { 1001 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 1002 sa->add(sa->set, c); 1003 } 1004 st3>>=1; 1005 stage3+=3; /* +=st3Multiplier */ 1006 } while((++c&0xf)!=0); 1007 break; 1008 case UCNV_SET_FILTER_SJIS: 1009 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 1010 do { 1011 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 1012 sa->add(sa->set, c); 1013 } 1014 st3>>=1; 1015 stage3+=2; /* +=st3Multiplier */ 1016 } while((++c&0xf)!=0); 1017 break; 1018 case UCNV_SET_FILTER_GR94DBCS: 1019 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 1020 do { 1021 if( ((st3&1)!=0 || useFallback) && 1022 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 1023 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1024 ) { 1025 sa->add(sa->set, c); 1026 } 1027 st3>>=1; 1028 stage3+=2; /* +=st3Multiplier */ 1029 } while((++c&0xf)!=0); 1030 break; 1031 case UCNV_SET_FILTER_HZ: 1032 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 1033 do { 1034 if( ((st3&1)!=0 || useFallback) && 1035 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1036 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1037 ) { 1038 sa->add(sa->set, c); 1039 } 1040 st3>>=1; 1041 stage3+=2; /* +=st3Multiplier */ 1042 } while((++c&0xf)!=0); 1043 break; 1044 default: 1045 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1046 return; 1047 } 1048 } else { 1049 c+=16; /* empty stage 3 block */ 1050 } 1051 } 1052 } else { 1053 c+=1024; /* empty stage 2 block */ 1054 } 1055 } 1056 } 1057 1058 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 1059 } 1060 1061 U_CFUNC void 1062 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1063 const USetAdder *sa, 1064 UConverterUnicodeSet which, 1065 UErrorCode *pErrorCode) { 1066 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 1067 sharedData, sa, which, 1068 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1069 UCNV_SET_FILTER_DBCS_ONLY : 1070 UCNV_SET_FILTER_NONE, 1071 pErrorCode); 1072 } 1073 1074 static void U_CALLCONV 1075 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 1076 const USetAdder *sa, 1077 UConverterUnicodeSet which, 1078 UErrorCode *pErrorCode) { 1079 if(cnv->options&_MBCS_OPTION_GB18030) { 1080 sa->addRange(sa->set, 0, 0xd7ff); 1081 sa->addRange(sa->set, 0xe000, 0x10ffff); 1082 } else { 1083 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 1084 } 1085 } 1086 1087 /* conversion extensions for input not in the main table -------------------- */ 1088 1089 /* 1090 * Hardcoded extension handling for GB 18030. 1091 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 1092 * 1093 * In the future, conversion extensions may handle m:n mappings and delta tables, 1094 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 1095 * 1096 * If an input character cannot be mapped, then these functions set an error 1097 * code. The framework will then call the callback function. 1098 */ 1099 1100 /* 1101 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 1102 * else return 0 after output has been written to the target 1103 */ 1104 static UChar32 1105 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 1106 UChar32 cp, 1107 const UChar **source, const UChar *sourceLimit, 1108 uint8_t **target, const uint8_t *targetLimit, 1109 int32_t **offsets, int32_t sourceIndex, 1110 UBool flush, 1111 UErrorCode *pErrorCode) { 1112 const int32_t *cx; 1113 1114 cnv->useSubChar1=FALSE; 1115 1116 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1117 ucnv_extInitialMatchFromU( 1118 cnv, cx, 1119 cp, source, sourceLimit, 1120 (char **)target, (char *)targetLimit, 1121 offsets, sourceIndex, 1122 flush, 1123 pErrorCode) 1124 ) { 1125 return 0; /* an extension mapping handled the input */ 1126 } 1127 1128 /* GB 18030 */ 1129 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 1130 const uint32_t *range; 1131 int32_t i; 1132 1133 range=gb18030Ranges[0]; 1134 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1135 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 1136 /* found the Unicode code point, output the four-byte sequence for it */ 1137 uint32_t linear; 1138 char bytes[4]; 1139 1140 /* get the linear value of the first GB 18030 code in this range */ 1141 linear=range[2]-LINEAR_18030_BASE; 1142 1143 /* add the offset from the beginning of the range */ 1144 linear+=((uint32_t)cp-range[0]); 1145 1146 /* turn this into a four-byte sequence */ 1147 bytes[3]=(char)(0x30+linear%10); linear/=10; 1148 bytes[2]=(char)(0x81+linear%126); linear/=126; 1149 bytes[1]=(char)(0x30+linear%10); linear/=10; 1150 bytes[0]=(char)(0x81+linear); 1151 1152 /* output this sequence */ 1153 ucnv_fromUWriteBytes(cnv, 1154 bytes, 4, (char **)target, (char *)targetLimit, 1155 offsets, sourceIndex, pErrorCode); 1156 return 0; 1157 } 1158 } 1159 } 1160 1161 /* no mapping */ 1162 *pErrorCode=U_INVALID_CHAR_FOUND; 1163 return cp; 1164 } 1165 1166 /* 1167 * Input sequence: cnv->toUBytes[0..length[ 1168 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1169 * else return 0 after output has been written to the target 1170 */ 1171 static int8_t 1172 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1173 int8_t length, 1174 const uint8_t **source, const uint8_t *sourceLimit, 1175 UChar **target, const UChar *targetLimit, 1176 int32_t **offsets, int32_t sourceIndex, 1177 UBool flush, 1178 UErrorCode *pErrorCode) { 1179 const int32_t *cx; 1180 1181 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1182 ucnv_extInitialMatchToU( 1183 cnv, cx, 1184 length, (const char **)source, (const char *)sourceLimit, 1185 target, targetLimit, 1186 offsets, sourceIndex, 1187 flush, 1188 pErrorCode) 1189 ) { 1190 return 0; /* an extension mapping handled the input */ 1191 } 1192 1193 /* GB 18030 */ 1194 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1195 const uint32_t *range; 1196 uint32_t linear; 1197 int32_t i; 1198 1199 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1200 range=gb18030Ranges[0]; 1201 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1202 if(range[2]<=linear && linear<=range[3]) { 1203 /* found the sequence, output the Unicode code point for it */ 1204 *pErrorCode=U_ZERO_ERROR; 1205 1206 /* add the linear difference between the input and start sequences to the start code point */ 1207 linear=range[0]+(linear-range[2]); 1208 1209 /* output this code point */ 1210 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1211 1212 return 0; 1213 } 1214 } 1215 } 1216 1217 /* no mapping */ 1218 *pErrorCode=U_INVALID_CHAR_FOUND; 1219 return length; 1220 } 1221 1222 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1223 1224 /* 1225 * This code modifies a standard EBCDIC<->Unicode mapping table for 1226 * OS/390 (z/OS) Unix System Services (Open Edition). 1227 * The difference is in the mapping of Line Feed and New Line control codes: 1228 * Standard EBCDIC maps 1229 * 1230 * <U000A> \x25 |0 1231 * <U0085> \x15 |0 1232 * 1233 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1234 * mapping 1235 * 1236 * <U000A> \x15 |0 1237 * <U0085> \x25 |0 1238 * 1239 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1240 * by copying it into allocated memory and swapping the LF and NL values. 1241 * It allows to support the same EBCDIC charset in both versions without 1242 * duplicating the entire installed table. 1243 */ 1244 1245 /* standard EBCDIC codes */ 1246 #define EBCDIC_LF 0x25 1247 #define EBCDIC_NL 0x15 1248 1249 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1250 #define EBCDIC_RT_LF 0xf25 1251 #define EBCDIC_RT_NL 0xf15 1252 1253 /* Unicode code points */ 1254 #define U_LF 0x0a 1255 #define U_NL 0x85 1256 1257 static UBool 1258 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1259 UConverterMBCSTable *mbcsTable; 1260 1261 const uint16_t *table, *results; 1262 const uint8_t *bytes; 1263 1264 int32_t (*newStateTable)[256]; 1265 uint16_t *newResults; 1266 uint8_t *p; 1267 char *name; 1268 1269 uint32_t stage2Entry; 1270 uint32_t size, sizeofFromUBytes; 1271 1272 mbcsTable=&sharedData->mbcs; 1273 1274 table=mbcsTable->fromUnicodeTable; 1275 bytes=mbcsTable->fromUnicodeBytes; 1276 results=(const uint16_t *)bytes; 1277 1278 /* 1279 * Check that this is an EBCDIC table with SBCS portion - 1280 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1281 * 1282 * If not, ignore the option. Options are always ignored if they do not apply. 1283 */ 1284 if(!( 1285 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1286 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1287 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1288 )) { 1289 return FALSE; 1290 } 1291 1292 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1293 if(!( 1294 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1295 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1296 )) { 1297 return FALSE; 1298 } 1299 } else /* MBCS_OUTPUT_2_SISO */ { 1300 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1301 if(!( 1302 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1303 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1304 )) { 1305 return FALSE; 1306 } 1307 1308 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1309 if(!( 1310 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1311 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1312 )) { 1313 return FALSE; 1314 } 1315 } 1316 1317 if(mbcsTable->fromUBytesLength>0) { 1318 /* 1319 * We _know_ the number of bytes in the fromUnicodeBytes array 1320 * starting with header.version 4.1. 1321 */ 1322 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1323 } else { 1324 /* 1325 * Otherwise: 1326 * There used to be code to enumerate the fromUnicode 1327 * trie and find the highest entry, but it was removed in ICU 3.2 1328 * because it was not tested and caused a low code coverage number. 1329 * See Jitterbug 3674. 1330 * This affects only some .cnv file formats with a header.version 1331 * below 4.1, and only when swaplfnl is requested. 1332 * 1333 * ucnvmbcs.c revision 1.99 is the last one with the 1334 * ucnv_MBCSSizeofFromUBytes() function. 1335 */ 1336 *pErrorCode=U_INVALID_FORMAT_ERROR; 1337 return FALSE; 1338 } 1339 1340 /* 1341 * The table has an appropriate format. 1342 * Allocate and build 1343 * - a modified to-Unicode state table 1344 * - a modified from-Unicode output array 1345 * - a converter name string with the swap option appended 1346 */ 1347 size= 1348 mbcsTable->countStates*1024+ 1349 sizeofFromUBytes+ 1350 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1351 p=(uint8_t *)uprv_malloc(size); 1352 if(p==NULL) { 1353 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1354 return FALSE; 1355 } 1356 1357 /* copy and modify the to-Unicode state table */ 1358 newStateTable=(int32_t (*)[256])p; 1359 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1360 1361 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1362 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1363 1364 /* copy and modify the from-Unicode result table */ 1365 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1366 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1367 1368 /* conveniently, the table access macros work on the left side of expressions */ 1369 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1370 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1371 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1372 } else /* MBCS_OUTPUT_2_SISO */ { 1373 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1374 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1375 1376 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1377 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1378 } 1379 1380 /* set the canonical converter name */ 1381 name=(char *)newResults+sizeofFromUBytes; 1382 uprv_strcpy(name, sharedData->staticData->name); 1383 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1384 1385 /* set the pointers */ 1386 umtx_lock(NULL); 1387 if(mbcsTable->swapLFNLStateTable==NULL) { 1388 mbcsTable->swapLFNLStateTable=newStateTable; 1389 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1390 mbcsTable->swapLFNLName=name; 1391 1392 newStateTable=NULL; 1393 } 1394 umtx_unlock(NULL); 1395 1396 /* release the allocated memory if another thread beat us to it */ 1397 if(newStateTable!=NULL) { 1398 uprv_free(newStateTable); 1399 } 1400 return TRUE; 1401 } 1402 1403 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1404 1405 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1406 static UBool U_CALLCONV 1407 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1408 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1409 const uint16_t *table; 1410 uint32_t *stage2; 1411 uint8_t *bytes, *p; 1412 UChar32 c; 1413 int32_t i, st3; 1414 1415 table=mbcsTable->fromUnicodeTable; 1416 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1417 1418 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1419 switch(mbcsTable->outputType) { 1420 case MBCS_OUTPUT_3_EUC: 1421 if(value<=0xffff) { 1422 /* short sequences are stored directly */ 1423 /* code set 0 or 1 */ 1424 } else if(value<=0x8effff) { 1425 /* code set 2 */ 1426 value&=0x7fff; 1427 } else /* first byte is 0x8f */ { 1428 /* code set 3 */ 1429 value&=0xff7f; 1430 } 1431 break; 1432 case MBCS_OUTPUT_4_EUC: 1433 if(value<=0xffffff) { 1434 /* short sequences are stored directly */ 1435 /* code set 0 or 1 */ 1436 } else if(value<=0x8effffff) { 1437 /* code set 2 */ 1438 value&=0x7fffff; 1439 } else /* first byte is 0x8f */ { 1440 /* code set 3 */ 1441 value&=0xff7fff; 1442 } 1443 break; 1444 default: 1445 break; 1446 } 1447 1448 for(i=0; i<=0x1f; ++value, ++i) { 1449 c=codePoints[i]; 1450 if(c<0) { 1451 continue; 1452 } 1453 1454 /* locate the stage 2 & 3 data */ 1455 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1456 p=bytes; 1457 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1458 1459 /* write the codepage bytes into stage 3 */ 1460 switch(mbcsTable->outputType) { 1461 case MBCS_OUTPUT_3: 1462 case MBCS_OUTPUT_4_EUC: 1463 p+=st3*3; 1464 p[0]=(uint8_t)(value>>16); 1465 p[1]=(uint8_t)(value>>8); 1466 p[2]=(uint8_t)value; 1467 break; 1468 case MBCS_OUTPUT_4: 1469 ((uint32_t *)p)[st3]=value; 1470 break; 1471 default: 1472 /* 2 bytes per character */ 1473 ((uint16_t *)p)[st3]=(uint16_t)value; 1474 break; 1475 } 1476 1477 /* set the roundtrip flag */ 1478 *stage2|=(1UL<<(16+(c&0xf))); 1479 } 1480 return TRUE; 1481 } 1482 1483 static void 1484 reconstituteData(UConverterMBCSTable *mbcsTable, 1485 uint32_t stage1Length, uint32_t stage2Length, 1486 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1487 UErrorCode *pErrorCode) { 1488 uint16_t *stage1; 1489 uint32_t *stage2; 1490 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1491 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1492 if(mbcsTable->reconstitutedData==NULL) { 1493 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1494 return; 1495 } 1496 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1497 1498 /* copy existing data and reroute the pointers */ 1499 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1500 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1501 1502 stage2=(uint32_t *)(stage1+stage1Length); 1503 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1504 mbcsTable->fromUnicodeTable+stage1Length, 1505 stage2Length*4); 1506 1507 mbcsTable->fromUnicodeTable=stage1; 1508 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length); 1509 1510 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1511 stage2=(uint32_t *)stage1; 1512 1513 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1514 { 1515 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1516 int32_t stageUTF8Index=0; 1517 int32_t st1, st2, st3, i; 1518 1519 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1520 st2=stage1[st1]; 1521 if(st2!=(int32_t)stage1Length/2) { 1522 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1523 for(i=0; i<16; ++i) { 1524 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1525 if(st3!=0) { 1526 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1527 st3>>=4; 1528 /* 1529 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1530 * allocated together as a single 64-block for access from the mbcsIndex 1531 */ 1532 stage2[st2++]=st3++; 1533 stage2[st2++]=st3++; 1534 stage2[st2++]=st3++; 1535 stage2[st2++]=st3; 1536 } else { 1537 /* no stage 3 block, skip */ 1538 st2+=4; 1539 } 1540 } 1541 } else { 1542 /* no stage 2 block, skip */ 1543 stageUTF8Index+=16; 1544 } 1545 } 1546 } 1547 1548 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1549 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1550 } 1551 1552 /* MBCS setup functions ----------------------------------------------------- */ 1553 1554 static void U_CALLCONV 1555 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1556 UConverterLoadArgs *pArgs, 1557 const uint8_t *raw, 1558 UErrorCode *pErrorCode) { 1559 UDataInfo info; 1560 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1561 _MBCSHeader *header=(_MBCSHeader *)raw; 1562 uint32_t offset; 1563 uint32_t headerLength; 1564 UBool noFromU=FALSE; 1565 1566 if(header->version[0]==4) { 1567 headerLength=MBCS_HEADER_V4_LENGTH; 1568 } else if(header->version[0]==5 && header->version[1]>=3 && 1569 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1570 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1571 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1572 } else { 1573 *pErrorCode=U_INVALID_TABLE_FORMAT; 1574 return; 1575 } 1576 1577 mbcsTable->outputType=(uint8_t)header->flags; 1578 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1579 *pErrorCode=U_INVALID_TABLE_FORMAT; 1580 return; 1581 } 1582 1583 /* extension data, header version 4.2 and higher */ 1584 offset=header->flags>>8; 1585 if(offset!=0) { 1586 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1587 } 1588 1589 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1590 UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER; 1591 UConverterSharedData *baseSharedData; 1592 const int32_t *extIndexes; 1593 const char *baseName; 1594 1595 /* extension-only file, load the base table and set values appropriately */ 1596 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1597 /* extension-only file without extension */ 1598 *pErrorCode=U_INVALID_TABLE_FORMAT; 1599 return; 1600 } 1601 1602 if(pArgs->nestedLoads!=1) { 1603 /* an extension table must not be loaded as a base table */ 1604 *pErrorCode=U_INVALID_TABLE_FILE; 1605 return; 1606 } 1607 1608 /* load the base table */ 1609 baseName=(const char *)header+headerLength*4; 1610 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1611 /* forbid loading this same extension-only file */ 1612 *pErrorCode=U_INVALID_TABLE_FORMAT; 1613 return; 1614 } 1615 1616 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1617 args.size=sizeof(UConverterLoadArgs); 1618 args.nestedLoads=2; 1619 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1620 args.reserved=pArgs->reserved; 1621 args.options=pArgs->options; 1622 args.pkg=pArgs->pkg; 1623 args.name=baseName; 1624 baseSharedData=ucnv_load(&args, pErrorCode); 1625 if(U_FAILURE(*pErrorCode)) { 1626 return; 1627 } 1628 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1629 baseSharedData->mbcs.baseSharedData!=NULL 1630 ) { 1631 ucnv_unload(baseSharedData); 1632 *pErrorCode=U_INVALID_TABLE_FORMAT; 1633 return; 1634 } 1635 if(pArgs->onlyTestIsLoadable) { 1636 /* 1637 * Exit as soon as we know that we can load the converter 1638 * and the format is valid and supported. 1639 * The worst that can happen in the following code is a memory 1640 * allocation error. 1641 */ 1642 ucnv_unload(baseSharedData); 1643 return; 1644 } 1645 1646 /* copy the base table data */ 1647 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1648 1649 /* overwrite values with relevant ones for the extension converter */ 1650 mbcsTable->baseSharedData=baseSharedData; 1651 mbcsTable->extIndexes=extIndexes; 1652 1653 /* 1654 * It would be possible to share the swapLFNL data with a base converter, 1655 * but the generated name would have to be different, and the memory 1656 * would have to be free'd only once. 1657 * It is easier to just create the data for the extension converter 1658 * separately when it is requested. 1659 */ 1660 mbcsTable->swapLFNLStateTable=NULL; 1661 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1662 mbcsTable->swapLFNLName=NULL; 1663 1664 /* 1665 * The reconstitutedData must be deleted only when the base converter 1666 * is unloaded. 1667 */ 1668 mbcsTable->reconstitutedData=NULL; 1669 1670 /* 1671 * Set a special, runtime-only outputType if the extension converter 1672 * is a DBCS version of a base converter that also maps single bytes. 1673 */ 1674 if( sharedData->staticData->conversionType==UCNV_DBCS || 1675 (sharedData->staticData->conversionType==UCNV_MBCS && 1676 sharedData->staticData->minBytesPerChar>=2) 1677 ) { 1678 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1679 /* the base converter is SI/SO-stateful */ 1680 int32_t entry; 1681 1682 /* get the dbcs state from the state table entry for SO=0x0e */ 1683 entry=mbcsTable->stateTable[0][0xe]; 1684 if( MBCS_ENTRY_IS_FINAL(entry) && 1685 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1686 MBCS_ENTRY_FINAL_STATE(entry)!=0 1687 ) { 1688 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1689 1690 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1691 } 1692 } else if( 1693 baseSharedData->staticData->conversionType==UCNV_MBCS && 1694 baseSharedData->staticData->minBytesPerChar==1 && 1695 baseSharedData->staticData->maxBytesPerChar==2 && 1696 mbcsTable->countStates<=127 1697 ) { 1698 /* non-stateful base converter, need to modify the state table */ 1699 int32_t (*newStateTable)[256]; 1700 int32_t *state; 1701 int32_t i, count; 1702 1703 /* allocate a new state table and copy the base state table contents */ 1704 count=mbcsTable->countStates; 1705 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1706 if(newStateTable==NULL) { 1707 ucnv_unload(baseSharedData); 1708 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1709 return; 1710 } 1711 1712 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1713 1714 /* change all final single-byte entries to go to a new all-illegal state */ 1715 state=newStateTable[0]; 1716 for(i=0; i<256; ++i) { 1717 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1718 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1719 } 1720 } 1721 1722 /* build the new all-illegal state */ 1723 state=newStateTable[count]; 1724 for(i=0; i<256; ++i) { 1725 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1726 } 1727 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1728 mbcsTable->countStates=(uint8_t)(count+1); 1729 mbcsTable->stateTableOwned=TRUE; 1730 1731 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1732 } 1733 } 1734 1735 /* 1736 * unlike below for files with base tables, do not get the unicodeMask 1737 * from the sharedData; instead, use the base table's unicodeMask, 1738 * which we copied in the memcpy above; 1739 * this is necessary because the static data unicodeMask, especially 1740 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1741 */ 1742 } else { 1743 /* conversion file with a base table; an additional extension table is optional */ 1744 /* make sure that the output type is known */ 1745 switch(mbcsTable->outputType) { 1746 case MBCS_OUTPUT_1: 1747 case MBCS_OUTPUT_2: 1748 case MBCS_OUTPUT_3: 1749 case MBCS_OUTPUT_4: 1750 case MBCS_OUTPUT_3_EUC: 1751 case MBCS_OUTPUT_4_EUC: 1752 case MBCS_OUTPUT_2_SISO: 1753 /* OK */ 1754 break; 1755 default: 1756 *pErrorCode=U_INVALID_TABLE_FORMAT; 1757 return; 1758 } 1759 if(pArgs->onlyTestIsLoadable) { 1760 /* 1761 * Exit as soon as we know that we can load the converter 1762 * and the format is valid and supported. 1763 * The worst that can happen in the following code is a memory 1764 * allocation error. 1765 */ 1766 return; 1767 } 1768 1769 mbcsTable->countStates=(uint8_t)header->countStates; 1770 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1771 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1772 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1773 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1774 1775 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1776 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1777 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1778 1779 /* 1780 * converter versions 6.1 and up contain a unicodeMask that is 1781 * used here to select the most efficient function implementations 1782 */ 1783 info.size=sizeof(UDataInfo); 1784 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1785 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1786 /* mask off possible future extensions to be safe */ 1787 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1788 } else { 1789 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1790 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1791 } 1792 1793 /* 1794 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1795 * Check for the header version, SBCS vs. MBCS, and for whether the 1796 * data structures are optimized for code points as high as what the 1797 * runtime code is designed for. 1798 * The implementation does not handle mapping tables with entries for 1799 * unpaired surrogates. 1800 */ 1801 if( header->version[1]>=3 && 1802 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1803 (mbcsTable->countStates==1 ? 1804 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1805 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1806 ) 1807 ) { 1808 mbcsTable->utf8Friendly=TRUE; 1809 1810 if(mbcsTable->countStates==1) { 1811 /* 1812 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1813 * Build a table with indexes to each block, to be used instead of 1814 * the regular stage 1/2 table. 1815 */ 1816 int32_t i; 1817 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1818 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1819 } 1820 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1821 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1822 } else { 1823 /* 1824 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1825 * The .cnv file is prebuilt with an additional stage table with indexes 1826 * to each block. 1827 */ 1828 mbcsTable->mbcsIndex=(const uint16_t *) 1829 (mbcsTable->fromUnicodeBytes+ 1830 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1831 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1832 } 1833 } 1834 1835 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1836 { 1837 uint32_t asciiRoundtrips=0xffffffff; 1838 int32_t i; 1839 1840 for(i=0; i<0x80; ++i) { 1841 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1842 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1843 } 1844 } 1845 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1846 } 1847 1848 if(noFromU) { 1849 uint32_t stage1Length= 1850 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1851 0x440 : 0x40; 1852 uint32_t stage2Length= 1853 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1854 stage1Length/2; 1855 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1856 } 1857 } 1858 1859 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1860 if(mbcsTable->utf8Friendly) { 1861 if(mbcsTable->countStates==1) { 1862 sharedData->impl=&_SBCSUTF8Impl; 1863 } else { 1864 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1865 sharedData->impl=&_DBCSUTF8Impl; 1866 } 1867 } 1868 } 1869 1870 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1871 /* 1872 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1873 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1874 */ 1875 mbcsTable->asciiRoundtrips=0; 1876 } 1877 } 1878 1879 static void U_CALLCONV 1880 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1881 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1882 1883 if(mbcsTable->swapLFNLStateTable!=NULL) { 1884 uprv_free(mbcsTable->swapLFNLStateTable); 1885 } 1886 if(mbcsTable->stateTableOwned) { 1887 uprv_free((void *)mbcsTable->stateTable); 1888 } 1889 if(mbcsTable->baseSharedData!=NULL) { 1890 ucnv_unload(mbcsTable->baseSharedData); 1891 } 1892 if(mbcsTable->reconstitutedData!=NULL) { 1893 uprv_free(mbcsTable->reconstitutedData); 1894 } 1895 } 1896 1897 static void U_CALLCONV 1898 ucnv_MBCSOpen(UConverter *cnv, 1899 UConverterLoadArgs *pArgs, 1900 UErrorCode *pErrorCode) { 1901 UConverterMBCSTable *mbcsTable; 1902 const int32_t *extIndexes; 1903 uint8_t outputType; 1904 int8_t maxBytesPerUChar; 1905 1906 if(pArgs->onlyTestIsLoadable) { 1907 return; 1908 } 1909 1910 mbcsTable=&cnv->sharedData->mbcs; 1911 outputType=mbcsTable->outputType; 1912 1913 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1914 /* the swaplfnl option does not apply, remove it */ 1915 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1916 } 1917 1918 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1919 /* do this because double-checked locking is broken */ 1920 UBool isCached; 1921 1922 umtx_lock(NULL); 1923 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1924 umtx_unlock(NULL); 1925 1926 if(!isCached) { 1927 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1928 if(U_FAILURE(*pErrorCode)) { 1929 return; /* something went wrong */ 1930 } 1931 1932 /* the option does not apply, remove it */ 1933 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1934 } 1935 } 1936 } 1937 1938 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1939 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1940 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1941 cnv->options|=_MBCS_OPTION_GB18030; 1942 } 1943 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1944 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1945 cnv->options|=_MBCS_OPTION_KEIS; 1946 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1947 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1948 cnv->options|=_MBCS_OPTION_JEF; 1949 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1950 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1951 cnv->options|=_MBCS_OPTION_JIPS; 1952 } 1953 1954 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1955 if(outputType==MBCS_OUTPUT_2_SISO) { 1956 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1957 } 1958 1959 extIndexes=mbcsTable->extIndexes; 1960 if(extIndexes!=NULL) { 1961 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1962 if(outputType==MBCS_OUTPUT_2_SISO) { 1963 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1964 } 1965 1966 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1967 cnv->maxBytesPerUChar=maxBytesPerUChar; 1968 } 1969 } 1970 1971 #if 0 1972 /* 1973 * documentation of UConverter fields used for status 1974 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1975 */ 1976 1977 /* toUnicode */ 1978 cnv->toUnicodeStatus=0; /* offset */ 1979 cnv->mode=0; /* state */ 1980 cnv->toULength=0; /* byteIndex */ 1981 1982 /* fromUnicode */ 1983 cnv->fromUChar32=0; 1984 cnv->fromUnicodeStatus=1; /* prevLength */ 1985 #endif 1986 } 1987 1988 U_CDECL_BEGIN 1989 1990 static const char* U_CALLCONV 1991 ucnv_MBCSGetName(const UConverter *cnv) { 1992 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1993 return cnv->sharedData->mbcs.swapLFNLName; 1994 } else { 1995 return cnv->sharedData->staticData->name; 1996 } 1997 } 1998 U_CDECL_END 1999 2000 2001 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 2002 2003 static UChar32 U_CALLCONV 2004 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 2005 const _MBCSToUFallback *toUFallbacks; 2006 uint32_t i, start, limit; 2007 2008 limit=mbcsTable->countToUFallbacks; 2009 if(limit>0) { 2010 /* do a binary search for the fallback mapping */ 2011 toUFallbacks=mbcsTable->toUFallbacks; 2012 start=0; 2013 while(start<limit-1) { 2014 i=(start+limit)/2; 2015 if(offset<toUFallbacks[i].offset) { 2016 limit=i; 2017 } else { 2018 start=i; 2019 } 2020 } 2021 2022 /* did we really find it? */ 2023 if(offset==toUFallbacks[start].offset) { 2024 return toUFallbacks[start].codePoint; 2025 } 2026 } 2027 2028 return 0xfffe; 2029 } 2030 2031 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 2032 static void 2033 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2034 UErrorCode *pErrorCode) { 2035 UConverter *cnv; 2036 const uint8_t *source, *sourceLimit; 2037 UChar *target; 2038 const UChar *targetLimit; 2039 int32_t *offsets; 2040 2041 const int32_t (*stateTable)[256]; 2042 2043 int32_t sourceIndex; 2044 2045 int32_t entry; 2046 UChar c; 2047 uint8_t action; 2048 2049 /* set up the local pointers */ 2050 cnv=pArgs->converter; 2051 source=(const uint8_t *)pArgs->source; 2052 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2053 target=pArgs->target; 2054 targetLimit=pArgs->targetLimit; 2055 offsets=pArgs->offsets; 2056 2057 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2058 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2059 } else { 2060 stateTable=cnv->sharedData->mbcs.stateTable; 2061 } 2062 2063 /* sourceIndex=-1 if the current character began in the previous buffer */ 2064 sourceIndex=0; 2065 2066 /* conversion loop */ 2067 while(source<sourceLimit) { 2068 /* 2069 * This following test is to see if available input would overflow the output. 2070 * It does not catch output of more than one code unit that 2071 * overflows as a result of a surrogate pair or callback output 2072 * from the last source byte. 2073 * Therefore, those situations also test for overflows and will 2074 * then break the loop, too. 2075 */ 2076 if(target>=targetLimit) { 2077 /* target is full */ 2078 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2079 break; 2080 } 2081 2082 entry=stateTable[0][*source++]; 2083 /* MBCS_ENTRY_IS_FINAL(entry) */ 2084 2085 /* test the most common case first */ 2086 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2087 /* output BMP code point */ 2088 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2089 if(offsets!=NULL) { 2090 *offsets++=sourceIndex; 2091 } 2092 2093 /* normal end of action codes: prepare for a new character */ 2094 ++sourceIndex; 2095 continue; 2096 } 2097 2098 /* 2099 * An if-else-if chain provides more reliable performance for 2100 * the most common cases compared to a switch. 2101 */ 2102 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2103 if(action==MBCS_STATE_VALID_DIRECT_20 || 2104 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2105 ) { 2106 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2107 /* output surrogate pair */ 2108 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2109 if(offsets!=NULL) { 2110 *offsets++=sourceIndex; 2111 } 2112 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2113 if(target<targetLimit) { 2114 *target++=c; 2115 if(offsets!=NULL) { 2116 *offsets++=sourceIndex; 2117 } 2118 } else { 2119 /* target overflow */ 2120 cnv->UCharErrorBuffer[0]=c; 2121 cnv->UCharErrorBufferLength=1; 2122 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2123 break; 2124 } 2125 2126 ++sourceIndex; 2127 continue; 2128 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2129 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2130 /* output BMP code point */ 2131 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2132 if(offsets!=NULL) { 2133 *offsets++=sourceIndex; 2134 } 2135 2136 ++sourceIndex; 2137 continue; 2138 } 2139 } else if(action==MBCS_STATE_UNASSIGNED) { 2140 /* just fall through */ 2141 } else if(action==MBCS_STATE_ILLEGAL) { 2142 /* callback(illegal) */ 2143 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2144 } else { 2145 /* reserved, must never occur */ 2146 ++sourceIndex; 2147 continue; 2148 } 2149 2150 if(U_FAILURE(*pErrorCode)) { 2151 /* callback(illegal) */ 2152 break; 2153 } else /* unassigned sequences indicated with byteIndex>0 */ { 2154 /* try an extension mapping */ 2155 pArgs->source=(const char *)source; 2156 cnv->toUBytes[0]=*(source-1); 2157 cnv->toULength=_extToU(cnv, cnv->sharedData, 2158 1, &source, sourceLimit, 2159 &target, targetLimit, 2160 &offsets, sourceIndex, 2161 pArgs->flush, 2162 pErrorCode); 2163 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2164 2165 if(U_FAILURE(*pErrorCode)) { 2166 /* not mappable or buffer overflow */ 2167 break; 2168 } 2169 } 2170 } 2171 2172 /* write back the updated pointers */ 2173 pArgs->source=(const char *)source; 2174 pArgs->target=target; 2175 pArgs->offsets=offsets; 2176 } 2177 2178 /* 2179 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2180 * that only map to and from the BMP. 2181 * In addition to single-byte optimizations, the offset calculations 2182 * become much easier. 2183 */ 2184 static void 2185 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2186 UErrorCode *pErrorCode) { 2187 UConverter *cnv; 2188 const uint8_t *source, *sourceLimit, *lastSource; 2189 UChar *target; 2190 int32_t targetCapacity, length; 2191 int32_t *offsets; 2192 2193 const int32_t (*stateTable)[256]; 2194 2195 int32_t sourceIndex; 2196 2197 int32_t entry; 2198 uint8_t action; 2199 2200 /* set up the local pointers */ 2201 cnv=pArgs->converter; 2202 source=(const uint8_t *)pArgs->source; 2203 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2204 target=pArgs->target; 2205 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2206 offsets=pArgs->offsets; 2207 2208 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2209 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2210 } else { 2211 stateTable=cnv->sharedData->mbcs.stateTable; 2212 } 2213 2214 /* sourceIndex=-1 if the current character began in the previous buffer */ 2215 sourceIndex=0; 2216 lastSource=source; 2217 2218 /* 2219 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2220 * for the minimum of the sourceLength and targetCapacity 2221 */ 2222 length=(int32_t)(sourceLimit-source); 2223 if(length<targetCapacity) { 2224 targetCapacity=length; 2225 } 2226 2227 #if MBCS_UNROLL_SINGLE_TO_BMP 2228 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2229 /* unroll the loop with the most common case */ 2230 unrolled: 2231 if(targetCapacity>=16) { 2232 int32_t count, loops, oredEntries; 2233 2234 loops=count=targetCapacity>>4; 2235 do { 2236 oredEntries=entry=stateTable[0][*source++]; 2237 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2238 oredEntries|=entry=stateTable[0][*source++]; 2239 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2240 oredEntries|=entry=stateTable[0][*source++]; 2241 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2242 oredEntries|=entry=stateTable[0][*source++]; 2243 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2244 oredEntries|=entry=stateTable[0][*source++]; 2245 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2246 oredEntries|=entry=stateTable[0][*source++]; 2247 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2248 oredEntries|=entry=stateTable[0][*source++]; 2249 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2250 oredEntries|=entry=stateTable[0][*source++]; 2251 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2252 oredEntries|=entry=stateTable[0][*source++]; 2253 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2254 oredEntries|=entry=stateTable[0][*source++]; 2255 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2256 oredEntries|=entry=stateTable[0][*source++]; 2257 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2258 oredEntries|=entry=stateTable[0][*source++]; 2259 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2260 oredEntries|=entry=stateTable[0][*source++]; 2261 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2262 oredEntries|=entry=stateTable[0][*source++]; 2263 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2264 oredEntries|=entry=stateTable[0][*source++]; 2265 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2266 oredEntries|=entry=stateTable[0][*source++]; 2267 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2268 2269 /* were all 16 entries really valid? */ 2270 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2271 /* no, return to the first of these 16 */ 2272 source-=16; 2273 target-=16; 2274 break; 2275 } 2276 } while(--count>0); 2277 count=loops-count; 2278 targetCapacity-=16*count; 2279 2280 if(offsets!=NULL) { 2281 lastSource+=16*count; 2282 while(count>0) { 2283 *offsets++=sourceIndex++; 2284 *offsets++=sourceIndex++; 2285 *offsets++=sourceIndex++; 2286 *offsets++=sourceIndex++; 2287 *offsets++=sourceIndex++; 2288 *offsets++=sourceIndex++; 2289 *offsets++=sourceIndex++; 2290 *offsets++=sourceIndex++; 2291 *offsets++=sourceIndex++; 2292 *offsets++=sourceIndex++; 2293 *offsets++=sourceIndex++; 2294 *offsets++=sourceIndex++; 2295 *offsets++=sourceIndex++; 2296 *offsets++=sourceIndex++; 2297 *offsets++=sourceIndex++; 2298 *offsets++=sourceIndex++; 2299 --count; 2300 } 2301 } 2302 } 2303 #endif 2304 2305 /* conversion loop */ 2306 while(targetCapacity > 0 && source < sourceLimit) { 2307 entry=stateTable[0][*source++]; 2308 /* MBCS_ENTRY_IS_FINAL(entry) */ 2309 2310 /* test the most common case first */ 2311 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2312 /* output BMP code point */ 2313 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2314 --targetCapacity; 2315 continue; 2316 } 2317 2318 /* 2319 * An if-else-if chain provides more reliable performance for 2320 * the most common cases compared to a switch. 2321 */ 2322 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2323 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2324 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2325 /* output BMP code point */ 2326 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2327 --targetCapacity; 2328 continue; 2329 } 2330 } else if(action==MBCS_STATE_UNASSIGNED) { 2331 /* just fall through */ 2332 } else if(action==MBCS_STATE_ILLEGAL) { 2333 /* callback(illegal) */ 2334 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2335 } else { 2336 /* reserved, must never occur */ 2337 continue; 2338 } 2339 2340 /* set offsets since the start or the last extension */ 2341 if(offsets!=NULL) { 2342 int32_t count=(int32_t)(source-lastSource); 2343 2344 /* predecrement: do not set the offset for the callback-causing character */ 2345 while(--count>0) { 2346 *offsets++=sourceIndex++; 2347 } 2348 /* offset and sourceIndex are now set for the current character */ 2349 } 2350 2351 if(U_FAILURE(*pErrorCode)) { 2352 /* callback(illegal) */ 2353 break; 2354 } else /* unassigned sequences indicated with byteIndex>0 */ { 2355 /* try an extension mapping */ 2356 lastSource=source; 2357 cnv->toUBytes[0]=*(source-1); 2358 cnv->toULength=_extToU(cnv, cnv->sharedData, 2359 1, &source, sourceLimit, 2360 &target, pArgs->targetLimit, 2361 &offsets, sourceIndex, 2362 pArgs->flush, 2363 pErrorCode); 2364 sourceIndex+=1+(int32_t)(source-lastSource); 2365 2366 if(U_FAILURE(*pErrorCode)) { 2367 /* not mappable or buffer overflow */ 2368 break; 2369 } 2370 2371 /* recalculate the targetCapacity after an extension mapping */ 2372 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2373 length=(int32_t)(sourceLimit-source); 2374 if(length<targetCapacity) { 2375 targetCapacity=length; 2376 } 2377 } 2378 2379 #if MBCS_UNROLL_SINGLE_TO_BMP 2380 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2381 goto unrolled; 2382 #endif 2383 } 2384 2385 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2386 /* target is full */ 2387 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2388 } 2389 2390 /* set offsets since the start or the last callback */ 2391 if(offsets!=NULL) { 2392 size_t count=source-lastSource; 2393 while(count>0) { 2394 *offsets++=sourceIndex++; 2395 --count; 2396 } 2397 } 2398 2399 /* write back the updated pointers */ 2400 pArgs->source=(const char *)source; 2401 pArgs->target=target; 2402 pArgs->offsets=offsets; 2403 } 2404 2405 static UBool 2406 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2407 const int32_t *row=stateTable[state]; 2408 int32_t b, entry; 2409 /* First test for final entries in this state for some commonly valid byte values. */ 2410 entry=row[0xa1]; 2411 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2412 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2413 ) { 2414 return TRUE; 2415 } 2416 entry=row[0x41]; 2417 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2418 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2419 ) { 2420 return TRUE; 2421 } 2422 /* Then test for final entries in this state. */ 2423 for(b=0; b<=0xff; ++b) { 2424 entry=row[b]; 2425 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2426 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2427 ) { 2428 return TRUE; 2429 } 2430 } 2431 /* Then recurse for transition entries. */ 2432 for(b=0; b<=0xff; ++b) { 2433 entry=row[b]; 2434 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2435 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2436 ) { 2437 return TRUE; 2438 } 2439 } 2440 return FALSE; 2441 } 2442 2443 /* 2444 * Is byte b a single/lead byte in this state? 2445 * Recurse for transition states, because here we don't want to say that 2446 * b is a lead byte if all byte sequences that start with b are illegal. 2447 */ 2448 static UBool 2449 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2450 const int32_t *row=stateTable[state]; 2451 int32_t entry=row[b]; 2452 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2453 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2454 } else { 2455 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2456 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2457 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2458 } else { 2459 return action!=MBCS_STATE_ILLEGAL; 2460 } 2461 } 2462 } 2463 2464 U_CFUNC void 2465 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2466 UErrorCode *pErrorCode) { 2467 UConverter *cnv; 2468 const uint8_t *source, *sourceLimit; 2469 UChar *target; 2470 const UChar *targetLimit; 2471 int32_t *offsets; 2472 2473 const int32_t (*stateTable)[256]; 2474 const uint16_t *unicodeCodeUnits; 2475 2476 uint32_t offset; 2477 uint8_t state; 2478 int8_t byteIndex; 2479 uint8_t *bytes; 2480 2481 int32_t sourceIndex, nextSourceIndex; 2482 2483 int32_t entry; 2484 UChar c; 2485 uint8_t action; 2486 2487 /* use optimized function if possible */ 2488 cnv=pArgs->converter; 2489 2490 if(cnv->preToULength>0) { 2491 /* 2492 * pass sourceIndex=-1 because we continue from an earlier buffer 2493 * in the future, this may change with continuous offsets 2494 */ 2495 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2496 2497 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2498 return; 2499 } 2500 } 2501 2502 if(cnv->sharedData->mbcs.countStates==1) { 2503 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2504 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2505 } else { 2506 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2507 } 2508 return; 2509 } 2510 2511 /* set up the local pointers */ 2512 source=(const uint8_t *)pArgs->source; 2513 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2514 target=pArgs->target; 2515 targetLimit=pArgs->targetLimit; 2516 offsets=pArgs->offsets; 2517 2518 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2519 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2520 } else { 2521 stateTable=cnv->sharedData->mbcs.stateTable; 2522 } 2523 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2524 2525 /* get the converter state from UConverter */ 2526 offset=cnv->toUnicodeStatus; 2527 byteIndex=cnv->toULength; 2528 bytes=cnv->toUBytes; 2529 2530 /* 2531 * if we are in the SBCS state for a DBCS-only converter, 2532 * then load the DBCS state from the MBCS data 2533 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2534 */ 2535 if((state=(uint8_t)(cnv->mode))==0) { 2536 state=cnv->sharedData->mbcs.dbcsOnlyState; 2537 } 2538 2539 /* sourceIndex=-1 if the current character began in the previous buffer */ 2540 sourceIndex=byteIndex==0 ? 0 : -1; 2541 nextSourceIndex=0; 2542 2543 /* conversion loop */ 2544 while(source<sourceLimit) { 2545 /* 2546 * This following test is to see if available input would overflow the output. 2547 * It does not catch output of more than one code unit that 2548 * overflows as a result of a surrogate pair or callback output 2549 * from the last source byte. 2550 * Therefore, those situations also test for overflows and will 2551 * then break the loop, too. 2552 */ 2553 if(target>=targetLimit) { 2554 /* target is full */ 2555 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2556 break; 2557 } 2558 2559 if(byteIndex==0) { 2560 /* optimized loop for 1/2-byte input and BMP output */ 2561 if(offsets==NULL) { 2562 do { 2563 entry=stateTable[state][*source]; 2564 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2565 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2566 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2567 2568 ++source; 2569 if( source<sourceLimit && 2570 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2571 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2572 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2573 ) { 2574 ++source; 2575 *target++=c; 2576 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2577 offset=0; 2578 } else { 2579 /* set the state and leave the optimized loop */ 2580 bytes[0]=*(source-1); 2581 byteIndex=1; 2582 break; 2583 } 2584 } else { 2585 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2586 /* output BMP code point */ 2587 ++source; 2588 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2589 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2590 } else { 2591 /* leave the optimized loop */ 2592 break; 2593 } 2594 } 2595 } while(source<sourceLimit && target<targetLimit); 2596 } else /* offsets!=NULL */ { 2597 do { 2598 entry=stateTable[state][*source]; 2599 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2600 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2601 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2602 2603 ++source; 2604 if( source<sourceLimit && 2605 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2606 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2607 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2608 ) { 2609 ++source; 2610 *target++=c; 2611 if(offsets!=NULL) { 2612 *offsets++=sourceIndex; 2613 sourceIndex=(nextSourceIndex+=2); 2614 } 2615 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2616 offset=0; 2617 } else { 2618 /* set the state and leave the optimized loop */ 2619 ++nextSourceIndex; 2620 bytes[0]=*(source-1); 2621 byteIndex=1; 2622 break; 2623 } 2624 } else { 2625 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2626 /* output BMP code point */ 2627 ++source; 2628 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2629 if(offsets!=NULL) { 2630 *offsets++=sourceIndex; 2631 sourceIndex=++nextSourceIndex; 2632 } 2633 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2634 } else { 2635 /* leave the optimized loop */ 2636 break; 2637 } 2638 } 2639 } while(source<sourceLimit && target<targetLimit); 2640 } 2641 2642 /* 2643 * these tests and break statements could be put inside the loop 2644 * if C had "break outerLoop" like Java 2645 */ 2646 if(source>=sourceLimit) { 2647 break; 2648 } 2649 if(target>=targetLimit) { 2650 /* target is full */ 2651 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2652 break; 2653 } 2654 2655 ++nextSourceIndex; 2656 bytes[byteIndex++]=*source++; 2657 } else /* byteIndex>0 */ { 2658 ++nextSourceIndex; 2659 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2660 } 2661 2662 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2663 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2664 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2665 continue; 2666 } 2667 2668 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2669 cnv->mode=state; 2670 2671 /* set the next state early so that we can reuse the entry variable */ 2672 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2673 2674 /* 2675 * An if-else-if chain provides more reliable performance for 2676 * the most common cases compared to a switch. 2677 */ 2678 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2679 if(action==MBCS_STATE_VALID_16) { 2680 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2681 c=unicodeCodeUnits[offset]; 2682 if(c<0xfffe) { 2683 /* output BMP code point */ 2684 *target++=c; 2685 if(offsets!=NULL) { 2686 *offsets++=sourceIndex; 2687 } 2688 byteIndex=0; 2689 } else if(c==0xfffe) { 2690 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2691 /* output fallback BMP code point */ 2692 *target++=(UChar)entry; 2693 if(offsets!=NULL) { 2694 *offsets++=sourceIndex; 2695 } 2696 byteIndex=0; 2697 } 2698 } else { 2699 /* callback(illegal) */ 2700 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2701 } 2702 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2703 /* output BMP code point */ 2704 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2705 if(offsets!=NULL) { 2706 *offsets++=sourceIndex; 2707 } 2708 byteIndex=0; 2709 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2710 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2711 c=unicodeCodeUnits[offset++]; 2712 if(c<0xd800) { 2713 /* output BMP code point below 0xd800 */ 2714 *target++=c; 2715 if(offsets!=NULL) { 2716 *offsets++=sourceIndex; 2717 } 2718 byteIndex=0; 2719 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2720 /* output roundtrip or fallback surrogate pair */ 2721 *target++=(UChar)(c&0xdbff); 2722 if(offsets!=NULL) { 2723 *offsets++=sourceIndex; 2724 } 2725 byteIndex=0; 2726 if(target<targetLimit) { 2727 *target++=unicodeCodeUnits[offset]; 2728 if(offsets!=NULL) { 2729 *offsets++=sourceIndex; 2730 } 2731 } else { 2732 /* target overflow */ 2733 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2734 cnv->UCharErrorBufferLength=1; 2735 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2736 2737 offset=0; 2738 break; 2739 } 2740 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2741 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2742 *target++=unicodeCodeUnits[offset]; 2743 if(offsets!=NULL) { 2744 *offsets++=sourceIndex; 2745 } 2746 byteIndex=0; 2747 } else if(c==0xffff) { 2748 /* callback(illegal) */ 2749 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2750 } 2751 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2752 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2753 ) { 2754 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2755 /* output surrogate pair */ 2756 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2757 if(offsets!=NULL) { 2758 *offsets++=sourceIndex; 2759 } 2760 byteIndex=0; 2761 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2762 if(target<targetLimit) { 2763 *target++=c; 2764 if(offsets!=NULL) { 2765 *offsets++=sourceIndex; 2766 } 2767 } else { 2768 /* target overflow */ 2769 cnv->UCharErrorBuffer[0]=c; 2770 cnv->UCharErrorBufferLength=1; 2771 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2772 2773 offset=0; 2774 break; 2775 } 2776 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2777 /* 2778 * This serves as a state change without any output. 2779 * It is useful for reading simple stateful encodings, 2780 * for example using just Shift-In/Shift-Out codes. 2781 * The 21 unused bits may later be used for more sophisticated 2782 * state transitions. 2783 */ 2784 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2785 byteIndex=0; 2786 } else { 2787 /* SI/SO are illegal for DBCS-only conversion */ 2788 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2789 2790 /* callback(illegal) */ 2791 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2792 } 2793 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2794 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2795 /* output BMP code point */ 2796 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2797 if(offsets!=NULL) { 2798 *offsets++=sourceIndex; 2799 } 2800 byteIndex=0; 2801 } 2802 } else if(action==MBCS_STATE_UNASSIGNED) { 2803 /* just fall through */ 2804 } else if(action==MBCS_STATE_ILLEGAL) { 2805 /* callback(illegal) */ 2806 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2807 } else { 2808 /* reserved, must never occur */ 2809 byteIndex=0; 2810 } 2811 2812 /* end of action codes: prepare for a new character */ 2813 offset=0; 2814 2815 if(byteIndex==0) { 2816 sourceIndex=nextSourceIndex; 2817 } else if(U_FAILURE(*pErrorCode)) { 2818 /* callback(illegal) */ 2819 if(byteIndex>1) { 2820 /* 2821 * Ticket 5691: consistent illegal sequences: 2822 * - We include at least the first byte in the illegal sequence. 2823 * - If any of the non-initial bytes could be the start of a character, 2824 * we stop the illegal sequence before the first one of those. 2825 */ 2826 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2827 int8_t i; 2828 for(i=1; 2829 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2830 ++i) {} 2831 if(i<byteIndex) { 2832 /* Back out some bytes. */ 2833 int8_t backOutDistance=byteIndex-i; 2834 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2835 byteIndex=i; /* length of reported illegal byte sequence */ 2836 if(backOutDistance<=bytesFromThisBuffer) { 2837 source-=backOutDistance; 2838 } else { 2839 /* Back out bytes from the previous buffer: Need to replay them. */ 2840 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2841 /* preToULength is negative! */ 2842 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2843 source=(const uint8_t *)pArgs->source; 2844 } 2845 } 2846 } 2847 break; 2848 } else /* unassigned sequences indicated with byteIndex>0 */ { 2849 /* try an extension mapping */ 2850 pArgs->source=(const char *)source; 2851 byteIndex=_extToU(cnv, cnv->sharedData, 2852 byteIndex, &source, sourceLimit, 2853 &target, targetLimit, 2854 &offsets, sourceIndex, 2855 pArgs->flush, 2856 pErrorCode); 2857 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2858 2859 if(U_FAILURE(*pErrorCode)) { 2860 /* not mappable or buffer overflow */ 2861 break; 2862 } 2863 } 2864 } 2865 2866 /* set the converter state back into UConverter */ 2867 cnv->toUnicodeStatus=offset; 2868 cnv->mode=state; 2869 cnv->toULength=byteIndex; 2870 2871 /* write back the updated pointers */ 2872 pArgs->source=(const char *)source; 2873 pArgs->target=target; 2874 pArgs->offsets=offsets; 2875 } 2876 2877 /* 2878 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2879 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2880 */ 2881 static UChar32 2882 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2883 UErrorCode *pErrorCode) { 2884 UConverter *cnv; 2885 const int32_t (*stateTable)[256]; 2886 const uint8_t *source, *sourceLimit; 2887 2888 int32_t entry; 2889 uint8_t action; 2890 2891 /* set up the local pointers */ 2892 cnv=pArgs->converter; 2893 source=(const uint8_t *)pArgs->source; 2894 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2895 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2896 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2897 } else { 2898 stateTable=cnv->sharedData->mbcs.stateTable; 2899 } 2900 2901 /* conversion loop */ 2902 while(source<sourceLimit) { 2903 entry=stateTable[0][*source++]; 2904 /* MBCS_ENTRY_IS_FINAL(entry) */ 2905 2906 /* write back the updated pointer early so that we can return directly */ 2907 pArgs->source=(const char *)source; 2908 2909 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2910 /* output BMP code point */ 2911 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2912 } 2913 2914 /* 2915 * An if-else-if chain provides more reliable performance for 2916 * the most common cases compared to a switch. 2917 */ 2918 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2919 if( action==MBCS_STATE_VALID_DIRECT_20 || 2920 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2921 ) { 2922 /* output supplementary code point */ 2923 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2924 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2925 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2926 /* output BMP code point */ 2927 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2928 } 2929 } else if(action==MBCS_STATE_UNASSIGNED) { 2930 /* just fall through */ 2931 } else if(action==MBCS_STATE_ILLEGAL) { 2932 /* callback(illegal) */ 2933 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2934 } else { 2935 /* reserved, must never occur */ 2936 continue; 2937 } 2938 2939 if(U_FAILURE(*pErrorCode)) { 2940 /* callback(illegal) */ 2941 break; 2942 } else /* unassigned sequence */ { 2943 /* defer to the generic implementation */ 2944 pArgs->source=(const char *)source-1; 2945 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2946 } 2947 } 2948 2949 /* no output because of empty input or only state changes */ 2950 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2951 return 0xffff; 2952 } 2953 2954 /* 2955 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2956 * conversion without offset handling. 2957 * 2958 * When a character does not have a mapping to Unicode, then we return to the 2959 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2960 * handling. 2961 * We also defer to the generic code in other complicated cases and have them 2962 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2963 * 2964 * All normal mappings and errors are handled here. 2965 */ 2966 static UChar32 U_CALLCONV 2967 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2968 UErrorCode *pErrorCode) { 2969 UConverter *cnv; 2970 const uint8_t *source, *sourceLimit, *lastSource; 2971 2972 const int32_t (*stateTable)[256]; 2973 const uint16_t *unicodeCodeUnits; 2974 2975 uint32_t offset; 2976 uint8_t state; 2977 2978 int32_t entry; 2979 UChar32 c; 2980 uint8_t action; 2981 2982 /* use optimized function if possible */ 2983 cnv=pArgs->converter; 2984 2985 if(cnv->preToULength>0) { 2986 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2987 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2988 } 2989 2990 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2991 /* 2992 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2993 * with the rare case of a codepage that maps single surrogates 2994 * without adding the complexity to this already complicated function here. 2995 */ 2996 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2997 } else if(cnv->sharedData->mbcs.countStates==1) { 2998 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2999 } 3000 3001 /* set up the local pointers */ 3002 source=lastSource=(const uint8_t *)pArgs->source; 3003 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 3004 3005 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3006 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 3007 } else { 3008 stateTable=cnv->sharedData->mbcs.stateTable; 3009 } 3010 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 3011 3012 /* get the converter state from UConverter */ 3013 offset=cnv->toUnicodeStatus; 3014 3015 /* 3016 * if we are in the SBCS state for a DBCS-only converter, 3017 * then load the DBCS state from the MBCS data 3018 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 3019 */ 3020 if((state=(uint8_t)(cnv->mode))==0) { 3021 state=cnv->sharedData->mbcs.dbcsOnlyState; 3022 } 3023 3024 /* conversion loop */ 3025 c=U_SENTINEL; 3026 while(source<sourceLimit) { 3027 entry=stateTable[state][*source++]; 3028 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3029 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3030 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3031 3032 /* optimization for 1/2-byte input and BMP output */ 3033 if( source<sourceLimit && 3034 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 3035 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 3036 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 3037 ) { 3038 ++source; 3039 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3040 /* output BMP code point */ 3041 break; 3042 } 3043 } else { 3044 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 3045 cnv->mode=state; 3046 3047 /* set the next state early so that we can reuse the entry variable */ 3048 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3049 3050 /* 3051 * An if-else-if chain provides more reliable performance for 3052 * the most common cases compared to a switch. 3053 */ 3054 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3055 if(action==MBCS_STATE_VALID_DIRECT_16) { 3056 /* output BMP code point */ 3057 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3058 break; 3059 } else if(action==MBCS_STATE_VALID_16) { 3060 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3061 c=unicodeCodeUnits[offset]; 3062 if(c<0xfffe) { 3063 /* output BMP code point */ 3064 break; 3065 } else if(c==0xfffe) { 3066 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 3067 break; 3068 } 3069 } else { 3070 /* callback(illegal) */ 3071 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3072 } 3073 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3074 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3075 c=unicodeCodeUnits[offset++]; 3076 if(c<0xd800) { 3077 /* output BMP code point below 0xd800 */ 3078 break; 3079 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3080 /* output roundtrip or fallback supplementary code point */ 3081 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 3082 break; 3083 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3084 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3085 c=unicodeCodeUnits[offset]; 3086 break; 3087 } else if(c==0xffff) { 3088 /* callback(illegal) */ 3089 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3090 } 3091 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 3092 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 3093 ) { 3094 /* output supplementary code point */ 3095 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 3096 break; 3097 } else if(action==MBCS_STATE_CHANGE_ONLY) { 3098 /* 3099 * This serves as a state change without any output. 3100 * It is useful for reading simple stateful encodings, 3101 * for example using just Shift-In/Shift-Out codes. 3102 * The 21 unused bits may later be used for more sophisticated 3103 * state transitions. 3104 */ 3105 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 3106 /* SI/SO are illegal for DBCS-only conversion */ 3107 state=(uint8_t)(cnv->mode); /* restore the previous state */ 3108 3109 /* callback(illegal) */ 3110 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3111 } 3112 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3113 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3114 /* output BMP code point */ 3115 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3116 break; 3117 } 3118 } else if(action==MBCS_STATE_UNASSIGNED) { 3119 /* just fall through */ 3120 } else if(action==MBCS_STATE_ILLEGAL) { 3121 /* callback(illegal) */ 3122 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3123 } else { 3124 /* reserved (must never occur), or only state change */ 3125 offset=0; 3126 lastSource=source; 3127 continue; 3128 } 3129 3130 /* end of action codes: prepare for a new character */ 3131 offset=0; 3132 3133 if(U_FAILURE(*pErrorCode)) { 3134 /* callback(illegal) */ 3135 break; 3136 } else /* unassigned sequence */ { 3137 /* defer to the generic implementation */ 3138 cnv->toUnicodeStatus=0; 3139 cnv->mode=state; 3140 pArgs->source=(const char *)lastSource; 3141 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 3142 } 3143 } 3144 } 3145 3146 if(c<0) { 3147 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3148 /* incomplete character byte sequence */ 3149 uint8_t *bytes=cnv->toUBytes; 3150 cnv->toULength=(int8_t)(source-lastSource); 3151 do { 3152 *bytes++=*lastSource++; 3153 } while(lastSource<source); 3154 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3155 } else if(U_FAILURE(*pErrorCode)) { 3156 /* callback(illegal) */ 3157 /* 3158 * Ticket 5691: consistent illegal sequences: 3159 * - We include at least the first byte in the illegal sequence. 3160 * - If any of the non-initial bytes could be the start of a character, 3161 * we stop the illegal sequence before the first one of those. 3162 */ 3163 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3164 uint8_t *bytes=cnv->toUBytes; 3165 *bytes++=*lastSource++; /* first byte */ 3166 if(lastSource==source) { 3167 cnv->toULength=1; 3168 } else /* lastSource<source: multi-byte character */ { 3169 int8_t i; 3170 for(i=1; 3171 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3172 ++i 3173 ) { 3174 *bytes++=*lastSource++; 3175 } 3176 cnv->toULength=i; 3177 source=lastSource; 3178 } 3179 } else { 3180 /* no output because of empty input or only state changes */ 3181 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3182 } 3183 c=0xffff; 3184 } 3185 3186 /* set the converter state back into UConverter, ready for a new character */ 3187 cnv->toUnicodeStatus=0; 3188 cnv->mode=state; 3189 3190 /* write back the updated pointer */ 3191 pArgs->source=(const char *)source; 3192 return c; 3193 } 3194 3195 #if 0 3196 /* 3197 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3198 * Removal improves code coverage. 3199 */ 3200 /** 3201 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3202 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3203 * It does not handle conversion extensions (_extToU()). 3204 */ 3205 U_CFUNC UChar32 3206 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3207 uint8_t b, UBool useFallback) { 3208 int32_t entry; 3209 uint8_t action; 3210 3211 entry=sharedData->mbcs.stateTable[0][b]; 3212 /* MBCS_ENTRY_IS_FINAL(entry) */ 3213 3214 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3215 /* output BMP code point */ 3216 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3217 } 3218 3219 /* 3220 * An if-else-if chain provides more reliable performance for 3221 * the most common cases compared to a switch. 3222 */ 3223 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3224 if(action==MBCS_STATE_VALID_DIRECT_20) { 3225 /* output supplementary code point */ 3226 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3227 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3228 if(!TO_U_USE_FALLBACK(useFallback)) { 3229 return 0xfffe; 3230 } 3231 /* output BMP code point */ 3232 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3233 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3234 if(!TO_U_USE_FALLBACK(useFallback)) { 3235 return 0xfffe; 3236 } 3237 /* output supplementary code point */ 3238 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3239 } else if(action==MBCS_STATE_UNASSIGNED) { 3240 return 0xfffe; 3241 } else if(action==MBCS_STATE_ILLEGAL) { 3242 return 0xffff; 3243 } else { 3244 /* reserved, must never occur */ 3245 return 0xffff; 3246 } 3247 } 3248 #endif 3249 3250 /* 3251 * This is a simple version of _MBCSGetNextUChar() that is used 3252 * by other converter implementations. 3253 * It only returns an "assigned" result if it consumes the entire input. 3254 * It does not use state from the converter, nor error codes. 3255 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3256 * It handles conversion extensions but not GB 18030. 3257 * 3258 * Return value: 3259 * U+fffe unassigned 3260 * U+ffff illegal 3261 * otherwise the Unicode code point 3262 */ 3263 U_CFUNC UChar32 3264 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3265 const char *source, int32_t length, 3266 UBool useFallback) { 3267 const int32_t (*stateTable)[256]; 3268 const uint16_t *unicodeCodeUnits; 3269 3270 uint32_t offset; 3271 uint8_t state, action; 3272 3273 UChar32 c; 3274 int32_t i, entry; 3275 3276 if(length<=0) { 3277 /* no input at all: "illegal" */ 3278 return 0xffff; 3279 } 3280 3281 #if 0 3282 /* 3283 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3284 * TODO In future releases, verify that this function is never called for SBCS 3285 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3286 * Removal improves code coverage. 3287 */ 3288 /* use optimized function if possible */ 3289 if(sharedData->mbcs.countStates==1) { 3290 if(length==1) { 3291 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3292 } else { 3293 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3294 } 3295 } 3296 #endif 3297 3298 /* set up the local pointers */ 3299 stateTable=sharedData->mbcs.stateTable; 3300 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3301 3302 /* converter state */ 3303 offset=0; 3304 state=sharedData->mbcs.dbcsOnlyState; 3305 3306 /* conversion loop */ 3307 for(i=0;;) { 3308 entry=stateTable[state][(uint8_t)source[i++]]; 3309 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3310 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3311 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3312 3313 if(i==length) { 3314 return 0xffff; /* truncated character */ 3315 } 3316 } else { 3317 /* 3318 * An if-else-if chain provides more reliable performance for 3319 * the most common cases compared to a switch. 3320 */ 3321 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3322 if(action==MBCS_STATE_VALID_16) { 3323 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3324 c=unicodeCodeUnits[offset]; 3325 if(c!=0xfffe) { 3326 /* done */ 3327 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3328 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3329 /* else done with 0xfffe */ 3330 } 3331 break; 3332 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3333 /* output BMP code point */ 3334 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3335 break; 3336 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3337 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3338 c=unicodeCodeUnits[offset++]; 3339 if(c<0xd800) { 3340 /* output BMP code point below 0xd800 */ 3341 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3342 /* output roundtrip or fallback supplementary code point */ 3343 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3344 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3345 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3346 c=unicodeCodeUnits[offset]; 3347 } else if(c==0xffff) { 3348 return 0xffff; 3349 } else { 3350 c=0xfffe; 3351 } 3352 break; 3353 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3354 /* output supplementary code point */ 3355 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3356 break; 3357 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3358 if(!TO_U_USE_FALLBACK(useFallback)) { 3359 c=0xfffe; 3360 break; 3361 } 3362 /* output BMP code point */ 3363 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3364 break; 3365 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3366 if(!TO_U_USE_FALLBACK(useFallback)) { 3367 c=0xfffe; 3368 break; 3369 } 3370 /* output supplementary code point */ 3371 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3372 break; 3373 } else if(action==MBCS_STATE_UNASSIGNED) { 3374 c=0xfffe; 3375 break; 3376 } 3377 3378 /* 3379 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3380 * and MBCS_STATE_ILLEGAL and reserved action codes 3381 */ 3382 return 0xffff; 3383 } 3384 } 3385 3386 if(i!=length) { 3387 /* illegal for this function: not all input consumed */ 3388 return 0xffff; 3389 } 3390 3391 if(c==0xfffe) { 3392 /* try an extension mapping */ 3393 const int32_t *cx=sharedData->mbcs.extIndexes; 3394 if(cx!=NULL) { 3395 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3396 } 3397 } 3398 3399 return c; 3400 } 3401 3402 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3403 3404 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3405 static void 3406 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3407 UErrorCode *pErrorCode) { 3408 UConverter *cnv; 3409 const UChar *source, *sourceLimit; 3410 uint8_t *target; 3411 int32_t targetCapacity; 3412 int32_t *offsets; 3413 3414 const uint16_t *table; 3415 const uint16_t *mbcsIndex; 3416 const uint8_t *bytes; 3417 3418 UChar32 c; 3419 3420 int32_t sourceIndex, nextSourceIndex; 3421 3422 uint32_t stage2Entry; 3423 uint32_t asciiRoundtrips; 3424 uint32_t value; 3425 uint8_t unicodeMask; 3426 3427 /* use optimized function if possible */ 3428 cnv=pArgs->converter; 3429 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3430 3431 /* set up the local pointers */ 3432 source=pArgs->source; 3433 sourceLimit=pArgs->sourceLimit; 3434 target=(uint8_t *)pArgs->target; 3435 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3436 offsets=pArgs->offsets; 3437 3438 table=cnv->sharedData->mbcs.fromUnicodeTable; 3439 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3440 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3441 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3442 } else { 3443 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3444 } 3445 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3446 3447 /* get the converter state from UConverter */ 3448 c=cnv->fromUChar32; 3449 3450 /* sourceIndex=-1 if the current character began in the previous buffer */ 3451 sourceIndex= c==0 ? 0 : -1; 3452 nextSourceIndex=0; 3453 3454 /* conversion loop */ 3455 if(c!=0 && targetCapacity>0) { 3456 goto getTrail; 3457 } 3458 3459 while(source<sourceLimit) { 3460 /* 3461 * This following test is to see if available input would overflow the output. 3462 * It does not catch output of more than one byte that 3463 * overflows as a result of a multi-byte character or callback output 3464 * from the last source character. 3465 * Therefore, those situations also test for overflows and will 3466 * then break the loop, too. 3467 */ 3468 if(targetCapacity>0) { 3469 /* 3470 * Get a correct Unicode code point: 3471 * a single UChar for a BMP code point or 3472 * a matched surrogate pair for a "supplementary code point". 3473 */ 3474 c=*source++; 3475 ++nextSourceIndex; 3476 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3477 *target++=(uint8_t)c; 3478 if(offsets!=NULL) { 3479 *offsets++=sourceIndex; 3480 sourceIndex=nextSourceIndex; 3481 } 3482 --targetCapacity; 3483 c=0; 3484 continue; 3485 } 3486 /* 3487 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3488 * to avoid dealing with surrogates. 3489 * MBCS_FAST_MAX must be >=0xd7ff. 3490 */ 3491 if(c<=0xd7ff) { 3492 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3493 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3494 if(value==0) { 3495 goto unassigned; 3496 } 3497 /* output the value */ 3498 } else { 3499 /* 3500 * This also tests if the codepage maps single surrogates. 3501 * If it does, then surrogates are not paired but mapped separately. 3502 * Note that in this case unmatched surrogates are not detected. 3503 */ 3504 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3505 if(U16_IS_SURROGATE_LEAD(c)) { 3506 getTrail: 3507 if(source<sourceLimit) { 3508 /* test the following code unit */ 3509 UChar trail=*source; 3510 if(U16_IS_TRAIL(trail)) { 3511 ++source; 3512 ++nextSourceIndex; 3513 c=U16_GET_SUPPLEMENTARY(c, trail); 3514 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3515 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3516 /* callback(unassigned) */ 3517 goto unassigned; 3518 } 3519 /* convert this supplementary code point */ 3520 /* exit this condition tree */ 3521 } else { 3522 /* this is an unmatched lead code unit (1st surrogate) */ 3523 /* callback(illegal) */ 3524 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3525 break; 3526 } 3527 } else { 3528 /* no more input */ 3529 break; 3530 } 3531 } else { 3532 /* this is an unmatched trail code unit (2nd surrogate) */ 3533 /* callback(illegal) */ 3534 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3535 break; 3536 } 3537 } 3538 3539 /* convert the Unicode code point in c into codepage bytes */ 3540 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3541 3542 /* get the bytes and the length for the output */ 3543 /* MBCS_OUTPUT_2 */ 3544 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3545 3546 /* is this code point assigned, or do we use fallbacks? */ 3547 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3548 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3549 ) { 3550 /* 3551 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3552 * There is no way with this data structure for fallback output 3553 * to be a zero byte. 3554 */ 3555 3556 unassigned: 3557 /* try an extension mapping */ 3558 pArgs->source=source; 3559 c=_extFromU(cnv, cnv->sharedData, 3560 c, &source, sourceLimit, 3561 &target, target+targetCapacity, 3562 &offsets, sourceIndex, 3563 pArgs->flush, 3564 pErrorCode); 3565 nextSourceIndex+=(int32_t)(source-pArgs->source); 3566 3567 if(U_FAILURE(*pErrorCode)) { 3568 /* not mappable or buffer overflow */ 3569 break; 3570 } else { 3571 /* a mapping was written to the target, continue */ 3572 3573 /* recalculate the targetCapacity after an extension mapping */ 3574 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3575 3576 /* normal end of conversion: prepare for a new character */ 3577 sourceIndex=nextSourceIndex; 3578 continue; 3579 } 3580 } 3581 } 3582 3583 /* write the output character bytes from value and length */ 3584 /* from the first if in the loop we know that targetCapacity>0 */ 3585 if(value<=0xff) { 3586 /* this is easy because we know that there is enough space */ 3587 *target++=(uint8_t)value; 3588 if(offsets!=NULL) { 3589 *offsets++=sourceIndex; 3590 } 3591 --targetCapacity; 3592 } else /* length==2 */ { 3593 *target++=(uint8_t)(value>>8); 3594 if(2<=targetCapacity) { 3595 *target++=(uint8_t)value; 3596 if(offsets!=NULL) { 3597 *offsets++=sourceIndex; 3598 *offsets++=sourceIndex; 3599 } 3600 targetCapacity-=2; 3601 } else { 3602 if(offsets!=NULL) { 3603 *offsets++=sourceIndex; 3604 } 3605 cnv->charErrorBuffer[0]=(char)value; 3606 cnv->charErrorBufferLength=1; 3607 3608 /* target overflow */ 3609 targetCapacity=0; 3610 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3611 c=0; 3612 break; 3613 } 3614 } 3615 3616 /* normal end of conversion: prepare for a new character */ 3617 c=0; 3618 sourceIndex=nextSourceIndex; 3619 continue; 3620 } else { 3621 /* target is full */ 3622 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3623 break; 3624 } 3625 } 3626 3627 /* set the converter state back into UConverter */ 3628 cnv->fromUChar32=c; 3629 3630 /* write back the updated pointers */ 3631 pArgs->source=source; 3632 pArgs->target=(char *)target; 3633 pArgs->offsets=offsets; 3634 } 3635 3636 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3637 static void 3638 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3639 UErrorCode *pErrorCode) { 3640 UConverter *cnv; 3641 const UChar *source, *sourceLimit; 3642 uint8_t *target; 3643 int32_t targetCapacity; 3644 int32_t *offsets; 3645 3646 const uint16_t *table; 3647 const uint16_t *results; 3648 3649 UChar32 c; 3650 3651 int32_t sourceIndex, nextSourceIndex; 3652 3653 uint16_t value, minValue; 3654 UBool hasSupplementary; 3655 3656 /* set up the local pointers */ 3657 cnv=pArgs->converter; 3658 source=pArgs->source; 3659 sourceLimit=pArgs->sourceLimit; 3660 target=(uint8_t *)pArgs->target; 3661 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3662 offsets=pArgs->offsets; 3663 3664 table=cnv->sharedData->mbcs.fromUnicodeTable; 3665 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3666 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3667 } else { 3668 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3669 } 3670 3671 if(cnv->useFallback) { 3672 /* use all roundtrip and fallback results */ 3673 minValue=0x800; 3674 } else { 3675 /* use only roundtrips and fallbacks from private-use characters */ 3676 minValue=0xc00; 3677 } 3678 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3679 3680 /* get the converter state from UConverter */ 3681 c=cnv->fromUChar32; 3682 3683 /* sourceIndex=-1 if the current character began in the previous buffer */ 3684 sourceIndex= c==0 ? 0 : -1; 3685 nextSourceIndex=0; 3686 3687 /* conversion loop */ 3688 if(c!=0 && targetCapacity>0) { 3689 goto getTrail; 3690 } 3691 3692 while(source<sourceLimit) { 3693 /* 3694 * This following test is to see if available input would overflow the output. 3695 * It does not catch output of more than one byte that 3696 * overflows as a result of a multi-byte character or callback output 3697 * from the last source character. 3698 * Therefore, those situations also test for overflows and will 3699 * then break the loop, too. 3700 */ 3701 if(targetCapacity>0) { 3702 /* 3703 * Get a correct Unicode code point: 3704 * a single UChar for a BMP code point or 3705 * a matched surrogate pair for a "supplementary code point". 3706 */ 3707 c=*source++; 3708 ++nextSourceIndex; 3709 if(U16_IS_SURROGATE(c)) { 3710 if(U16_IS_SURROGATE_LEAD(c)) { 3711 getTrail: 3712 if(source<sourceLimit) { 3713 /* test the following code unit */ 3714 UChar trail=*source; 3715 if(U16_IS_TRAIL(trail)) { 3716 ++source; 3717 ++nextSourceIndex; 3718 c=U16_GET_SUPPLEMENTARY(c, trail); 3719 if(!hasSupplementary) { 3720 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3721 /* callback(unassigned) */ 3722 goto unassigned; 3723 } 3724 /* convert this supplementary code point */ 3725 /* exit this condition tree */ 3726 } else { 3727 /* this is an unmatched lead code unit (1st surrogate) */ 3728 /* callback(illegal) */ 3729 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3730 break; 3731 } 3732 } else { 3733 /* no more input */ 3734 break; 3735 } 3736 } else { 3737 /* this is an unmatched trail code unit (2nd surrogate) */ 3738 /* callback(illegal) */ 3739 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3740 break; 3741 } 3742 } 3743 3744 /* convert the Unicode code point in c into codepage bytes */ 3745 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3746 3747 /* is this code point assigned, or do we use fallbacks? */ 3748 if(value>=minValue) { 3749 /* assigned, write the output character bytes from value and length */ 3750 /* length==1 */ 3751 /* this is easy because we know that there is enough space */ 3752 *target++=(uint8_t)value; 3753 if(offsets!=NULL) { 3754 *offsets++=sourceIndex; 3755 } 3756 --targetCapacity; 3757 3758 /* normal end of conversion: prepare for a new character */ 3759 c=0; 3760 sourceIndex=nextSourceIndex; 3761 } else { /* unassigned */ 3762 unassigned: 3763 /* try an extension mapping */ 3764 pArgs->source=source; 3765 c=_extFromU(cnv, cnv->sharedData, 3766 c, &source, sourceLimit, 3767 &target, target+targetCapacity, 3768 &offsets, sourceIndex, 3769 pArgs->flush, 3770 pErrorCode); 3771 nextSourceIndex+=(int32_t)(source-pArgs->source); 3772 3773 if(U_FAILURE(*pErrorCode)) { 3774 /* not mappable or buffer overflow */ 3775 break; 3776 } else { 3777 /* a mapping was written to the target, continue */ 3778 3779 /* recalculate the targetCapacity after an extension mapping */ 3780 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3781 3782 /* normal end of conversion: prepare for a new character */ 3783 sourceIndex=nextSourceIndex; 3784 } 3785 } 3786 } else { 3787 /* target is full */ 3788 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3789 break; 3790 } 3791 } 3792 3793 /* set the converter state back into UConverter */ 3794 cnv->fromUChar32=c; 3795 3796 /* write back the updated pointers */ 3797 pArgs->source=source; 3798 pArgs->target=(char *)target; 3799 pArgs->offsets=offsets; 3800 } 3801 3802 /* 3803 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3804 * that map only to and from the BMP. 3805 * In addition to single-byte/state optimizations, the offset calculations 3806 * become much easier. 3807 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3808 * but measurements have shown that this diminishes performance 3809 * in more cases than it improves it. 3810 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3811 * for various MBCS and SBCS optimizations. 3812 */ 3813 static void 3814 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3815 UErrorCode *pErrorCode) { 3816 UConverter *cnv; 3817 const UChar *source, *sourceLimit, *lastSource; 3818 uint8_t *target; 3819 int32_t targetCapacity, length; 3820 int32_t *offsets; 3821 3822 const uint16_t *table; 3823 const uint16_t *results; 3824 3825 UChar32 c; 3826 3827 int32_t sourceIndex; 3828 3829 uint32_t asciiRoundtrips; 3830 uint16_t value, minValue; 3831 3832 /* set up the local pointers */ 3833 cnv=pArgs->converter; 3834 source=pArgs->source; 3835 sourceLimit=pArgs->sourceLimit; 3836 target=(uint8_t *)pArgs->target; 3837 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3838 offsets=pArgs->offsets; 3839 3840 table=cnv->sharedData->mbcs.fromUnicodeTable; 3841 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3842 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3843 } else { 3844 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3845 } 3846 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3847 3848 if(cnv->useFallback) { 3849 /* use all roundtrip and fallback results */ 3850 minValue=0x800; 3851 } else { 3852 /* use only roundtrips and fallbacks from private-use characters */ 3853 minValue=0xc00; 3854 } 3855 3856 /* get the converter state from UConverter */ 3857 c=cnv->fromUChar32; 3858 3859 /* sourceIndex=-1 if the current character began in the previous buffer */ 3860 sourceIndex= c==0 ? 0 : -1; 3861 lastSource=source; 3862 3863 /* 3864 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3865 * for the minimum of the sourceLength and targetCapacity 3866 */ 3867 length=(int32_t)(sourceLimit-source); 3868 if(length<targetCapacity) { 3869 targetCapacity=length; 3870 } 3871 3872 /* conversion loop */ 3873 if(c!=0 && targetCapacity>0) { 3874 goto getTrail; 3875 } 3876 3877 #if MBCS_UNROLL_SINGLE_FROM_BMP 3878 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3879 /* unroll the loop with the most common case */ 3880 unrolled: 3881 if(targetCapacity>=4) { 3882 int32_t count, loops; 3883 uint16_t andedValues; 3884 3885 loops=count=targetCapacity>>2; 3886 do { 3887 c=*source++; 3888 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3889 *target++=(uint8_t)value; 3890 c=*source++; 3891 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3892 *target++=(uint8_t)value; 3893 c=*source++; 3894 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3895 *target++=(uint8_t)value; 3896 c=*source++; 3897 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3898 *target++=(uint8_t)value; 3899 3900 /* were all 4 entries really valid? */ 3901 if(andedValues<minValue) { 3902 /* no, return to the first of these 4 */ 3903 source-=4; 3904 target-=4; 3905 break; 3906 } 3907 } while(--count>0); 3908 count=loops-count; 3909 targetCapacity-=4*count; 3910 3911 if(offsets!=NULL) { 3912 lastSource+=4*count; 3913 while(count>0) { 3914 *offsets++=sourceIndex++; 3915 *offsets++=sourceIndex++; 3916 *offsets++=sourceIndex++; 3917 *offsets++=sourceIndex++; 3918 --count; 3919 } 3920 } 3921 3922 c=0; 3923 } 3924 #endif 3925 3926 while(targetCapacity>0) { 3927 /* 3928 * Get a correct Unicode code point: 3929 * a single UChar for a BMP code point or 3930 * a matched surrogate pair for a "supplementary code point". 3931 */ 3932 c=*source++; 3933 /* 3934 * Do not immediately check for single surrogates: 3935 * Assume that they are unassigned and check for them in that case. 3936 * This speeds up the conversion of assigned characters. 3937 */ 3938 /* convert the Unicode code point in c into codepage bytes */ 3939 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3940 *target++=(uint8_t)c; 3941 --targetCapacity; 3942 c=0; 3943 continue; 3944 } 3945 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3946 /* is this code point assigned, or do we use fallbacks? */ 3947 if(value>=minValue) { 3948 /* assigned, write the output character bytes from value and length */ 3949 /* length==1 */ 3950 /* this is easy because we know that there is enough space */ 3951 *target++=(uint8_t)value; 3952 --targetCapacity; 3953 3954 /* normal end of conversion: prepare for a new character */ 3955 c=0; 3956 continue; 3957 } else if(!U16_IS_SURROGATE(c)) { 3958 /* normal, unassigned BMP character */ 3959 } else if(U16_IS_SURROGATE_LEAD(c)) { 3960 getTrail: 3961 if(source<sourceLimit) { 3962 /* test the following code unit */ 3963 UChar trail=*source; 3964 if(U16_IS_TRAIL(trail)) { 3965 ++source; 3966 c=U16_GET_SUPPLEMENTARY(c, trail); 3967 /* this codepage does not map supplementary code points */ 3968 /* callback(unassigned) */ 3969 } else { 3970 /* this is an unmatched lead code unit (1st surrogate) */ 3971 /* callback(illegal) */ 3972 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3973 break; 3974 } 3975 } else { 3976 /* no more input */ 3977 if (pArgs->flush) { 3978 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3979 } 3980 break; 3981 } 3982 } else { 3983 /* this is an unmatched trail code unit (2nd surrogate) */ 3984 /* callback(illegal) */ 3985 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3986 break; 3987 } 3988 3989 /* c does not have a mapping */ 3990 3991 /* get the number of code units for c to correctly advance sourceIndex */ 3992 length=U16_LENGTH(c); 3993 3994 /* set offsets since the start or the last extension */ 3995 if(offsets!=NULL) { 3996 int32_t count=(int32_t)(source-lastSource); 3997 3998 /* do not set the offset for this character */ 3999 count-=length; 4000 4001 while(count>0) { 4002 *offsets++=sourceIndex++; 4003 --count; 4004 } 4005 /* offsets and sourceIndex are now set for the current character */ 4006 } 4007 4008 /* try an extension mapping */ 4009 lastSource=source; 4010 c=_extFromU(cnv, cnv->sharedData, 4011 c, &source, sourceLimit, 4012 &target, (const uint8_t *)(pArgs->targetLimit), 4013 &offsets, sourceIndex, 4014 pArgs->flush, 4015 pErrorCode); 4016 sourceIndex+=length+(int32_t)(source-lastSource); 4017 lastSource=source; 4018 4019 if(U_FAILURE(*pErrorCode)) { 4020 /* not mappable or buffer overflow */ 4021 break; 4022 } else { 4023 /* a mapping was written to the target, continue */ 4024 4025 /* recalculate the targetCapacity after an extension mapping */ 4026 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4027 length=(int32_t)(sourceLimit-source); 4028 if(length<targetCapacity) { 4029 targetCapacity=length; 4030 } 4031 } 4032 4033 #if MBCS_UNROLL_SINGLE_FROM_BMP 4034 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 4035 goto unrolled; 4036 #endif 4037 } 4038 4039 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 4040 /* target is full */ 4041 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4042 } 4043 4044 /* set offsets since the start or the last callback */ 4045 if(offsets!=NULL) { 4046 size_t count=source-lastSource; 4047 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 4048 /* 4049 Caller gave us a partial supplementary character, 4050 which this function couldn't convert in any case. 4051 The callback will handle the offset. 4052 */ 4053 count--; 4054 } 4055 while(count>0) { 4056 *offsets++=sourceIndex++; 4057 --count; 4058 } 4059 } 4060 4061 /* set the converter state back into UConverter */ 4062 cnv->fromUChar32=c; 4063 4064 /* write back the updated pointers */ 4065 pArgs->source=source; 4066 pArgs->target=(char *)target; 4067 pArgs->offsets=offsets; 4068 } 4069 4070 U_CFUNC void 4071 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 4072 UErrorCode *pErrorCode) { 4073 UConverter *cnv; 4074 const UChar *source, *sourceLimit; 4075 uint8_t *target; 4076 int32_t targetCapacity; 4077 int32_t *offsets; 4078 4079 const uint16_t *table; 4080 const uint16_t *mbcsIndex; 4081 const uint8_t *p, *bytes; 4082 uint8_t outputType; 4083 4084 UChar32 c; 4085 4086 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 4087 4088 uint32_t stage2Entry; 4089 uint32_t asciiRoundtrips; 4090 uint32_t value; 4091 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */ 4092 uint8_t siBytes[2] = {0, 0}; 4093 uint8_t soBytes[2] = {0, 0}; 4094 uint8_t siLength, soLength; 4095 int32_t length = 0, prevLength; 4096 uint8_t unicodeMask; 4097 4098 cnv=pArgs->converter; 4099 4100 if(cnv->preFromUFirstCP>=0) { 4101 /* 4102 * pass sourceIndex=-1 because we continue from an earlier buffer 4103 * in the future, this may change with continuous offsets 4104 */ 4105 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 4106 4107 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 4108 return; 4109 } 4110 } 4111 4112 /* use optimized function if possible */ 4113 outputType=cnv->sharedData->mbcs.outputType; 4114 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 4115 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4116 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4117 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 4118 } else { 4119 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 4120 } 4121 return; 4122 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 4123 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 4124 return; 4125 } 4126 4127 /* set up the local pointers */ 4128 source=pArgs->source; 4129 sourceLimit=pArgs->sourceLimit; 4130 target=(uint8_t *)pArgs->target; 4131 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 4132 offsets=pArgs->offsets; 4133 4134 table=cnv->sharedData->mbcs.fromUnicodeTable; 4135 if(cnv->sharedData->mbcs.utf8Friendly) { 4136 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 4137 } else { 4138 mbcsIndex=NULL; 4139 } 4140 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4141 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4142 } else { 4143 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 4144 } 4145 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4146 4147 /* get the converter state from UConverter */ 4148 c=cnv->fromUChar32; 4149 4150 if(outputType==MBCS_OUTPUT_2_SISO) { 4151 prevLength=cnv->fromUnicodeStatus; 4152 if(prevLength==0) { 4153 /* set the real value */ 4154 prevLength=1; 4155 } 4156 } else { 4157 /* prevent fromUnicodeStatus from being set to something non-0 */ 4158 prevLength=0; 4159 } 4160 4161 /* sourceIndex=-1 if the current character began in the previous buffer */ 4162 prevSourceIndex=-1; 4163 sourceIndex= c==0 ? 0 : -1; 4164 nextSourceIndex=0; 4165 4166 /* Get the SI/SO character for the converter */ 4167 siLength = static_cast<uint8_t>(getSISOBytes(SI, cnv->options, siBytes)); 4168 soLength = static_cast<uint8_t>(getSISOBytes(SO, cnv->options, soBytes)); 4169 4170 /* conversion loop */ 4171 /* 4172 * This is another piece of ugly code: 4173 * A goto into the loop if the converter state contains a first surrogate 4174 * from the previous function call. 4175 * It saves me to check in each loop iteration a check of if(c==0) 4176 * and duplicating the trail-surrogate-handling code in the else 4177 * branch of that check. 4178 * I could not find any other way to get around this other than 4179 * using a function call for the conversion and callback, which would 4180 * be even more inefficient. 4181 * 4182 * Markus Scherer 2000-jul-19 4183 */ 4184 if(c!=0 && targetCapacity>0) { 4185 goto getTrail; 4186 } 4187 4188 while(source<sourceLimit) { 4189 /* 4190 * This following test is to see if available input would overflow the output. 4191 * It does not catch output of more than one byte that 4192 * overflows as a result of a multi-byte character or callback output 4193 * from the last source character. 4194 * Therefore, those situations also test for overflows and will 4195 * then break the loop, too. 4196 */ 4197 if(targetCapacity>0) { 4198 /* 4199 * Get a correct Unicode code point: 4200 * a single UChar for a BMP code point or 4201 * a matched surrogate pair for a "supplementary code point". 4202 */ 4203 c=*source++; 4204 ++nextSourceIndex; 4205 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4206 *target++=(uint8_t)c; 4207 if(offsets!=NULL) { 4208 *offsets++=sourceIndex; 4209 prevSourceIndex=sourceIndex; 4210 sourceIndex=nextSourceIndex; 4211 } 4212 --targetCapacity; 4213 c=0; 4214 continue; 4215 } 4216 /* 4217 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4218 * to avoid dealing with surrogates. 4219 * MBCS_FAST_MAX must be >=0xd7ff. 4220 */ 4221 if(c<=0xd7ff && mbcsIndex!=NULL) { 4222 value=mbcsIndex[c>>6]; 4223 4224 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4225 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4226 switch(outputType) { 4227 case MBCS_OUTPUT_2: 4228 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4229 if(value<=0xff) { 4230 if(value==0) { 4231 goto unassigned; 4232 } else { 4233 length=1; 4234 } 4235 } else { 4236 length=2; 4237 } 4238 break; 4239 case MBCS_OUTPUT_2_SISO: 4240 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4241 /* 4242 * Save the old state in the converter object 4243 * right here, then change the local prevLength state variable if necessary. 4244 * Then, if this character turns out to be unassigned or a fallback that 4245 * is not taken, the callback code must not save the new state in the converter 4246 * because the new state is for a character that is not output. 4247 * However, the callback must still restore the state from the converter 4248 * in case the callback function changed it for its output. 4249 */ 4250 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4251 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4252 if(value<=0xff) { 4253 if(value==0) { 4254 goto unassigned; 4255 } else if(prevLength<=1) { 4256 length=1; 4257 } else { 4258 /* change from double-byte mode to single-byte */ 4259 if (siLength == 1) { 4260 value|=(uint32_t)siBytes[0]<<8; 4261 length = 2; 4262 } else if (siLength == 2) { 4263 value|=(uint32_t)siBytes[1]<<8; 4264 value|=(uint32_t)siBytes[0]<<16; 4265 length = 3; 4266 } 4267 prevLength=1; 4268 } 4269 } else { 4270 if(prevLength==2) { 4271 length=2; 4272 } else { 4273 /* change from single-byte mode to double-byte */ 4274 if (soLength == 1) { 4275 value|=(uint32_t)soBytes[0]<<16; 4276 length = 3; 4277 } else if (soLength == 2) { 4278 value|=(uint32_t)soBytes[1]<<16; 4279 value|=(uint32_t)soBytes[0]<<24; 4280 length = 4; 4281 } 4282 prevLength=2; 4283 } 4284 } 4285 break; 4286 case MBCS_OUTPUT_DBCS_ONLY: 4287 /* table with single-byte results, but only DBCS mappings used */ 4288 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4289 if(value<=0xff) { 4290 /* no mapping or SBCS result, not taken for DBCS-only */ 4291 goto unassigned; 4292 } else { 4293 length=2; 4294 } 4295 break; 4296 case MBCS_OUTPUT_3: 4297 p=bytes+(value+(c&0x3f))*3; 4298 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4299 if(value<=0xff) { 4300 if(value==0) { 4301 goto unassigned; 4302 } else { 4303 length=1; 4304 } 4305 } else if(value<=0xffff) { 4306 length=2; 4307 } else { 4308 length=3; 4309 } 4310 break; 4311 case MBCS_OUTPUT_4: 4312 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4313 if(value<=0xff) { 4314 if(value==0) { 4315 goto unassigned; 4316 } else { 4317 length=1; 4318 } 4319 } else if(value<=0xffff) { 4320 length=2; 4321 } else if(value<=0xffffff) { 4322 length=3; 4323 } else { 4324 length=4; 4325 } 4326 break; 4327 case MBCS_OUTPUT_3_EUC: 4328 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4329 /* EUC 16-bit fixed-length representation */ 4330 if(value<=0xff) { 4331 if(value==0) { 4332 goto unassigned; 4333 } else { 4334 length=1; 4335 } 4336 } else if((value&0x8000)==0) { 4337 value|=0x8e8000; 4338 length=3; 4339 } else if((value&0x80)==0) { 4340 value|=0x8f0080; 4341 length=3; 4342 } else { 4343 length=2; 4344 } 4345 break; 4346 case MBCS_OUTPUT_4_EUC: 4347 p=bytes+(value+(c&0x3f))*3; 4348 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4349 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4350 if(value<=0xff) { 4351 if(value==0) { 4352 goto unassigned; 4353 } else { 4354 length=1; 4355 } 4356 } else if(value<=0xffff) { 4357 length=2; 4358 } else if((value&0x800000)==0) { 4359 value|=0x8e800000; 4360 length=4; 4361 } else if((value&0x8000)==0) { 4362 value|=0x8f008000; 4363 length=4; 4364 } else { 4365 length=3; 4366 } 4367 break; 4368 default: 4369 /* must not occur */ 4370 /* 4371 * To avoid compiler warnings that value & length may be 4372 * used without having been initialized, we set them here. 4373 * In reality, this is unreachable code. 4374 * Not having a default branch also causes warnings with 4375 * some compilers. 4376 */ 4377 value=0; 4378 length=0; 4379 break; 4380 } 4381 /* output the value */ 4382 } else { 4383 /* 4384 * This also tests if the codepage maps single surrogates. 4385 * If it does, then surrogates are not paired but mapped separately. 4386 * Note that in this case unmatched surrogates are not detected. 4387 */ 4388 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4389 if(U16_IS_SURROGATE_LEAD(c)) { 4390 getTrail: 4391 if(source<sourceLimit) { 4392 /* test the following code unit */ 4393 UChar trail=*source; 4394 if(U16_IS_TRAIL(trail)) { 4395 ++source; 4396 ++nextSourceIndex; 4397 c=U16_GET_SUPPLEMENTARY(c, trail); 4398 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4399 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4400 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4401 /* callback(unassigned) */ 4402 goto unassigned; 4403 } 4404 /* convert this supplementary code point */ 4405 /* exit this condition tree */ 4406 } else { 4407 /* this is an unmatched lead code unit (1st surrogate) */ 4408 /* callback(illegal) */ 4409 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4410 break; 4411 } 4412 } else { 4413 /* no more input */ 4414 break; 4415 } 4416 } else { 4417 /* this is an unmatched trail code unit (2nd surrogate) */ 4418 /* callback(illegal) */ 4419 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4420 break; 4421 } 4422 } 4423 4424 /* convert the Unicode code point in c into codepage bytes */ 4425 4426 /* 4427 * The basic lookup is a triple-stage compact array (trie) lookup. 4428 * For details see the beginning of this file. 4429 * 4430 * Single-byte codepages are handled with a different data structure 4431 * by _MBCSSingle... functions. 4432 * 4433 * The result consists of a 32-bit value from stage 2 and 4434 * a pointer to as many bytes as are stored per character. 4435 * The pointer points to the character's bytes in stage 3. 4436 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4437 * for that pointer, while bits 31..16 are flags for which of 4438 * the 16 characters in the block are roundtrip-assigned. 4439 * 4440 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4441 * respectively as uint32_t, in the platform encoding. 4442 * For 3-byte codepages, the bytes are always stored in big-endian order. 4443 * 4444 * For EUC encodings that use only either 0x8e or 0x8f as the first 4445 * byte of their longest byte sequences, the first two bytes in 4446 * this third stage indicate with their 7th bits whether these bytes 4447 * are to be written directly or actually need to be preceeded by 4448 * one of the two Single-Shift codes. With this, the third stage 4449 * stores one byte fewer per character than the actual maximum length of 4450 * EUC byte sequences. 4451 * 4452 * Other than that, leading zero bytes are removed and the other 4453 * bytes output. A single zero byte may be output if the "assigned" 4454 * bit in stage 2 was on. 4455 * The data structure does not support zero byte output as a fallback, 4456 * and also does not allow output of leading zeros. 4457 */ 4458 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4459 4460 /* get the bytes and the length for the output */ 4461 switch(outputType) { 4462 case MBCS_OUTPUT_2: 4463 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4464 if(value<=0xff) { 4465 length=1; 4466 } else { 4467 length=2; 4468 } 4469 break; 4470 case MBCS_OUTPUT_2_SISO: 4471 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4472 /* 4473 * Save the old state in the converter object 4474 * right here, then change the local prevLength state variable if necessary. 4475 * Then, if this character turns out to be unassigned or a fallback that 4476 * is not taken, the callback code must not save the new state in the converter 4477 * because the new state is for a character that is not output. 4478 * However, the callback must still restore the state from the converter 4479 * in case the callback function changed it for its output. 4480 */ 4481 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4482 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4483 if(value<=0xff) { 4484 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4485 /* no mapping, leave value==0 */ 4486 length=0; 4487 } else if(prevLength<=1) { 4488 length=1; 4489 } else { 4490 /* change from double-byte mode to single-byte */ 4491 if (siLength == 1) { 4492 value|=(uint32_t)siBytes[0]<<8; 4493 length = 2; 4494 } else if (siLength == 2) { 4495 value|=(uint32_t)siBytes[1]<<8; 4496 value|=(uint32_t)siBytes[0]<<16; 4497 length = 3; 4498 } 4499 prevLength=1; 4500 } 4501 } else { 4502 if(prevLength==2) { 4503 length=2; 4504 } else { 4505 /* change from single-byte mode to double-byte */ 4506 if (soLength == 1) { 4507 value|=(uint32_t)soBytes[0]<<16; 4508 length = 3; 4509 } else if (soLength == 2) { 4510 value|=(uint32_t)soBytes[1]<<16; 4511 value|=(uint32_t)soBytes[0]<<24; 4512 length = 4; 4513 } 4514 prevLength=2; 4515 } 4516 } 4517 break; 4518 case MBCS_OUTPUT_DBCS_ONLY: 4519 /* table with single-byte results, but only DBCS mappings used */ 4520 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4521 if(value<=0xff) { 4522 /* no mapping or SBCS result, not taken for DBCS-only */ 4523 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4524 length=0; 4525 } else { 4526 length=2; 4527 } 4528 break; 4529 case MBCS_OUTPUT_3: 4530 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4531 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4532 if(value<=0xff) { 4533 length=1; 4534 } else if(value<=0xffff) { 4535 length=2; 4536 } else { 4537 length=3; 4538 } 4539 break; 4540 case MBCS_OUTPUT_4: 4541 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4542 if(value<=0xff) { 4543 length=1; 4544 } else if(value<=0xffff) { 4545 length=2; 4546 } else if(value<=0xffffff) { 4547 length=3; 4548 } else { 4549 length=4; 4550 } 4551 break; 4552 case MBCS_OUTPUT_3_EUC: 4553 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4554 /* EUC 16-bit fixed-length representation */ 4555 if(value<=0xff) { 4556 length=1; 4557 } else if((value&0x8000)==0) { 4558 value|=0x8e8000; 4559 length=3; 4560 } else if((value&0x80)==0) { 4561 value|=0x8f0080; 4562 length=3; 4563 } else { 4564 length=2; 4565 } 4566 break; 4567 case MBCS_OUTPUT_4_EUC: 4568 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4569 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4570 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4571 if(value<=0xff) { 4572 length=1; 4573 } else if(value<=0xffff) { 4574 length=2; 4575 } else if((value&0x800000)==0) { 4576 value|=0x8e800000; 4577 length=4; 4578 } else if((value&0x8000)==0) { 4579 value|=0x8f008000; 4580 length=4; 4581 } else { 4582 length=3; 4583 } 4584 break; 4585 default: 4586 /* must not occur */ 4587 /* 4588 * To avoid compiler warnings that value & length may be 4589 * used without having been initialized, we set them here. 4590 * In reality, this is unreachable code. 4591 * Not having a default branch also causes warnings with 4592 * some compilers. 4593 */ 4594 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4595 length=0; 4596 break; 4597 } 4598 4599 /* is this code point assigned, or do we use fallbacks? */ 4600 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4601 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4602 ) { 4603 /* 4604 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4605 * There is no way with this data structure for fallback output 4606 * to be a zero byte. 4607 */ 4608 4609 unassigned: 4610 /* try an extension mapping */ 4611 pArgs->source=source; 4612 c=_extFromU(cnv, cnv->sharedData, 4613 c, &source, sourceLimit, 4614 &target, target+targetCapacity, 4615 &offsets, sourceIndex, 4616 pArgs->flush, 4617 pErrorCode); 4618 nextSourceIndex+=(int32_t)(source-pArgs->source); 4619 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4620 4621 if(U_FAILURE(*pErrorCode)) { 4622 /* not mappable or buffer overflow */ 4623 break; 4624 } else { 4625 /* a mapping was written to the target, continue */ 4626 4627 /* recalculate the targetCapacity after an extension mapping */ 4628 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4629 4630 /* normal end of conversion: prepare for a new character */ 4631 if(offsets!=NULL) { 4632 prevSourceIndex=sourceIndex; 4633 sourceIndex=nextSourceIndex; 4634 } 4635 continue; 4636 } 4637 } 4638 } 4639 4640 /* write the output character bytes from value and length */ 4641 /* from the first if in the loop we know that targetCapacity>0 */ 4642 if(length<=targetCapacity) { 4643 if(offsets==NULL) { 4644 switch(length) { 4645 /* each branch falls through to the next one */ 4646 case 4: 4647 *target++=(uint8_t)(value>>24); 4648 U_FALLTHROUGH; 4649 case 3: 4650 *target++=(uint8_t)(value>>16); 4651 U_FALLTHROUGH; 4652 case 2: 4653 *target++=(uint8_t)(value>>8); 4654 U_FALLTHROUGH; 4655 case 1: 4656 *target++=(uint8_t)value; 4657 U_FALLTHROUGH; 4658 default: 4659 /* will never occur */ 4660 break; 4661 } 4662 } else { 4663 switch(length) { 4664 /* each branch falls through to the next one */ 4665 case 4: 4666 *target++=(uint8_t)(value>>24); 4667 *offsets++=sourceIndex; 4668 U_FALLTHROUGH; 4669 case 3: 4670 *target++=(uint8_t)(value>>16); 4671 *offsets++=sourceIndex; 4672 U_FALLTHROUGH; 4673 case 2: 4674 *target++=(uint8_t)(value>>8); 4675 *offsets++=sourceIndex; 4676 U_FALLTHROUGH; 4677 case 1: 4678 *target++=(uint8_t)value; 4679 *offsets++=sourceIndex; 4680 U_FALLTHROUGH; 4681 default: 4682 /* will never occur */ 4683 break; 4684 } 4685 } 4686 targetCapacity-=length; 4687 } else { 4688 uint8_t *charErrorBuffer; 4689 4690 /* 4691 * We actually do this backwards here: 4692 * In order to save an intermediate variable, we output 4693 * first to the overflow buffer what does not fit into the 4694 * regular target. 4695 */ 4696 /* we know that 1<=targetCapacity<length<=4 */ 4697 length-=targetCapacity; 4698 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4699 switch(length) { 4700 /* each branch falls through to the next one */ 4701 case 3: 4702 *charErrorBuffer++=(uint8_t)(value>>16); 4703 U_FALLTHROUGH; 4704 case 2: 4705 *charErrorBuffer++=(uint8_t)(value>>8); 4706 U_FALLTHROUGH; 4707 case 1: 4708 *charErrorBuffer=(uint8_t)value; 4709 U_FALLTHROUGH; 4710 default: 4711 /* will never occur */ 4712 break; 4713 } 4714 cnv->charErrorBufferLength=(int8_t)length; 4715 4716 /* now output what fits into the regular target */ 4717 value>>=8*length; /* length was reduced by targetCapacity */ 4718 switch(targetCapacity) { 4719 /* each branch falls through to the next one */ 4720 case 3: 4721 *target++=(uint8_t)(value>>16); 4722 if(offsets!=NULL) { 4723 *offsets++=sourceIndex; 4724 } 4725 U_FALLTHROUGH; 4726 case 2: 4727 *target++=(uint8_t)(value>>8); 4728 if(offsets!=NULL) { 4729 *offsets++=sourceIndex; 4730 } 4731 U_FALLTHROUGH; 4732 case 1: 4733 *target++=(uint8_t)value; 4734 if(offsets!=NULL) { 4735 *offsets++=sourceIndex; 4736 } 4737 U_FALLTHROUGH; 4738 default: 4739 /* will never occur */ 4740 break; 4741 } 4742 4743 /* target overflow */ 4744 targetCapacity=0; 4745 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4746 c=0; 4747 break; 4748 } 4749 4750 /* normal end of conversion: prepare for a new character */ 4751 c=0; 4752 if(offsets!=NULL) { 4753 prevSourceIndex=sourceIndex; 4754 sourceIndex=nextSourceIndex; 4755 } 4756 continue; 4757 } else { 4758 /* target is full */ 4759 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4760 break; 4761 } 4762 } 4763 4764 /* 4765 * the end of the input stream and detection of truncated input 4766 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4767 * we need to emit an SI at the very end 4768 * 4769 * conditions: 4770 * successful 4771 * EBCDIC_STATEFUL in DBCS mode 4772 * end of input and no truncated input 4773 */ 4774 if( U_SUCCESS(*pErrorCode) && 4775 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4776 pArgs->flush && source>=sourceLimit && c==0 4777 ) { 4778 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4779 if(targetCapacity>0) { 4780 *target++=(uint8_t)siBytes[0]; 4781 if (siLength == 2) { 4782 if (targetCapacity<2) { 4783 cnv->charErrorBuffer[0]=(uint8_t)siBytes[1]; 4784 cnv->charErrorBufferLength=1; 4785 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4786 } else { 4787 *target++=(uint8_t)siBytes[1]; 4788 } 4789 } 4790 if(offsets!=NULL) { 4791 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4792 *offsets++=prevSourceIndex; 4793 } 4794 } else { 4795 /* target is full */ 4796 cnv->charErrorBuffer[0]=(uint8_t)siBytes[0]; 4797 if (siLength == 2) { 4798 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1]; 4799 } 4800 cnv->charErrorBufferLength=siLength; 4801 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4802 } 4803 prevLength=1; /* we switched into SBCS */ 4804 } 4805 4806 /* set the converter state back into UConverter */ 4807 cnv->fromUChar32=c; 4808 cnv->fromUnicodeStatus=prevLength; 4809 4810 /* write back the updated pointers */ 4811 pArgs->source=source; 4812 pArgs->target=(char *)target; 4813 pArgs->offsets=offsets; 4814 } 4815 4816 /* 4817 * This is another simple conversion function for internal use by other 4818 * conversion implementations. 4819 * It does not use the converter state nor call callbacks. 4820 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4821 * It handles conversion extensions but not GB 18030. 4822 * 4823 * It converts one single Unicode code point into codepage bytes, encoded 4824 * as one 32-bit value. The function returns the number of bytes in *pValue: 4825 * 1..4 the number of bytes in *pValue 4826 * 0 unassigned (*pValue undefined) 4827 * -1 illegal (currently not used, *pValue undefined) 4828 * 4829 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4830 * the second to last byte in bits 15..8, etc. 4831 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4832 */ 4833 U_CFUNC int32_t 4834 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4835 UChar32 c, uint32_t *pValue, 4836 UBool useFallback) { 4837 const int32_t *cx; 4838 const uint16_t *table; 4839 #if 0 4840 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4841 const uint8_t *p; 4842 #endif 4843 uint32_t stage2Entry; 4844 uint32_t value; 4845 int32_t length; 4846 4847 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4848 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4849 table=sharedData->mbcs.fromUnicodeTable; 4850 4851 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4852 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4853 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4854 /* is this code point assigned, or do we use fallbacks? */ 4855 if(useFallback ? value>=0x800 : value>=0xc00) { 4856 *pValue=value&0xff; 4857 return 1; 4858 } 4859 } else /* outputType!=MBCS_OUTPUT_1 */ { 4860 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4861 4862 /* get the bytes and the length for the output */ 4863 switch(sharedData->mbcs.outputType) { 4864 case MBCS_OUTPUT_2: 4865 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4866 if(value<=0xff) { 4867 length=1; 4868 } else { 4869 length=2; 4870 } 4871 break; 4872 #if 0 4873 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4874 case MBCS_OUTPUT_DBCS_ONLY: 4875 /* table with single-byte results, but only DBCS mappings used */ 4876 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4877 if(value<=0xff) { 4878 /* no mapping or SBCS result, not taken for DBCS-only */ 4879 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4880 length=0; 4881 } else { 4882 length=2; 4883 } 4884 break; 4885 case MBCS_OUTPUT_3: 4886 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4887 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4888 if(value<=0xff) { 4889 length=1; 4890 } else if(value<=0xffff) { 4891 length=2; 4892 } else { 4893 length=3; 4894 } 4895 break; 4896 case MBCS_OUTPUT_4: 4897 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4898 if(value<=0xff) { 4899 length=1; 4900 } else if(value<=0xffff) { 4901 length=2; 4902 } else if(value<=0xffffff) { 4903 length=3; 4904 } else { 4905 length=4; 4906 } 4907 break; 4908 case MBCS_OUTPUT_3_EUC: 4909 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4910 /* EUC 16-bit fixed-length representation */ 4911 if(value<=0xff) { 4912 length=1; 4913 } else if((value&0x8000)==0) { 4914 value|=0x8e8000; 4915 length=3; 4916 } else if((value&0x80)==0) { 4917 value|=0x8f0080; 4918 length=3; 4919 } else { 4920 length=2; 4921 } 4922 break; 4923 case MBCS_OUTPUT_4_EUC: 4924 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4925 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4926 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4927 if(value<=0xff) { 4928 length=1; 4929 } else if(value<=0xffff) { 4930 length=2; 4931 } else if((value&0x800000)==0) { 4932 value|=0x8e800000; 4933 length=4; 4934 } else if((value&0x8000)==0) { 4935 value|=0x8f008000; 4936 length=4; 4937 } else { 4938 length=3; 4939 } 4940 break; 4941 #endif 4942 default: 4943 /* must not occur */ 4944 return -1; 4945 } 4946 4947 /* is this code point assigned, or do we use fallbacks? */ 4948 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4949 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4950 ) { 4951 /* 4952 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4953 * There is no way with this data structure for fallback output 4954 * to be a zero byte. 4955 */ 4956 /* assigned */ 4957 *pValue=value; 4958 return length; 4959 } 4960 } 4961 } 4962 4963 cx=sharedData->mbcs.extIndexes; 4964 if(cx!=NULL) { 4965 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4966 return length>=0 ? length : -length; /* return abs(length); */ 4967 } 4968 4969 /* unassigned */ 4970 return 0; 4971 } 4972 4973 4974 #if 0 4975 /* 4976 * This function has been moved to ucnv2022.c for inlining. 4977 * This implementation is here only for documentation purposes 4978 */ 4979 4980 /** 4981 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4982 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4983 * It does not handle conversion extensions (_extFromU()). 4984 * 4985 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4986 */ 4987 U_CFUNC int32_t 4988 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4989 UChar32 c, 4990 UBool useFallback) { 4991 const uint16_t *table; 4992 int32_t value; 4993 4994 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4995 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4996 return -1; 4997 } 4998 4999 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 5000 table=sharedData->mbcs.fromUnicodeTable; 5001 5002 /* get the byte for the output */ 5003 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 5004 /* is this code point assigned, or do we use fallbacks? */ 5005 if(useFallback ? value>=0x800 : value>=0xc00) { 5006 return value&0xff; 5007 } else { 5008 return -1; 5009 } 5010 } 5011 #endif 5012 5013 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 5014 5015 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 5016 static const UChar32 5017 utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 5018 5019 static void U_CALLCONV 5020 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5021 UConverterToUnicodeArgs *pToUArgs, 5022 UErrorCode *pErrorCode) { 5023 UConverter *utf8, *cnv; 5024 const uint8_t *source, *sourceLimit; 5025 uint8_t *target; 5026 int32_t targetCapacity; 5027 5028 const uint16_t *table, *sbcsIndex; 5029 const uint16_t *results; 5030 5031 int8_t oldToULength, toULength, toULimit; 5032 5033 UChar32 c; 5034 uint8_t b, t1, t2; 5035 5036 uint32_t asciiRoundtrips; 5037 uint16_t value, minValue = 0; 5038 UBool hasSupplementary; 5039 5040 /* set up the local pointers */ 5041 utf8=pToUArgs->converter; 5042 cnv=pFromUArgs->converter; 5043 source=(uint8_t *)pToUArgs->source; 5044 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5045 target=(uint8_t *)pFromUArgs->target; 5046 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5047 5048 table=cnv->sharedData->mbcs.fromUnicodeTable; 5049 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 5050 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5051 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5052 } else { 5053 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5054 } 5055 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5056 5057 if(cnv->useFallback) { 5058 /* use all roundtrip and fallback results */ 5059 minValue=0x800; 5060 } else { 5061 /* use only roundtrips and fallbacks from private-use characters */ 5062 minValue=0xc00; 5063 } 5064 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5065 5066 /* get the converter state from the UTF-8 UConverter */ 5067 if(utf8->toULength > 0) { 5068 toULength=oldToULength=utf8->toULength; 5069 toULimit=(int8_t)utf8->mode; 5070 c=(UChar32)utf8->toUnicodeStatus; 5071 } else { 5072 toULength=oldToULength=toULimit=0; 5073 c = 0; 5074 } 5075 5076 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character. 5077 // If the buffer ends with a truncated 2- or 3-byte sequence, 5078 // then we reduce the sourceLimit to before that, 5079 // and collect the remaining bytes after the conversion loop. 5080 { 5081 // Do not go back into the bytes that will be read for finishing a partial 5082 // sequence from the previous buffer. 5083 int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5084 if(length>0) { 5085 uint8_t b1=*(sourceLimit-1); 5086 if(U8_IS_SINGLE(b1)) { 5087 // common ASCII character 5088 } else if(U8_IS_TRAIL(b1) && length>=2) { 5089 uint8_t b2=*(sourceLimit-2); 5090 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 5091 // truncated 3-byte sequence 5092 sourceLimit-=2; 5093 } 5094 } else if(0xc2<=b1 && b1<0xf0) { 5095 // truncated 2- or 3-byte sequence 5096 --sourceLimit; 5097 } 5098 } 5099 } 5100 5101 if(c!=0 && targetCapacity>0) { 5102 utf8->toUnicodeStatus=0; 5103 utf8->toULength=0; 5104 goto moreBytes; 5105 /* 5106 * Note: We could avoid the goto by duplicating some of the moreBytes 5107 * code, but only up to the point of collecting a complete UTF-8 5108 * sequence; then recurse for the toUBytes[toULength] 5109 * and then continue with normal conversion. 5110 * 5111 * If so, move this code to just after initializing the minimum 5112 * set of local variables for reading the UTF-8 input 5113 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 5114 * 5115 * Potential advantages: 5116 * - avoid the goto 5117 * - oldToULength could become a local variable in just those code blocks 5118 * that deal with buffer boundaries 5119 * - possibly faster if the goto prevents some compiler optimizations 5120 * (this would need measuring to confirm) 5121 * Disadvantage: 5122 * - code duplication 5123 */ 5124 } 5125 5126 /* conversion loop */ 5127 while(source<sourceLimit) { 5128 if(targetCapacity>0) { 5129 b=*source++; 5130 if(U8_IS_SINGLE(b)) { 5131 /* convert ASCII */ 5132 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5133 *target++=(uint8_t)b; 5134 --targetCapacity; 5135 continue; 5136 } else { 5137 c=b; 5138 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 5139 } 5140 } else { 5141 if(b<0xe0) { 5142 if( /* handle U+0080..U+07FF inline */ 5143 b>=0xc2 && 5144 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5145 ) { 5146 c=b&0x1f; 5147 ++source; 5148 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 5149 if(value>=minValue) { 5150 *target++=(uint8_t)value; 5151 --targetCapacity; 5152 continue; 5153 } else { 5154 c=(c<<6)|t1; 5155 } 5156 } else { 5157 c=-1; 5158 } 5159 } else if(b==0xe0) { 5160 if( /* handle U+0800..U+0FFF inline */ 5161 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5162 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5163 ) { 5164 c=t1; 5165 source+=2; 5166 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5167 if(value>=minValue) { 5168 *target++=(uint8_t)value; 5169 --targetCapacity; 5170 continue; 5171 } else { 5172 c=(c<<6)|t2; 5173 } 5174 } else { 5175 c=-1; 5176 } 5177 } else { 5178 c=-1; 5179 } 5180 5181 if(c<0) { 5182 /* handle "complicated" and error cases, and continuing partial characters */ 5183 oldToULength=0; 5184 toULength=1; 5185 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 5186 c=b; 5187 moreBytes: 5188 while(toULength<toULimit) { 5189 /* 5190 * The sourceLimit may have been adjusted before the conversion loop 5191 * to stop before a truncated sequence. 5192 * Here we need to use the real limit in case we have two truncated 5193 * sequences at the end. 5194 * See ticket #7492. 5195 */ 5196 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5197 b=*source; 5198 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 5199 ++source; 5200 ++toULength; 5201 c=(c<<6)+b; 5202 } else { 5203 break; /* sequence too short, stop with toULength<toULimit */ 5204 } 5205 } else { 5206 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5207 source-=(toULength-oldToULength); 5208 while(oldToULength<toULength) { 5209 utf8->toUBytes[oldToULength++]=*source++; 5210 } 5211 utf8->toUnicodeStatus=c; 5212 utf8->toULength=toULength; 5213 utf8->mode=toULimit; 5214 pToUArgs->source=(char *)source; 5215 pFromUArgs->target=(char *)target; 5216 return; 5217 } 5218 } 5219 5220 if(toULength==toULimit) { 5221 c-=utf8_offsets[toULength]; 5222 if(toULength<=3) { /* BMP */ 5223 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5224 } else { 5225 /* supplementary code point */ 5226 if(!hasSupplementary) { 5227 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5228 value=0; 5229 } else { 5230 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5231 } 5232 } 5233 } else { 5234 /* error handling: illegal UTF-8 byte sequence */ 5235 source-=(toULength-oldToULength); 5236 while(oldToULength<toULength) { 5237 utf8->toUBytes[oldToULength++]=*source++; 5238 } 5239 utf8->toULength=toULength; 5240 pToUArgs->source=(char *)source; 5241 pFromUArgs->target=(char *)target; 5242 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5243 return; 5244 } 5245 } 5246 } 5247 5248 if(value>=minValue) { 5249 /* output the mapping for c */ 5250 *target++=(uint8_t)value; 5251 --targetCapacity; 5252 } else { 5253 /* value<minValue means c is unassigned (unmappable) */ 5254 /* 5255 * Try an extension mapping. 5256 * Pass in no source because we don't have UTF-16 input. 5257 * If we have a partial match on c, we will return and revert 5258 * to UTF-8->UTF-16->charset conversion. 5259 */ 5260 static const UChar nul=0; 5261 const UChar *noSource=&nul; 5262 c=_extFromU(cnv, cnv->sharedData, 5263 c, &noSource, noSource, 5264 &target, target+targetCapacity, 5265 NULL, -1, 5266 pFromUArgs->flush, 5267 pErrorCode); 5268 5269 if(U_FAILURE(*pErrorCode)) { 5270 /* not mappable or buffer overflow */ 5271 cnv->fromUChar32=c; 5272 break; 5273 } else if(cnv->preFromUFirstCP>=0) { 5274 /* 5275 * Partial match, return and revert to pivoting. 5276 * In normal from-UTF-16 conversion, we would just continue 5277 * but then exit the loop because the extension match would 5278 * have consumed the source. 5279 */ 5280 *pErrorCode=U_USING_DEFAULT_WARNING; 5281 break; 5282 } else { 5283 /* a mapping was written to the target, continue */ 5284 5285 /* recalculate the targetCapacity after an extension mapping */ 5286 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5287 } 5288 } 5289 } else { 5290 /* target is full */ 5291 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5292 break; 5293 } 5294 } 5295 5296 /* 5297 * The sourceLimit may have been adjusted before the conversion loop 5298 * to stop before a truncated sequence. 5299 * If so, then collect the truncated sequence now. 5300 */ 5301 if(U_SUCCESS(*pErrorCode) && 5302 cnv->preFromUFirstCP<0 && 5303 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5304 c=utf8->toUBytes[0]=b=*source++; 5305 toULength=1; 5306 toULimit=U8_COUNT_BYTES(b); 5307 while(source<sourceLimit) { 5308 utf8->toUBytes[toULength++]=b=*source++; 5309 c=(c<<6)+b; 5310 } 5311 utf8->toUnicodeStatus=c; 5312 utf8->toULength=toULength; 5313 utf8->mode=toULimit; 5314 } 5315 5316 /* write back the updated pointers */ 5317 pToUArgs->source=(char *)source; 5318 pFromUArgs->target=(char *)target; 5319 } 5320 5321 static void U_CALLCONV 5322 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5323 UConverterToUnicodeArgs *pToUArgs, 5324 UErrorCode *pErrorCode) { 5325 UConverter *utf8, *cnv; 5326 const uint8_t *source, *sourceLimit; 5327 uint8_t *target; 5328 int32_t targetCapacity; 5329 5330 const uint16_t *table, *mbcsIndex; 5331 const uint16_t *results; 5332 5333 int8_t oldToULength, toULength, toULimit; 5334 5335 UChar32 c; 5336 uint8_t b, t1, t2; 5337 5338 uint32_t stage2Entry; 5339 uint32_t asciiRoundtrips; 5340 uint16_t value = 0; 5341 UBool hasSupplementary; 5342 5343 /* set up the local pointers */ 5344 utf8=pToUArgs->converter; 5345 cnv=pFromUArgs->converter; 5346 source=(uint8_t *)pToUArgs->source; 5347 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5348 target=(uint8_t *)pFromUArgs->target; 5349 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5350 5351 table=cnv->sharedData->mbcs.fromUnicodeTable; 5352 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5353 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5354 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5355 } else { 5356 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5357 } 5358 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5359 5360 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5361 5362 /* get the converter state from the UTF-8 UConverter */ 5363 if(utf8->toULength > 0) { 5364 toULength=oldToULength=utf8->toULength; 5365 toULimit=(int8_t)utf8->mode; 5366 c=(UChar32)utf8->toUnicodeStatus; 5367 } else { 5368 toULength=oldToULength=toULimit=0; 5369 c = 0; 5370 } 5371 5372 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character. 5373 // If the buffer ends with a truncated 2- or 3-byte sequence, 5374 // then we reduce the sourceLimit to before that, 5375 // and collect the remaining bytes after the conversion loop. 5376 { 5377 // Do not go back into the bytes that will be read for finishing a partial 5378 // sequence from the previous buffer. 5379 int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5380 if(length>0) { 5381 uint8_t b1=*(sourceLimit-1); 5382 if(U8_IS_SINGLE(b1)) { 5383 // common ASCII character 5384 } else if(U8_IS_TRAIL(b1) && length>=2) { 5385 uint8_t b2=*(sourceLimit-2); 5386 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 5387 // truncated 3-byte sequence 5388 sourceLimit-=2; 5389 } 5390 } else if(0xc2<=b1 && b1<0xf0) { 5391 // truncated 2- or 3-byte sequence 5392 --sourceLimit; 5393 } 5394 } 5395 } 5396 5397 if(c!=0 && targetCapacity>0) { 5398 utf8->toUnicodeStatus=0; 5399 utf8->toULength=0; 5400 goto moreBytes; 5401 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5402 } 5403 5404 /* conversion loop */ 5405 while(source<sourceLimit) { 5406 if(targetCapacity>0) { 5407 b=*source++; 5408 if(U8_IS_SINGLE(b)) { 5409 /* convert ASCII */ 5410 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5411 *target++=b; 5412 --targetCapacity; 5413 continue; 5414 } else { 5415 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5416 if(value==0) { 5417 c=b; 5418 goto unassigned; 5419 } 5420 } 5421 } else { 5422 if(b>=0xe0) { 5423 if( /* handle U+0800..U+D7FF inline */ 5424 b<=0xed && // do not assume maxFastUChar>0xd7ff 5425 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && 5426 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5427 ) { 5428 c=((b&0xf)<<6)|(t1&0x3f); 5429 source+=2; 5430 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5431 if(value==0) { 5432 c=(c<<6)|t2; 5433 goto unassigned; 5434 } 5435 } else { 5436 c=-1; 5437 } 5438 } else { 5439 if( /* handle U+0080..U+07FF inline */ 5440 b>=0xc2 && 5441 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5442 ) { 5443 c=b&0x1f; 5444 ++source; 5445 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5446 if(value==0) { 5447 c=(c<<6)|t1; 5448 goto unassigned; 5449 } 5450 } else { 5451 c=-1; 5452 } 5453 } 5454 5455 if(c<0) { 5456 /* handle "complicated" and error cases, and continuing partial characters */ 5457 oldToULength=0; 5458 toULength=1; 5459 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 5460 c=b; 5461 moreBytes: 5462 while(toULength<toULimit) { 5463 /* 5464 * The sourceLimit may have been adjusted before the conversion loop 5465 * to stop before a truncated sequence. 5466 * Here we need to use the real limit in case we have two truncated 5467 * sequences at the end. 5468 * See ticket #7492. 5469 */ 5470 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5471 b=*source; 5472 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 5473 ++source; 5474 ++toULength; 5475 c=(c<<6)+b; 5476 } else { 5477 break; /* sequence too short, stop with toULength<toULimit */ 5478 } 5479 } else { 5480 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5481 source-=(toULength-oldToULength); 5482 while(oldToULength<toULength) { 5483 utf8->toUBytes[oldToULength++]=*source++; 5484 } 5485 utf8->toUnicodeStatus=c; 5486 utf8->toULength=toULength; 5487 utf8->mode=toULimit; 5488 pToUArgs->source=(char *)source; 5489 pFromUArgs->target=(char *)target; 5490 return; 5491 } 5492 } 5493 5494 if(toULength==toULimit) { 5495 c-=utf8_offsets[toULength]; 5496 if(toULength<=3) { /* BMP */ 5497 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5498 } else { 5499 /* supplementary code point */ 5500 if(!hasSupplementary) { 5501 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5502 stage2Entry=0; 5503 } else { 5504 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5505 } 5506 } 5507 } else { 5508 /* error handling: illegal UTF-8 byte sequence */ 5509 source-=(toULength-oldToULength); 5510 while(oldToULength<toULength) { 5511 utf8->toUBytes[oldToULength++]=*source++; 5512 } 5513 utf8->toULength=toULength; 5514 pToUArgs->source=(char *)source; 5515 pFromUArgs->target=(char *)target; 5516 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5517 return; 5518 } 5519 5520 /* get the bytes and the length for the output */ 5521 /* MBCS_OUTPUT_2 */ 5522 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5523 5524 /* is this code point assigned, or do we use fallbacks? */ 5525 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5526 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5527 ) { 5528 goto unassigned; 5529 } 5530 } 5531 } 5532 5533 /* write the output character bytes from value and length */ 5534 /* from the first if in the loop we know that targetCapacity>0 */ 5535 if(value<=0xff) { 5536 /* this is easy because we know that there is enough space */ 5537 *target++=(uint8_t)value; 5538 --targetCapacity; 5539 } else /* length==2 */ { 5540 *target++=(uint8_t)(value>>8); 5541 if(2<=targetCapacity) { 5542 *target++=(uint8_t)value; 5543 targetCapacity-=2; 5544 } else { 5545 cnv->charErrorBuffer[0]=(char)value; 5546 cnv->charErrorBufferLength=1; 5547 5548 /* target overflow */ 5549 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5550 break; 5551 } 5552 } 5553 continue; 5554 5555 unassigned: 5556 { 5557 /* 5558 * Try an extension mapping. 5559 * Pass in no source because we don't have UTF-16 input. 5560 * If we have a partial match on c, we will return and revert 5561 * to UTF-8->UTF-16->charset conversion. 5562 */ 5563 static const UChar nul=0; 5564 const UChar *noSource=&nul; 5565 c=_extFromU(cnv, cnv->sharedData, 5566 c, &noSource, noSource, 5567 &target, target+targetCapacity, 5568 NULL, -1, 5569 pFromUArgs->flush, 5570 pErrorCode); 5571 5572 if(U_FAILURE(*pErrorCode)) { 5573 /* not mappable or buffer overflow */ 5574 cnv->fromUChar32=c; 5575 break; 5576 } else if(cnv->preFromUFirstCP>=0) { 5577 /* 5578 * Partial match, return and revert to pivoting. 5579 * In normal from-UTF-16 conversion, we would just continue 5580 * but then exit the loop because the extension match would 5581 * have consumed the source. 5582 */ 5583 *pErrorCode=U_USING_DEFAULT_WARNING; 5584 break; 5585 } else { 5586 /* a mapping was written to the target, continue */ 5587 5588 /* recalculate the targetCapacity after an extension mapping */ 5589 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5590 continue; 5591 } 5592 } 5593 } else { 5594 /* target is full */ 5595 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5596 break; 5597 } 5598 } 5599 5600 /* 5601 * The sourceLimit may have been adjusted before the conversion loop 5602 * to stop before a truncated sequence. 5603 * If so, then collect the truncated sequence now. 5604 */ 5605 if(U_SUCCESS(*pErrorCode) && 5606 cnv->preFromUFirstCP<0 && 5607 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5608 c=utf8->toUBytes[0]=b=*source++; 5609 toULength=1; 5610 toULimit=U8_COUNT_BYTES(b); 5611 while(source<sourceLimit) { 5612 utf8->toUBytes[toULength++]=b=*source++; 5613 c=(c<<6)+b; 5614 } 5615 utf8->toUnicodeStatus=c; 5616 utf8->toULength=toULength; 5617 utf8->mode=toULimit; 5618 } 5619 5620 /* write back the updated pointers */ 5621 pToUArgs->source=(char *)source; 5622 pFromUArgs->target=(char *)target; 5623 } 5624 5625 /* miscellaneous ------------------------------------------------------------ */ 5626 5627 static void U_CALLCONV 5628 ucnv_MBCSGetStarters(const UConverter* cnv, 5629 UBool starters[256], 5630 UErrorCode *) { 5631 const int32_t *state0; 5632 int i; 5633 5634 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5635 for(i=0; i<256; ++i) { 5636 /* all bytes that cause a state transition from state 0 are lead bytes */ 5637 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5638 } 5639 } 5640 5641 /* 5642 * This is an internal function that allows other converter implementations 5643 * to check whether a byte is a lead byte. 5644 */ 5645 U_CFUNC UBool 5646 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5647 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5648 } 5649 5650 static void U_CALLCONV 5651 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5652 int32_t offsetIndex, 5653 UErrorCode *pErrorCode) { 5654 UConverter *cnv=pArgs->converter; 5655 char *p, *subchar; 5656 char buffer[4]; 5657 int32_t length; 5658 5659 /* first, select between subChar and subChar1 */ 5660 if( cnv->subChar1!=0 && 5661 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5662 cnv->useSubChar1 : 5663 (cnv->invalidUCharBuffer[0]<=0xff)) 5664 ) { 5665 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5666 subchar=(char *)&cnv->subChar1; 5667 length=1; 5668 } else { 5669 /* select subChar in all other cases */ 5670 subchar=(char *)cnv->subChars; 5671 length=cnv->subCharLen; 5672 } 5673 5674 /* reset the selector for the next code point */ 5675 cnv->useSubChar1=FALSE; 5676 5677 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5678 p=buffer; 5679 5680 /* fromUnicodeStatus contains prevLength */ 5681 switch(length) { 5682 case 1: 5683 if(cnv->fromUnicodeStatus==2) { 5684 /* DBCS mode and SBCS sub char: change to SBCS */ 5685 cnv->fromUnicodeStatus=1; 5686 *p++=UCNV_SI; 5687 } 5688 *p++=subchar[0]; 5689 break; 5690 case 2: 5691 if(cnv->fromUnicodeStatus<=1) { 5692 /* SBCS mode and DBCS sub char: change to DBCS */ 5693 cnv->fromUnicodeStatus=2; 5694 *p++=UCNV_SO; 5695 } 5696 *p++=subchar[0]; 5697 *p++=subchar[1]; 5698 break; 5699 default: 5700 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5701 return; 5702 } 5703 subchar=buffer; 5704 length=(int32_t)(p-buffer); 5705 } 5706 5707 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5708 } 5709 5710 U_CFUNC UConverterType 5711 ucnv_MBCSGetType(const UConverter* converter) { 5712 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5713 if(converter->sharedData->mbcs.countStates==1) { 5714 return (UConverterType)UCNV_SBCS; 5715 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5716 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5717 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5718 return (UConverterType)UCNV_DBCS; 5719 } 5720 return (UConverterType)UCNV_MBCS; 5721 } 5722 5723 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5724