1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvmbcs.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000jul03 14 * created by: Markus W. Scherer 15 * 16 * The current code in this file replaces the previous implementation 17 * of conversion code from multi-byte codepages to Unicode and back. 18 * This implementation supports the following: 19 * - legacy variable-length codepages with up to 4 bytes per character 20 * - all Unicode code points (up to 0x10ffff) 21 * - efficient distinction of unassigned vs. illegal byte sequences 22 * - it is possible in fromUnicode() to directly deal with simple 23 * stateful encodings (used for EBCDIC_STATEFUL) 24 * - it is possible to convert Unicode code points 25 * to a single zero byte (but not as a fallback except for SBCS) 26 * 27 * Remaining limitations in fromUnicode: 28 * - byte sequences must not have leading zero bytes 29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 30 * - limitation to up to 4 bytes per character 31 * 32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 33 * limitations and adds m:n character mappings and other features. 34 * See ucnv_ext.h for details. 35 * 36 * Change history: 37 * 38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 40 * macros to ucnvmbcs.h file 41 */ 42 43 #include "unicode/utypes.h" 44 45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 46 47 #include "unicode/ucnv.h" 48 #include "unicode/ucnv_cb.h" 49 #include "unicode/udata.h" 50 #include "unicode/uset.h" 51 #include "unicode/utf8.h" 52 #include "unicode/utf16.h" 53 #include "ucnv_bld.h" 54 #include "ucnvmbcs.h" 55 #include "ucnv_ext.h" 56 #include "ucnv_cnv.h" 57 #include "cmemory.h" 58 #include "cstring.h" 59 #include "umutex.h" 60 61 /* control optimizations according to the platform */ 62 #define MBCS_UNROLL_SINGLE_TO_BMP 1 63 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 64 65 /* 66 * _MBCSHeader versions 5.3 & 4.3 67 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 68 * 69 * This version is optional. Version 5 is used for incompatible data format changes. 70 * makeconv will continue to generate version 4 files if possible. 71 * 72 * Changes from version 4: 73 * 74 * The main difference is an additional _MBCSHeader field with 75 * - the length (number of uint32_t) of the _MBCSHeader 76 * - flags for further incompatible data format changes 77 * - flags for further, backward compatible data format changes 78 * 79 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 80 * the file and needs to be reconstituted at load time. 81 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 82 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 83 * (For details about these structures see below, and see ucnvmbcs.h.) 84 * 85 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 86 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 87 * precision markers for all mappings.) 88 * 89 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 90 * omitted data that can be reconstituted from the toUnicode data. 91 * 92 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 93 * With only roundtrip mappings in the base fromUnicode data, this part is fully 94 * redundant with the mbcsIndex and will be reconstituted from that (also using the 95 * stage 1 table which contains the information about how stage 2 was compacted). 96 * 97 * The rest of the stage 2 table, the part for code points above maxFastUChar, 98 * is stored in the file and will be appended to the reconstituted part. 99 * 100 * The entire fromUBytes array is omitted from the file and will be reconstitued. 101 * This is done by enumerating all toUnicode roundtrip mappings, performing 102 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 103 * writing instead of reading the byte values. 104 * 105 * _MBCSHeader version 4.3 106 * 107 * Change from version 4.2: 108 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 109 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 110 * files which can be used instead of stages 1 & 2. 111 * Faster lookups for roundtrips from most commonly used characters, 112 * and lookups from UTF-8 byte sequences with a natural bit distribution. 113 * See ucnvmbcs.h for more details. 114 * 115 * Change from version 4.1: 116 * - Added an optional extension table structure at the end of the .cnv file. 117 * It is present if the upper bits of the header flags field contains a non-zero 118 * byte offset to it. 119 * Files that contain only a conversion table and no base table 120 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 121 * These contain the base table name between the MBCS header and the extension 122 * data. 123 * 124 * Change from version 4.0: 125 * - Replace header.reserved with header.fromUBytesLength so that all 126 * fields in the data have length. 127 * 128 * Changes from version 3 (for performance improvements): 129 * - new bit distribution for state table entries 130 * - reordered action codes 131 * - new data structure for single-byte fromUnicode 132 * + stage 2 only contains indexes 133 * + stage 3 stores 16 bits per character with classification bits 15..8 134 * - no multiplier for stage 1 entries 135 * - stage 2 for non-single-byte codepages contains the index and the flags in 136 * one 32-bit value 137 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 138 * 139 * For more details about old versions of the MBCS data structure, see 140 * the corresponding versions of this file. 141 * 142 * Converting stateless codepage data ---------------------------------------*** 143 * (or codepage data with simple states) to Unicode. 144 * 145 * Data structure and algorithm for converting from complex legacy codepages 146 * to Unicode. (Designed before 2000-may-22.) 147 * 148 * The basic idea is that the structure of legacy codepages can be described 149 * with state tables. 150 * When reading a byte stream, each input byte causes a state transition. 151 * Some transitions result in the output of a code point, some result in 152 * "unassigned" or "illegal" output. 153 * This is used here for character conversion. 154 * 155 * The data structure begins with a state table consisting of a row 156 * per state, with 256 entries (columns) per row for each possible input 157 * byte value. 158 * Each entry is 32 bits wide, with two formats distinguished by 159 * the sign bit (bit 31): 160 * 161 * One format for transitional entries (bit 31 not set) for non-final bytes, and 162 * one format for final entries (bit 31 set). 163 * Both formats contain the number of the next state in the same bit 164 * positions. 165 * State 0 is the initial state. 166 * 167 * Most of the time, the offset values of subsequent states are added 168 * up to a scalar value. This value will eventually be the index of 169 * the Unicode code point in a table that follows the state table. 170 * The effect is that the code points for final state table rows 171 * are contiguous. The code points of final state rows follow each other 172 * in the order of the references to those final states by previous 173 * states, etc. 174 * 175 * For some terminal states, the offset is itself the output Unicode 176 * code point (16 bits for a BMP code point or 20 bits for a supplementary 177 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 178 * For others, the code point in the Unicode table is stored with either 179 * one or two code units: one for BMP code points, two for a pair of 180 * surrogates. 181 * All code points for a final state entry take up the same number of code 182 * units, regardless of whether they all actually _use_ the same number 183 * of code units. This is necessary for simple array access. 184 * 185 * An additional feature comes in with what in ICU is called "fallback" 186 * mappings: 187 * 188 * In addition to round-trippable, precise, 1:1 mappings, there are often 189 * mappings defined between similar, though not the same, characters. 190 * Typically, such mappings occur only in fromUnicode mapping tables because 191 * Unicode has a superset repertoire of most other codepages. However, it 192 * is possible to provide such mappings in the toUnicode tables, too. 193 * In this case, the fallback mappings are partly integrated into the 194 * general state tables because the structure of the encoding includes their 195 * byte sequences. 196 * For final entries in an initial state, fallback mappings are stored in 197 * the entry itself like with roundtrip mappings. 198 * For other final entries, they are stored in the code units table if 199 * the entry is for a pair of code units. 200 * For single-unit results in the code units table, there is no space to 201 * alternatively hold a fallback mapping; in this case, the code unit 202 * is stored as U+fffe (unassigned), and the fallback mapping needs to 203 * be looked up by the scalar offset value in a separate table. 204 * 205 * "Unassigned" state entries really mean "structurally unassigned", 206 * i.e., such a byte sequence will never have a mapping result. 207 * 208 * The interpretation of the bits in each entry is as follows: 209 * 210 * Bit 31 not set, not a terminal entry ("transitional"): 211 * 30..24 next state 212 * 23..0 offset delta, to be added up 213 * 214 * Bit 31 set, terminal ("final") entry: 215 * 30..24 next state (regardless of action code) 216 * 23..20 action code: 217 * action codes 0 and 1 result in precise-mapping Unicode code points 218 * 0 valid byte sequence 219 * 19..16 not used, 0 220 * 15..0 16-bit Unicode BMP code point 221 * never U+fffe or U+ffff 222 * 1 valid byte sequence 223 * 19..0 20-bit Unicode supplementary code point 224 * never U+fffe or U+ffff 225 * 226 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 227 * 2 valid byte sequence (fallback) 228 * 19..16 not used, 0 229 * 15..0 16-bit Unicode BMP code point as fallback result 230 * 3 valid byte sequence (fallback) 231 * 19..0 20-bit Unicode supplementary code point as fallback result 232 * 233 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 234 * depending on the code units they result in 235 * 4 valid byte sequence 236 * 19..9 not used, 0 237 * 8..0 final offset delta 238 * pointing to one 16-bit code unit which may be 239 * fffe unassigned -- look for a fallback for this offset 240 * ffff illegal 241 * 5 valid byte sequence 242 * 19..9 not used, 0 243 * 8..0 final offset delta 244 * pointing to two 16-bit code units 245 * (typically UTF-16 surrogates) 246 * the result depends on the first code unit as follows: 247 * 0000..d7ff roundtrip BMP code point (1st alone) 248 * d800..dbff roundtrip surrogate pair (1st, 2nd) 249 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 250 * e000 roundtrip BMP code point (2nd alone) 251 * e001 fallback BMP code point (2nd alone) 252 * fffe unassigned 253 * ffff illegal 254 * (the final offset deltas are at most 255 * 2, 255 * times 2 because of storing code unit pairs) 256 * 257 * 6 unassigned byte sequence 258 * 19..16 not used, 0 259 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 260 * this does not contain a final offset delta because the main 261 * purpose of this action code is to save scalar offset values; 262 * therefore, fallback values cannot be assigned to byte 263 * sequences that result in this action code 264 * 7 illegal byte sequence 265 * 19..16 not used, 0 266 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 267 * 8 state change only 268 * 19..0 not used, 0 269 * useful for state changes in simple stateful encodings, 270 * at Shift-In/Shift-Out codes 271 * 272 * 273 * 9..15 reserved for future use 274 * current implementations will only perform a state change 275 * and ignore bits 19..0 276 * 277 * An encoding with contiguous ranges of unassigned byte sequences, like 278 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 279 * at least two states for the trail bytes: 280 * One trail byte state that results in code points, and one that only 281 * has "unassigned" and "illegal" terminal states. 282 * 283 * Note: partly by accident, this data structure supports simple stateful 284 * encodings without any additional logic. 285 * Currently, only simple Shift-In/Shift-Out schemes are handled with 286 * appropriate state tables (especially EBCDIC_STATEFUL!). 287 * 288 * MBCS version 2 added: 289 * unassigned and illegal action codes have U+fffe and U+ffff 290 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 291 * 292 * Converting from Unicode to codepage bytes --------------------------------*** 293 * 294 * The conversion data structure for fromUnicode is designed for the known 295 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 296 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 297 * a roundtrip mapping. 298 * 299 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 300 * like in the character properties table. 301 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 302 * with the resulting bytes is at offsetFromUBytes. 303 * 304 * Beginning with version 4, single-byte codepages have a significantly different 305 * trie compared to other codepages. 306 * In all cases, the entry in stage 1 is directly the index of the block of 307 * 64 entries in stage 2. 308 * 309 * Single-byte lookup: 310 * 311 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 312 * Stage 3 contains one 16-bit word per result: 313 * Bits 15..8 indicate the kind of result: 314 * f roundtrip result 315 * c fallback result from private-use code point 316 * 8 fallback result from other code points 317 * 0 unassigned 318 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 319 * 320 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 321 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 322 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 323 * ASCII code points can be looked up with a linear array access into stage 3. 324 * See maxFastUChar and other details in ucnvmbcs.h. 325 * 326 * Multi-byte lookup: 327 * 328 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 329 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 330 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 331 * If this test is false, then a non-zero result will be interpreted as 332 * a fallback mapping. 333 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 334 * 335 * Stage 3 contains 2, 3, or 4 bytes per result. 336 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 337 * while 3 bytes are stored as bytes in big-endian order. 338 * Leading zero bytes are ignored, and the number of bytes is counted. 339 * A zero byte mapping result is possible as a roundtrip result. 340 * For some output types, the actual result is processed from this; 341 * see ucnv_MBCSFromUnicodeWithOffsets(). 342 * 343 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 344 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 345 * 346 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 347 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 348 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 349 * ASCII code points can be looked up with a linear array access into stage 3. 350 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 351 * 352 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 353 * for compaction. 354 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 355 * may overlap by any number of entries. 356 * 357 * MBCS version 2 added: 358 * the converter checks for known output types, which allows 359 * adding new ones without crashing an unaware converter 360 */ 361 362 /** 363 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 364 * consecutive sequences of bytes, starting from the one encoded in value, 365 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 366 * Does not currently support m:n mappings or reverse fallbacks. 367 * This function will not be called for sequences of bytes with leading zeros. 368 * 369 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 370 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 371 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 372 * not map to anything 373 * @return TRUE to continue enumeration, FALSE to stop 374 */ 375 typedef UBool U_CALLCONV 376 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 377 378 static void 379 ucnv_MBCSLoad(UConverterSharedData *sharedData, 380 UConverterLoadArgs *pArgs, 381 const uint8_t *raw, 382 UErrorCode *pErrorCode); 383 384 static void 385 ucnv_MBCSUnload(UConverterSharedData *sharedData); 386 387 static void 388 ucnv_MBCSOpen(UConverter *cnv, 389 UConverterLoadArgs *pArgs, 390 UErrorCode *pErrorCode); 391 392 static UChar32 393 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 394 UErrorCode *pErrorCode); 395 396 static void 397 ucnv_MBCSGetStarters(const UConverter* cnv, 398 UBool starters[256], 399 UErrorCode *pErrorCode); 400 401 static const char * 402 ucnv_MBCSGetName(const UConverter *cnv); 403 404 static void 405 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 406 int32_t offsetIndex, 407 UErrorCode *pErrorCode); 408 409 static UChar32 410 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 411 UErrorCode *pErrorCode); 412 413 static void 414 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 415 UConverterToUnicodeArgs *pToUArgs, 416 UErrorCode *pErrorCode); 417 418 static void 419 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 420 const USetAdder *sa, 421 UConverterUnicodeSet which, 422 UErrorCode *pErrorCode); 423 424 static void 425 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 426 UConverterToUnicodeArgs *pToUArgs, 427 UErrorCode *pErrorCode); 428 429 static const UConverterImpl _SBCSUTF8Impl={ 430 UCNV_MBCS, 431 432 ucnv_MBCSLoad, 433 ucnv_MBCSUnload, 434 435 ucnv_MBCSOpen, 436 NULL, 437 NULL, 438 439 ucnv_MBCSToUnicodeWithOffsets, 440 ucnv_MBCSToUnicodeWithOffsets, 441 ucnv_MBCSFromUnicodeWithOffsets, 442 ucnv_MBCSFromUnicodeWithOffsets, 443 ucnv_MBCSGetNextUChar, 444 445 ucnv_MBCSGetStarters, 446 ucnv_MBCSGetName, 447 ucnv_MBCSWriteSub, 448 NULL, 449 ucnv_MBCSGetUnicodeSet, 450 451 NULL, 452 ucnv_SBCSFromUTF8 453 }; 454 455 static const UConverterImpl _DBCSUTF8Impl={ 456 UCNV_MBCS, 457 458 ucnv_MBCSLoad, 459 ucnv_MBCSUnload, 460 461 ucnv_MBCSOpen, 462 NULL, 463 NULL, 464 465 ucnv_MBCSToUnicodeWithOffsets, 466 ucnv_MBCSToUnicodeWithOffsets, 467 ucnv_MBCSFromUnicodeWithOffsets, 468 ucnv_MBCSFromUnicodeWithOffsets, 469 ucnv_MBCSGetNextUChar, 470 471 ucnv_MBCSGetStarters, 472 ucnv_MBCSGetName, 473 ucnv_MBCSWriteSub, 474 NULL, 475 ucnv_MBCSGetUnicodeSet, 476 477 NULL, 478 ucnv_DBCSFromUTF8 479 }; 480 481 static const UConverterImpl _MBCSImpl={ 482 UCNV_MBCS, 483 484 ucnv_MBCSLoad, 485 ucnv_MBCSUnload, 486 487 ucnv_MBCSOpen, 488 NULL, 489 NULL, 490 491 ucnv_MBCSToUnicodeWithOffsets, 492 ucnv_MBCSToUnicodeWithOffsets, 493 ucnv_MBCSFromUnicodeWithOffsets, 494 ucnv_MBCSFromUnicodeWithOffsets, 495 ucnv_MBCSGetNextUChar, 496 497 ucnv_MBCSGetStarters, 498 ucnv_MBCSGetName, 499 ucnv_MBCSWriteSub, 500 NULL, 501 ucnv_MBCSGetUnicodeSet, 502 NULL, 503 NULL 504 }; 505 506 507 /* Static data is in tools/makeconv/ucnvstat.c for data-based 508 * converters. Be sure to update it as well. 509 */ 510 511 const UConverterSharedData _MBCSData={ 512 sizeof(UConverterSharedData), 1, 513 NULL, NULL, FALSE, TRUE, &_MBCSImpl, 514 0, UCNV_MBCS_TABLE_INITIALIZER 515 }; 516 517 518 /* GB 18030 data ------------------------------------------------------------ */ 519 520 /* helper macros for linear values for GB 18030 four-byte sequences */ 521 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 522 523 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 524 525 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 526 527 /* 528 * Some ranges of GB 18030 where both the Unicode code points and the 529 * GB four-byte sequences are contiguous and are handled algorithmically by 530 * the special callback functions below. 531 * The values are start & end of Unicode & GB codes. 532 * 533 * Note that single surrogates are not mapped by GB 18030 534 * as of the re-released mapping tables from 2000-nov-30. 535 */ 536 static const uint32_t 537 gb18030Ranges[14][4]={ 538 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 539 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 540 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 541 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 542 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 543 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 544 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 545 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 546 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 547 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 548 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 549 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 550 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 551 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 552 }; 553 554 /* bit flag for UConverter.options indicating GB 18030 special handling */ 555 #define _MBCS_OPTION_GB18030 0x8000 556 557 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 558 #define _MBCS_OPTION_KEIS 0x01000 559 #define _MBCS_OPTION_JEF 0x02000 560 #define _MBCS_OPTION_JIPS 0x04000 561 562 #define KEIS_SO_CHAR_1 0x0A 563 #define KEIS_SO_CHAR_2 0x42 564 #define KEIS_SI_CHAR_1 0x0A 565 #define KEIS_SI_CHAR_2 0x41 566 567 #define JEF_SO_CHAR 0x28 568 #define JEF_SI_CHAR 0x29 569 570 #define JIPS_SO_CHAR_1 0x1A 571 #define JIPS_SO_CHAR_2 0x70 572 #define JIPS_SI_CHAR_1 0x1A 573 #define JIPS_SI_CHAR_2 0x71 574 575 enum SISO_Option { 576 SI, 577 SO 578 }; 579 typedef enum SISO_Option SISO_Option; 580 581 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 582 int32_t SISOLength = 0; 583 584 switch (option) { 585 case SI: 586 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 587 value[0] = KEIS_SI_CHAR_1; 588 value[1] = KEIS_SI_CHAR_2; 589 SISOLength = 2; 590 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 591 value[0] = JEF_SI_CHAR; 592 SISOLength = 1; 593 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 594 value[0] = JIPS_SI_CHAR_1; 595 value[1] = JIPS_SI_CHAR_2; 596 SISOLength = 2; 597 } else { 598 value[0] = UCNV_SI; 599 SISOLength = 1; 600 } 601 break; 602 case SO: 603 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 604 value[0] = KEIS_SO_CHAR_1; 605 value[1] = KEIS_SO_CHAR_2; 606 SISOLength = 2; 607 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 608 value[0] = JEF_SO_CHAR; 609 SISOLength = 1; 610 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 611 value[0] = JIPS_SO_CHAR_1; 612 value[1] = JIPS_SO_CHAR_2; 613 SISOLength = 2; 614 } else { 615 value[0] = UCNV_SO; 616 SISOLength = 1; 617 } 618 break; 619 default: 620 /* Should never happen. */ 621 break; 622 } 623 624 return SISOLength; 625 } 626 627 /* Miscellaneous ------------------------------------------------------------ */ 628 629 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 630 static UBool 631 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 632 int32_t state, uint32_t offset, 633 uint32_t value, 634 UConverterEnumToUCallback *callback, const void *context, 635 UErrorCode *pErrorCode) { 636 UChar32 codePoints[32]; 637 const int32_t *row; 638 const uint16_t *unicodeCodeUnits; 639 UChar32 anyCodePoints; 640 int32_t b, limit; 641 642 row=mbcsTable->stateTable[state]; 643 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 644 645 value<<=8; 646 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 647 648 b=(stateProps[state]&0x38)<<2; 649 if(b==0 && stateProps[state]>=0x40) { 650 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 651 codePoints[0]=U_SENTINEL; 652 b=1; 653 } 654 limit=((stateProps[state]&7)+1)<<5; 655 while(b<limit) { 656 int32_t entry=row[b]; 657 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 658 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 659 if(stateProps[nextState]>=0) { 660 /* recurse to a state with non-ignorable actions */ 661 if(!enumToU( 662 mbcsTable, stateProps, nextState, 663 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 664 value|(uint32_t)b, 665 callback, context, 666 pErrorCode)) { 667 return FALSE; 668 } 669 } 670 codePoints[b&0x1f]=U_SENTINEL; 671 } else { 672 UChar32 c; 673 int32_t action; 674 675 /* 676 * An if-else-if chain provides more reliable performance for 677 * the most common cases compared to a switch. 678 */ 679 action=MBCS_ENTRY_FINAL_ACTION(entry); 680 if(action==MBCS_STATE_VALID_DIRECT_16) { 681 /* output BMP code point */ 682 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 683 } else if(action==MBCS_STATE_VALID_16) { 684 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 685 c=unicodeCodeUnits[finalOffset]; 686 if(c<0xfffe) { 687 /* output BMP code point */ 688 } else { 689 c=U_SENTINEL; 690 } 691 } else if(action==MBCS_STATE_VALID_16_PAIR) { 692 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 693 c=unicodeCodeUnits[finalOffset++]; 694 if(c<0xd800) { 695 /* output BMP code point below 0xd800 */ 696 } else if(c<=0xdbff) { 697 /* output roundtrip or fallback supplementary code point */ 698 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 699 } else if(c==0xe000) { 700 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 701 c=unicodeCodeUnits[finalOffset]; 702 } else { 703 c=U_SENTINEL; 704 } 705 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 706 /* output supplementary code point */ 707 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 708 } else { 709 c=U_SENTINEL; 710 } 711 712 codePoints[b&0x1f]=c; 713 anyCodePoints&=c; 714 } 715 if(((++b)&0x1f)==0) { 716 if(anyCodePoints>=0) { 717 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 718 return FALSE; 719 } 720 anyCodePoints=-1; 721 } 722 } 723 } 724 return TRUE; 725 } 726 727 /* 728 * Only called if stateProps[state]==-1. 729 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 730 * MBCS_STATE_CHANGE_ONLY. 731 */ 732 static int8_t 733 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 734 const int32_t *row; 735 int32_t min, max, entry, nextState; 736 737 row=stateTable[state]; 738 stateProps[state]=0; 739 740 /* find first non-ignorable state */ 741 for(min=0;; ++min) { 742 entry=row[min]; 743 nextState=MBCS_ENTRY_STATE(entry); 744 if(stateProps[nextState]==-1) { 745 getStateProp(stateTable, stateProps, nextState); 746 } 747 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 748 if(stateProps[nextState]>=0) { 749 break; 750 } 751 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 752 break; 753 } 754 if(min==0xff) { 755 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 756 return stateProps[state]; 757 } 758 } 759 stateProps[state]|=(int8_t)((min>>5)<<3); 760 761 /* find last non-ignorable state */ 762 for(max=0xff; min<max; --max) { 763 entry=row[max]; 764 nextState=MBCS_ENTRY_STATE(entry); 765 if(stateProps[nextState]==-1) { 766 getStateProp(stateTable, stateProps, nextState); 767 } 768 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 769 if(stateProps[nextState]>=0) { 770 break; 771 } 772 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 773 break; 774 } 775 } 776 stateProps[state]|=(int8_t)(max>>5); 777 778 /* recurse further and collect direct-state information */ 779 while(min<=max) { 780 entry=row[min]; 781 nextState=MBCS_ENTRY_STATE(entry); 782 if(stateProps[nextState]==-1) { 783 getStateProp(stateTable, stateProps, nextState); 784 } 785 if(MBCS_ENTRY_IS_FINAL(entry)) { 786 stateProps[nextState]|=0x40; 787 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 788 stateProps[state]|=0x40; 789 } 790 } 791 ++min; 792 } 793 return stateProps[state]; 794 } 795 796 /* 797 * Internal function enumerating the toUnicode data of an MBCS converter. 798 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 799 * table, but could also be used for a future ucnv_getUnicodeSet() option 800 * that includes reverse fallbacks (after updating this function's implementation). 801 * Currently only handles roundtrip mappings. 802 * Does not currently handle extensions. 803 */ 804 static void 805 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 806 UConverterEnumToUCallback *callback, const void *context, 807 UErrorCode *pErrorCode) { 808 /* 809 * Properties for each state, to speed up the enumeration. 810 * Ignorable actions are unassigned/illegal/state-change-only: 811 * They do not lead to mappings. 812 * 813 * Bits 7..6: 814 * 1 direct/initial state (stateful converters have multiple) 815 * 0 non-initial state with transitions or with non-ignorable result actions 816 * -1 final state with only ignorable actions 817 * 818 * Bits 5..3: 819 * The lowest byte value with non-ignorable actions is 820 * value<<5 (rounded down). 821 * 822 * Bits 2..0: 823 * The highest byte value with non-ignorable actions is 824 * (value<<5)&0x1f (rounded up). 825 */ 826 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 827 int32_t state; 828 829 uprv_memset(stateProps, -1, sizeof(stateProps)); 830 831 /* recurse from state 0 and set all stateProps */ 832 getStateProp(mbcsTable->stateTable, stateProps, 0); 833 834 for(state=0; state<mbcsTable->countStates; ++state) { 835 /*if(stateProps[state]==-1) { 836 printf("unused/unreachable <icu:state> %d\n", state); 837 }*/ 838 if(stateProps[state]>=0x40) { 839 /* start from each direct state */ 840 enumToU( 841 mbcsTable, stateProps, state, 0, 0, 842 callback, context, 843 pErrorCode); 844 } 845 } 846 } 847 848 U_CFUNC void 849 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 850 const USetAdder *sa, 851 UConverterUnicodeSet which, 852 UConverterSetFilter filter, 853 UErrorCode *pErrorCode) { 854 const UConverterMBCSTable *mbcsTable; 855 const uint16_t *table; 856 857 uint32_t st3; 858 uint16_t st1, maxStage1, st2; 859 860 UChar32 c; 861 862 /* enumerate the from-Unicode trie table */ 863 mbcsTable=&sharedData->mbcs; 864 table=mbcsTable->fromUnicodeTable; 865 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 866 maxStage1=0x440; 867 } else { 868 maxStage1=0x40; 869 } 870 871 c=0; /* keep track of the current code point while enumerating */ 872 873 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 874 const uint16_t *stage2, *stage3, *results; 875 uint16_t minValue; 876 877 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 878 879 /* 880 * Set a threshold variable for selecting which mappings to use. 881 * See ucnv_MBCSSingleFromBMPWithOffsets() and 882 * MBCS_SINGLE_RESULT_FROM_U() for details. 883 */ 884 if(which==UCNV_ROUNDTRIP_SET) { 885 /* use only roundtrips */ 886 minValue=0xf00; 887 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 888 /* use all roundtrip and fallback results */ 889 minValue=0x800; 890 } 891 892 for(st1=0; st1<maxStage1; ++st1) { 893 st2=table[st1]; 894 if(st2>maxStage1) { 895 stage2=table+st2; 896 for(st2=0; st2<64; ++st2) { 897 if((st3=stage2[st2])!=0) { 898 /* read the stage 3 block */ 899 stage3=results+st3; 900 901 do { 902 if(*stage3++>=minValue) { 903 sa->add(sa->set, c); 904 } 905 } while((++c&0xf)!=0); 906 } else { 907 c+=16; /* empty stage 3 block */ 908 } 909 } 910 } else { 911 c+=1024; /* empty stage 2 block */ 912 } 913 } 914 } else { 915 const uint32_t *stage2; 916 const uint8_t *stage3, *bytes; 917 uint32_t st3Multiplier; 918 uint32_t value; 919 UBool useFallback; 920 921 bytes=mbcsTable->fromUnicodeBytes; 922 923 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 924 925 switch(mbcsTable->outputType) { 926 case MBCS_OUTPUT_3: 927 case MBCS_OUTPUT_4_EUC: 928 st3Multiplier=3; 929 break; 930 case MBCS_OUTPUT_4: 931 st3Multiplier=4; 932 break; 933 default: 934 st3Multiplier=2; 935 break; 936 } 937 938 for(st1=0; st1<maxStage1; ++st1) { 939 st2=table[st1]; 940 if(st2>(maxStage1>>1)) { 941 stage2=(const uint32_t *)table+st2; 942 for(st2=0; st2<64; ++st2) { 943 if((st3=stage2[st2])!=0) { 944 /* read the stage 3 block */ 945 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 946 947 /* get the roundtrip flags for the stage 3 block */ 948 st3>>=16; 949 950 /* 951 * Add code points for which the roundtrip flag is set, 952 * or which map to non-zero bytes if we use fallbacks. 953 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 954 */ 955 switch(filter) { 956 case UCNV_SET_FILTER_NONE: 957 do { 958 if(st3&1) { 959 sa->add(sa->set, c); 960 stage3+=st3Multiplier; 961 } else if(useFallback) { 962 uint8_t b=0; 963 switch(st3Multiplier) { 964 case 4: 965 b|=*stage3++; 966 case 3: /*fall through*/ 967 b|=*stage3++; 968 case 2: /*fall through*/ 969 b|=stage3[0]|stage3[1]; 970 stage3+=2; 971 default: 972 break; 973 } 974 if(b!=0) { 975 sa->add(sa->set, c); 976 } 977 } 978 st3>>=1; 979 } while((++c&0xf)!=0); 980 break; 981 case UCNV_SET_FILTER_DBCS_ONLY: 982 /* Ignore single-byte results (<0x100). */ 983 do { 984 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 985 sa->add(sa->set, c); 986 } 987 st3>>=1; 988 stage3+=2; /* +=st3Multiplier */ 989 } while((++c&0xf)!=0); 990 break; 991 case UCNV_SET_FILTER_2022_CN: 992 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 993 do { 994 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 995 sa->add(sa->set, c); 996 } 997 st3>>=1; 998 stage3+=3; /* +=st3Multiplier */ 999 } while((++c&0xf)!=0); 1000 break; 1001 case UCNV_SET_FILTER_SJIS: 1002 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 1003 do { 1004 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 1005 sa->add(sa->set, c); 1006 } 1007 st3>>=1; 1008 stage3+=2; /* +=st3Multiplier */ 1009 } while((++c&0xf)!=0); 1010 break; 1011 case UCNV_SET_FILTER_GR94DBCS: 1012 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 1013 do { 1014 if( ((st3&1)!=0 || useFallback) && 1015 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 1016 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1017 ) { 1018 sa->add(sa->set, c); 1019 } 1020 st3>>=1; 1021 stage3+=2; /* +=st3Multiplier */ 1022 } while((++c&0xf)!=0); 1023 break; 1024 case UCNV_SET_FILTER_HZ: 1025 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 1026 do { 1027 if( ((st3&1)!=0 || useFallback) && 1028 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1029 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1030 ) { 1031 sa->add(sa->set, c); 1032 } 1033 st3>>=1; 1034 stage3+=2; /* +=st3Multiplier */ 1035 } while((++c&0xf)!=0); 1036 break; 1037 default: 1038 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1039 return; 1040 } 1041 } else { 1042 c+=16; /* empty stage 3 block */ 1043 } 1044 } 1045 } else { 1046 c+=1024; /* empty stage 2 block */ 1047 } 1048 } 1049 } 1050 1051 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 1052 } 1053 1054 U_CFUNC void 1055 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1056 const USetAdder *sa, 1057 UConverterUnicodeSet which, 1058 UErrorCode *pErrorCode) { 1059 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 1060 sharedData, sa, which, 1061 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1062 UCNV_SET_FILTER_DBCS_ONLY : 1063 UCNV_SET_FILTER_NONE, 1064 pErrorCode); 1065 } 1066 1067 static void 1068 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 1069 const USetAdder *sa, 1070 UConverterUnicodeSet which, 1071 UErrorCode *pErrorCode) { 1072 if(cnv->options&_MBCS_OPTION_GB18030) { 1073 sa->addRange(sa->set, 0, 0xd7ff); 1074 sa->addRange(sa->set, 0xe000, 0x10ffff); 1075 } else { 1076 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 1077 } 1078 } 1079 1080 /* conversion extensions for input not in the main table -------------------- */ 1081 1082 /* 1083 * Hardcoded extension handling for GB 18030. 1084 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 1085 * 1086 * In the future, conversion extensions may handle m:n mappings and delta tables, 1087 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 1088 * 1089 * If an input character cannot be mapped, then these functions set an error 1090 * code. The framework will then call the callback function. 1091 */ 1092 1093 /* 1094 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 1095 * else return 0 after output has been written to the target 1096 */ 1097 static UChar32 1098 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 1099 UChar32 cp, 1100 const UChar **source, const UChar *sourceLimit, 1101 uint8_t **target, const uint8_t *targetLimit, 1102 int32_t **offsets, int32_t sourceIndex, 1103 UBool flush, 1104 UErrorCode *pErrorCode) { 1105 const int32_t *cx; 1106 1107 cnv->useSubChar1=FALSE; 1108 1109 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1110 ucnv_extInitialMatchFromU( 1111 cnv, cx, 1112 cp, source, sourceLimit, 1113 (char **)target, (char *)targetLimit, 1114 offsets, sourceIndex, 1115 flush, 1116 pErrorCode) 1117 ) { 1118 return 0; /* an extension mapping handled the input */ 1119 } 1120 1121 /* GB 18030 */ 1122 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 1123 const uint32_t *range; 1124 int32_t i; 1125 1126 range=gb18030Ranges[0]; 1127 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1128 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 1129 /* found the Unicode code point, output the four-byte sequence for it */ 1130 uint32_t linear; 1131 char bytes[4]; 1132 1133 /* get the linear value of the first GB 18030 code in this range */ 1134 linear=range[2]-LINEAR_18030_BASE; 1135 1136 /* add the offset from the beginning of the range */ 1137 linear+=((uint32_t)cp-range[0]); 1138 1139 /* turn this into a four-byte sequence */ 1140 bytes[3]=(char)(0x30+linear%10); linear/=10; 1141 bytes[2]=(char)(0x81+linear%126); linear/=126; 1142 bytes[1]=(char)(0x30+linear%10); linear/=10; 1143 bytes[0]=(char)(0x81+linear); 1144 1145 /* output this sequence */ 1146 ucnv_fromUWriteBytes(cnv, 1147 bytes, 4, (char **)target, (char *)targetLimit, 1148 offsets, sourceIndex, pErrorCode); 1149 return 0; 1150 } 1151 } 1152 } 1153 1154 /* no mapping */ 1155 *pErrorCode=U_INVALID_CHAR_FOUND; 1156 return cp; 1157 } 1158 1159 /* 1160 * Input sequence: cnv->toUBytes[0..length[ 1161 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1162 * else return 0 after output has been written to the target 1163 */ 1164 static int8_t 1165 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1166 int8_t length, 1167 const uint8_t **source, const uint8_t *sourceLimit, 1168 UChar **target, const UChar *targetLimit, 1169 int32_t **offsets, int32_t sourceIndex, 1170 UBool flush, 1171 UErrorCode *pErrorCode) { 1172 const int32_t *cx; 1173 1174 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1175 ucnv_extInitialMatchToU( 1176 cnv, cx, 1177 length, (const char **)source, (const char *)sourceLimit, 1178 target, targetLimit, 1179 offsets, sourceIndex, 1180 flush, 1181 pErrorCode) 1182 ) { 1183 return 0; /* an extension mapping handled the input */ 1184 } 1185 1186 /* GB 18030 */ 1187 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1188 const uint32_t *range; 1189 uint32_t linear; 1190 int32_t i; 1191 1192 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1193 range=gb18030Ranges[0]; 1194 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1195 if(range[2]<=linear && linear<=range[3]) { 1196 /* found the sequence, output the Unicode code point for it */ 1197 *pErrorCode=U_ZERO_ERROR; 1198 1199 /* add the linear difference between the input and start sequences to the start code point */ 1200 linear=range[0]+(linear-range[2]); 1201 1202 /* output this code point */ 1203 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1204 1205 return 0; 1206 } 1207 } 1208 } 1209 1210 /* no mapping */ 1211 *pErrorCode=U_INVALID_CHAR_FOUND; 1212 return length; 1213 } 1214 1215 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1216 1217 /* 1218 * This code modifies a standard EBCDIC<->Unicode mapping table for 1219 * OS/390 (z/OS) Unix System Services (Open Edition). 1220 * The difference is in the mapping of Line Feed and New Line control codes: 1221 * Standard EBCDIC maps 1222 * 1223 * <U000A> \x25 |0 1224 * <U0085> \x15 |0 1225 * 1226 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1227 * mapping 1228 * 1229 * <U000A> \x15 |0 1230 * <U0085> \x25 |0 1231 * 1232 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1233 * by copying it into allocated memory and swapping the LF and NL values. 1234 * It allows to support the same EBCDIC charset in both versions without 1235 * duplicating the entire installed table. 1236 */ 1237 1238 /* standard EBCDIC codes */ 1239 #define EBCDIC_LF 0x25 1240 #define EBCDIC_NL 0x15 1241 1242 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1243 #define EBCDIC_RT_LF 0xf25 1244 #define EBCDIC_RT_NL 0xf15 1245 1246 /* Unicode code points */ 1247 #define U_LF 0x0a 1248 #define U_NL 0x85 1249 1250 static UBool 1251 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1252 UConverterMBCSTable *mbcsTable; 1253 1254 const uint16_t *table, *results; 1255 const uint8_t *bytes; 1256 1257 int32_t (*newStateTable)[256]; 1258 uint16_t *newResults; 1259 uint8_t *p; 1260 char *name; 1261 1262 uint32_t stage2Entry; 1263 uint32_t size, sizeofFromUBytes; 1264 1265 mbcsTable=&sharedData->mbcs; 1266 1267 table=mbcsTable->fromUnicodeTable; 1268 bytes=mbcsTable->fromUnicodeBytes; 1269 results=(const uint16_t *)bytes; 1270 1271 /* 1272 * Check that this is an EBCDIC table with SBCS portion - 1273 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1274 * 1275 * If not, ignore the option. Options are always ignored if they do not apply. 1276 */ 1277 if(!( 1278 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1279 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1280 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1281 )) { 1282 return FALSE; 1283 } 1284 1285 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1286 if(!( 1287 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1288 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1289 )) { 1290 return FALSE; 1291 } 1292 } else /* MBCS_OUTPUT_2_SISO */ { 1293 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1294 if(!( 1295 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1296 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1297 )) { 1298 return FALSE; 1299 } 1300 1301 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1302 if(!( 1303 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1304 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1305 )) { 1306 return FALSE; 1307 } 1308 } 1309 1310 if(mbcsTable->fromUBytesLength>0) { 1311 /* 1312 * We _know_ the number of bytes in the fromUnicodeBytes array 1313 * starting with header.version 4.1. 1314 */ 1315 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1316 } else { 1317 /* 1318 * Otherwise: 1319 * There used to be code to enumerate the fromUnicode 1320 * trie and find the highest entry, but it was removed in ICU 3.2 1321 * because it was not tested and caused a low code coverage number. 1322 * See Jitterbug 3674. 1323 * This affects only some .cnv file formats with a header.version 1324 * below 4.1, and only when swaplfnl is requested. 1325 * 1326 * ucnvmbcs.c revision 1.99 is the last one with the 1327 * ucnv_MBCSSizeofFromUBytes() function. 1328 */ 1329 *pErrorCode=U_INVALID_FORMAT_ERROR; 1330 return FALSE; 1331 } 1332 1333 /* 1334 * The table has an appropriate format. 1335 * Allocate and build 1336 * - a modified to-Unicode state table 1337 * - a modified from-Unicode output array 1338 * - a converter name string with the swap option appended 1339 */ 1340 size= 1341 mbcsTable->countStates*1024+ 1342 sizeofFromUBytes+ 1343 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1344 p=(uint8_t *)uprv_malloc(size); 1345 if(p==NULL) { 1346 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1347 return FALSE; 1348 } 1349 1350 /* copy and modify the to-Unicode state table */ 1351 newStateTable=(int32_t (*)[256])p; 1352 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1353 1354 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1355 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1356 1357 /* copy and modify the from-Unicode result table */ 1358 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1359 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1360 1361 /* conveniently, the table access macros work on the left side of expressions */ 1362 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1363 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1364 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1365 } else /* MBCS_OUTPUT_2_SISO */ { 1366 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1367 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1368 1369 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1370 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1371 } 1372 1373 /* set the canonical converter name */ 1374 name=(char *)newResults+sizeofFromUBytes; 1375 uprv_strcpy(name, sharedData->staticData->name); 1376 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1377 1378 /* set the pointers */ 1379 umtx_lock(NULL); 1380 if(mbcsTable->swapLFNLStateTable==NULL) { 1381 mbcsTable->swapLFNLStateTable=newStateTable; 1382 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1383 mbcsTable->swapLFNLName=name; 1384 1385 newStateTable=NULL; 1386 } 1387 umtx_unlock(NULL); 1388 1389 /* release the allocated memory if another thread beat us to it */ 1390 if(newStateTable!=NULL) { 1391 uprv_free(newStateTable); 1392 } 1393 return TRUE; 1394 } 1395 1396 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1397 1398 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1399 static UBool U_CALLCONV 1400 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1401 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1402 const uint16_t *table; 1403 uint32_t *stage2; 1404 uint8_t *bytes, *p; 1405 UChar32 c; 1406 int32_t i, st3; 1407 1408 table=mbcsTable->fromUnicodeTable; 1409 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1410 1411 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1412 switch(mbcsTable->outputType) { 1413 case MBCS_OUTPUT_3_EUC: 1414 if(value<=0xffff) { 1415 /* short sequences are stored directly */ 1416 /* code set 0 or 1 */ 1417 } else if(value<=0x8effff) { 1418 /* code set 2 */ 1419 value&=0x7fff; 1420 } else /* first byte is 0x8f */ { 1421 /* code set 3 */ 1422 value&=0xff7f; 1423 } 1424 break; 1425 case MBCS_OUTPUT_4_EUC: 1426 if(value<=0xffffff) { 1427 /* short sequences are stored directly */ 1428 /* code set 0 or 1 */ 1429 } else if(value<=0x8effffff) { 1430 /* code set 2 */ 1431 value&=0x7fffff; 1432 } else /* first byte is 0x8f */ { 1433 /* code set 3 */ 1434 value&=0xff7fff; 1435 } 1436 break; 1437 default: 1438 break; 1439 } 1440 1441 for(i=0; i<=0x1f; ++value, ++i) { 1442 c=codePoints[i]; 1443 if(c<0) { 1444 continue; 1445 } 1446 1447 /* locate the stage 2 & 3 data */ 1448 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1449 p=bytes; 1450 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1451 1452 /* write the codepage bytes into stage 3 */ 1453 switch(mbcsTable->outputType) { 1454 case MBCS_OUTPUT_3: 1455 case MBCS_OUTPUT_4_EUC: 1456 p+=st3*3; 1457 p[0]=(uint8_t)(value>>16); 1458 p[1]=(uint8_t)(value>>8); 1459 p[2]=(uint8_t)value; 1460 break; 1461 case MBCS_OUTPUT_4: 1462 ((uint32_t *)p)[st3]=value; 1463 break; 1464 default: 1465 /* 2 bytes per character */ 1466 ((uint16_t *)p)[st3]=(uint16_t)value; 1467 break; 1468 } 1469 1470 /* set the roundtrip flag */ 1471 *stage2|=(1UL<<(16+(c&0xf))); 1472 } 1473 return TRUE; 1474 } 1475 1476 static void 1477 reconstituteData(UConverterMBCSTable *mbcsTable, 1478 uint32_t stage1Length, uint32_t stage2Length, 1479 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1480 UErrorCode *pErrorCode) { 1481 uint16_t *stage1; 1482 uint32_t *stage2; 1483 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1484 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1485 if(mbcsTable->reconstitutedData==NULL) { 1486 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1487 return; 1488 } 1489 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1490 1491 /* copy existing data and reroute the pointers */ 1492 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1493 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1494 1495 stage2=(uint32_t *)(stage1+stage1Length); 1496 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1497 mbcsTable->fromUnicodeTable+stage1Length, 1498 stage2Length*4); 1499 1500 mbcsTable->fromUnicodeTable=stage1; 1501 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length); 1502 1503 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1504 stage2=(uint32_t *)stage1; 1505 1506 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1507 { 1508 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1509 int32_t stageUTF8Index=0; 1510 int32_t st1, st2, st3, i; 1511 1512 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1513 st2=stage1[st1]; 1514 if(st2!=(int32_t)stage1Length/2) { 1515 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1516 for(i=0; i<16; ++i) { 1517 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1518 if(st3!=0) { 1519 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1520 st3>>=4; 1521 /* 1522 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1523 * allocated together as a single 64-block for access from the mbcsIndex 1524 */ 1525 stage2[st2++]=st3++; 1526 stage2[st2++]=st3++; 1527 stage2[st2++]=st3++; 1528 stage2[st2++]=st3; 1529 } else { 1530 /* no stage 3 block, skip */ 1531 st2+=4; 1532 } 1533 } 1534 } else { 1535 /* no stage 2 block, skip */ 1536 stageUTF8Index+=16; 1537 } 1538 } 1539 } 1540 1541 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1542 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1543 } 1544 1545 /* MBCS setup functions ----------------------------------------------------- */ 1546 1547 static void 1548 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1549 UConverterLoadArgs *pArgs, 1550 const uint8_t *raw, 1551 UErrorCode *pErrorCode) { 1552 UDataInfo info; 1553 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1554 _MBCSHeader *header=(_MBCSHeader *)raw; 1555 uint32_t offset; 1556 uint32_t headerLength; 1557 UBool noFromU=FALSE; 1558 1559 if(header->version[0]==4) { 1560 headerLength=MBCS_HEADER_V4_LENGTH; 1561 } else if(header->version[0]==5 && header->version[1]>=3 && 1562 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1563 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1564 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1565 } else { 1566 *pErrorCode=U_INVALID_TABLE_FORMAT; 1567 return; 1568 } 1569 1570 mbcsTable->outputType=(uint8_t)header->flags; 1571 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1572 *pErrorCode=U_INVALID_TABLE_FORMAT; 1573 return; 1574 } 1575 1576 /* extension data, header version 4.2 and higher */ 1577 offset=header->flags>>8; 1578 if(offset!=0) { 1579 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1580 } 1581 1582 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1583 UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER; 1584 UConverterSharedData *baseSharedData; 1585 const int32_t *extIndexes; 1586 const char *baseName; 1587 1588 /* extension-only file, load the base table and set values appropriately */ 1589 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1590 /* extension-only file without extension */ 1591 *pErrorCode=U_INVALID_TABLE_FORMAT; 1592 return; 1593 } 1594 1595 if(pArgs->nestedLoads!=1) { 1596 /* an extension table must not be loaded as a base table */ 1597 *pErrorCode=U_INVALID_TABLE_FILE; 1598 return; 1599 } 1600 1601 /* load the base table */ 1602 baseName=(const char *)header+headerLength*4; 1603 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1604 /* forbid loading this same extension-only file */ 1605 *pErrorCode=U_INVALID_TABLE_FORMAT; 1606 return; 1607 } 1608 1609 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1610 args.size=sizeof(UConverterLoadArgs); 1611 args.nestedLoads=2; 1612 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1613 args.reserved=pArgs->reserved; 1614 args.options=pArgs->options; 1615 args.pkg=pArgs->pkg; 1616 args.name=baseName; 1617 baseSharedData=ucnv_load(&args, pErrorCode); 1618 if(U_FAILURE(*pErrorCode)) { 1619 return; 1620 } 1621 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1622 baseSharedData->mbcs.baseSharedData!=NULL 1623 ) { 1624 ucnv_unload(baseSharedData); 1625 *pErrorCode=U_INVALID_TABLE_FORMAT; 1626 return; 1627 } 1628 if(pArgs->onlyTestIsLoadable) { 1629 /* 1630 * Exit as soon as we know that we can load the converter 1631 * and the format is valid and supported. 1632 * The worst that can happen in the following code is a memory 1633 * allocation error. 1634 */ 1635 ucnv_unload(baseSharedData); 1636 return; 1637 } 1638 1639 /* copy the base table data */ 1640 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1641 1642 /* overwrite values with relevant ones for the extension converter */ 1643 mbcsTable->baseSharedData=baseSharedData; 1644 mbcsTable->extIndexes=extIndexes; 1645 1646 /* 1647 * It would be possible to share the swapLFNL data with a base converter, 1648 * but the generated name would have to be different, and the memory 1649 * would have to be free'd only once. 1650 * It is easier to just create the data for the extension converter 1651 * separately when it is requested. 1652 */ 1653 mbcsTable->swapLFNLStateTable=NULL; 1654 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1655 mbcsTable->swapLFNLName=NULL; 1656 1657 /* 1658 * The reconstitutedData must be deleted only when the base converter 1659 * is unloaded. 1660 */ 1661 mbcsTable->reconstitutedData=NULL; 1662 1663 /* 1664 * Set a special, runtime-only outputType if the extension converter 1665 * is a DBCS version of a base converter that also maps single bytes. 1666 */ 1667 if( sharedData->staticData->conversionType==UCNV_DBCS || 1668 (sharedData->staticData->conversionType==UCNV_MBCS && 1669 sharedData->staticData->minBytesPerChar>=2) 1670 ) { 1671 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1672 /* the base converter is SI/SO-stateful */ 1673 int32_t entry; 1674 1675 /* get the dbcs state from the state table entry for SO=0x0e */ 1676 entry=mbcsTable->stateTable[0][0xe]; 1677 if( MBCS_ENTRY_IS_FINAL(entry) && 1678 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1679 MBCS_ENTRY_FINAL_STATE(entry)!=0 1680 ) { 1681 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1682 1683 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1684 } 1685 } else if( 1686 baseSharedData->staticData->conversionType==UCNV_MBCS && 1687 baseSharedData->staticData->minBytesPerChar==1 && 1688 baseSharedData->staticData->maxBytesPerChar==2 && 1689 mbcsTable->countStates<=127 1690 ) { 1691 /* non-stateful base converter, need to modify the state table */ 1692 int32_t (*newStateTable)[256]; 1693 int32_t *state; 1694 int32_t i, count; 1695 1696 /* allocate a new state table and copy the base state table contents */ 1697 count=mbcsTable->countStates; 1698 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1699 if(newStateTable==NULL) { 1700 ucnv_unload(baseSharedData); 1701 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1702 return; 1703 } 1704 1705 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1706 1707 /* change all final single-byte entries to go to a new all-illegal state */ 1708 state=newStateTable[0]; 1709 for(i=0; i<256; ++i) { 1710 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1711 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1712 } 1713 } 1714 1715 /* build the new all-illegal state */ 1716 state=newStateTable[count]; 1717 for(i=0; i<256; ++i) { 1718 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1719 } 1720 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1721 mbcsTable->countStates=(uint8_t)(count+1); 1722 mbcsTable->stateTableOwned=TRUE; 1723 1724 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1725 } 1726 } 1727 1728 /* 1729 * unlike below for files with base tables, do not get the unicodeMask 1730 * from the sharedData; instead, use the base table's unicodeMask, 1731 * which we copied in the memcpy above; 1732 * this is necessary because the static data unicodeMask, especially 1733 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1734 */ 1735 } else { 1736 /* conversion file with a base table; an additional extension table is optional */ 1737 /* make sure that the output type is known */ 1738 switch(mbcsTable->outputType) { 1739 case MBCS_OUTPUT_1: 1740 case MBCS_OUTPUT_2: 1741 case MBCS_OUTPUT_3: 1742 case MBCS_OUTPUT_4: 1743 case MBCS_OUTPUT_3_EUC: 1744 case MBCS_OUTPUT_4_EUC: 1745 case MBCS_OUTPUT_2_SISO: 1746 /* OK */ 1747 break; 1748 default: 1749 *pErrorCode=U_INVALID_TABLE_FORMAT; 1750 return; 1751 } 1752 if(pArgs->onlyTestIsLoadable) { 1753 /* 1754 * Exit as soon as we know that we can load the converter 1755 * and the format is valid and supported. 1756 * The worst that can happen in the following code is a memory 1757 * allocation error. 1758 */ 1759 return; 1760 } 1761 1762 mbcsTable->countStates=(uint8_t)header->countStates; 1763 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1764 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1765 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1766 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1767 1768 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1769 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1770 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1771 1772 /* 1773 * converter versions 6.1 and up contain a unicodeMask that is 1774 * used here to select the most efficient function implementations 1775 */ 1776 info.size=sizeof(UDataInfo); 1777 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1778 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1779 /* mask off possible future extensions to be safe */ 1780 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1781 } else { 1782 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1783 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1784 } 1785 1786 /* 1787 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1788 * Check for the header version, SBCS vs. MBCS, and for whether the 1789 * data structures are optimized for code points as high as what the 1790 * runtime code is designed for. 1791 * The implementation does not handle mapping tables with entries for 1792 * unpaired surrogates. 1793 */ 1794 if( header->version[1]>=3 && 1795 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1796 (mbcsTable->countStates==1 ? 1797 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1798 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1799 ) 1800 ) { 1801 mbcsTable->utf8Friendly=TRUE; 1802 1803 if(mbcsTable->countStates==1) { 1804 /* 1805 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1806 * Build a table with indexes to each block, to be used instead of 1807 * the regular stage 1/2 table. 1808 */ 1809 int32_t i; 1810 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1811 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1812 } 1813 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1814 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1815 } else { 1816 /* 1817 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1818 * The .cnv file is prebuilt with an additional stage table with indexes 1819 * to each block. 1820 */ 1821 mbcsTable->mbcsIndex=(const uint16_t *) 1822 (mbcsTable->fromUnicodeBytes+ 1823 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1824 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1825 } 1826 } 1827 1828 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1829 { 1830 uint32_t asciiRoundtrips=0xffffffff; 1831 int32_t i; 1832 1833 for(i=0; i<0x80; ++i) { 1834 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1835 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1836 } 1837 } 1838 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1839 } 1840 1841 if(noFromU) { 1842 uint32_t stage1Length= 1843 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1844 0x440 : 0x40; 1845 uint32_t stage2Length= 1846 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1847 stage1Length/2; 1848 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1849 } 1850 } 1851 1852 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1853 if(mbcsTable->utf8Friendly) { 1854 if(mbcsTable->countStates==1) { 1855 sharedData->impl=&_SBCSUTF8Impl; 1856 } else { 1857 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1858 sharedData->impl=&_DBCSUTF8Impl; 1859 } 1860 } 1861 } 1862 1863 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1864 /* 1865 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1866 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1867 */ 1868 mbcsTable->asciiRoundtrips=0; 1869 } 1870 } 1871 1872 static void 1873 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1874 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1875 1876 if(mbcsTable->swapLFNLStateTable!=NULL) { 1877 uprv_free(mbcsTable->swapLFNLStateTable); 1878 } 1879 if(mbcsTable->stateTableOwned) { 1880 uprv_free((void *)mbcsTable->stateTable); 1881 } 1882 if(mbcsTable->baseSharedData!=NULL) { 1883 ucnv_unload(mbcsTable->baseSharedData); 1884 } 1885 if(mbcsTable->reconstitutedData!=NULL) { 1886 uprv_free(mbcsTable->reconstitutedData); 1887 } 1888 } 1889 1890 static void 1891 ucnv_MBCSOpen(UConverter *cnv, 1892 UConverterLoadArgs *pArgs, 1893 UErrorCode *pErrorCode) { 1894 UConverterMBCSTable *mbcsTable; 1895 const int32_t *extIndexes; 1896 uint8_t outputType; 1897 int8_t maxBytesPerUChar; 1898 1899 if(pArgs->onlyTestIsLoadable) { 1900 return; 1901 } 1902 1903 mbcsTable=&cnv->sharedData->mbcs; 1904 outputType=mbcsTable->outputType; 1905 1906 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1907 /* the swaplfnl option does not apply, remove it */ 1908 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1909 } 1910 1911 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1912 /* do this because double-checked locking is broken */ 1913 UBool isCached; 1914 1915 umtx_lock(NULL); 1916 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1917 umtx_unlock(NULL); 1918 1919 if(!isCached) { 1920 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1921 if(U_FAILURE(*pErrorCode)) { 1922 return; /* something went wrong */ 1923 } 1924 1925 /* the option does not apply, remove it */ 1926 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1927 } 1928 } 1929 } 1930 1931 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1932 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1933 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1934 cnv->options|=_MBCS_OPTION_GB18030; 1935 } 1936 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1937 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1938 cnv->options|=_MBCS_OPTION_KEIS; 1939 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1940 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1941 cnv->options|=_MBCS_OPTION_JEF; 1942 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1943 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1944 cnv->options|=_MBCS_OPTION_JIPS; 1945 } 1946 1947 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1948 if(outputType==MBCS_OUTPUT_2_SISO) { 1949 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1950 } 1951 1952 extIndexes=mbcsTable->extIndexes; 1953 if(extIndexes!=NULL) { 1954 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1955 if(outputType==MBCS_OUTPUT_2_SISO) { 1956 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1957 } 1958 1959 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1960 cnv->maxBytesPerUChar=maxBytesPerUChar; 1961 } 1962 } 1963 1964 #if 0 1965 /* 1966 * documentation of UConverter fields used for status 1967 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1968 */ 1969 1970 /* toUnicode */ 1971 cnv->toUnicodeStatus=0; /* offset */ 1972 cnv->mode=0; /* state */ 1973 cnv->toULength=0; /* byteIndex */ 1974 1975 /* fromUnicode */ 1976 cnv->fromUChar32=0; 1977 cnv->fromUnicodeStatus=1; /* prevLength */ 1978 #endif 1979 } 1980 1981 static const char * 1982 ucnv_MBCSGetName(const UConverter *cnv) { 1983 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1984 return cnv->sharedData->mbcs.swapLFNLName; 1985 } else { 1986 return cnv->sharedData->staticData->name; 1987 } 1988 } 1989 1990 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 1991 1992 static UChar32 1993 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 1994 const _MBCSToUFallback *toUFallbacks; 1995 uint32_t i, start, limit; 1996 1997 limit=mbcsTable->countToUFallbacks; 1998 if(limit>0) { 1999 /* do a binary search for the fallback mapping */ 2000 toUFallbacks=mbcsTable->toUFallbacks; 2001 start=0; 2002 while(start<limit-1) { 2003 i=(start+limit)/2; 2004 if(offset<toUFallbacks[i].offset) { 2005 limit=i; 2006 } else { 2007 start=i; 2008 } 2009 } 2010 2011 /* did we really find it? */ 2012 if(offset==toUFallbacks[start].offset) { 2013 return toUFallbacks[start].codePoint; 2014 } 2015 } 2016 2017 return 0xfffe; 2018 } 2019 2020 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 2021 static void 2022 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2023 UErrorCode *pErrorCode) { 2024 UConverter *cnv; 2025 const uint8_t *source, *sourceLimit; 2026 UChar *target; 2027 const UChar *targetLimit; 2028 int32_t *offsets; 2029 2030 const int32_t (*stateTable)[256]; 2031 2032 int32_t sourceIndex; 2033 2034 int32_t entry; 2035 UChar c; 2036 uint8_t action; 2037 2038 /* set up the local pointers */ 2039 cnv=pArgs->converter; 2040 source=(const uint8_t *)pArgs->source; 2041 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2042 target=pArgs->target; 2043 targetLimit=pArgs->targetLimit; 2044 offsets=pArgs->offsets; 2045 2046 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2047 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2048 } else { 2049 stateTable=cnv->sharedData->mbcs.stateTable; 2050 } 2051 2052 /* sourceIndex=-1 if the current character began in the previous buffer */ 2053 sourceIndex=0; 2054 2055 /* conversion loop */ 2056 while(source<sourceLimit) { 2057 /* 2058 * This following test is to see if available input would overflow the output. 2059 * It does not catch output of more than one code unit that 2060 * overflows as a result of a surrogate pair or callback output 2061 * from the last source byte. 2062 * Therefore, those situations also test for overflows and will 2063 * then break the loop, too. 2064 */ 2065 if(target>=targetLimit) { 2066 /* target is full */ 2067 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2068 break; 2069 } 2070 2071 entry=stateTable[0][*source++]; 2072 /* MBCS_ENTRY_IS_FINAL(entry) */ 2073 2074 /* test the most common case first */ 2075 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2076 /* output BMP code point */ 2077 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2078 if(offsets!=NULL) { 2079 *offsets++=sourceIndex; 2080 } 2081 2082 /* normal end of action codes: prepare for a new character */ 2083 ++sourceIndex; 2084 continue; 2085 } 2086 2087 /* 2088 * An if-else-if chain provides more reliable performance for 2089 * the most common cases compared to a switch. 2090 */ 2091 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2092 if(action==MBCS_STATE_VALID_DIRECT_20 || 2093 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2094 ) { 2095 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2096 /* output surrogate pair */ 2097 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2098 if(offsets!=NULL) { 2099 *offsets++=sourceIndex; 2100 } 2101 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2102 if(target<targetLimit) { 2103 *target++=c; 2104 if(offsets!=NULL) { 2105 *offsets++=sourceIndex; 2106 } 2107 } else { 2108 /* target overflow */ 2109 cnv->UCharErrorBuffer[0]=c; 2110 cnv->UCharErrorBufferLength=1; 2111 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2112 break; 2113 } 2114 2115 ++sourceIndex; 2116 continue; 2117 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2118 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2119 /* output BMP code point */ 2120 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2121 if(offsets!=NULL) { 2122 *offsets++=sourceIndex; 2123 } 2124 2125 ++sourceIndex; 2126 continue; 2127 } 2128 } else if(action==MBCS_STATE_UNASSIGNED) { 2129 /* just fall through */ 2130 } else if(action==MBCS_STATE_ILLEGAL) { 2131 /* callback(illegal) */ 2132 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2133 } else { 2134 /* reserved, must never occur */ 2135 ++sourceIndex; 2136 continue; 2137 } 2138 2139 if(U_FAILURE(*pErrorCode)) { 2140 /* callback(illegal) */ 2141 break; 2142 } else /* unassigned sequences indicated with byteIndex>0 */ { 2143 /* try an extension mapping */ 2144 pArgs->source=(const char *)source; 2145 cnv->toUBytes[0]=*(source-1); 2146 cnv->toULength=_extToU(cnv, cnv->sharedData, 2147 1, &source, sourceLimit, 2148 &target, targetLimit, 2149 &offsets, sourceIndex, 2150 pArgs->flush, 2151 pErrorCode); 2152 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2153 2154 if(U_FAILURE(*pErrorCode)) { 2155 /* not mappable or buffer overflow */ 2156 break; 2157 } 2158 } 2159 } 2160 2161 /* write back the updated pointers */ 2162 pArgs->source=(const char *)source; 2163 pArgs->target=target; 2164 pArgs->offsets=offsets; 2165 } 2166 2167 /* 2168 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2169 * that only map to and from the BMP. 2170 * In addition to single-byte optimizations, the offset calculations 2171 * become much easier. 2172 */ 2173 static void 2174 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2175 UErrorCode *pErrorCode) { 2176 UConverter *cnv; 2177 const uint8_t *source, *sourceLimit, *lastSource; 2178 UChar *target; 2179 int32_t targetCapacity, length; 2180 int32_t *offsets; 2181 2182 const int32_t (*stateTable)[256]; 2183 2184 int32_t sourceIndex; 2185 2186 int32_t entry; 2187 uint8_t action; 2188 2189 /* set up the local pointers */ 2190 cnv=pArgs->converter; 2191 source=(const uint8_t *)pArgs->source; 2192 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2193 target=pArgs->target; 2194 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2195 offsets=pArgs->offsets; 2196 2197 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2198 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2199 } else { 2200 stateTable=cnv->sharedData->mbcs.stateTable; 2201 } 2202 2203 /* sourceIndex=-1 if the current character began in the previous buffer */ 2204 sourceIndex=0; 2205 lastSource=source; 2206 2207 /* 2208 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2209 * for the minimum of the sourceLength and targetCapacity 2210 */ 2211 length=(int32_t)(sourceLimit-source); 2212 if(length<targetCapacity) { 2213 targetCapacity=length; 2214 } 2215 2216 #if MBCS_UNROLL_SINGLE_TO_BMP 2217 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2218 /* unroll the loop with the most common case */ 2219 unrolled: 2220 if(targetCapacity>=16) { 2221 int32_t count, loops, oredEntries; 2222 2223 loops=count=targetCapacity>>4; 2224 do { 2225 oredEntries=entry=stateTable[0][*source++]; 2226 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2227 oredEntries|=entry=stateTable[0][*source++]; 2228 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2229 oredEntries|=entry=stateTable[0][*source++]; 2230 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2231 oredEntries|=entry=stateTable[0][*source++]; 2232 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2233 oredEntries|=entry=stateTable[0][*source++]; 2234 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2235 oredEntries|=entry=stateTable[0][*source++]; 2236 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2237 oredEntries|=entry=stateTable[0][*source++]; 2238 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2239 oredEntries|=entry=stateTable[0][*source++]; 2240 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2241 oredEntries|=entry=stateTable[0][*source++]; 2242 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2243 oredEntries|=entry=stateTable[0][*source++]; 2244 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2245 oredEntries|=entry=stateTable[0][*source++]; 2246 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2247 oredEntries|=entry=stateTable[0][*source++]; 2248 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2249 oredEntries|=entry=stateTable[0][*source++]; 2250 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2251 oredEntries|=entry=stateTable[0][*source++]; 2252 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2253 oredEntries|=entry=stateTable[0][*source++]; 2254 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2255 oredEntries|=entry=stateTable[0][*source++]; 2256 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2257 2258 /* were all 16 entries really valid? */ 2259 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2260 /* no, return to the first of these 16 */ 2261 source-=16; 2262 target-=16; 2263 break; 2264 } 2265 } while(--count>0); 2266 count=loops-count; 2267 targetCapacity-=16*count; 2268 2269 if(offsets!=NULL) { 2270 lastSource+=16*count; 2271 while(count>0) { 2272 *offsets++=sourceIndex++; 2273 *offsets++=sourceIndex++; 2274 *offsets++=sourceIndex++; 2275 *offsets++=sourceIndex++; 2276 *offsets++=sourceIndex++; 2277 *offsets++=sourceIndex++; 2278 *offsets++=sourceIndex++; 2279 *offsets++=sourceIndex++; 2280 *offsets++=sourceIndex++; 2281 *offsets++=sourceIndex++; 2282 *offsets++=sourceIndex++; 2283 *offsets++=sourceIndex++; 2284 *offsets++=sourceIndex++; 2285 *offsets++=sourceIndex++; 2286 *offsets++=sourceIndex++; 2287 *offsets++=sourceIndex++; 2288 --count; 2289 } 2290 } 2291 } 2292 #endif 2293 2294 /* conversion loop */ 2295 while(targetCapacity > 0 && source < sourceLimit) { 2296 entry=stateTable[0][*source++]; 2297 /* MBCS_ENTRY_IS_FINAL(entry) */ 2298 2299 /* test the most common case first */ 2300 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2301 /* output BMP code point */ 2302 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2303 --targetCapacity; 2304 continue; 2305 } 2306 2307 /* 2308 * An if-else-if chain provides more reliable performance for 2309 * the most common cases compared to a switch. 2310 */ 2311 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2312 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2313 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2314 /* output BMP code point */ 2315 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2316 --targetCapacity; 2317 continue; 2318 } 2319 } else if(action==MBCS_STATE_UNASSIGNED) { 2320 /* just fall through */ 2321 } else if(action==MBCS_STATE_ILLEGAL) { 2322 /* callback(illegal) */ 2323 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2324 } else { 2325 /* reserved, must never occur */ 2326 continue; 2327 } 2328 2329 /* set offsets since the start or the last extension */ 2330 if(offsets!=NULL) { 2331 int32_t count=(int32_t)(source-lastSource); 2332 2333 /* predecrement: do not set the offset for the callback-causing character */ 2334 while(--count>0) { 2335 *offsets++=sourceIndex++; 2336 } 2337 /* offset and sourceIndex are now set for the current character */ 2338 } 2339 2340 if(U_FAILURE(*pErrorCode)) { 2341 /* callback(illegal) */ 2342 break; 2343 } else /* unassigned sequences indicated with byteIndex>0 */ { 2344 /* try an extension mapping */ 2345 lastSource=source; 2346 cnv->toUBytes[0]=*(source-1); 2347 cnv->toULength=_extToU(cnv, cnv->sharedData, 2348 1, &source, sourceLimit, 2349 &target, pArgs->targetLimit, 2350 &offsets, sourceIndex, 2351 pArgs->flush, 2352 pErrorCode); 2353 sourceIndex+=1+(int32_t)(source-lastSource); 2354 2355 if(U_FAILURE(*pErrorCode)) { 2356 /* not mappable or buffer overflow */ 2357 break; 2358 } 2359 2360 /* recalculate the targetCapacity after an extension mapping */ 2361 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2362 length=(int32_t)(sourceLimit-source); 2363 if(length<targetCapacity) { 2364 targetCapacity=length; 2365 } 2366 } 2367 2368 #if MBCS_UNROLL_SINGLE_TO_BMP 2369 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2370 goto unrolled; 2371 #endif 2372 } 2373 2374 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2375 /* target is full */ 2376 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2377 } 2378 2379 /* set offsets since the start or the last callback */ 2380 if(offsets!=NULL) { 2381 size_t count=source-lastSource; 2382 while(count>0) { 2383 *offsets++=sourceIndex++; 2384 --count; 2385 } 2386 } 2387 2388 /* write back the updated pointers */ 2389 pArgs->source=(const char *)source; 2390 pArgs->target=target; 2391 pArgs->offsets=offsets; 2392 } 2393 2394 static UBool 2395 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2396 const int32_t *row=stateTable[state]; 2397 int32_t b, entry; 2398 /* First test for final entries in this state for some commonly valid byte values. */ 2399 entry=row[0xa1]; 2400 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2401 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2402 ) { 2403 return TRUE; 2404 } 2405 entry=row[0x41]; 2406 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2407 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2408 ) { 2409 return TRUE; 2410 } 2411 /* Then test for final entries in this state. */ 2412 for(b=0; b<=0xff; ++b) { 2413 entry=row[b]; 2414 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2415 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2416 ) { 2417 return TRUE; 2418 } 2419 } 2420 /* Then recurse for transition entries. */ 2421 for(b=0; b<=0xff; ++b) { 2422 entry=row[b]; 2423 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2424 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2425 ) { 2426 return TRUE; 2427 } 2428 } 2429 return FALSE; 2430 } 2431 2432 /* 2433 * Is byte b a single/lead byte in this state? 2434 * Recurse for transition states, because here we don't want to say that 2435 * b is a lead byte if all byte sequences that start with b are illegal. 2436 */ 2437 static UBool 2438 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2439 const int32_t *row=stateTable[state]; 2440 int32_t entry=row[b]; 2441 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2442 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2443 } else { 2444 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2445 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2446 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2447 } else { 2448 return action!=MBCS_STATE_ILLEGAL; 2449 } 2450 } 2451 } 2452 2453 U_CFUNC void 2454 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2455 UErrorCode *pErrorCode) { 2456 UConverter *cnv; 2457 const uint8_t *source, *sourceLimit; 2458 UChar *target; 2459 const UChar *targetLimit; 2460 int32_t *offsets; 2461 2462 const int32_t (*stateTable)[256]; 2463 const uint16_t *unicodeCodeUnits; 2464 2465 uint32_t offset; 2466 uint8_t state; 2467 int8_t byteIndex; 2468 uint8_t *bytes; 2469 2470 int32_t sourceIndex, nextSourceIndex; 2471 2472 int32_t entry; 2473 UChar c; 2474 uint8_t action; 2475 2476 /* use optimized function if possible */ 2477 cnv=pArgs->converter; 2478 2479 if(cnv->preToULength>0) { 2480 /* 2481 * pass sourceIndex=-1 because we continue from an earlier buffer 2482 * in the future, this may change with continuous offsets 2483 */ 2484 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2485 2486 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2487 return; 2488 } 2489 } 2490 2491 if(cnv->sharedData->mbcs.countStates==1) { 2492 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2493 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2494 } else { 2495 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2496 } 2497 return; 2498 } 2499 2500 /* set up the local pointers */ 2501 source=(const uint8_t *)pArgs->source; 2502 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2503 target=pArgs->target; 2504 targetLimit=pArgs->targetLimit; 2505 offsets=pArgs->offsets; 2506 2507 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2508 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2509 } else { 2510 stateTable=cnv->sharedData->mbcs.stateTable; 2511 } 2512 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2513 2514 /* get the converter state from UConverter */ 2515 offset=cnv->toUnicodeStatus; 2516 byteIndex=cnv->toULength; 2517 bytes=cnv->toUBytes; 2518 2519 /* 2520 * if we are in the SBCS state for a DBCS-only converter, 2521 * then load the DBCS state from the MBCS data 2522 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2523 */ 2524 if((state=(uint8_t)(cnv->mode))==0) { 2525 state=cnv->sharedData->mbcs.dbcsOnlyState; 2526 } 2527 2528 /* sourceIndex=-1 if the current character began in the previous buffer */ 2529 sourceIndex=byteIndex==0 ? 0 : -1; 2530 nextSourceIndex=0; 2531 2532 /* conversion loop */ 2533 while(source<sourceLimit) { 2534 /* 2535 * This following test is to see if available input would overflow the output. 2536 * It does not catch output of more than one code unit that 2537 * overflows as a result of a surrogate pair or callback output 2538 * from the last source byte. 2539 * Therefore, those situations also test for overflows and will 2540 * then break the loop, too. 2541 */ 2542 if(target>=targetLimit) { 2543 /* target is full */ 2544 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2545 break; 2546 } 2547 2548 if(byteIndex==0) { 2549 /* optimized loop for 1/2-byte input and BMP output */ 2550 if(offsets==NULL) { 2551 do { 2552 entry=stateTable[state][*source]; 2553 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2554 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2555 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2556 2557 ++source; 2558 if( source<sourceLimit && 2559 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2560 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2561 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2562 ) { 2563 ++source; 2564 *target++=c; 2565 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2566 offset=0; 2567 } else { 2568 /* set the state and leave the optimized loop */ 2569 bytes[0]=*(source-1); 2570 byteIndex=1; 2571 break; 2572 } 2573 } else { 2574 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2575 /* output BMP code point */ 2576 ++source; 2577 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2578 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2579 } else { 2580 /* leave the optimized loop */ 2581 break; 2582 } 2583 } 2584 } while(source<sourceLimit && target<targetLimit); 2585 } else /* offsets!=NULL */ { 2586 do { 2587 entry=stateTable[state][*source]; 2588 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2589 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2590 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2591 2592 ++source; 2593 if( source<sourceLimit && 2594 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2595 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2596 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2597 ) { 2598 ++source; 2599 *target++=c; 2600 if(offsets!=NULL) { 2601 *offsets++=sourceIndex; 2602 sourceIndex=(nextSourceIndex+=2); 2603 } 2604 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2605 offset=0; 2606 } else { 2607 /* set the state and leave the optimized loop */ 2608 ++nextSourceIndex; 2609 bytes[0]=*(source-1); 2610 byteIndex=1; 2611 break; 2612 } 2613 } else { 2614 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2615 /* output BMP code point */ 2616 ++source; 2617 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2618 if(offsets!=NULL) { 2619 *offsets++=sourceIndex; 2620 sourceIndex=++nextSourceIndex; 2621 } 2622 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2623 } else { 2624 /* leave the optimized loop */ 2625 break; 2626 } 2627 } 2628 } while(source<sourceLimit && target<targetLimit); 2629 } 2630 2631 /* 2632 * these tests and break statements could be put inside the loop 2633 * if C had "break outerLoop" like Java 2634 */ 2635 if(source>=sourceLimit) { 2636 break; 2637 } 2638 if(target>=targetLimit) { 2639 /* target is full */ 2640 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2641 break; 2642 } 2643 2644 ++nextSourceIndex; 2645 bytes[byteIndex++]=*source++; 2646 } else /* byteIndex>0 */ { 2647 ++nextSourceIndex; 2648 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2649 } 2650 2651 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2652 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2653 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2654 continue; 2655 } 2656 2657 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2658 cnv->mode=state; 2659 2660 /* set the next state early so that we can reuse the entry variable */ 2661 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2662 2663 /* 2664 * An if-else-if chain provides more reliable performance for 2665 * the most common cases compared to a switch. 2666 */ 2667 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2668 if(action==MBCS_STATE_VALID_16) { 2669 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2670 c=unicodeCodeUnits[offset]; 2671 if(c<0xfffe) { 2672 /* output BMP code point */ 2673 *target++=c; 2674 if(offsets!=NULL) { 2675 *offsets++=sourceIndex; 2676 } 2677 byteIndex=0; 2678 } else if(c==0xfffe) { 2679 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2680 /* output fallback BMP code point */ 2681 *target++=(UChar)entry; 2682 if(offsets!=NULL) { 2683 *offsets++=sourceIndex; 2684 } 2685 byteIndex=0; 2686 } 2687 } else { 2688 /* callback(illegal) */ 2689 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2690 } 2691 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2692 /* output BMP code point */ 2693 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2694 if(offsets!=NULL) { 2695 *offsets++=sourceIndex; 2696 } 2697 byteIndex=0; 2698 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2699 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2700 c=unicodeCodeUnits[offset++]; 2701 if(c<0xd800) { 2702 /* output BMP code point below 0xd800 */ 2703 *target++=c; 2704 if(offsets!=NULL) { 2705 *offsets++=sourceIndex; 2706 } 2707 byteIndex=0; 2708 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2709 /* output roundtrip or fallback surrogate pair */ 2710 *target++=(UChar)(c&0xdbff); 2711 if(offsets!=NULL) { 2712 *offsets++=sourceIndex; 2713 } 2714 byteIndex=0; 2715 if(target<targetLimit) { 2716 *target++=unicodeCodeUnits[offset]; 2717 if(offsets!=NULL) { 2718 *offsets++=sourceIndex; 2719 } 2720 } else { 2721 /* target overflow */ 2722 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2723 cnv->UCharErrorBufferLength=1; 2724 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2725 2726 offset=0; 2727 break; 2728 } 2729 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2730 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2731 *target++=unicodeCodeUnits[offset]; 2732 if(offsets!=NULL) { 2733 *offsets++=sourceIndex; 2734 } 2735 byteIndex=0; 2736 } else if(c==0xffff) { 2737 /* callback(illegal) */ 2738 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2739 } 2740 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2741 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2742 ) { 2743 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2744 /* output surrogate pair */ 2745 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2746 if(offsets!=NULL) { 2747 *offsets++=sourceIndex; 2748 } 2749 byteIndex=0; 2750 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2751 if(target<targetLimit) { 2752 *target++=c; 2753 if(offsets!=NULL) { 2754 *offsets++=sourceIndex; 2755 } 2756 } else { 2757 /* target overflow */ 2758 cnv->UCharErrorBuffer[0]=c; 2759 cnv->UCharErrorBufferLength=1; 2760 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2761 2762 offset=0; 2763 break; 2764 } 2765 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2766 /* 2767 * This serves as a state change without any output. 2768 * It is useful for reading simple stateful encodings, 2769 * for example using just Shift-In/Shift-Out codes. 2770 * The 21 unused bits may later be used for more sophisticated 2771 * state transitions. 2772 */ 2773 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2774 byteIndex=0; 2775 } else { 2776 /* SI/SO are illegal for DBCS-only conversion */ 2777 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2778 2779 /* callback(illegal) */ 2780 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2781 } 2782 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2783 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2784 /* output BMP code point */ 2785 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2786 if(offsets!=NULL) { 2787 *offsets++=sourceIndex; 2788 } 2789 byteIndex=0; 2790 } 2791 } else if(action==MBCS_STATE_UNASSIGNED) { 2792 /* just fall through */ 2793 } else if(action==MBCS_STATE_ILLEGAL) { 2794 /* callback(illegal) */ 2795 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2796 } else { 2797 /* reserved, must never occur */ 2798 byteIndex=0; 2799 } 2800 2801 /* end of action codes: prepare for a new character */ 2802 offset=0; 2803 2804 if(byteIndex==0) { 2805 sourceIndex=nextSourceIndex; 2806 } else if(U_FAILURE(*pErrorCode)) { 2807 /* callback(illegal) */ 2808 if(byteIndex>1) { 2809 /* 2810 * Ticket 5691: consistent illegal sequences: 2811 * - We include at least the first byte in the illegal sequence. 2812 * - If any of the non-initial bytes could be the start of a character, 2813 * we stop the illegal sequence before the first one of those. 2814 */ 2815 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2816 int8_t i; 2817 for(i=1; 2818 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2819 ++i) {} 2820 if(i<byteIndex) { 2821 /* Back out some bytes. */ 2822 int8_t backOutDistance=byteIndex-i; 2823 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2824 byteIndex=i; /* length of reported illegal byte sequence */ 2825 if(backOutDistance<=bytesFromThisBuffer) { 2826 source-=backOutDistance; 2827 } else { 2828 /* Back out bytes from the previous buffer: Need to replay them. */ 2829 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2830 /* preToULength is negative! */ 2831 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2832 source=(const uint8_t *)pArgs->source; 2833 } 2834 } 2835 } 2836 break; 2837 } else /* unassigned sequences indicated with byteIndex>0 */ { 2838 /* try an extension mapping */ 2839 pArgs->source=(const char *)source; 2840 byteIndex=_extToU(cnv, cnv->sharedData, 2841 byteIndex, &source, sourceLimit, 2842 &target, targetLimit, 2843 &offsets, sourceIndex, 2844 pArgs->flush, 2845 pErrorCode); 2846 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2847 2848 if(U_FAILURE(*pErrorCode)) { 2849 /* not mappable or buffer overflow */ 2850 break; 2851 } 2852 } 2853 } 2854 2855 /* set the converter state back into UConverter */ 2856 cnv->toUnicodeStatus=offset; 2857 cnv->mode=state; 2858 cnv->toULength=byteIndex; 2859 2860 /* write back the updated pointers */ 2861 pArgs->source=(const char *)source; 2862 pArgs->target=target; 2863 pArgs->offsets=offsets; 2864 } 2865 2866 /* 2867 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2868 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2869 */ 2870 static UChar32 2871 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2872 UErrorCode *pErrorCode) { 2873 UConverter *cnv; 2874 const int32_t (*stateTable)[256]; 2875 const uint8_t *source, *sourceLimit; 2876 2877 int32_t entry; 2878 uint8_t action; 2879 2880 /* set up the local pointers */ 2881 cnv=pArgs->converter; 2882 source=(const uint8_t *)pArgs->source; 2883 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2884 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2885 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2886 } else { 2887 stateTable=cnv->sharedData->mbcs.stateTable; 2888 } 2889 2890 /* conversion loop */ 2891 while(source<sourceLimit) { 2892 entry=stateTable[0][*source++]; 2893 /* MBCS_ENTRY_IS_FINAL(entry) */ 2894 2895 /* write back the updated pointer early so that we can return directly */ 2896 pArgs->source=(const char *)source; 2897 2898 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2899 /* output BMP code point */ 2900 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2901 } 2902 2903 /* 2904 * An if-else-if chain provides more reliable performance for 2905 * the most common cases compared to a switch. 2906 */ 2907 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2908 if( action==MBCS_STATE_VALID_DIRECT_20 || 2909 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2910 ) { 2911 /* output supplementary code point */ 2912 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2913 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2914 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2915 /* output BMP code point */ 2916 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2917 } 2918 } else if(action==MBCS_STATE_UNASSIGNED) { 2919 /* just fall through */ 2920 } else if(action==MBCS_STATE_ILLEGAL) { 2921 /* callback(illegal) */ 2922 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2923 } else { 2924 /* reserved, must never occur */ 2925 continue; 2926 } 2927 2928 if(U_FAILURE(*pErrorCode)) { 2929 /* callback(illegal) */ 2930 break; 2931 } else /* unassigned sequence */ { 2932 /* defer to the generic implementation */ 2933 pArgs->source=(const char *)source-1; 2934 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2935 } 2936 } 2937 2938 /* no output because of empty input or only state changes */ 2939 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2940 return 0xffff; 2941 } 2942 2943 /* 2944 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2945 * conversion without offset handling. 2946 * 2947 * When a character does not have a mapping to Unicode, then we return to the 2948 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2949 * handling. 2950 * We also defer to the generic code in other complicated cases and have them 2951 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2952 * 2953 * All normal mappings and errors are handled here. 2954 */ 2955 static UChar32 2956 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2957 UErrorCode *pErrorCode) { 2958 UConverter *cnv; 2959 const uint8_t *source, *sourceLimit, *lastSource; 2960 2961 const int32_t (*stateTable)[256]; 2962 const uint16_t *unicodeCodeUnits; 2963 2964 uint32_t offset; 2965 uint8_t state; 2966 2967 int32_t entry; 2968 UChar32 c; 2969 uint8_t action; 2970 2971 /* use optimized function if possible */ 2972 cnv=pArgs->converter; 2973 2974 if(cnv->preToULength>0) { 2975 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2976 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2977 } 2978 2979 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2980 /* 2981 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2982 * with the rare case of a codepage that maps single surrogates 2983 * without adding the complexity to this already complicated function here. 2984 */ 2985 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2986 } else if(cnv->sharedData->mbcs.countStates==1) { 2987 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2988 } 2989 2990 /* set up the local pointers */ 2991 source=lastSource=(const uint8_t *)pArgs->source; 2992 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2993 2994 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2995 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2996 } else { 2997 stateTable=cnv->sharedData->mbcs.stateTable; 2998 } 2999 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 3000 3001 /* get the converter state from UConverter */ 3002 offset=cnv->toUnicodeStatus; 3003 3004 /* 3005 * if we are in the SBCS state for a DBCS-only converter, 3006 * then load the DBCS state from the MBCS data 3007 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 3008 */ 3009 if((state=(uint8_t)(cnv->mode))==0) { 3010 state=cnv->sharedData->mbcs.dbcsOnlyState; 3011 } 3012 3013 /* conversion loop */ 3014 c=U_SENTINEL; 3015 while(source<sourceLimit) { 3016 entry=stateTable[state][*source++]; 3017 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3018 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3019 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3020 3021 /* optimization for 1/2-byte input and BMP output */ 3022 if( source<sourceLimit && 3023 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 3024 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 3025 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 3026 ) { 3027 ++source; 3028 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3029 /* output BMP code point */ 3030 break; 3031 } 3032 } else { 3033 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 3034 cnv->mode=state; 3035 3036 /* set the next state early so that we can reuse the entry variable */ 3037 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3038 3039 /* 3040 * An if-else-if chain provides more reliable performance for 3041 * the most common cases compared to a switch. 3042 */ 3043 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3044 if(action==MBCS_STATE_VALID_DIRECT_16) { 3045 /* output BMP code point */ 3046 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3047 break; 3048 } else if(action==MBCS_STATE_VALID_16) { 3049 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3050 c=unicodeCodeUnits[offset]; 3051 if(c<0xfffe) { 3052 /* output BMP code point */ 3053 break; 3054 } else if(c==0xfffe) { 3055 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 3056 break; 3057 } 3058 } else { 3059 /* callback(illegal) */ 3060 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3061 } 3062 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3063 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3064 c=unicodeCodeUnits[offset++]; 3065 if(c<0xd800) { 3066 /* output BMP code point below 0xd800 */ 3067 break; 3068 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3069 /* output roundtrip or fallback supplementary code point */ 3070 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 3071 break; 3072 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3073 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3074 c=unicodeCodeUnits[offset]; 3075 break; 3076 } else if(c==0xffff) { 3077 /* callback(illegal) */ 3078 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3079 } 3080 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 3081 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 3082 ) { 3083 /* output supplementary code point */ 3084 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 3085 break; 3086 } else if(action==MBCS_STATE_CHANGE_ONLY) { 3087 /* 3088 * This serves as a state change without any output. 3089 * It is useful for reading simple stateful encodings, 3090 * for example using just Shift-In/Shift-Out codes. 3091 * The 21 unused bits may later be used for more sophisticated 3092 * state transitions. 3093 */ 3094 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 3095 /* SI/SO are illegal for DBCS-only conversion */ 3096 state=(uint8_t)(cnv->mode); /* restore the previous state */ 3097 3098 /* callback(illegal) */ 3099 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3100 } 3101 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3102 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3103 /* output BMP code point */ 3104 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3105 break; 3106 } 3107 } else if(action==MBCS_STATE_UNASSIGNED) { 3108 /* just fall through */ 3109 } else if(action==MBCS_STATE_ILLEGAL) { 3110 /* callback(illegal) */ 3111 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3112 } else { 3113 /* reserved (must never occur), or only state change */ 3114 offset=0; 3115 lastSource=source; 3116 continue; 3117 } 3118 3119 /* end of action codes: prepare for a new character */ 3120 offset=0; 3121 3122 if(U_FAILURE(*pErrorCode)) { 3123 /* callback(illegal) */ 3124 break; 3125 } else /* unassigned sequence */ { 3126 /* defer to the generic implementation */ 3127 cnv->toUnicodeStatus=0; 3128 cnv->mode=state; 3129 pArgs->source=(const char *)lastSource; 3130 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 3131 } 3132 } 3133 } 3134 3135 if(c<0) { 3136 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3137 /* incomplete character byte sequence */ 3138 uint8_t *bytes=cnv->toUBytes; 3139 cnv->toULength=(int8_t)(source-lastSource); 3140 do { 3141 *bytes++=*lastSource++; 3142 } while(lastSource<source); 3143 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3144 } else if(U_FAILURE(*pErrorCode)) { 3145 /* callback(illegal) */ 3146 /* 3147 * Ticket 5691: consistent illegal sequences: 3148 * - We include at least the first byte in the illegal sequence. 3149 * - If any of the non-initial bytes could be the start of a character, 3150 * we stop the illegal sequence before the first one of those. 3151 */ 3152 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3153 uint8_t *bytes=cnv->toUBytes; 3154 *bytes++=*lastSource++; /* first byte */ 3155 if(lastSource==source) { 3156 cnv->toULength=1; 3157 } else /* lastSource<source: multi-byte character */ { 3158 int8_t i; 3159 for(i=1; 3160 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3161 ++i 3162 ) { 3163 *bytes++=*lastSource++; 3164 } 3165 cnv->toULength=i; 3166 source=lastSource; 3167 } 3168 } else { 3169 /* no output because of empty input or only state changes */ 3170 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3171 } 3172 c=0xffff; 3173 } 3174 3175 /* set the converter state back into UConverter, ready for a new character */ 3176 cnv->toUnicodeStatus=0; 3177 cnv->mode=state; 3178 3179 /* write back the updated pointer */ 3180 pArgs->source=(const char *)source; 3181 return c; 3182 } 3183 3184 #if 0 3185 /* 3186 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3187 * Removal improves code coverage. 3188 */ 3189 /** 3190 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3191 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3192 * It does not handle conversion extensions (_extToU()). 3193 */ 3194 U_CFUNC UChar32 3195 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3196 uint8_t b, UBool useFallback) { 3197 int32_t entry; 3198 uint8_t action; 3199 3200 entry=sharedData->mbcs.stateTable[0][b]; 3201 /* MBCS_ENTRY_IS_FINAL(entry) */ 3202 3203 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3204 /* output BMP code point */ 3205 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3206 } 3207 3208 /* 3209 * An if-else-if chain provides more reliable performance for 3210 * the most common cases compared to a switch. 3211 */ 3212 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3213 if(action==MBCS_STATE_VALID_DIRECT_20) { 3214 /* output supplementary code point */ 3215 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3216 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3217 if(!TO_U_USE_FALLBACK(useFallback)) { 3218 return 0xfffe; 3219 } 3220 /* output BMP code point */ 3221 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3222 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3223 if(!TO_U_USE_FALLBACK(useFallback)) { 3224 return 0xfffe; 3225 } 3226 /* output supplementary code point */ 3227 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3228 } else if(action==MBCS_STATE_UNASSIGNED) { 3229 return 0xfffe; 3230 } else if(action==MBCS_STATE_ILLEGAL) { 3231 return 0xffff; 3232 } else { 3233 /* reserved, must never occur */ 3234 return 0xffff; 3235 } 3236 } 3237 #endif 3238 3239 /* 3240 * This is a simple version of _MBCSGetNextUChar() that is used 3241 * by other converter implementations. 3242 * It only returns an "assigned" result if it consumes the entire input. 3243 * It does not use state from the converter, nor error codes. 3244 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3245 * It handles conversion extensions but not GB 18030. 3246 * 3247 * Return value: 3248 * U+fffe unassigned 3249 * U+ffff illegal 3250 * otherwise the Unicode code point 3251 */ 3252 U_CFUNC UChar32 3253 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3254 const char *source, int32_t length, 3255 UBool useFallback) { 3256 const int32_t (*stateTable)[256]; 3257 const uint16_t *unicodeCodeUnits; 3258 3259 uint32_t offset; 3260 uint8_t state, action; 3261 3262 UChar32 c; 3263 int32_t i, entry; 3264 3265 if(length<=0) { 3266 /* no input at all: "illegal" */ 3267 return 0xffff; 3268 } 3269 3270 #if 0 3271 /* 3272 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3273 * TODO In future releases, verify that this function is never called for SBCS 3274 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3275 * Removal improves code coverage. 3276 */ 3277 /* use optimized function if possible */ 3278 if(sharedData->mbcs.countStates==1) { 3279 if(length==1) { 3280 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3281 } else { 3282 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3283 } 3284 } 3285 #endif 3286 3287 /* set up the local pointers */ 3288 stateTable=sharedData->mbcs.stateTable; 3289 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3290 3291 /* converter state */ 3292 offset=0; 3293 state=sharedData->mbcs.dbcsOnlyState; 3294 3295 /* conversion loop */ 3296 for(i=0;;) { 3297 entry=stateTable[state][(uint8_t)source[i++]]; 3298 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3299 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3300 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3301 3302 if(i==length) { 3303 return 0xffff; /* truncated character */ 3304 } 3305 } else { 3306 /* 3307 * An if-else-if chain provides more reliable performance for 3308 * the most common cases compared to a switch. 3309 */ 3310 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3311 if(action==MBCS_STATE_VALID_16) { 3312 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3313 c=unicodeCodeUnits[offset]; 3314 if(c!=0xfffe) { 3315 /* done */ 3316 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3317 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3318 /* else done with 0xfffe */ 3319 } 3320 break; 3321 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3322 /* output BMP code point */ 3323 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3324 break; 3325 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3326 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3327 c=unicodeCodeUnits[offset++]; 3328 if(c<0xd800) { 3329 /* output BMP code point below 0xd800 */ 3330 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3331 /* output roundtrip or fallback supplementary code point */ 3332 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3333 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3334 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3335 c=unicodeCodeUnits[offset]; 3336 } else if(c==0xffff) { 3337 return 0xffff; 3338 } else { 3339 c=0xfffe; 3340 } 3341 break; 3342 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3343 /* output supplementary code point */ 3344 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3345 break; 3346 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3347 if(!TO_U_USE_FALLBACK(useFallback)) { 3348 c=0xfffe; 3349 break; 3350 } 3351 /* output BMP code point */ 3352 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3353 break; 3354 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3355 if(!TO_U_USE_FALLBACK(useFallback)) { 3356 c=0xfffe; 3357 break; 3358 } 3359 /* output supplementary code point */ 3360 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3361 break; 3362 } else if(action==MBCS_STATE_UNASSIGNED) { 3363 c=0xfffe; 3364 break; 3365 } 3366 3367 /* 3368 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3369 * and MBCS_STATE_ILLEGAL and reserved action codes 3370 */ 3371 return 0xffff; 3372 } 3373 } 3374 3375 if(i!=length) { 3376 /* illegal for this function: not all input consumed */ 3377 return 0xffff; 3378 } 3379 3380 if(c==0xfffe) { 3381 /* try an extension mapping */ 3382 const int32_t *cx=sharedData->mbcs.extIndexes; 3383 if(cx!=NULL) { 3384 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3385 } 3386 } 3387 3388 return c; 3389 } 3390 3391 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3392 3393 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3394 static void 3395 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3396 UErrorCode *pErrorCode) { 3397 UConverter *cnv; 3398 const UChar *source, *sourceLimit; 3399 uint8_t *target; 3400 int32_t targetCapacity; 3401 int32_t *offsets; 3402 3403 const uint16_t *table; 3404 const uint16_t *mbcsIndex; 3405 const uint8_t *bytes; 3406 3407 UChar32 c; 3408 3409 int32_t sourceIndex, nextSourceIndex; 3410 3411 uint32_t stage2Entry; 3412 uint32_t asciiRoundtrips; 3413 uint32_t value; 3414 uint8_t unicodeMask; 3415 3416 /* use optimized function if possible */ 3417 cnv=pArgs->converter; 3418 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3419 3420 /* set up the local pointers */ 3421 source=pArgs->source; 3422 sourceLimit=pArgs->sourceLimit; 3423 target=(uint8_t *)pArgs->target; 3424 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3425 offsets=pArgs->offsets; 3426 3427 table=cnv->sharedData->mbcs.fromUnicodeTable; 3428 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3429 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3430 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3431 } else { 3432 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3433 } 3434 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3435 3436 /* get the converter state from UConverter */ 3437 c=cnv->fromUChar32; 3438 3439 /* sourceIndex=-1 if the current character began in the previous buffer */ 3440 sourceIndex= c==0 ? 0 : -1; 3441 nextSourceIndex=0; 3442 3443 /* conversion loop */ 3444 if(c!=0 && targetCapacity>0) { 3445 goto getTrail; 3446 } 3447 3448 while(source<sourceLimit) { 3449 /* 3450 * This following test is to see if available input would overflow the output. 3451 * It does not catch output of more than one byte that 3452 * overflows as a result of a multi-byte character or callback output 3453 * from the last source character. 3454 * Therefore, those situations also test for overflows and will 3455 * then break the loop, too. 3456 */ 3457 if(targetCapacity>0) { 3458 /* 3459 * Get a correct Unicode code point: 3460 * a single UChar for a BMP code point or 3461 * a matched surrogate pair for a "supplementary code point". 3462 */ 3463 c=*source++; 3464 ++nextSourceIndex; 3465 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3466 *target++=(uint8_t)c; 3467 if(offsets!=NULL) { 3468 *offsets++=sourceIndex; 3469 sourceIndex=nextSourceIndex; 3470 } 3471 --targetCapacity; 3472 c=0; 3473 continue; 3474 } 3475 /* 3476 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3477 * to avoid dealing with surrogates. 3478 * MBCS_FAST_MAX must be >=0xd7ff. 3479 */ 3480 if(c<=0xd7ff) { 3481 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3482 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3483 if(value==0) { 3484 goto unassigned; 3485 } 3486 /* output the value */ 3487 } else { 3488 /* 3489 * This also tests if the codepage maps single surrogates. 3490 * If it does, then surrogates are not paired but mapped separately. 3491 * Note that in this case unmatched surrogates are not detected. 3492 */ 3493 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3494 if(U16_IS_SURROGATE_LEAD(c)) { 3495 getTrail: 3496 if(source<sourceLimit) { 3497 /* test the following code unit */ 3498 UChar trail=*source; 3499 if(U16_IS_TRAIL(trail)) { 3500 ++source; 3501 ++nextSourceIndex; 3502 c=U16_GET_SUPPLEMENTARY(c, trail); 3503 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3504 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3505 /* callback(unassigned) */ 3506 goto unassigned; 3507 } 3508 /* convert this supplementary code point */ 3509 /* exit this condition tree */ 3510 } else { 3511 /* this is an unmatched lead code unit (1st surrogate) */ 3512 /* callback(illegal) */ 3513 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3514 break; 3515 } 3516 } else { 3517 /* no more input */ 3518 break; 3519 } 3520 } else { 3521 /* this is an unmatched trail code unit (2nd surrogate) */ 3522 /* callback(illegal) */ 3523 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3524 break; 3525 } 3526 } 3527 3528 /* convert the Unicode code point in c into codepage bytes */ 3529 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3530 3531 /* get the bytes and the length for the output */ 3532 /* MBCS_OUTPUT_2 */ 3533 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3534 3535 /* is this code point assigned, or do we use fallbacks? */ 3536 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3537 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3538 ) { 3539 /* 3540 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3541 * There is no way with this data structure for fallback output 3542 * to be a zero byte. 3543 */ 3544 3545 unassigned: 3546 /* try an extension mapping */ 3547 pArgs->source=source; 3548 c=_extFromU(cnv, cnv->sharedData, 3549 c, &source, sourceLimit, 3550 &target, target+targetCapacity, 3551 &offsets, sourceIndex, 3552 pArgs->flush, 3553 pErrorCode); 3554 nextSourceIndex+=(int32_t)(source-pArgs->source); 3555 3556 if(U_FAILURE(*pErrorCode)) { 3557 /* not mappable or buffer overflow */ 3558 break; 3559 } else { 3560 /* a mapping was written to the target, continue */ 3561 3562 /* recalculate the targetCapacity after an extension mapping */ 3563 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3564 3565 /* normal end of conversion: prepare for a new character */ 3566 sourceIndex=nextSourceIndex; 3567 continue; 3568 } 3569 } 3570 } 3571 3572 /* write the output character bytes from value and length */ 3573 /* from the first if in the loop we know that targetCapacity>0 */ 3574 if(value<=0xff) { 3575 /* this is easy because we know that there is enough space */ 3576 *target++=(uint8_t)value; 3577 if(offsets!=NULL) { 3578 *offsets++=sourceIndex; 3579 } 3580 --targetCapacity; 3581 } else /* length==2 */ { 3582 *target++=(uint8_t)(value>>8); 3583 if(2<=targetCapacity) { 3584 *target++=(uint8_t)value; 3585 if(offsets!=NULL) { 3586 *offsets++=sourceIndex; 3587 *offsets++=sourceIndex; 3588 } 3589 targetCapacity-=2; 3590 } else { 3591 if(offsets!=NULL) { 3592 *offsets++=sourceIndex; 3593 } 3594 cnv->charErrorBuffer[0]=(char)value; 3595 cnv->charErrorBufferLength=1; 3596 3597 /* target overflow */ 3598 targetCapacity=0; 3599 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3600 c=0; 3601 break; 3602 } 3603 } 3604 3605 /* normal end of conversion: prepare for a new character */ 3606 c=0; 3607 sourceIndex=nextSourceIndex; 3608 continue; 3609 } else { 3610 /* target is full */ 3611 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3612 break; 3613 } 3614 } 3615 3616 /* set the converter state back into UConverter */ 3617 cnv->fromUChar32=c; 3618 3619 /* write back the updated pointers */ 3620 pArgs->source=source; 3621 pArgs->target=(char *)target; 3622 pArgs->offsets=offsets; 3623 } 3624 3625 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3626 static void 3627 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3628 UErrorCode *pErrorCode) { 3629 UConverter *cnv; 3630 const UChar *source, *sourceLimit; 3631 uint8_t *target; 3632 int32_t targetCapacity; 3633 int32_t *offsets; 3634 3635 const uint16_t *table; 3636 const uint16_t *results; 3637 3638 UChar32 c; 3639 3640 int32_t sourceIndex, nextSourceIndex; 3641 3642 uint16_t value, minValue; 3643 UBool hasSupplementary; 3644 3645 /* set up the local pointers */ 3646 cnv=pArgs->converter; 3647 source=pArgs->source; 3648 sourceLimit=pArgs->sourceLimit; 3649 target=(uint8_t *)pArgs->target; 3650 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3651 offsets=pArgs->offsets; 3652 3653 table=cnv->sharedData->mbcs.fromUnicodeTable; 3654 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3655 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3656 } else { 3657 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3658 } 3659 3660 if(cnv->useFallback) { 3661 /* use all roundtrip and fallback results */ 3662 minValue=0x800; 3663 } else { 3664 /* use only roundtrips and fallbacks from private-use characters */ 3665 minValue=0xc00; 3666 } 3667 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3668 3669 /* get the converter state from UConverter */ 3670 c=cnv->fromUChar32; 3671 3672 /* sourceIndex=-1 if the current character began in the previous buffer */ 3673 sourceIndex= c==0 ? 0 : -1; 3674 nextSourceIndex=0; 3675 3676 /* conversion loop */ 3677 if(c!=0 && targetCapacity>0) { 3678 goto getTrail; 3679 } 3680 3681 while(source<sourceLimit) { 3682 /* 3683 * This following test is to see if available input would overflow the output. 3684 * It does not catch output of more than one byte that 3685 * overflows as a result of a multi-byte character or callback output 3686 * from the last source character. 3687 * Therefore, those situations also test for overflows and will 3688 * then break the loop, too. 3689 */ 3690 if(targetCapacity>0) { 3691 /* 3692 * Get a correct Unicode code point: 3693 * a single UChar for a BMP code point or 3694 * a matched surrogate pair for a "supplementary code point". 3695 */ 3696 c=*source++; 3697 ++nextSourceIndex; 3698 if(U16_IS_SURROGATE(c)) { 3699 if(U16_IS_SURROGATE_LEAD(c)) { 3700 getTrail: 3701 if(source<sourceLimit) { 3702 /* test the following code unit */ 3703 UChar trail=*source; 3704 if(U16_IS_TRAIL(trail)) { 3705 ++source; 3706 ++nextSourceIndex; 3707 c=U16_GET_SUPPLEMENTARY(c, trail); 3708 if(!hasSupplementary) { 3709 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3710 /* callback(unassigned) */ 3711 goto unassigned; 3712 } 3713 /* convert this supplementary code point */ 3714 /* exit this condition tree */ 3715 } else { 3716 /* this is an unmatched lead code unit (1st surrogate) */ 3717 /* callback(illegal) */ 3718 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3719 break; 3720 } 3721 } else { 3722 /* no more input */ 3723 break; 3724 } 3725 } else { 3726 /* this is an unmatched trail code unit (2nd surrogate) */ 3727 /* callback(illegal) */ 3728 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3729 break; 3730 } 3731 } 3732 3733 /* convert the Unicode code point in c into codepage bytes */ 3734 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3735 3736 /* is this code point assigned, or do we use fallbacks? */ 3737 if(value>=minValue) { 3738 /* assigned, write the output character bytes from value and length */ 3739 /* length==1 */ 3740 /* this is easy because we know that there is enough space */ 3741 *target++=(uint8_t)value; 3742 if(offsets!=NULL) { 3743 *offsets++=sourceIndex; 3744 } 3745 --targetCapacity; 3746 3747 /* normal end of conversion: prepare for a new character */ 3748 c=0; 3749 sourceIndex=nextSourceIndex; 3750 } else { /* unassigned */ 3751 unassigned: 3752 /* try an extension mapping */ 3753 pArgs->source=source; 3754 c=_extFromU(cnv, cnv->sharedData, 3755 c, &source, sourceLimit, 3756 &target, target+targetCapacity, 3757 &offsets, sourceIndex, 3758 pArgs->flush, 3759 pErrorCode); 3760 nextSourceIndex+=(int32_t)(source-pArgs->source); 3761 3762 if(U_FAILURE(*pErrorCode)) { 3763 /* not mappable or buffer overflow */ 3764 break; 3765 } else { 3766 /* a mapping was written to the target, continue */ 3767 3768 /* recalculate the targetCapacity after an extension mapping */ 3769 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3770 3771 /* normal end of conversion: prepare for a new character */ 3772 sourceIndex=nextSourceIndex; 3773 } 3774 } 3775 } else { 3776 /* target is full */ 3777 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3778 break; 3779 } 3780 } 3781 3782 /* set the converter state back into UConverter */ 3783 cnv->fromUChar32=c; 3784 3785 /* write back the updated pointers */ 3786 pArgs->source=source; 3787 pArgs->target=(char *)target; 3788 pArgs->offsets=offsets; 3789 } 3790 3791 /* 3792 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3793 * that map only to and from the BMP. 3794 * In addition to single-byte/state optimizations, the offset calculations 3795 * become much easier. 3796 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3797 * but measurements have shown that this diminishes performance 3798 * in more cases than it improves it. 3799 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3800 * for various MBCS and SBCS optimizations. 3801 */ 3802 static void 3803 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3804 UErrorCode *pErrorCode) { 3805 UConverter *cnv; 3806 const UChar *source, *sourceLimit, *lastSource; 3807 uint8_t *target; 3808 int32_t targetCapacity, length; 3809 int32_t *offsets; 3810 3811 const uint16_t *table; 3812 const uint16_t *results; 3813 3814 UChar32 c; 3815 3816 int32_t sourceIndex; 3817 3818 uint32_t asciiRoundtrips; 3819 uint16_t value, minValue; 3820 3821 /* set up the local pointers */ 3822 cnv=pArgs->converter; 3823 source=pArgs->source; 3824 sourceLimit=pArgs->sourceLimit; 3825 target=(uint8_t *)pArgs->target; 3826 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3827 offsets=pArgs->offsets; 3828 3829 table=cnv->sharedData->mbcs.fromUnicodeTable; 3830 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3831 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3832 } else { 3833 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3834 } 3835 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3836 3837 if(cnv->useFallback) { 3838 /* use all roundtrip and fallback results */ 3839 minValue=0x800; 3840 } else { 3841 /* use only roundtrips and fallbacks from private-use characters */ 3842 minValue=0xc00; 3843 } 3844 3845 /* get the converter state from UConverter */ 3846 c=cnv->fromUChar32; 3847 3848 /* sourceIndex=-1 if the current character began in the previous buffer */ 3849 sourceIndex= c==0 ? 0 : -1; 3850 lastSource=source; 3851 3852 /* 3853 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3854 * for the minimum of the sourceLength and targetCapacity 3855 */ 3856 length=(int32_t)(sourceLimit-source); 3857 if(length<targetCapacity) { 3858 targetCapacity=length; 3859 } 3860 3861 /* conversion loop */ 3862 if(c!=0 && targetCapacity>0) { 3863 goto getTrail; 3864 } 3865 3866 #if MBCS_UNROLL_SINGLE_FROM_BMP 3867 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3868 /* unroll the loop with the most common case */ 3869 unrolled: 3870 if(targetCapacity>=4) { 3871 int32_t count, loops; 3872 uint16_t andedValues; 3873 3874 loops=count=targetCapacity>>2; 3875 do { 3876 c=*source++; 3877 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3878 *target++=(uint8_t)value; 3879 c=*source++; 3880 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3881 *target++=(uint8_t)value; 3882 c=*source++; 3883 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3884 *target++=(uint8_t)value; 3885 c=*source++; 3886 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3887 *target++=(uint8_t)value; 3888 3889 /* were all 4 entries really valid? */ 3890 if(andedValues<minValue) { 3891 /* no, return to the first of these 4 */ 3892 source-=4; 3893 target-=4; 3894 break; 3895 } 3896 } while(--count>0); 3897 count=loops-count; 3898 targetCapacity-=4*count; 3899 3900 if(offsets!=NULL) { 3901 lastSource+=4*count; 3902 while(count>0) { 3903 *offsets++=sourceIndex++; 3904 *offsets++=sourceIndex++; 3905 *offsets++=sourceIndex++; 3906 *offsets++=sourceIndex++; 3907 --count; 3908 } 3909 } 3910 3911 c=0; 3912 } 3913 #endif 3914 3915 while(targetCapacity>0) { 3916 /* 3917 * Get a correct Unicode code point: 3918 * a single UChar for a BMP code point or 3919 * a matched surrogate pair for a "supplementary code point". 3920 */ 3921 c=*source++; 3922 /* 3923 * Do not immediately check for single surrogates: 3924 * Assume that they are unassigned and check for them in that case. 3925 * This speeds up the conversion of assigned characters. 3926 */ 3927 /* convert the Unicode code point in c into codepage bytes */ 3928 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3929 *target++=(uint8_t)c; 3930 --targetCapacity; 3931 c=0; 3932 continue; 3933 } 3934 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3935 /* is this code point assigned, or do we use fallbacks? */ 3936 if(value>=minValue) { 3937 /* assigned, write the output character bytes from value and length */ 3938 /* length==1 */ 3939 /* this is easy because we know that there is enough space */ 3940 *target++=(uint8_t)value; 3941 --targetCapacity; 3942 3943 /* normal end of conversion: prepare for a new character */ 3944 c=0; 3945 continue; 3946 } else if(!U16_IS_SURROGATE(c)) { 3947 /* normal, unassigned BMP character */ 3948 } else if(U16_IS_SURROGATE_LEAD(c)) { 3949 getTrail: 3950 if(source<sourceLimit) { 3951 /* test the following code unit */ 3952 UChar trail=*source; 3953 if(U16_IS_TRAIL(trail)) { 3954 ++source; 3955 c=U16_GET_SUPPLEMENTARY(c, trail); 3956 /* this codepage does not map supplementary code points */ 3957 /* callback(unassigned) */ 3958 } else { 3959 /* this is an unmatched lead code unit (1st surrogate) */ 3960 /* callback(illegal) */ 3961 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3962 break; 3963 } 3964 } else { 3965 /* no more input */ 3966 if (pArgs->flush) { 3967 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3968 } 3969 break; 3970 } 3971 } else { 3972 /* this is an unmatched trail code unit (2nd surrogate) */ 3973 /* callback(illegal) */ 3974 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3975 break; 3976 } 3977 3978 /* c does not have a mapping */ 3979 3980 /* get the number of code units for c to correctly advance sourceIndex */ 3981 length=U16_LENGTH(c); 3982 3983 /* set offsets since the start or the last extension */ 3984 if(offsets!=NULL) { 3985 int32_t count=(int32_t)(source-lastSource); 3986 3987 /* do not set the offset for this character */ 3988 count-=length; 3989 3990 while(count>0) { 3991 *offsets++=sourceIndex++; 3992 --count; 3993 } 3994 /* offsets and sourceIndex are now set for the current character */ 3995 } 3996 3997 /* try an extension mapping */ 3998 lastSource=source; 3999 c=_extFromU(cnv, cnv->sharedData, 4000 c, &source, sourceLimit, 4001 &target, (const uint8_t *)(pArgs->targetLimit), 4002 &offsets, sourceIndex, 4003 pArgs->flush, 4004 pErrorCode); 4005 sourceIndex+=length+(int32_t)(source-lastSource); 4006 lastSource=source; 4007 4008 if(U_FAILURE(*pErrorCode)) { 4009 /* not mappable or buffer overflow */ 4010 break; 4011 } else { 4012 /* a mapping was written to the target, continue */ 4013 4014 /* recalculate the targetCapacity after an extension mapping */ 4015 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4016 length=(int32_t)(sourceLimit-source); 4017 if(length<targetCapacity) { 4018 targetCapacity=length; 4019 } 4020 } 4021 4022 #if MBCS_UNROLL_SINGLE_FROM_BMP 4023 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 4024 goto unrolled; 4025 #endif 4026 } 4027 4028 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 4029 /* target is full */ 4030 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4031 } 4032 4033 /* set offsets since the start or the last callback */ 4034 if(offsets!=NULL) { 4035 size_t count=source-lastSource; 4036 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 4037 /* 4038 Caller gave us a partial supplementary character, 4039 which this function couldn't convert in any case. 4040 The callback will handle the offset. 4041 */ 4042 count--; 4043 } 4044 while(count>0) { 4045 *offsets++=sourceIndex++; 4046 --count; 4047 } 4048 } 4049 4050 /* set the converter state back into UConverter */ 4051 cnv->fromUChar32=c; 4052 4053 /* write back the updated pointers */ 4054 pArgs->source=source; 4055 pArgs->target=(char *)target; 4056 pArgs->offsets=offsets; 4057 } 4058 4059 U_CFUNC void 4060 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 4061 UErrorCode *pErrorCode) { 4062 UConverter *cnv; 4063 const UChar *source, *sourceLimit; 4064 uint8_t *target; 4065 int32_t targetCapacity; 4066 int32_t *offsets; 4067 4068 const uint16_t *table; 4069 const uint16_t *mbcsIndex; 4070 const uint8_t *p, *bytes; 4071 uint8_t outputType; 4072 4073 UChar32 c; 4074 4075 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 4076 4077 uint32_t stage2Entry; 4078 uint32_t asciiRoundtrips; 4079 uint32_t value; 4080 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */ 4081 uint8_t siBytes[2] = {0, 0}; 4082 uint8_t soBytes[2] = {0, 0}; 4083 uint8_t siLength, soLength; 4084 int32_t length = 0, prevLength; 4085 uint8_t unicodeMask; 4086 4087 cnv=pArgs->converter; 4088 4089 if(cnv->preFromUFirstCP>=0) { 4090 /* 4091 * pass sourceIndex=-1 because we continue from an earlier buffer 4092 * in the future, this may change with continuous offsets 4093 */ 4094 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 4095 4096 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 4097 return; 4098 } 4099 } 4100 4101 /* use optimized function if possible */ 4102 outputType=cnv->sharedData->mbcs.outputType; 4103 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 4104 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4105 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4106 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 4107 } else { 4108 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 4109 } 4110 return; 4111 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 4112 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 4113 return; 4114 } 4115 4116 /* set up the local pointers */ 4117 source=pArgs->source; 4118 sourceLimit=pArgs->sourceLimit; 4119 target=(uint8_t *)pArgs->target; 4120 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 4121 offsets=pArgs->offsets; 4122 4123 table=cnv->sharedData->mbcs.fromUnicodeTable; 4124 if(cnv->sharedData->mbcs.utf8Friendly) { 4125 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 4126 } else { 4127 mbcsIndex=NULL; 4128 } 4129 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4130 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4131 } else { 4132 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 4133 } 4134 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4135 4136 /* get the converter state from UConverter */ 4137 c=cnv->fromUChar32; 4138 4139 if(outputType==MBCS_OUTPUT_2_SISO) { 4140 prevLength=cnv->fromUnicodeStatus; 4141 if(prevLength==0) { 4142 /* set the real value */ 4143 prevLength=1; 4144 } 4145 } else { 4146 /* prevent fromUnicodeStatus from being set to something non-0 */ 4147 prevLength=0; 4148 } 4149 4150 /* sourceIndex=-1 if the current character began in the previous buffer */ 4151 prevSourceIndex=-1; 4152 sourceIndex= c==0 ? 0 : -1; 4153 nextSourceIndex=0; 4154 4155 /* Get the SI/SO character for the converter */ 4156 siLength = getSISOBytes(SI, cnv->options, siBytes); 4157 soLength = getSISOBytes(SO, cnv->options, soBytes); 4158 4159 /* conversion loop */ 4160 /* 4161 * This is another piece of ugly code: 4162 * A goto into the loop if the converter state contains a first surrogate 4163 * from the previous function call. 4164 * It saves me to check in each loop iteration a check of if(c==0) 4165 * and duplicating the trail-surrogate-handling code in the else 4166 * branch of that check. 4167 * I could not find any other way to get around this other than 4168 * using a function call for the conversion and callback, which would 4169 * be even more inefficient. 4170 * 4171 * Markus Scherer 2000-jul-19 4172 */ 4173 if(c!=0 && targetCapacity>0) { 4174 goto getTrail; 4175 } 4176 4177 while(source<sourceLimit) { 4178 /* 4179 * This following test is to see if available input would overflow the output. 4180 * It does not catch output of more than one byte that 4181 * overflows as a result of a multi-byte character or callback output 4182 * from the last source character. 4183 * Therefore, those situations also test for overflows and will 4184 * then break the loop, too. 4185 */ 4186 if(targetCapacity>0) { 4187 /* 4188 * Get a correct Unicode code point: 4189 * a single UChar for a BMP code point or 4190 * a matched surrogate pair for a "supplementary code point". 4191 */ 4192 c=*source++; 4193 ++nextSourceIndex; 4194 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4195 *target++=(uint8_t)c; 4196 if(offsets!=NULL) { 4197 *offsets++=sourceIndex; 4198 prevSourceIndex=sourceIndex; 4199 sourceIndex=nextSourceIndex; 4200 } 4201 --targetCapacity; 4202 c=0; 4203 continue; 4204 } 4205 /* 4206 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4207 * to avoid dealing with surrogates. 4208 * MBCS_FAST_MAX must be >=0xd7ff. 4209 */ 4210 if(c<=0xd7ff && mbcsIndex!=NULL) { 4211 value=mbcsIndex[c>>6]; 4212 4213 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4214 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4215 switch(outputType) { 4216 case MBCS_OUTPUT_2: 4217 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4218 if(value<=0xff) { 4219 if(value==0) { 4220 goto unassigned; 4221 } else { 4222 length=1; 4223 } 4224 } else { 4225 length=2; 4226 } 4227 break; 4228 case MBCS_OUTPUT_2_SISO: 4229 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4230 /* 4231 * Save the old state in the converter object 4232 * right here, then change the local prevLength state variable if necessary. 4233 * Then, if this character turns out to be unassigned or a fallback that 4234 * is not taken, the callback code must not save the new state in the converter 4235 * because the new state is for a character that is not output. 4236 * However, the callback must still restore the state from the converter 4237 * in case the callback function changed it for its output. 4238 */ 4239 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4240 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4241 if(value<=0xff) { 4242 if(value==0) { 4243 goto unassigned; 4244 } else if(prevLength<=1) { 4245 length=1; 4246 } else { 4247 /* change from double-byte mode to single-byte */ 4248 if (siLength == 1) { 4249 value|=(uint32_t)siBytes[0]<<8; 4250 length = 2; 4251 } else if (siLength == 2) { 4252 value|=(uint32_t)siBytes[1]<<8; 4253 value|=(uint32_t)siBytes[0]<<16; 4254 length = 3; 4255 } 4256 prevLength=1; 4257 } 4258 } else { 4259 if(prevLength==2) { 4260 length=2; 4261 } else { 4262 /* change from single-byte mode to double-byte */ 4263 if (soLength == 1) { 4264 value|=(uint32_t)soBytes[0]<<16; 4265 length = 3; 4266 } else if (soLength == 2) { 4267 value|=(uint32_t)soBytes[1]<<16; 4268 value|=(uint32_t)soBytes[0]<<24; 4269 length = 4; 4270 } 4271 prevLength=2; 4272 } 4273 } 4274 break; 4275 case MBCS_OUTPUT_DBCS_ONLY: 4276 /* table with single-byte results, but only DBCS mappings used */ 4277 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4278 if(value<=0xff) { 4279 /* no mapping or SBCS result, not taken for DBCS-only */ 4280 goto unassigned; 4281 } else { 4282 length=2; 4283 } 4284 break; 4285 case MBCS_OUTPUT_3: 4286 p=bytes+(value+(c&0x3f))*3; 4287 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4288 if(value<=0xff) { 4289 if(value==0) { 4290 goto unassigned; 4291 } else { 4292 length=1; 4293 } 4294 } else if(value<=0xffff) { 4295 length=2; 4296 } else { 4297 length=3; 4298 } 4299 break; 4300 case MBCS_OUTPUT_4: 4301 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4302 if(value<=0xff) { 4303 if(value==0) { 4304 goto unassigned; 4305 } else { 4306 length=1; 4307 } 4308 } else if(value<=0xffff) { 4309 length=2; 4310 } else if(value<=0xffffff) { 4311 length=3; 4312 } else { 4313 length=4; 4314 } 4315 break; 4316 case MBCS_OUTPUT_3_EUC: 4317 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4318 /* EUC 16-bit fixed-length representation */ 4319 if(value<=0xff) { 4320 if(value==0) { 4321 goto unassigned; 4322 } else { 4323 length=1; 4324 } 4325 } else if((value&0x8000)==0) { 4326 value|=0x8e8000; 4327 length=3; 4328 } else if((value&0x80)==0) { 4329 value|=0x8f0080; 4330 length=3; 4331 } else { 4332 length=2; 4333 } 4334 break; 4335 case MBCS_OUTPUT_4_EUC: 4336 p=bytes+(value+(c&0x3f))*3; 4337 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4338 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4339 if(value<=0xff) { 4340 if(value==0) { 4341 goto unassigned; 4342 } else { 4343 length=1; 4344 } 4345 } else if(value<=0xffff) { 4346 length=2; 4347 } else if((value&0x800000)==0) { 4348 value|=0x8e800000; 4349 length=4; 4350 } else if((value&0x8000)==0) { 4351 value|=0x8f008000; 4352 length=4; 4353 } else { 4354 length=3; 4355 } 4356 break; 4357 default: 4358 /* must not occur */ 4359 /* 4360 * To avoid compiler warnings that value & length may be 4361 * used without having been initialized, we set them here. 4362 * In reality, this is unreachable code. 4363 * Not having a default branch also causes warnings with 4364 * some compilers. 4365 */ 4366 value=0; 4367 length=0; 4368 break; 4369 } 4370 /* output the value */ 4371 } else { 4372 /* 4373 * This also tests if the codepage maps single surrogates. 4374 * If it does, then surrogates are not paired but mapped separately. 4375 * Note that in this case unmatched surrogates are not detected. 4376 */ 4377 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4378 if(U16_IS_SURROGATE_LEAD(c)) { 4379 getTrail: 4380 if(source<sourceLimit) { 4381 /* test the following code unit */ 4382 UChar trail=*source; 4383 if(U16_IS_TRAIL(trail)) { 4384 ++source; 4385 ++nextSourceIndex; 4386 c=U16_GET_SUPPLEMENTARY(c, trail); 4387 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4388 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4389 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4390 /* callback(unassigned) */ 4391 goto unassigned; 4392 } 4393 /* convert this supplementary code point */ 4394 /* exit this condition tree */ 4395 } else { 4396 /* this is an unmatched lead code unit (1st surrogate) */ 4397 /* callback(illegal) */ 4398 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4399 break; 4400 } 4401 } else { 4402 /* no more input */ 4403 break; 4404 } 4405 } else { 4406 /* this is an unmatched trail code unit (2nd surrogate) */ 4407 /* callback(illegal) */ 4408 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4409 break; 4410 } 4411 } 4412 4413 /* convert the Unicode code point in c into codepage bytes */ 4414 4415 /* 4416 * The basic lookup is a triple-stage compact array (trie) lookup. 4417 * For details see the beginning of this file. 4418 * 4419 * Single-byte codepages are handled with a different data structure 4420 * by _MBCSSingle... functions. 4421 * 4422 * The result consists of a 32-bit value from stage 2 and 4423 * a pointer to as many bytes as are stored per character. 4424 * The pointer points to the character's bytes in stage 3. 4425 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4426 * for that pointer, while bits 31..16 are flags for which of 4427 * the 16 characters in the block are roundtrip-assigned. 4428 * 4429 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4430 * respectively as uint32_t, in the platform encoding. 4431 * For 3-byte codepages, the bytes are always stored in big-endian order. 4432 * 4433 * For EUC encodings that use only either 0x8e or 0x8f as the first 4434 * byte of their longest byte sequences, the first two bytes in 4435 * this third stage indicate with their 7th bits whether these bytes 4436 * are to be written directly or actually need to be preceeded by 4437 * one of the two Single-Shift codes. With this, the third stage 4438 * stores one byte fewer per character than the actual maximum length of 4439 * EUC byte sequences. 4440 * 4441 * Other than that, leading zero bytes are removed and the other 4442 * bytes output. A single zero byte may be output if the "assigned" 4443 * bit in stage 2 was on. 4444 * The data structure does not support zero byte output as a fallback, 4445 * and also does not allow output of leading zeros. 4446 */ 4447 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4448 4449 /* get the bytes and the length for the output */ 4450 switch(outputType) { 4451 case MBCS_OUTPUT_2: 4452 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4453 if(value<=0xff) { 4454 length=1; 4455 } else { 4456 length=2; 4457 } 4458 break; 4459 case MBCS_OUTPUT_2_SISO: 4460 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4461 /* 4462 * Save the old state in the converter object 4463 * right here, then change the local prevLength state variable if necessary. 4464 * Then, if this character turns out to be unassigned or a fallback that 4465 * is not taken, the callback code must not save the new state in the converter 4466 * because the new state is for a character that is not output. 4467 * However, the callback must still restore the state from the converter 4468 * in case the callback function changed it for its output. 4469 */ 4470 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4471 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4472 if(value<=0xff) { 4473 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4474 /* no mapping, leave value==0 */ 4475 length=0; 4476 } else if(prevLength<=1) { 4477 length=1; 4478 } else { 4479 /* change from double-byte mode to single-byte */ 4480 if (siLength == 1) { 4481 value|=(uint32_t)siBytes[0]<<8; 4482 length = 2; 4483 } else if (siLength == 2) { 4484 value|=(uint32_t)siBytes[1]<<8; 4485 value|=(uint32_t)siBytes[0]<<16; 4486 length = 3; 4487 } 4488 prevLength=1; 4489 } 4490 } else { 4491 if(prevLength==2) { 4492 length=2; 4493 } else { 4494 /* change from single-byte mode to double-byte */ 4495 if (soLength == 1) { 4496 value|=(uint32_t)soBytes[0]<<16; 4497 length = 3; 4498 } else if (soLength == 2) { 4499 value|=(uint32_t)soBytes[1]<<16; 4500 value|=(uint32_t)soBytes[0]<<24; 4501 length = 4; 4502 } 4503 prevLength=2; 4504 } 4505 } 4506 break; 4507 case MBCS_OUTPUT_DBCS_ONLY: 4508 /* table with single-byte results, but only DBCS mappings used */ 4509 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4510 if(value<=0xff) { 4511 /* no mapping or SBCS result, not taken for DBCS-only */ 4512 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4513 length=0; 4514 } else { 4515 length=2; 4516 } 4517 break; 4518 case MBCS_OUTPUT_3: 4519 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4520 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4521 if(value<=0xff) { 4522 length=1; 4523 } else if(value<=0xffff) { 4524 length=2; 4525 } else { 4526 length=3; 4527 } 4528 break; 4529 case MBCS_OUTPUT_4: 4530 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4531 if(value<=0xff) { 4532 length=1; 4533 } else if(value<=0xffff) { 4534 length=2; 4535 } else if(value<=0xffffff) { 4536 length=3; 4537 } else { 4538 length=4; 4539 } 4540 break; 4541 case MBCS_OUTPUT_3_EUC: 4542 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4543 /* EUC 16-bit fixed-length representation */ 4544 if(value<=0xff) { 4545 length=1; 4546 } else if((value&0x8000)==0) { 4547 value|=0x8e8000; 4548 length=3; 4549 } else if((value&0x80)==0) { 4550 value|=0x8f0080; 4551 length=3; 4552 } else { 4553 length=2; 4554 } 4555 break; 4556 case MBCS_OUTPUT_4_EUC: 4557 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4558 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4559 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4560 if(value<=0xff) { 4561 length=1; 4562 } else if(value<=0xffff) { 4563 length=2; 4564 } else if((value&0x800000)==0) { 4565 value|=0x8e800000; 4566 length=4; 4567 } else if((value&0x8000)==0) { 4568 value|=0x8f008000; 4569 length=4; 4570 } else { 4571 length=3; 4572 } 4573 break; 4574 default: 4575 /* must not occur */ 4576 /* 4577 * To avoid compiler warnings that value & length may be 4578 * used without having been initialized, we set them here. 4579 * In reality, this is unreachable code. 4580 * Not having a default branch also causes warnings with 4581 * some compilers. 4582 */ 4583 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4584 length=0; 4585 break; 4586 } 4587 4588 /* is this code point assigned, or do we use fallbacks? */ 4589 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4590 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4591 ) { 4592 /* 4593 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4594 * There is no way with this data structure for fallback output 4595 * to be a zero byte. 4596 */ 4597 4598 unassigned: 4599 /* try an extension mapping */ 4600 pArgs->source=source; 4601 c=_extFromU(cnv, cnv->sharedData, 4602 c, &source, sourceLimit, 4603 &target, target+targetCapacity, 4604 &offsets, sourceIndex, 4605 pArgs->flush, 4606 pErrorCode); 4607 nextSourceIndex+=(int32_t)(source-pArgs->source); 4608 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4609 4610 if(U_FAILURE(*pErrorCode)) { 4611 /* not mappable or buffer overflow */ 4612 break; 4613 } else { 4614 /* a mapping was written to the target, continue */ 4615 4616 /* recalculate the targetCapacity after an extension mapping */ 4617 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4618 4619 /* normal end of conversion: prepare for a new character */ 4620 if(offsets!=NULL) { 4621 prevSourceIndex=sourceIndex; 4622 sourceIndex=nextSourceIndex; 4623 } 4624 continue; 4625 } 4626 } 4627 } 4628 4629 /* write the output character bytes from value and length */ 4630 /* from the first if in the loop we know that targetCapacity>0 */ 4631 if(length<=targetCapacity) { 4632 if(offsets==NULL) { 4633 switch(length) { 4634 /* each branch falls through to the next one */ 4635 case 4: 4636 *target++=(uint8_t)(value>>24); 4637 case 3: /*fall through*/ 4638 *target++=(uint8_t)(value>>16); 4639 case 2: /*fall through*/ 4640 *target++=(uint8_t)(value>>8); 4641 case 1: /*fall through*/ 4642 *target++=(uint8_t)value; 4643 default: 4644 /* will never occur */ 4645 break; 4646 } 4647 } else { 4648 switch(length) { 4649 /* each branch falls through to the next one */ 4650 case 4: 4651 *target++=(uint8_t)(value>>24); 4652 *offsets++=sourceIndex; 4653 case 3: /*fall through*/ 4654 *target++=(uint8_t)(value>>16); 4655 *offsets++=sourceIndex; 4656 case 2: /*fall through*/ 4657 *target++=(uint8_t)(value>>8); 4658 *offsets++=sourceIndex; 4659 case 1: /*fall through*/ 4660 *target++=(uint8_t)value; 4661 *offsets++=sourceIndex; 4662 default: 4663 /* will never occur */ 4664 break; 4665 } 4666 } 4667 targetCapacity-=length; 4668 } else { 4669 uint8_t *charErrorBuffer; 4670 4671 /* 4672 * We actually do this backwards here: 4673 * In order to save an intermediate variable, we output 4674 * first to the overflow buffer what does not fit into the 4675 * regular target. 4676 */ 4677 /* we know that 1<=targetCapacity<length<=4 */ 4678 length-=targetCapacity; 4679 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4680 switch(length) { 4681 /* each branch falls through to the next one */ 4682 case 3: 4683 *charErrorBuffer++=(uint8_t)(value>>16); 4684 case 2: /*fall through*/ 4685 *charErrorBuffer++=(uint8_t)(value>>8); 4686 case 1: /*fall through*/ 4687 *charErrorBuffer=(uint8_t)value; 4688 default: 4689 /* will never occur */ 4690 break; 4691 } 4692 cnv->charErrorBufferLength=(int8_t)length; 4693 4694 /* now output what fits into the regular target */ 4695 value>>=8*length; /* length was reduced by targetCapacity */ 4696 switch(targetCapacity) { 4697 /* each branch falls through to the next one */ 4698 case 3: 4699 *target++=(uint8_t)(value>>16); 4700 if(offsets!=NULL) { 4701 *offsets++=sourceIndex; 4702 } 4703 case 2: /*fall through*/ 4704 *target++=(uint8_t)(value>>8); 4705 if(offsets!=NULL) { 4706 *offsets++=sourceIndex; 4707 } 4708 case 1: /*fall through*/ 4709 *target++=(uint8_t)value; 4710 if(offsets!=NULL) { 4711 *offsets++=sourceIndex; 4712 } 4713 default: 4714 /* will never occur */ 4715 break; 4716 } 4717 4718 /* target overflow */ 4719 targetCapacity=0; 4720 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4721 c=0; 4722 break; 4723 } 4724 4725 /* normal end of conversion: prepare for a new character */ 4726 c=0; 4727 if(offsets!=NULL) { 4728 prevSourceIndex=sourceIndex; 4729 sourceIndex=nextSourceIndex; 4730 } 4731 continue; 4732 } else { 4733 /* target is full */ 4734 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4735 break; 4736 } 4737 } 4738 4739 /* 4740 * the end of the input stream and detection of truncated input 4741 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4742 * we need to emit an SI at the very end 4743 * 4744 * conditions: 4745 * successful 4746 * EBCDIC_STATEFUL in DBCS mode 4747 * end of input and no truncated input 4748 */ 4749 if( U_SUCCESS(*pErrorCode) && 4750 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4751 pArgs->flush && source>=sourceLimit && c==0 4752 ) { 4753 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4754 if(targetCapacity>0) { 4755 *target++=(uint8_t)siBytes[0]; 4756 if (siLength == 2) { 4757 if (targetCapacity<2) { 4758 cnv->charErrorBuffer[0]=(uint8_t)siBytes[1]; 4759 cnv->charErrorBufferLength=1; 4760 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4761 } else { 4762 *target++=(uint8_t)siBytes[1]; 4763 } 4764 } 4765 if(offsets!=NULL) { 4766 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4767 *offsets++=prevSourceIndex; 4768 } 4769 } else { 4770 /* target is full */ 4771 cnv->charErrorBuffer[0]=(uint8_t)siBytes[0]; 4772 if (siLength == 2) { 4773 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1]; 4774 } 4775 cnv->charErrorBufferLength=siLength; 4776 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4777 } 4778 prevLength=1; /* we switched into SBCS */ 4779 } 4780 4781 /* set the converter state back into UConverter */ 4782 cnv->fromUChar32=c; 4783 cnv->fromUnicodeStatus=prevLength; 4784 4785 /* write back the updated pointers */ 4786 pArgs->source=source; 4787 pArgs->target=(char *)target; 4788 pArgs->offsets=offsets; 4789 } 4790 4791 /* 4792 * This is another simple conversion function for internal use by other 4793 * conversion implementations. 4794 * It does not use the converter state nor call callbacks. 4795 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4796 * It handles conversion extensions but not GB 18030. 4797 * 4798 * It converts one single Unicode code point into codepage bytes, encoded 4799 * as one 32-bit value. The function returns the number of bytes in *pValue: 4800 * 1..4 the number of bytes in *pValue 4801 * 0 unassigned (*pValue undefined) 4802 * -1 illegal (currently not used, *pValue undefined) 4803 * 4804 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4805 * the second to last byte in bits 15..8, etc. 4806 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4807 */ 4808 U_CFUNC int32_t 4809 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4810 UChar32 c, uint32_t *pValue, 4811 UBool useFallback) { 4812 const int32_t *cx; 4813 const uint16_t *table; 4814 #if 0 4815 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4816 const uint8_t *p; 4817 #endif 4818 uint32_t stage2Entry; 4819 uint32_t value; 4820 int32_t length; 4821 4822 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4823 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4824 table=sharedData->mbcs.fromUnicodeTable; 4825 4826 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4827 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4828 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4829 /* is this code point assigned, or do we use fallbacks? */ 4830 if(useFallback ? value>=0x800 : value>=0xc00) { 4831 *pValue=value&0xff; 4832 return 1; 4833 } 4834 } else /* outputType!=MBCS_OUTPUT_1 */ { 4835 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4836 4837 /* get the bytes and the length for the output */ 4838 switch(sharedData->mbcs.outputType) { 4839 case MBCS_OUTPUT_2: 4840 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4841 if(value<=0xff) { 4842 length=1; 4843 } else { 4844 length=2; 4845 } 4846 break; 4847 #if 0 4848 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4849 case MBCS_OUTPUT_DBCS_ONLY: 4850 /* table with single-byte results, but only DBCS mappings used */ 4851 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4852 if(value<=0xff) { 4853 /* no mapping or SBCS result, not taken for DBCS-only */ 4854 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4855 length=0; 4856 } else { 4857 length=2; 4858 } 4859 break; 4860 case MBCS_OUTPUT_3: 4861 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4862 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4863 if(value<=0xff) { 4864 length=1; 4865 } else if(value<=0xffff) { 4866 length=2; 4867 } else { 4868 length=3; 4869 } 4870 break; 4871 case MBCS_OUTPUT_4: 4872 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4873 if(value<=0xff) { 4874 length=1; 4875 } else if(value<=0xffff) { 4876 length=2; 4877 } else if(value<=0xffffff) { 4878 length=3; 4879 } else { 4880 length=4; 4881 } 4882 break; 4883 case MBCS_OUTPUT_3_EUC: 4884 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4885 /* EUC 16-bit fixed-length representation */ 4886 if(value<=0xff) { 4887 length=1; 4888 } else if((value&0x8000)==0) { 4889 value|=0x8e8000; 4890 length=3; 4891 } else if((value&0x80)==0) { 4892 value|=0x8f0080; 4893 length=3; 4894 } else { 4895 length=2; 4896 } 4897 break; 4898 case MBCS_OUTPUT_4_EUC: 4899 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4900 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4901 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4902 if(value<=0xff) { 4903 length=1; 4904 } else if(value<=0xffff) { 4905 length=2; 4906 } else if((value&0x800000)==0) { 4907 value|=0x8e800000; 4908 length=4; 4909 } else if((value&0x8000)==0) { 4910 value|=0x8f008000; 4911 length=4; 4912 } else { 4913 length=3; 4914 } 4915 break; 4916 #endif 4917 default: 4918 /* must not occur */ 4919 return -1; 4920 } 4921 4922 /* is this code point assigned, or do we use fallbacks? */ 4923 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4924 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4925 ) { 4926 /* 4927 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4928 * There is no way with this data structure for fallback output 4929 * to be a zero byte. 4930 */ 4931 /* assigned */ 4932 *pValue=value; 4933 return length; 4934 } 4935 } 4936 } 4937 4938 cx=sharedData->mbcs.extIndexes; 4939 if(cx!=NULL) { 4940 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4941 return length>=0 ? length : -length; /* return abs(length); */ 4942 } 4943 4944 /* unassigned */ 4945 return 0; 4946 } 4947 4948 4949 #if 0 4950 /* 4951 * This function has been moved to ucnv2022.c for inlining. 4952 * This implementation is here only for documentation purposes 4953 */ 4954 4955 /** 4956 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4957 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4958 * It does not handle conversion extensions (_extFromU()). 4959 * 4960 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4961 */ 4962 U_CFUNC int32_t 4963 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4964 UChar32 c, 4965 UBool useFallback) { 4966 const uint16_t *table; 4967 int32_t value; 4968 4969 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4970 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4971 return -1; 4972 } 4973 4974 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4975 table=sharedData->mbcs.fromUnicodeTable; 4976 4977 /* get the byte for the output */ 4978 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4979 /* is this code point assigned, or do we use fallbacks? */ 4980 if(useFallback ? value>=0x800 : value>=0xc00) { 4981 return value&0xff; 4982 } else { 4983 return -1; 4984 } 4985 } 4986 #endif 4987 4988 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 4989 4990 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 4991 static const UChar32 4992 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 4993 4994 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 4995 static const UChar32 4996 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 4997 4998 static void 4999 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5000 UConverterToUnicodeArgs *pToUArgs, 5001 UErrorCode *pErrorCode) { 5002 UConverter *utf8, *cnv; 5003 const uint8_t *source, *sourceLimit; 5004 uint8_t *target; 5005 int32_t targetCapacity; 5006 5007 const uint16_t *table, *sbcsIndex; 5008 const uint16_t *results; 5009 5010 int8_t oldToULength, toULength, toULimit; 5011 5012 UChar32 c; 5013 uint8_t b, t1, t2; 5014 5015 uint32_t asciiRoundtrips; 5016 uint16_t value, minValue; 5017 UBool hasSupplementary; 5018 5019 /* set up the local pointers */ 5020 utf8=pToUArgs->converter; 5021 cnv=pFromUArgs->converter; 5022 source=(uint8_t *)pToUArgs->source; 5023 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5024 target=(uint8_t *)pFromUArgs->target; 5025 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5026 5027 table=cnv->sharedData->mbcs.fromUnicodeTable; 5028 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 5029 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5030 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5031 } else { 5032 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5033 } 5034 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5035 5036 if(cnv->useFallback) { 5037 /* use all roundtrip and fallback results */ 5038 minValue=0x800; 5039 } else { 5040 /* use only roundtrips and fallbacks from private-use characters */ 5041 minValue=0xc00; 5042 } 5043 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5044 5045 /* get the converter state from the UTF-8 UConverter */ 5046 c=(UChar32)utf8->toUnicodeStatus; 5047 if(c!=0) { 5048 toULength=oldToULength=utf8->toULength; 5049 toULimit=(int8_t)utf8->mode; 5050 } else { 5051 toULength=oldToULength=toULimit=0; 5052 } 5053 5054 /* 5055 * Make sure that the last byte sequence before sourceLimit is complete 5056 * or runs into a lead byte. 5057 * Do not go back into the bytes that will be read for finishing a partial 5058 * sequence from the previous buffer. 5059 * In the conversion loop compare source with sourceLimit only once 5060 * per multi-byte character. 5061 */ 5062 { 5063 int32_t i, length; 5064 5065 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5066 for(i=0; i<3 && i<length;) { 5067 b=*(sourceLimit-i-1); 5068 if(U8_IS_TRAIL(b)) { 5069 ++i; 5070 } else { 5071 if(i<U8_COUNT_TRAIL_BYTES(b)) { 5072 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5073 sourceLimit-=i+1; 5074 } 5075 break; 5076 } 5077 } 5078 } 5079 5080 if(c!=0 && targetCapacity>0) { 5081 utf8->toUnicodeStatus=0; 5082 utf8->toULength=0; 5083 goto moreBytes; 5084 /* 5085 * Note: We could avoid the goto by duplicating some of the moreBytes 5086 * code, but only up to the point of collecting a complete UTF-8 5087 * sequence; then recurse for the toUBytes[toULength] 5088 * and then continue with normal conversion. 5089 * 5090 * If so, move this code to just after initializing the minimum 5091 * set of local variables for reading the UTF-8 input 5092 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 5093 * 5094 * Potential advantages: 5095 * - avoid the goto 5096 * - oldToULength could become a local variable in just those code blocks 5097 * that deal with buffer boundaries 5098 * - possibly faster if the goto prevents some compiler optimizations 5099 * (this would need measuring to confirm) 5100 * Disadvantage: 5101 * - code duplication 5102 */ 5103 } 5104 5105 /* conversion loop */ 5106 while(source<sourceLimit) { 5107 if(targetCapacity>0) { 5108 b=*source++; 5109 if((int8_t)b>=0) { 5110 /* convert ASCII */ 5111 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5112 *target++=(uint8_t)b; 5113 --targetCapacity; 5114 continue; 5115 } else { 5116 c=b; 5117 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 5118 } 5119 } else { 5120 if(b<0xe0) { 5121 if( /* handle U+0080..U+07FF inline */ 5122 b>=0xc2 && 5123 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5124 ) { 5125 c=b&0x1f; 5126 ++source; 5127 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 5128 if(value>=minValue) { 5129 *target++=(uint8_t)value; 5130 --targetCapacity; 5131 continue; 5132 } else { 5133 c=(c<<6)|t1; 5134 } 5135 } else { 5136 c=-1; 5137 } 5138 } else if(b==0xe0) { 5139 if( /* handle U+0800..U+0FFF inline */ 5140 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5141 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5142 ) { 5143 c=t1; 5144 source+=2; 5145 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5146 if(value>=minValue) { 5147 *target++=(uint8_t)value; 5148 --targetCapacity; 5149 continue; 5150 } else { 5151 c=(c<<6)|t2; 5152 } 5153 } else { 5154 c=-1; 5155 } 5156 } else { 5157 c=-1; 5158 } 5159 5160 if(c<0) { 5161 /* handle "complicated" and error cases, and continuing partial characters */ 5162 oldToULength=0; 5163 toULength=1; 5164 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5165 c=b; 5166 moreBytes: 5167 while(toULength<toULimit) { 5168 /* 5169 * The sourceLimit may have been adjusted before the conversion loop 5170 * to stop before a truncated sequence. 5171 * Here we need to use the real limit in case we have two truncated 5172 * sequences at the end. 5173 * See ticket #7492. 5174 */ 5175 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5176 b=*source; 5177 if(U8_IS_TRAIL(b)) { 5178 ++source; 5179 ++toULength; 5180 c=(c<<6)+b; 5181 } else { 5182 break; /* sequence too short, stop with toULength<toULimit */ 5183 } 5184 } else { 5185 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5186 source-=(toULength-oldToULength); 5187 while(oldToULength<toULength) { 5188 utf8->toUBytes[oldToULength++]=*source++; 5189 } 5190 utf8->toUnicodeStatus=c; 5191 utf8->toULength=toULength; 5192 utf8->mode=toULimit; 5193 pToUArgs->source=(char *)source; 5194 pFromUArgs->target=(char *)target; 5195 return; 5196 } 5197 } 5198 5199 if( toULength==toULimit && /* consumed all trail bytes */ 5200 (toULength==3 || toULength==2) && /* BMP */ 5201 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5202 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5203 ) { 5204 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5205 } else if( 5206 toULength==toULimit && toULength==4 && 5207 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5208 ) { 5209 /* supplementary code point */ 5210 if(!hasSupplementary) { 5211 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5212 value=0; 5213 } else { 5214 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5215 } 5216 } else { 5217 /* error handling: illegal UTF-8 byte sequence */ 5218 source-=(toULength-oldToULength); 5219 while(oldToULength<toULength) { 5220 utf8->toUBytes[oldToULength++]=*source++; 5221 } 5222 utf8->toULength=toULength; 5223 pToUArgs->source=(char *)source; 5224 pFromUArgs->target=(char *)target; 5225 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5226 return; 5227 } 5228 } 5229 } 5230 5231 if(value>=minValue) { 5232 /* output the mapping for c */ 5233 *target++=(uint8_t)value; 5234 --targetCapacity; 5235 } else { 5236 /* value<minValue means c is unassigned (unmappable) */ 5237 /* 5238 * Try an extension mapping. 5239 * Pass in no source because we don't have UTF-16 input. 5240 * If we have a partial match on c, we will return and revert 5241 * to UTF-8->UTF-16->charset conversion. 5242 */ 5243 static const UChar nul=0; 5244 const UChar *noSource=&nul; 5245 c=_extFromU(cnv, cnv->sharedData, 5246 c, &noSource, noSource, 5247 &target, target+targetCapacity, 5248 NULL, -1, 5249 pFromUArgs->flush, 5250 pErrorCode); 5251 5252 if(U_FAILURE(*pErrorCode)) { 5253 /* not mappable or buffer overflow */ 5254 cnv->fromUChar32=c; 5255 break; 5256 } else if(cnv->preFromUFirstCP>=0) { 5257 /* 5258 * Partial match, return and revert to pivoting. 5259 * In normal from-UTF-16 conversion, we would just continue 5260 * but then exit the loop because the extension match would 5261 * have consumed the source. 5262 */ 5263 *pErrorCode=U_USING_DEFAULT_WARNING; 5264 break; 5265 } else { 5266 /* a mapping was written to the target, continue */ 5267 5268 /* recalculate the targetCapacity after an extension mapping */ 5269 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5270 } 5271 } 5272 } else { 5273 /* target is full */ 5274 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5275 break; 5276 } 5277 } 5278 5279 /* 5280 * The sourceLimit may have been adjusted before the conversion loop 5281 * to stop before a truncated sequence. 5282 * If so, then collect the truncated sequence now. 5283 */ 5284 if(U_SUCCESS(*pErrorCode) && 5285 cnv->preFromUFirstCP<0 && 5286 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5287 c=utf8->toUBytes[0]=b=*source++; 5288 toULength=1; 5289 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5290 while(source<sourceLimit) { 5291 utf8->toUBytes[toULength++]=b=*source++; 5292 c=(c<<6)+b; 5293 } 5294 utf8->toUnicodeStatus=c; 5295 utf8->toULength=toULength; 5296 utf8->mode=toULimit; 5297 } 5298 5299 /* write back the updated pointers */ 5300 pToUArgs->source=(char *)source; 5301 pFromUArgs->target=(char *)target; 5302 } 5303 5304 static void 5305 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5306 UConverterToUnicodeArgs *pToUArgs, 5307 UErrorCode *pErrorCode) { 5308 UConverter *utf8, *cnv; 5309 const uint8_t *source, *sourceLimit; 5310 uint8_t *target; 5311 int32_t targetCapacity; 5312 5313 const uint16_t *table, *mbcsIndex; 5314 const uint16_t *results; 5315 5316 int8_t oldToULength, toULength, toULimit; 5317 5318 UChar32 c; 5319 uint8_t b, t1, t2; 5320 5321 uint32_t stage2Entry; 5322 uint32_t asciiRoundtrips; 5323 uint16_t value; 5324 UBool hasSupplementary; 5325 5326 /* set up the local pointers */ 5327 utf8=pToUArgs->converter; 5328 cnv=pFromUArgs->converter; 5329 source=(uint8_t *)pToUArgs->source; 5330 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5331 target=(uint8_t *)pFromUArgs->target; 5332 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5333 5334 table=cnv->sharedData->mbcs.fromUnicodeTable; 5335 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5336 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5337 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5338 } else { 5339 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5340 } 5341 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5342 5343 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5344 5345 /* get the converter state from the UTF-8 UConverter */ 5346 c=(UChar32)utf8->toUnicodeStatus; 5347 if(c!=0) { 5348 toULength=oldToULength=utf8->toULength; 5349 toULimit=(int8_t)utf8->mode; 5350 } else { 5351 toULength=oldToULength=toULimit=0; 5352 } 5353 5354 /* 5355 * Make sure that the last byte sequence before sourceLimit is complete 5356 * or runs into a lead byte. 5357 * Do not go back into the bytes that will be read for finishing a partial 5358 * sequence from the previous buffer. 5359 * In the conversion loop compare source with sourceLimit only once 5360 * per multi-byte character. 5361 */ 5362 { 5363 int32_t i, length; 5364 5365 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5366 for(i=0; i<3 && i<length;) { 5367 b=*(sourceLimit-i-1); 5368 if(U8_IS_TRAIL(b)) { 5369 ++i; 5370 } else { 5371 if(i<U8_COUNT_TRAIL_BYTES(b)) { 5372 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5373 sourceLimit-=i+1; 5374 } 5375 break; 5376 } 5377 } 5378 } 5379 5380 if(c!=0 && targetCapacity>0) { 5381 utf8->toUnicodeStatus=0; 5382 utf8->toULength=0; 5383 goto moreBytes; 5384 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5385 } 5386 5387 /* conversion loop */ 5388 while(source<sourceLimit) { 5389 if(targetCapacity>0) { 5390 b=*source++; 5391 if((int8_t)b>=0) { 5392 /* convert ASCII */ 5393 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5394 *target++=b; 5395 --targetCapacity; 5396 continue; 5397 } else { 5398 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5399 if(value==0) { 5400 c=b; 5401 goto unassigned; 5402 } 5403 } 5404 } else { 5405 if(b>0xe0) { 5406 if( /* handle U+1000..U+D7FF inline */ 5407 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5408 (b==0xed && (t1 <= 0x1f))) && 5409 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5410 ) { 5411 c=((b&0xf)<<6)|t1; 5412 source+=2; 5413 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5414 if(value==0) { 5415 c=(c<<6)|t2; 5416 goto unassigned; 5417 } 5418 } else { 5419 c=-1; 5420 } 5421 } else if(b<0xe0) { 5422 if( /* handle U+0080..U+07FF inline */ 5423 b>=0xc2 && 5424 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5425 ) { 5426 c=b&0x1f; 5427 ++source; 5428 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5429 if(value==0) { 5430 c=(c<<6)|t1; 5431 goto unassigned; 5432 } 5433 } else { 5434 c=-1; 5435 } 5436 } else { 5437 c=-1; 5438 } 5439 5440 if(c<0) { 5441 /* handle "complicated" and error cases, and continuing partial characters */ 5442 oldToULength=0; 5443 toULength=1; 5444 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5445 c=b; 5446 moreBytes: 5447 while(toULength<toULimit) { 5448 /* 5449 * The sourceLimit may have been adjusted before the conversion loop 5450 * to stop before a truncated sequence. 5451 * Here we need to use the real limit in case we have two truncated 5452 * sequences at the end. 5453 * See ticket #7492. 5454 */ 5455 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5456 b=*source; 5457 if(U8_IS_TRAIL(b)) { 5458 ++source; 5459 ++toULength; 5460 c=(c<<6)+b; 5461 } else { 5462 break; /* sequence too short, stop with toULength<toULimit */ 5463 } 5464 } else { 5465 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5466 source-=(toULength-oldToULength); 5467 while(oldToULength<toULength) { 5468 utf8->toUBytes[oldToULength++]=*source++; 5469 } 5470 utf8->toUnicodeStatus=c; 5471 utf8->toULength=toULength; 5472 utf8->mode=toULimit; 5473 pToUArgs->source=(char *)source; 5474 pFromUArgs->target=(char *)target; 5475 return; 5476 } 5477 } 5478 5479 if( toULength==toULimit && /* consumed all trail bytes */ 5480 (toULength==3 || toULength==2) && /* BMP */ 5481 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5482 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5483 ) { 5484 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5485 } else if( 5486 toULength==toULimit && toULength==4 && 5487 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5488 ) { 5489 /* supplementary code point */ 5490 if(!hasSupplementary) { 5491 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5492 stage2Entry=0; 5493 } else { 5494 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5495 } 5496 } else { 5497 /* error handling: illegal UTF-8 byte sequence */ 5498 source-=(toULength-oldToULength); 5499 while(oldToULength<toULength) { 5500 utf8->toUBytes[oldToULength++]=*source++; 5501 } 5502 utf8->toULength=toULength; 5503 pToUArgs->source=(char *)source; 5504 pFromUArgs->target=(char *)target; 5505 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5506 return; 5507 } 5508 5509 /* get the bytes and the length for the output */ 5510 /* MBCS_OUTPUT_2 */ 5511 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5512 5513 /* is this code point assigned, or do we use fallbacks? */ 5514 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5515 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5516 ) { 5517 goto unassigned; 5518 } 5519 } 5520 } 5521 5522 /* write the output character bytes from value and length */ 5523 /* from the first if in the loop we know that targetCapacity>0 */ 5524 if(value<=0xff) { 5525 /* this is easy because we know that there is enough space */ 5526 *target++=(uint8_t)value; 5527 --targetCapacity; 5528 } else /* length==2 */ { 5529 *target++=(uint8_t)(value>>8); 5530 if(2<=targetCapacity) { 5531 *target++=(uint8_t)value; 5532 targetCapacity-=2; 5533 } else { 5534 cnv->charErrorBuffer[0]=(char)value; 5535 cnv->charErrorBufferLength=1; 5536 5537 /* target overflow */ 5538 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5539 break; 5540 } 5541 } 5542 continue; 5543 5544 unassigned: 5545 { 5546 /* 5547 * Try an extension mapping. 5548 * Pass in no source because we don't have UTF-16 input. 5549 * If we have a partial match on c, we will return and revert 5550 * to UTF-8->UTF-16->charset conversion. 5551 */ 5552 static const UChar nul=0; 5553 const UChar *noSource=&nul; 5554 c=_extFromU(cnv, cnv->sharedData, 5555 c, &noSource, noSource, 5556 &target, target+targetCapacity, 5557 NULL, -1, 5558 pFromUArgs->flush, 5559 pErrorCode); 5560 5561 if(U_FAILURE(*pErrorCode)) { 5562 /* not mappable or buffer overflow */ 5563 cnv->fromUChar32=c; 5564 break; 5565 } else if(cnv->preFromUFirstCP>=0) { 5566 /* 5567 * Partial match, return and revert to pivoting. 5568 * In normal from-UTF-16 conversion, we would just continue 5569 * but then exit the loop because the extension match would 5570 * have consumed the source. 5571 */ 5572 *pErrorCode=U_USING_DEFAULT_WARNING; 5573 break; 5574 } else { 5575 /* a mapping was written to the target, continue */ 5576 5577 /* recalculate the targetCapacity after an extension mapping */ 5578 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5579 continue; 5580 } 5581 } 5582 } else { 5583 /* target is full */ 5584 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5585 break; 5586 } 5587 } 5588 5589 /* 5590 * The sourceLimit may have been adjusted before the conversion loop 5591 * to stop before a truncated sequence. 5592 * If so, then collect the truncated sequence now. 5593 */ 5594 if(U_SUCCESS(*pErrorCode) && 5595 cnv->preFromUFirstCP<0 && 5596 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5597 c=utf8->toUBytes[0]=b=*source++; 5598 toULength=1; 5599 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5600 while(source<sourceLimit) { 5601 utf8->toUBytes[toULength++]=b=*source++; 5602 c=(c<<6)+b; 5603 } 5604 utf8->toUnicodeStatus=c; 5605 utf8->toULength=toULength; 5606 utf8->mode=toULimit; 5607 } 5608 5609 /* write back the updated pointers */ 5610 pToUArgs->source=(char *)source; 5611 pFromUArgs->target=(char *)target; 5612 } 5613 5614 /* miscellaneous ------------------------------------------------------------ */ 5615 5616 static void 5617 ucnv_MBCSGetStarters(const UConverter* cnv, 5618 UBool starters[256], 5619 UErrorCode *) { 5620 const int32_t *state0; 5621 int i; 5622 5623 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5624 for(i=0; i<256; ++i) { 5625 /* all bytes that cause a state transition from state 0 are lead bytes */ 5626 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5627 } 5628 } 5629 5630 /* 5631 * This is an internal function that allows other converter implementations 5632 * to check whether a byte is a lead byte. 5633 */ 5634 U_CFUNC UBool 5635 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5636 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5637 } 5638 5639 static void 5640 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5641 int32_t offsetIndex, 5642 UErrorCode *pErrorCode) { 5643 UConverter *cnv=pArgs->converter; 5644 char *p, *subchar; 5645 char buffer[4]; 5646 int32_t length; 5647 5648 /* first, select between subChar and subChar1 */ 5649 if( cnv->subChar1!=0 && 5650 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5651 cnv->useSubChar1 : 5652 (cnv->invalidUCharBuffer[0]<=0xff)) 5653 ) { 5654 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5655 subchar=(char *)&cnv->subChar1; 5656 length=1; 5657 } else { 5658 /* select subChar in all other cases */ 5659 subchar=(char *)cnv->subChars; 5660 length=cnv->subCharLen; 5661 } 5662 5663 /* reset the selector for the next code point */ 5664 cnv->useSubChar1=FALSE; 5665 5666 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5667 p=buffer; 5668 5669 /* fromUnicodeStatus contains prevLength */ 5670 switch(length) { 5671 case 1: 5672 if(cnv->fromUnicodeStatus==2) { 5673 /* DBCS mode and SBCS sub char: change to SBCS */ 5674 cnv->fromUnicodeStatus=1; 5675 *p++=UCNV_SI; 5676 } 5677 *p++=subchar[0]; 5678 break; 5679 case 2: 5680 if(cnv->fromUnicodeStatus<=1) { 5681 /* SBCS mode and DBCS sub char: change to DBCS */ 5682 cnv->fromUnicodeStatus=2; 5683 *p++=UCNV_SO; 5684 } 5685 *p++=subchar[0]; 5686 *p++=subchar[1]; 5687 break; 5688 default: 5689 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5690 return; 5691 } 5692 subchar=buffer; 5693 length=(int32_t)(p-buffer); 5694 } 5695 5696 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5697 } 5698 5699 U_CFUNC UConverterType 5700 ucnv_MBCSGetType(const UConverter* converter) { 5701 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5702 if(converter->sharedData->mbcs.countStates==1) { 5703 return (UConverterType)UCNV_SBCS; 5704 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5705 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5706 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5707 return (UConverterType)UCNV_DBCS; 5708 } 5709 return (UConverterType)UCNV_MBCS; 5710 } 5711 5712 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5713