Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ucnvmbcs.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jul03
     16 *   created by: Markus W. Scherer
     17 *
     18 *   The current code in this file replaces the previous implementation
     19 *   of conversion code from multi-byte codepages to Unicode and back.
     20 *   This implementation supports the following:
     21 *   - legacy variable-length codepages with up to 4 bytes per character
     22 *   - all Unicode code points (up to 0x10ffff)
     23 *   - efficient distinction of unassigned vs. illegal byte sequences
     24 *   - it is possible in fromUnicode() to directly deal with simple
     25 *     stateful encodings (used for EBCDIC_STATEFUL)
     26 *   - it is possible to convert Unicode code points
     27 *     to a single zero byte (but not as a fallback except for SBCS)
     28 *
     29 *   Remaining limitations in fromUnicode:
     30 *   - byte sequences must not have leading zero bytes
     31 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
     32 *   - limitation to up to 4 bytes per character
     33 *
     34 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
     35 *   limitations and adds m:n character mappings and other features.
     36 *   See ucnv_ext.h for details.
     37 *
     38 *   Change history:
     39 *
     40 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
     41 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
     42 *                             macros to ucnvmbcs.h file
     43 */
     44 
     45 #include "unicode/utypes.h"
     46 
     47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     48 
     49 #include "unicode/ucnv.h"
     50 #include "unicode/ucnv_cb.h"
     51 #include "unicode/udata.h"
     52 #include "unicode/uset.h"
     53 #include "unicode/utf8.h"
     54 #include "unicode/utf16.h"
     55 #include "ucnv_bld.h"
     56 #include "ucnvmbcs.h"
     57 #include "ucnv_ext.h"
     58 #include "ucnv_cnv.h"
     59 #include "cmemory.h"
     60 #include "cstring.h"
     61 #include "umutex.h"
     62 #include "ustr_imp.h"
     63 
     64 /* control optimizations according to the platform */
     65 #define MBCS_UNROLL_SINGLE_TO_BMP 1
     66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
     67 
     68 /*
     69  * _MBCSHeader versions 5.3 & 4.3
     70  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
     71  *
     72  * This version is optional. Version 5 is used for incompatible data format changes.
     73  * makeconv will continue to generate version 4 files if possible.
     74  *
     75  * Changes from version 4:
     76  *
     77  * The main difference is an additional _MBCSHeader field with
     78  * - the length (number of uint32_t) of the _MBCSHeader
     79  * - flags for further incompatible data format changes
     80  * - flags for further, backward compatible data format changes
     81  *
     82  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
     83  * the file and needs to be reconstituted at load time.
     84  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
     85  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
     86  * (For details about these structures see below, and see ucnvmbcs.h.)
     87  *
     88  *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
     89  *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
     90  *   precision markers for all mappings.)
     91  *
     92  *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
     93  *   omitted data that can be reconstituted from the toUnicode data.
     94  *
     95  *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
     96  *   With only roundtrip mappings in the base fromUnicode data, this part is fully
     97  *   redundant with the mbcsIndex and will be reconstituted from that (also using the
     98  *   stage 1 table which contains the information about how stage 2 was compacted).
     99  *
    100  *   The rest of the stage 2 table, the part for code points above maxFastUChar,
    101  *   is stored in the file and will be appended to the reconstituted part.
    102  *
    103  *   The entire fromUBytes array is omitted from the file and will be reconstitued.
    104  *   This is done by enumerating all toUnicode roundtrip mappings, performing
    105  *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
    106  *   writing instead of reading the byte values.
    107  *
    108  * _MBCSHeader version 4.3
    109  *
    110  * Change from version 4.2:
    111  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
    112  *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
    113  *   files which can be used instead of stages 1 & 2.
    114  *   Faster lookups for roundtrips from most commonly used characters,
    115  *   and lookups from UTF-8 byte sequences with a natural bit distribution.
    116  *   See ucnvmbcs.h for more details.
    117  *
    118  * Change from version 4.1:
    119  * - Added an optional extension table structure at the end of the .cnv file.
    120  *   It is present if the upper bits of the header flags field contains a non-zero
    121  *   byte offset to it.
    122  *   Files that contain only a conversion table and no base table
    123  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
    124  *   These contain the base table name between the MBCS header and the extension
    125  *   data.
    126  *
    127  * Change from version 4.0:
    128  * - Replace header.reserved with header.fromUBytesLength so that all
    129  *   fields in the data have length.
    130  *
    131  * Changes from version 3 (for performance improvements):
    132  * - new bit distribution for state table entries
    133  * - reordered action codes
    134  * - new data structure for single-byte fromUnicode
    135  *   + stage 2 only contains indexes
    136  *   + stage 3 stores 16 bits per character with classification bits 15..8
    137  * - no multiplier for stage 1 entries
    138  * - stage 2 for non-single-byte codepages contains the index and the flags in
    139  *   one 32-bit value
    140  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
    141  *
    142  * For more details about old versions of the MBCS data structure, see
    143  * the corresponding versions of this file.
    144  *
    145  * Converting stateless codepage data ---------------------------------------***
    146  * (or codepage data with simple states) to Unicode.
    147  *
    148  * Data structure and algorithm for converting from complex legacy codepages
    149  * to Unicode. (Designed before 2000-may-22.)
    150  *
    151  * The basic idea is that the structure of legacy codepages can be described
    152  * with state tables.
    153  * When reading a byte stream, each input byte causes a state transition.
    154  * Some transitions result in the output of a code point, some result in
    155  * "unassigned" or "illegal" output.
    156  * This is used here for character conversion.
    157  *
    158  * The data structure begins with a state table consisting of a row
    159  * per state, with 256 entries (columns) per row for each possible input
    160  * byte value.
    161  * Each entry is 32 bits wide, with two formats distinguished by
    162  * the sign bit (bit 31):
    163  *
    164  * One format for transitional entries (bit 31 not set) for non-final bytes, and
    165  * one format for final entries (bit 31 set).
    166  * Both formats contain the number of the next state in the same bit
    167  * positions.
    168  * State 0 is the initial state.
    169  *
    170  * Most of the time, the offset values of subsequent states are added
    171  * up to a scalar value. This value will eventually be the index of
    172  * the Unicode code point in a table that follows the state table.
    173  * The effect is that the code points for final state table rows
    174  * are contiguous. The code points of final state rows follow each other
    175  * in the order of the references to those final states by previous
    176  * states, etc.
    177  *
    178  * For some terminal states, the offset is itself the output Unicode
    179  * code point (16 bits for a BMP code point or 20 bits for a supplementary
    180  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
    181  * For others, the code point in the Unicode table is stored with either
    182  * one or two code units: one for BMP code points, two for a pair of
    183  * surrogates.
    184  * All code points for a final state entry take up the same number of code
    185  * units, regardless of whether they all actually _use_ the same number
    186  * of code units. This is necessary for simple array access.
    187  *
    188  * An additional feature comes in with what in ICU is called "fallback"
    189  * mappings:
    190  *
    191  * In addition to round-trippable, precise, 1:1 mappings, there are often
    192  * mappings defined between similar, though not the same, characters.
    193  * Typically, such mappings occur only in fromUnicode mapping tables because
    194  * Unicode has a superset repertoire of most other codepages. However, it
    195  * is possible to provide such mappings in the toUnicode tables, too.
    196  * In this case, the fallback mappings are partly integrated into the
    197  * general state tables because the structure of the encoding includes their
    198  * byte sequences.
    199  * For final entries in an initial state, fallback mappings are stored in
    200  * the entry itself like with roundtrip mappings.
    201  * For other final entries, they are stored in the code units table if
    202  * the entry is for a pair of code units.
    203  * For single-unit results in the code units table, there is no space to
    204  * alternatively hold a fallback mapping; in this case, the code unit
    205  * is stored as U+fffe (unassigned), and the fallback mapping needs to
    206  * be looked up by the scalar offset value in a separate table.
    207  *
    208  * "Unassigned" state entries really mean "structurally unassigned",
    209  * i.e., such a byte sequence will never have a mapping result.
    210  *
    211  * The interpretation of the bits in each entry is as follows:
    212  *
    213  * Bit 31 not set, not a terminal entry ("transitional"):
    214  * 30..24 next state
    215  * 23..0  offset delta, to be added up
    216  *
    217  * Bit 31 set, terminal ("final") entry:
    218  * 30..24 next state (regardless of action code)
    219  * 23..20 action code:
    220  *        action codes 0 and 1 result in precise-mapping Unicode code points
    221  *        0  valid byte sequence
    222  *           19..16 not used, 0
    223  *           15..0  16-bit Unicode BMP code point
    224  *                  never U+fffe or U+ffff
    225  *        1  valid byte sequence
    226  *           19..0  20-bit Unicode supplementary code point
    227  *                  never U+fffe or U+ffff
    228  *
    229  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
    230  *        2  valid byte sequence (fallback)
    231  *           19..16 not used, 0
    232  *           15..0  16-bit Unicode BMP code point as fallback result
    233  *        3  valid byte sequence (fallback)
    234  *           19..0  20-bit Unicode supplementary code point as fallback result
    235  *
    236  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
    237  *        depending on the code units they result in
    238  *        4  valid byte sequence
    239  *           19..9  not used, 0
    240  *            8..0  final offset delta
    241  *                  pointing to one 16-bit code unit which may be
    242  *                  fffe  unassigned -- look for a fallback for this offset
    243  *                  ffff  illegal
    244  *        5  valid byte sequence
    245  *           19..9  not used, 0
    246  *            8..0  final offset delta
    247  *                  pointing to two 16-bit code units
    248  *                  (typically UTF-16 surrogates)
    249  *                  the result depends on the first code unit as follows:
    250  *                  0000..d7ff  roundtrip BMP code point (1st alone)
    251  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
    252  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
    253  *                  e000        roundtrip BMP code point (2nd alone)
    254  *                  e001        fallback BMP code point (2nd alone)
    255  *                  fffe        unassigned
    256  *                  ffff        illegal
    257  *           (the final offset deltas are at most 255 * 2,
    258  *            times 2 because of storing code unit pairs)
    259  *
    260  *        6  unassigned byte sequence
    261  *           19..16 not used, 0
    262  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
    263  *                  this does not contain a final offset delta because the main
    264  *                  purpose of this action code is to save scalar offset values;
    265  *                  therefore, fallback values cannot be assigned to byte
    266  *                  sequences that result in this action code
    267  *        7  illegal byte sequence
    268  *           19..16 not used, 0
    269  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
    270  *        8  state change only
    271  *           19..0  not used, 0
    272  *           useful for state changes in simple stateful encodings,
    273  *           at Shift-In/Shift-Out codes
    274  *
    275  *
    276  *        9..15 reserved for future use
    277  *           current implementations will only perform a state change
    278  *           and ignore bits 19..0
    279  *
    280  * An encoding with contiguous ranges of unassigned byte sequences, like
    281  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
    282  * at least two states for the trail bytes:
    283  * One trail byte state that results in code points, and one that only
    284  * has "unassigned" and "illegal" terminal states.
    285  *
    286  * Note: partly by accident, this data structure supports simple stateful
    287  * encodings without any additional logic.
    288  * Currently, only simple Shift-In/Shift-Out schemes are handled with
    289  * appropriate state tables (especially EBCDIC_STATEFUL!).
    290  *
    291  * MBCS version 2 added:
    292  * unassigned and illegal action codes have U+fffe and U+ffff
    293  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
    294  *
    295  * Converting from Unicode to codepage bytes --------------------------------***
    296  *
    297  * The conversion data structure for fromUnicode is designed for the known
    298  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
    299  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
    300  * a roundtrip mapping.
    301  *
    302  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
    303  * like in the character properties table.
    304  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
    305  * with the resulting bytes is at offsetFromUBytes.
    306  *
    307  * Beginning with version 4, single-byte codepages have a significantly different
    308  * trie compared to other codepages.
    309  * In all cases, the entry in stage 1 is directly the index of the block of
    310  * 64 entries in stage 2.
    311  *
    312  * Single-byte lookup:
    313  *
    314  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
    315  * Stage 3 contains one 16-bit word per result:
    316  * Bits 15..8 indicate the kind of result:
    317  *    f  roundtrip result
    318  *    c  fallback result from private-use code point
    319  *    8  fallback result from other code points
    320  *    0  unassigned
    321  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
    322  *
    323  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
    324  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
    325  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    326  * ASCII code points can be looked up with a linear array access into stage 3.
    327  * See maxFastUChar and other details in ucnvmbcs.h.
    328  *
    329  * Multi-byte lookup:
    330  *
    331  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
    332  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
    333  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
    334  *             If this test is false, then a non-zero result will be interpreted as
    335  *             a fallback mapping.
    336  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
    337  *
    338  * Stage 3 contains 2, 3, or 4 bytes per result.
    339  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
    340  * while 3 bytes are stored as bytes in big-endian order.
    341  * Leading zero bytes are ignored, and the number of bytes is counted.
    342  * A zero byte mapping result is possible as a roundtrip result.
    343  * For some output types, the actual result is processed from this;
    344  * see ucnv_MBCSFromUnicodeWithOffsets().
    345  *
    346  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
    347  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
    348  *
    349  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
    350  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
    351  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    352  * ASCII code points can be looked up with a linear array access into stage 3.
    353  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
    354  *
    355  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
    356  * for compaction.
    357  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
    358  * may overlap by any number of entries.
    359  *
    360  * MBCS version 2 added:
    361  * the converter checks for known output types, which allows
    362  * adding new ones without crashing an unaware converter
    363  */
    364 
    365 /**
    366  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
    367  * consecutive sequences of bytes, starting from the one encoded in value,
    368  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
    369  * Does not currently support m:n mappings or reverse fallbacks.
    370  * This function will not be called for sequences of bytes with leading zeros.
    371  *
    372  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
    373  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
    374  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
    375  *        not map to anything
    376  * @return TRUE to continue enumeration, FALSE to stop
    377  */
    378 typedef UBool U_CALLCONV
    379 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
    380 
    381 static void U_CALLCONV
    382 ucnv_MBCSLoad(UConverterSharedData *sharedData,
    383           UConverterLoadArgs *pArgs,
    384           const uint8_t *raw,
    385           UErrorCode *pErrorCode);
    386 
    387 static void U_CALLCONV
    388 ucnv_MBCSUnload(UConverterSharedData *sharedData);
    389 
    390 static void U_CALLCONV
    391 ucnv_MBCSOpen(UConverter *cnv,
    392               UConverterLoadArgs *pArgs,
    393               UErrorCode *pErrorCode);
    394 
    395 static UChar32 U_CALLCONV
    396 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
    397                   UErrorCode *pErrorCode);
    398 
    399 static void U_CALLCONV
    400 ucnv_MBCSGetStarters(const UConverter* cnv,
    401                  UBool starters[256],
    402                  UErrorCode *pErrorCode);
    403 
    404 U_CDECL_BEGIN
    405 static const char* U_CALLCONV
    406 ucnv_MBCSGetName(const UConverter *cnv);
    407 U_CDECL_END
    408 
    409 static void U_CALLCONV
    410 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
    411               int32_t offsetIndex,
    412               UErrorCode *pErrorCode);
    413 
    414 static UChar32 U_CALLCONV
    415 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
    416                   UErrorCode *pErrorCode);
    417 
    418 static void U_CALLCONV
    419 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    420                   UConverterToUnicodeArgs *pToUArgs,
    421                   UErrorCode *pErrorCode);
    422 
    423 static void U_CALLCONV
    424 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
    425                    const USetAdder *sa,
    426                    UConverterUnicodeSet which,
    427                    UErrorCode *pErrorCode);
    428 
    429 static void U_CALLCONV
    430 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    431                   UConverterToUnicodeArgs *pToUArgs,
    432                   UErrorCode *pErrorCode);
    433 
    434 static const UConverterImpl _SBCSUTF8Impl={
    435     UCNV_MBCS,
    436 
    437     ucnv_MBCSLoad,
    438     ucnv_MBCSUnload,
    439 
    440     ucnv_MBCSOpen,
    441     NULL,
    442     NULL,
    443 
    444     ucnv_MBCSToUnicodeWithOffsets,
    445     ucnv_MBCSToUnicodeWithOffsets,
    446     ucnv_MBCSFromUnicodeWithOffsets,
    447     ucnv_MBCSFromUnicodeWithOffsets,
    448     ucnv_MBCSGetNextUChar,
    449 
    450     ucnv_MBCSGetStarters,
    451     ucnv_MBCSGetName,
    452     ucnv_MBCSWriteSub,
    453     NULL,
    454     ucnv_MBCSGetUnicodeSet,
    455 
    456     NULL,
    457     ucnv_SBCSFromUTF8
    458 };
    459 
    460 static const UConverterImpl _DBCSUTF8Impl={
    461     UCNV_MBCS,
    462 
    463     ucnv_MBCSLoad,
    464     ucnv_MBCSUnload,
    465 
    466     ucnv_MBCSOpen,
    467     NULL,
    468     NULL,
    469 
    470     ucnv_MBCSToUnicodeWithOffsets,
    471     ucnv_MBCSToUnicodeWithOffsets,
    472     ucnv_MBCSFromUnicodeWithOffsets,
    473     ucnv_MBCSFromUnicodeWithOffsets,
    474     ucnv_MBCSGetNextUChar,
    475 
    476     ucnv_MBCSGetStarters,
    477     ucnv_MBCSGetName,
    478     ucnv_MBCSWriteSub,
    479     NULL,
    480     ucnv_MBCSGetUnicodeSet,
    481 
    482     NULL,
    483     ucnv_DBCSFromUTF8
    484 };
    485 
    486 static const UConverterImpl _MBCSImpl={
    487     UCNV_MBCS,
    488 
    489     ucnv_MBCSLoad,
    490     ucnv_MBCSUnload,
    491 
    492     ucnv_MBCSOpen,
    493     NULL,
    494     NULL,
    495 
    496     ucnv_MBCSToUnicodeWithOffsets,
    497     ucnv_MBCSToUnicodeWithOffsets,
    498     ucnv_MBCSFromUnicodeWithOffsets,
    499     ucnv_MBCSFromUnicodeWithOffsets,
    500     ucnv_MBCSGetNextUChar,
    501 
    502     ucnv_MBCSGetStarters,
    503     ucnv_MBCSGetName,
    504     ucnv_MBCSWriteSub,
    505     NULL,
    506     ucnv_MBCSGetUnicodeSet,
    507     NULL,
    508     NULL
    509 };
    510 
    511 /* Static data is in tools/makeconv/ucnvstat.c for data-based
    512  * converters. Be sure to update it as well.
    513  */
    514 
    515 const UConverterSharedData _MBCSData={
    516     sizeof(UConverterSharedData), 1,
    517     NULL, NULL, FALSE, TRUE, &_MBCSImpl,
    518     0, UCNV_MBCS_TABLE_INITIALIZER
    519 };
    520 
    521 
    522 /* GB 18030 data ------------------------------------------------------------ */
    523 
    524 /* helper macros for linear values for GB 18030 four-byte sequences */
    525 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
    526 
    527 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
    528 
    529 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
    530 
    531 /*
    532  * Some ranges of GB 18030 where both the Unicode code points and the
    533  * GB four-byte sequences are contiguous and are handled algorithmically by
    534  * the special callback functions below.
    535  * The values are start & end of Unicode & GB codes.
    536  *
    537  * Note that single surrogates are not mapped by GB 18030
    538  * as of the re-released mapping tables from 2000-nov-30.
    539  */
    540 static const uint32_t
    541 gb18030Ranges[14][4]={
    542     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
    543     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
    544     {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
    545     {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
    546     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
    547     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
    548     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
    549     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
    550     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
    551     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
    552     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
    553     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
    554     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
    555     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
    556 };
    557 
    558 /* bit flag for UConverter.options indicating GB 18030 special handling */
    559 #define _MBCS_OPTION_GB18030 0x8000
    560 
    561 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
    562 #define _MBCS_OPTION_KEIS 0x01000
    563 #define _MBCS_OPTION_JEF  0x02000
    564 #define _MBCS_OPTION_JIPS 0x04000
    565 
    566 #define KEIS_SO_CHAR_1 0x0A
    567 #define KEIS_SO_CHAR_2 0x42
    568 #define KEIS_SI_CHAR_1 0x0A
    569 #define KEIS_SI_CHAR_2 0x41
    570 
    571 #define JEF_SO_CHAR 0x28
    572 #define JEF_SI_CHAR 0x29
    573 
    574 #define JIPS_SO_CHAR_1 0x1A
    575 #define JIPS_SO_CHAR_2 0x70
    576 #define JIPS_SI_CHAR_1 0x1A
    577 #define JIPS_SI_CHAR_2 0x71
    578 
    579 enum SISO_Option {
    580     SI,
    581     SO
    582 };
    583 typedef enum SISO_Option SISO_Option;
    584 
    585 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
    586     int32_t SISOLength = 0;
    587 
    588     switch (option) {
    589         case SI:
    590             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    591                 value[0] = KEIS_SI_CHAR_1;
    592                 value[1] = KEIS_SI_CHAR_2;
    593                 SISOLength = 2;
    594             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    595                 value[0] = JEF_SI_CHAR;
    596                 SISOLength = 1;
    597             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    598                 value[0] = JIPS_SI_CHAR_1;
    599                 value[1] = JIPS_SI_CHAR_2;
    600                 SISOLength = 2;
    601             } else {
    602                 value[0] = UCNV_SI;
    603                 SISOLength = 1;
    604             }
    605             break;
    606         case SO:
    607             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    608                 value[0] = KEIS_SO_CHAR_1;
    609                 value[1] = KEIS_SO_CHAR_2;
    610                 SISOLength = 2;
    611             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    612                 value[0] = JEF_SO_CHAR;
    613                 SISOLength = 1;
    614             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    615                 value[0] = JIPS_SO_CHAR_1;
    616                 value[1] = JIPS_SO_CHAR_2;
    617                 SISOLength = 2;
    618             } else {
    619                 value[0] = UCNV_SO;
    620                 SISOLength = 1;
    621             }
    622             break;
    623         default:
    624             /* Should never happen. */
    625             break;
    626     }
    627 
    628     return SISOLength;
    629 }
    630 
    631 /* Miscellaneous ------------------------------------------------------------ */
    632 
    633 /* similar to ucnv_MBCSGetNextUChar() but recursive */
    634 static UBool
    635 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
    636         int32_t state, uint32_t offset,
    637         uint32_t value,
    638         UConverterEnumToUCallback *callback, const void *context,
    639         UErrorCode *pErrorCode) {
    640     UChar32 codePoints[32];
    641     const int32_t *row;
    642     const uint16_t *unicodeCodeUnits;
    643     UChar32 anyCodePoints;
    644     int32_t b, limit;
    645 
    646     row=mbcsTable->stateTable[state];
    647     unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
    648 
    649     value<<=8;
    650     anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
    651 
    652     b=(stateProps[state]&0x38)<<2;
    653     if(b==0 && stateProps[state]>=0x40) {
    654         /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
    655         codePoints[0]=U_SENTINEL;
    656         b=1;
    657     }
    658     limit=((stateProps[state]&7)+1)<<5;
    659     while(b<limit) {
    660         int32_t entry=row[b];
    661         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    662             int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
    663             if(stateProps[nextState]>=0) {
    664                 /* recurse to a state with non-ignorable actions */
    665                 if(!enumToU(
    666                         mbcsTable, stateProps, nextState,
    667                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
    668                         value|(uint32_t)b,
    669                         callback, context,
    670                         pErrorCode)) {
    671                     return FALSE;
    672                 }
    673             }
    674             codePoints[b&0x1f]=U_SENTINEL;
    675         } else {
    676             UChar32 c;
    677             int32_t action;
    678 
    679             /*
    680              * An if-else-if chain provides more reliable performance for
    681              * the most common cases compared to a switch.
    682              */
    683             action=MBCS_ENTRY_FINAL_ACTION(entry);
    684             if(action==MBCS_STATE_VALID_DIRECT_16) {
    685                 /* output BMP code point */
    686                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
    687             } else if(action==MBCS_STATE_VALID_16) {
    688                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    689                 c=unicodeCodeUnits[finalOffset];
    690                 if(c<0xfffe) {
    691                     /* output BMP code point */
    692                 } else {
    693                     c=U_SENTINEL;
    694                 }
    695             } else if(action==MBCS_STATE_VALID_16_PAIR) {
    696                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    697                 c=unicodeCodeUnits[finalOffset++];
    698                 if(c<0xd800) {
    699                     /* output BMP code point below 0xd800 */
    700                 } else if(c<=0xdbff) {
    701                     /* output roundtrip or fallback supplementary code point */
    702                     c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
    703                 } else if(c==0xe000) {
    704                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
    705                     c=unicodeCodeUnits[finalOffset];
    706                 } else {
    707                     c=U_SENTINEL;
    708                 }
    709             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
    710                 /* output supplementary code point */
    711                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
    712             } else {
    713                 c=U_SENTINEL;
    714             }
    715 
    716             codePoints[b&0x1f]=c;
    717             anyCodePoints&=c;
    718         }
    719         if(((++b)&0x1f)==0) {
    720             if(anyCodePoints>=0) {
    721                 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
    722                     return FALSE;
    723                 }
    724                 anyCodePoints=-1;
    725             }
    726         }
    727     }
    728     return TRUE;
    729 }
    730 
    731 /*
    732  * Only called if stateProps[state]==-1.
    733  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
    734  * MBCS_STATE_CHANGE_ONLY.
    735  */
    736 static int8_t
    737 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
    738     const int32_t *row;
    739     int32_t min, max, entry, nextState;
    740 
    741     row=stateTable[state];
    742     stateProps[state]=0;
    743 
    744     /* find first non-ignorable state */
    745     for(min=0;; ++min) {
    746         entry=row[min];
    747         nextState=MBCS_ENTRY_STATE(entry);
    748         if(stateProps[nextState]==-1) {
    749             getStateProp(stateTable, stateProps, nextState);
    750         }
    751         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    752             if(stateProps[nextState]>=0) {
    753                 break;
    754             }
    755         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    756             break;
    757         }
    758         if(min==0xff) {
    759             stateProps[state]=-0x40;  /* (int8_t)0xc0 */
    760             return stateProps[state];
    761         }
    762     }
    763     stateProps[state]|=(int8_t)((min>>5)<<3);
    764 
    765     /* find last non-ignorable state */
    766     for(max=0xff; min<max; --max) {
    767         entry=row[max];
    768         nextState=MBCS_ENTRY_STATE(entry);
    769         if(stateProps[nextState]==-1) {
    770             getStateProp(stateTable, stateProps, nextState);
    771         }
    772         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    773             if(stateProps[nextState]>=0) {
    774                 break;
    775             }
    776         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    777             break;
    778         }
    779     }
    780     stateProps[state]|=(int8_t)(max>>5);
    781 
    782     /* recurse further and collect direct-state information */
    783     while(min<=max) {
    784         entry=row[min];
    785         nextState=MBCS_ENTRY_STATE(entry);
    786         if(stateProps[nextState]==-1) {
    787             getStateProp(stateTable, stateProps, nextState);
    788         }
    789         if(MBCS_ENTRY_IS_FINAL(entry)) {
    790             stateProps[nextState]|=0x40;
    791             if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
    792                 stateProps[state]|=0x40;
    793             }
    794         }
    795         ++min;
    796     }
    797     return stateProps[state];
    798 }
    799 
    800 /*
    801  * Internal function enumerating the toUnicode data of an MBCS converter.
    802  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
    803  * table, but could also be used for a future ucnv_getUnicodeSet() option
    804  * that includes reverse fallbacks (after updating this function's implementation).
    805  * Currently only handles roundtrip mappings.
    806  * Does not currently handle extensions.
    807  */
    808 static void
    809 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
    810                        UConverterEnumToUCallback *callback, const void *context,
    811                        UErrorCode *pErrorCode) {
    812     /*
    813      * Properties for each state, to speed up the enumeration.
    814      * Ignorable actions are unassigned/illegal/state-change-only:
    815      * They do not lead to mappings.
    816      *
    817      * Bits 7..6:
    818      * 1 direct/initial state (stateful converters have multiple)
    819      * 0 non-initial state with transitions or with non-ignorable result actions
    820      * -1 final state with only ignorable actions
    821      *
    822      * Bits 5..3:
    823      * The lowest byte value with non-ignorable actions is
    824      * value<<5 (rounded down).
    825      *
    826      * Bits 2..0:
    827      * The highest byte value with non-ignorable actions is
    828      * (value<<5)&0x1f (rounded up).
    829      */
    830     int8_t stateProps[MBCS_MAX_STATE_COUNT];
    831     int32_t state;
    832 
    833     uprv_memset(stateProps, -1, sizeof(stateProps));
    834 
    835     /* recurse from state 0 and set all stateProps */
    836     getStateProp(mbcsTable->stateTable, stateProps, 0);
    837 
    838     for(state=0; state<mbcsTable->countStates; ++state) {
    839         /*if(stateProps[state]==-1) {
    840             printf("unused/unreachable <icu:state> %d\n", state);
    841         }*/
    842         if(stateProps[state]>=0x40) {
    843             /* start from each direct state */
    844             enumToU(
    845                 mbcsTable, stateProps, state, 0, 0,
    846                 callback, context,
    847                 pErrorCode);
    848         }
    849     }
    850 }
    851 
    852 U_CFUNC void
    853 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    854                                          const USetAdder *sa,
    855                                          UConverterUnicodeSet which,
    856                                          UConverterSetFilter filter,
    857                                          UErrorCode *pErrorCode) {
    858     const UConverterMBCSTable *mbcsTable;
    859     const uint16_t *table;
    860 
    861     uint32_t st3;
    862     uint16_t st1, maxStage1, st2;
    863 
    864     UChar32 c;
    865 
    866     /* enumerate the from-Unicode trie table */
    867     mbcsTable=&sharedData->mbcs;
    868     table=mbcsTable->fromUnicodeTable;
    869     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
    870         maxStage1=0x440;
    871     } else {
    872         maxStage1=0x40;
    873     }
    874 
    875     c=0; /* keep track of the current code point while enumerating */
    876 
    877     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
    878         const uint16_t *stage2, *stage3, *results;
    879         uint16_t minValue;
    880 
    881         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
    882 
    883         /*
    884          * Set a threshold variable for selecting which mappings to use.
    885          * See ucnv_MBCSSingleFromBMPWithOffsets() and
    886          * MBCS_SINGLE_RESULT_FROM_U() for details.
    887          */
    888         if(which==UCNV_ROUNDTRIP_SET) {
    889             /* use only roundtrips */
    890             minValue=0xf00;
    891         } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
    892             /* use all roundtrip and fallback results */
    893             minValue=0x800;
    894         }
    895 
    896         for(st1=0; st1<maxStage1; ++st1) {
    897             st2=table[st1];
    898             if(st2>maxStage1) {
    899                 stage2=table+st2;
    900                 for(st2=0; st2<64; ++st2) {
    901                     if((st3=stage2[st2])!=0) {
    902                         /* read the stage 3 block */
    903                         stage3=results+st3;
    904 
    905                         do {
    906                             if(*stage3++>=minValue) {
    907                                 sa->add(sa->set, c);
    908                             }
    909                         } while((++c&0xf)!=0);
    910                     } else {
    911                         c+=16; /* empty stage 3 block */
    912                     }
    913                 }
    914             } else {
    915                 c+=1024; /* empty stage 2 block */
    916             }
    917         }
    918     } else {
    919         const uint32_t *stage2;
    920         const uint8_t *stage3, *bytes;
    921         uint32_t st3Multiplier;
    922         uint32_t value;
    923         UBool useFallback;
    924 
    925         bytes=mbcsTable->fromUnicodeBytes;
    926 
    927         useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
    928 
    929         switch(mbcsTable->outputType) {
    930         case MBCS_OUTPUT_3:
    931         case MBCS_OUTPUT_4_EUC:
    932             st3Multiplier=3;
    933             break;
    934         case MBCS_OUTPUT_4:
    935             st3Multiplier=4;
    936             break;
    937         default:
    938             st3Multiplier=2;
    939             break;
    940         }
    941 
    942         for(st1=0; st1<maxStage1; ++st1) {
    943             st2=table[st1];
    944             if(st2>(maxStage1>>1)) {
    945                 stage2=(const uint32_t *)table+st2;
    946                 for(st2=0; st2<64; ++st2) {
    947                     if((st3=stage2[st2])!=0) {
    948                         /* read the stage 3 block */
    949                         stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
    950 
    951                         /* get the roundtrip flags for the stage 3 block */
    952                         st3>>=16;
    953 
    954                         /*
    955                          * Add code points for which the roundtrip flag is set,
    956                          * or which map to non-zero bytes if we use fallbacks.
    957                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
    958                          */
    959                         switch(filter) {
    960                         case UCNV_SET_FILTER_NONE:
    961                             do {
    962                                 if(st3&1) {
    963                                     sa->add(sa->set, c);
    964                                     stage3+=st3Multiplier;
    965                                 } else if(useFallback) {
    966                                     uint8_t b=0;
    967                                     switch(st3Multiplier) {
    968                                     case 4:
    969                                         b|=*stage3++;
    970                                         U_FALLTHROUGH;
    971                                     case 3:
    972                                         b|=*stage3++;
    973                                         U_FALLTHROUGH;
    974                                     case 2:
    975                                         b|=stage3[0]|stage3[1];
    976                                         stage3+=2;
    977                                         U_FALLTHROUGH;
    978                                     default:
    979                                         break;
    980                                     }
    981                                     if(b!=0) {
    982                                         sa->add(sa->set, c);
    983                                     }
    984                                 }
    985                                 st3>>=1;
    986                             } while((++c&0xf)!=0);
    987                             break;
    988                         case UCNV_SET_FILTER_DBCS_ONLY:
    989                              /* Ignore single-byte results (<0x100). */
    990                             do {
    991                                 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
    992                                     sa->add(sa->set, c);
    993                                 }
    994                                 st3>>=1;
    995                                 stage3+=2;  /* +=st3Multiplier */
    996                             } while((++c&0xf)!=0);
    997                             break;
    998                         case UCNV_SET_FILTER_2022_CN:
    999                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
   1000                             do {
   1001                                 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
   1002                                     sa->add(sa->set, c);
   1003                                 }
   1004                                 st3>>=1;
   1005                                 stage3+=3;  /* +=st3Multiplier */
   1006                             } while((++c&0xf)!=0);
   1007                             break;
   1008                         case UCNV_SET_FILTER_SJIS:
   1009                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
   1010                             do {
   1011                                 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
   1012                                     sa->add(sa->set, c);
   1013                                 }
   1014                                 st3>>=1;
   1015                                 stage3+=2;  /* +=st3Multiplier */
   1016                             } while((++c&0xf)!=0);
   1017                             break;
   1018                         case UCNV_SET_FILTER_GR94DBCS:
   1019                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
   1020                             do {
   1021                                 if( ((st3&1)!=0 || useFallback) &&
   1022                                     (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
   1023                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
   1024                                 ) {
   1025                                     sa->add(sa->set, c);
   1026                                 }
   1027                                 st3>>=1;
   1028                                 stage3+=2;  /* +=st3Multiplier */
   1029                             } while((++c&0xf)!=0);
   1030                             break;
   1031                         case UCNV_SET_FILTER_HZ:
   1032                             /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
   1033                             do {
   1034                                 if( ((st3&1)!=0 || useFallback) &&
   1035                                     (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
   1036                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
   1037                                 ) {
   1038                                     sa->add(sa->set, c);
   1039                                 }
   1040                                 st3>>=1;
   1041                                 stage3+=2;  /* +=st3Multiplier */
   1042                             } while((++c&0xf)!=0);
   1043                             break;
   1044                         default:
   1045                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
   1046                             return;
   1047                         }
   1048                     } else {
   1049                         c+=16; /* empty stage 3 block */
   1050                     }
   1051                 }
   1052             } else {
   1053                 c+=1024; /* empty stage 2 block */
   1054             }
   1055         }
   1056     }
   1057 
   1058     ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
   1059 }
   1060 
   1061 U_CFUNC void
   1062 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
   1063                                  const USetAdder *sa,
   1064                                  UConverterUnicodeSet which,
   1065                                  UErrorCode *pErrorCode) {
   1066     ucnv_MBCSGetFilteredUnicodeSetForUnicode(
   1067         sharedData, sa, which,
   1068         sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
   1069             UCNV_SET_FILTER_DBCS_ONLY :
   1070             UCNV_SET_FILTER_NONE,
   1071         pErrorCode);
   1072 }
   1073 
   1074 static void U_CALLCONV
   1075 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
   1076                    const USetAdder *sa,
   1077                    UConverterUnicodeSet which,
   1078                    UErrorCode *pErrorCode) {
   1079     if(cnv->options&_MBCS_OPTION_GB18030) {
   1080         sa->addRange(sa->set, 0, 0xd7ff);
   1081         sa->addRange(sa->set, 0xe000, 0x10ffff);
   1082     } else {
   1083         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
   1084     }
   1085 }
   1086 
   1087 /* conversion extensions for input not in the main table -------------------- */
   1088 
   1089 /*
   1090  * Hardcoded extension handling for GB 18030.
   1091  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
   1092  *
   1093  * In the future, conversion extensions may handle m:n mappings and delta tables,
   1094  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
   1095  *
   1096  * If an input character cannot be mapped, then these functions set an error
   1097  * code. The framework will then call the callback function.
   1098  */
   1099 
   1100 /*
   1101  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
   1102  *         else return 0 after output has been written to the target
   1103  */
   1104 static UChar32
   1105 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
   1106           UChar32 cp,
   1107           const UChar **source, const UChar *sourceLimit,
   1108           uint8_t **target, const uint8_t *targetLimit,
   1109           int32_t **offsets, int32_t sourceIndex,
   1110           UBool flush,
   1111           UErrorCode *pErrorCode) {
   1112     const int32_t *cx;
   1113 
   1114     cnv->useSubChar1=FALSE;
   1115 
   1116     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1117         ucnv_extInitialMatchFromU(
   1118             cnv, cx,
   1119             cp, source, sourceLimit,
   1120             (char **)target, (char *)targetLimit,
   1121             offsets, sourceIndex,
   1122             flush,
   1123             pErrorCode)
   1124     ) {
   1125         return 0; /* an extension mapping handled the input */
   1126     }
   1127 
   1128     /* GB 18030 */
   1129     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1130         const uint32_t *range;
   1131         int32_t i;
   1132 
   1133         range=gb18030Ranges[0];
   1134         for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
   1135             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
   1136                 /* found the Unicode code point, output the four-byte sequence for it */
   1137                 uint32_t linear;
   1138                 char bytes[4];
   1139 
   1140                 /* get the linear value of the first GB 18030 code in this range */
   1141                 linear=range[2]-LINEAR_18030_BASE;
   1142 
   1143                 /* add the offset from the beginning of the range */
   1144                 linear+=((uint32_t)cp-range[0]);
   1145 
   1146                 /* turn this into a four-byte sequence */
   1147                 bytes[3]=(char)(0x30+linear%10); linear/=10;
   1148                 bytes[2]=(char)(0x81+linear%126); linear/=126;
   1149                 bytes[1]=(char)(0x30+linear%10); linear/=10;
   1150                 bytes[0]=(char)(0x81+linear);
   1151 
   1152                 /* output this sequence */
   1153                 ucnv_fromUWriteBytes(cnv,
   1154                                      bytes, 4, (char **)target, (char *)targetLimit,
   1155                                      offsets, sourceIndex, pErrorCode);
   1156                 return 0;
   1157             }
   1158         }
   1159     }
   1160 
   1161     /* no mapping */
   1162     *pErrorCode=U_INVALID_CHAR_FOUND;
   1163     return cp;
   1164 }
   1165 
   1166 /*
   1167  * Input sequence: cnv->toUBytes[0..length[
   1168  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
   1169  *         else return 0 after output has been written to the target
   1170  */
   1171 static int8_t
   1172 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
   1173         int8_t length,
   1174         const uint8_t **source, const uint8_t *sourceLimit,
   1175         UChar **target, const UChar *targetLimit,
   1176         int32_t **offsets, int32_t sourceIndex,
   1177         UBool flush,
   1178         UErrorCode *pErrorCode) {
   1179     const int32_t *cx;
   1180 
   1181     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1182         ucnv_extInitialMatchToU(
   1183             cnv, cx,
   1184             length, (const char **)source, (const char *)sourceLimit,
   1185             target, targetLimit,
   1186             offsets, sourceIndex,
   1187             flush,
   1188             pErrorCode)
   1189     ) {
   1190         return 0; /* an extension mapping handled the input */
   1191     }
   1192 
   1193     /* GB 18030 */
   1194     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1195         const uint32_t *range;
   1196         uint32_t linear;
   1197         int32_t i;
   1198 
   1199         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
   1200         range=gb18030Ranges[0];
   1201         for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
   1202             if(range[2]<=linear && linear<=range[3]) {
   1203                 /* found the sequence, output the Unicode code point for it */
   1204                 *pErrorCode=U_ZERO_ERROR;
   1205 
   1206                 /* add the linear difference between the input and start sequences to the start code point */
   1207                 linear=range[0]+(linear-range[2]);
   1208 
   1209                 /* output this code point */
   1210                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
   1211 
   1212                 return 0;
   1213             }
   1214         }
   1215     }
   1216 
   1217     /* no mapping */
   1218     *pErrorCode=U_INVALID_CHAR_FOUND;
   1219     return length;
   1220 }
   1221 
   1222 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
   1223 
   1224 /*
   1225  * This code modifies a standard EBCDIC<->Unicode mapping table for
   1226  * OS/390 (z/OS) Unix System Services (Open Edition).
   1227  * The difference is in the mapping of Line Feed and New Line control codes:
   1228  * Standard EBCDIC maps
   1229  *
   1230  *   <U000A> \x25 |0
   1231  *   <U0085> \x15 |0
   1232  *
   1233  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
   1234  * mapping
   1235  *
   1236  *   <U000A> \x15 |0
   1237  *   <U0085> \x25 |0
   1238  *
   1239  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
   1240  * by copying it into allocated memory and swapping the LF and NL values.
   1241  * It allows to support the same EBCDIC charset in both versions without
   1242  * duplicating the entire installed table.
   1243  */
   1244 
   1245 /* standard EBCDIC codes */
   1246 #define EBCDIC_LF 0x25
   1247 #define EBCDIC_NL 0x15
   1248 
   1249 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
   1250 #define EBCDIC_RT_LF 0xf25
   1251 #define EBCDIC_RT_NL 0xf15
   1252 
   1253 /* Unicode code points */
   1254 #define U_LF 0x0a
   1255 #define U_NL 0x85
   1256 
   1257 static UBool
   1258 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
   1259     UConverterMBCSTable *mbcsTable;
   1260 
   1261     const uint16_t *table, *results;
   1262     const uint8_t *bytes;
   1263 
   1264     int32_t (*newStateTable)[256];
   1265     uint16_t *newResults;
   1266     uint8_t *p;
   1267     char *name;
   1268 
   1269     uint32_t stage2Entry;
   1270     uint32_t size, sizeofFromUBytes;
   1271 
   1272     mbcsTable=&sharedData->mbcs;
   1273 
   1274     table=mbcsTable->fromUnicodeTable;
   1275     bytes=mbcsTable->fromUnicodeBytes;
   1276     results=(const uint16_t *)bytes;
   1277 
   1278     /*
   1279      * Check that this is an EBCDIC table with SBCS portion -
   1280      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
   1281      *
   1282      * If not, ignore the option. Options are always ignored if they do not apply.
   1283      */
   1284     if(!(
   1285          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
   1286          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
   1287          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
   1288     )) {
   1289         return FALSE;
   1290     }
   1291 
   1292     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1293         if(!(
   1294              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
   1295              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
   1296         )) {
   1297             return FALSE;
   1298         }
   1299     } else /* MBCS_OUTPUT_2_SISO */ {
   1300         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1301         if(!(
   1302              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
   1303              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
   1304         )) {
   1305             return FALSE;
   1306         }
   1307 
   1308         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1309         if(!(
   1310              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
   1311              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
   1312         )) {
   1313             return FALSE;
   1314         }
   1315     }
   1316 
   1317     if(mbcsTable->fromUBytesLength>0) {
   1318         /*
   1319          * We _know_ the number of bytes in the fromUnicodeBytes array
   1320          * starting with header.version 4.1.
   1321          */
   1322         sizeofFromUBytes=mbcsTable->fromUBytesLength;
   1323     } else {
   1324         /*
   1325          * Otherwise:
   1326          * There used to be code to enumerate the fromUnicode
   1327          * trie and find the highest entry, but it was removed in ICU 3.2
   1328          * because it was not tested and caused a low code coverage number.
   1329          * See Jitterbug 3674.
   1330          * This affects only some .cnv file formats with a header.version
   1331          * below 4.1, and only when swaplfnl is requested.
   1332          *
   1333          * ucnvmbcs.c revision 1.99 is the last one with the
   1334          * ucnv_MBCSSizeofFromUBytes() function.
   1335          */
   1336         *pErrorCode=U_INVALID_FORMAT_ERROR;
   1337         return FALSE;
   1338     }
   1339 
   1340     /*
   1341      * The table has an appropriate format.
   1342      * Allocate and build
   1343      * - a modified to-Unicode state table
   1344      * - a modified from-Unicode output array
   1345      * - a converter name string with the swap option appended
   1346      */
   1347     size=
   1348         mbcsTable->countStates*1024+
   1349         sizeofFromUBytes+
   1350         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
   1351     p=(uint8_t *)uprv_malloc(size);
   1352     if(p==NULL) {
   1353         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1354         return FALSE;
   1355     }
   1356 
   1357     /* copy and modify the to-Unicode state table */
   1358     newStateTable=(int32_t (*)[256])p;
   1359     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
   1360 
   1361     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
   1362     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
   1363 
   1364     /* copy and modify the from-Unicode result table */
   1365     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
   1366     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
   1367 
   1368     /* conveniently, the table access macros work on the left side of expressions */
   1369     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1370         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
   1371         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
   1372     } else /* MBCS_OUTPUT_2_SISO */ {
   1373         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1374         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
   1375 
   1376         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1377         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
   1378     }
   1379 
   1380     /* set the canonical converter name */
   1381     name=(char *)newResults+sizeofFromUBytes;
   1382     uprv_strcpy(name, sharedData->staticData->name);
   1383     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
   1384 
   1385     /* set the pointers */
   1386     umtx_lock(NULL);
   1387     if(mbcsTable->swapLFNLStateTable==NULL) {
   1388         mbcsTable->swapLFNLStateTable=newStateTable;
   1389         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
   1390         mbcsTable->swapLFNLName=name;
   1391 
   1392         newStateTable=NULL;
   1393     }
   1394     umtx_unlock(NULL);
   1395 
   1396     /* release the allocated memory if another thread beat us to it */
   1397     if(newStateTable!=NULL) {
   1398         uprv_free(newStateTable);
   1399     }
   1400     return TRUE;
   1401 }
   1402 
   1403 /* reconstitute omitted fromUnicode data ------------------------------------ */
   1404 
   1405 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
   1406 static UBool U_CALLCONV
   1407 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
   1408     UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
   1409     const uint16_t *table;
   1410     uint32_t *stage2;
   1411     uint8_t *bytes, *p;
   1412     UChar32 c;
   1413     int32_t i, st3;
   1414 
   1415     table=mbcsTable->fromUnicodeTable;
   1416     bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
   1417 
   1418     /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
   1419     switch(mbcsTable->outputType) {
   1420     case MBCS_OUTPUT_3_EUC:
   1421         if(value<=0xffff) {
   1422             /* short sequences are stored directly */
   1423             /* code set 0 or 1 */
   1424         } else if(value<=0x8effff) {
   1425             /* code set 2 */
   1426             value&=0x7fff;
   1427         } else /* first byte is 0x8f */ {
   1428             /* code set 3 */
   1429             value&=0xff7f;
   1430         }
   1431         break;
   1432     case MBCS_OUTPUT_4_EUC:
   1433         if(value<=0xffffff) {
   1434             /* short sequences are stored directly */
   1435             /* code set 0 or 1 */
   1436         } else if(value<=0x8effffff) {
   1437             /* code set 2 */
   1438             value&=0x7fffff;
   1439         } else /* first byte is 0x8f */ {
   1440             /* code set 3 */
   1441             value&=0xff7fff;
   1442         }
   1443         break;
   1444     default:
   1445         break;
   1446     }
   1447 
   1448     for(i=0; i<=0x1f; ++value, ++i) {
   1449         c=codePoints[i];
   1450         if(c<0) {
   1451             continue;
   1452         }
   1453 
   1454         /* locate the stage 2 & 3 data */
   1455         stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
   1456         p=bytes;
   1457         st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
   1458 
   1459         /* write the codepage bytes into stage 3 */
   1460         switch(mbcsTable->outputType) {
   1461         case MBCS_OUTPUT_3:
   1462         case MBCS_OUTPUT_4_EUC:
   1463             p+=st3*3;
   1464             p[0]=(uint8_t)(value>>16);
   1465             p[1]=(uint8_t)(value>>8);
   1466             p[2]=(uint8_t)value;
   1467             break;
   1468         case MBCS_OUTPUT_4:
   1469             ((uint32_t *)p)[st3]=value;
   1470             break;
   1471         default:
   1472             /* 2 bytes per character */
   1473             ((uint16_t *)p)[st3]=(uint16_t)value;
   1474             break;
   1475         }
   1476 
   1477         /* set the roundtrip flag */
   1478         *stage2|=(1UL<<(16+(c&0xf)));
   1479     }
   1480     return TRUE;
   1481  }
   1482 
   1483 static void
   1484 reconstituteData(UConverterMBCSTable *mbcsTable,
   1485                  uint32_t stage1Length, uint32_t stage2Length,
   1486                  uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
   1487                  UErrorCode *pErrorCode) {
   1488     uint16_t *stage1;
   1489     uint32_t *stage2;
   1490     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
   1491     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
   1492     if(mbcsTable->reconstitutedData==NULL) {
   1493         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1494         return;
   1495     }
   1496     uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
   1497 
   1498     /* copy existing data and reroute the pointers */
   1499     stage1=(uint16_t *)mbcsTable->reconstitutedData;
   1500     uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
   1501 
   1502     stage2=(uint32_t *)(stage1+stage1Length);
   1503     uprv_memcpy(stage2+(fullStage2Length-stage2Length),
   1504                 mbcsTable->fromUnicodeTable+stage1Length,
   1505                 stage2Length*4);
   1506 
   1507     mbcsTable->fromUnicodeTable=stage1;
   1508     mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
   1509 
   1510     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
   1511     stage2=(uint32_t *)stage1;
   1512 
   1513     /* reconstitute the initial part of stage 2 from the mbcsIndex */
   1514     {
   1515         int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
   1516         int32_t stageUTF8Index=0;
   1517         int32_t st1, st2, st3, i;
   1518 
   1519         for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
   1520             st2=stage1[st1];
   1521             if(st2!=(int32_t)stage1Length/2) {
   1522                 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
   1523                 for(i=0; i<16; ++i) {
   1524                     st3=mbcsTable->mbcsIndex[stageUTF8Index++];
   1525                     if(st3!=0) {
   1526                         /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
   1527                         st3>>=4;
   1528                         /*
   1529                          * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
   1530                          * allocated together as a single 64-block for access from the mbcsIndex
   1531                          */
   1532                         stage2[st2++]=st3++;
   1533                         stage2[st2++]=st3++;
   1534                         stage2[st2++]=st3++;
   1535                         stage2[st2++]=st3;
   1536                     } else {
   1537                         /* no stage 3 block, skip */
   1538                         st2+=4;
   1539                     }
   1540                 }
   1541             } else {
   1542                 /* no stage 2 block, skip */
   1543                 stageUTF8Index+=16;
   1544             }
   1545         }
   1546     }
   1547 
   1548     /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
   1549     ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
   1550 }
   1551 
   1552 /* MBCS setup functions ----------------------------------------------------- */
   1553 
   1554 static void U_CALLCONV
   1555 ucnv_MBCSLoad(UConverterSharedData *sharedData,
   1556           UConverterLoadArgs *pArgs,
   1557           const uint8_t *raw,
   1558           UErrorCode *pErrorCode) {
   1559     UDataInfo info;
   1560     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1561     _MBCSHeader *header=(_MBCSHeader *)raw;
   1562     uint32_t offset;
   1563     uint32_t headerLength;
   1564     UBool noFromU=FALSE;
   1565 
   1566     if(header->version[0]==4) {
   1567         headerLength=MBCS_HEADER_V4_LENGTH;
   1568     } else if(header->version[0]==5 && header->version[1]>=3 &&
   1569               (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
   1570         headerLength=header->options&MBCS_OPT_LENGTH_MASK;
   1571         noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
   1572     } else {
   1573         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1574         return;
   1575     }
   1576 
   1577     mbcsTable->outputType=(uint8_t)header->flags;
   1578     if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
   1579         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1580         return;
   1581     }
   1582 
   1583     /* extension data, header version 4.2 and higher */
   1584     offset=header->flags>>8;
   1585     if(offset!=0) {
   1586         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
   1587     }
   1588 
   1589     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
   1590         UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER;
   1591         UConverterSharedData *baseSharedData;
   1592         const int32_t *extIndexes;
   1593         const char *baseName;
   1594 
   1595         /* extension-only file, load the base table and set values appropriately */
   1596         if((extIndexes=mbcsTable->extIndexes)==NULL) {
   1597             /* extension-only file without extension */
   1598             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1599             return;
   1600         }
   1601 
   1602         if(pArgs->nestedLoads!=1) {
   1603             /* an extension table must not be loaded as a base table */
   1604             *pErrorCode=U_INVALID_TABLE_FILE;
   1605             return;
   1606         }
   1607 
   1608         /* load the base table */
   1609         baseName=(const char *)header+headerLength*4;
   1610         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
   1611             /* forbid loading this same extension-only file */
   1612             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1613             return;
   1614         }
   1615 
   1616         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
   1617         args.size=sizeof(UConverterLoadArgs);
   1618         args.nestedLoads=2;
   1619         args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
   1620         args.reserved=pArgs->reserved;
   1621         args.options=pArgs->options;
   1622         args.pkg=pArgs->pkg;
   1623         args.name=baseName;
   1624         baseSharedData=ucnv_load(&args, pErrorCode);
   1625         if(U_FAILURE(*pErrorCode)) {
   1626             return;
   1627         }
   1628         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
   1629             baseSharedData->mbcs.baseSharedData!=NULL
   1630         ) {
   1631             ucnv_unload(baseSharedData);
   1632             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1633             return;
   1634         }
   1635         if(pArgs->onlyTestIsLoadable) {
   1636             /*
   1637              * Exit as soon as we know that we can load the converter
   1638              * and the format is valid and supported.
   1639              * The worst that can happen in the following code is a memory
   1640              * allocation error.
   1641              */
   1642             ucnv_unload(baseSharedData);
   1643             return;
   1644         }
   1645 
   1646         /* copy the base table data */
   1647         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
   1648 
   1649         /* overwrite values with relevant ones for the extension converter */
   1650         mbcsTable->baseSharedData=baseSharedData;
   1651         mbcsTable->extIndexes=extIndexes;
   1652 
   1653         /*
   1654          * It would be possible to share the swapLFNL data with a base converter,
   1655          * but the generated name would have to be different, and the memory
   1656          * would have to be free'd only once.
   1657          * It is easier to just create the data for the extension converter
   1658          * separately when it is requested.
   1659          */
   1660         mbcsTable->swapLFNLStateTable=NULL;
   1661         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
   1662         mbcsTable->swapLFNLName=NULL;
   1663 
   1664         /*
   1665          * The reconstitutedData must be deleted only when the base converter
   1666          * is unloaded.
   1667          */
   1668         mbcsTable->reconstitutedData=NULL;
   1669 
   1670         /*
   1671          * Set a special, runtime-only outputType if the extension converter
   1672          * is a DBCS version of a base converter that also maps single bytes.
   1673          */
   1674         if( sharedData->staticData->conversionType==UCNV_DBCS ||
   1675                 (sharedData->staticData->conversionType==UCNV_MBCS &&
   1676                  sharedData->staticData->minBytesPerChar>=2)
   1677         ) {
   1678             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
   1679                 /* the base converter is SI/SO-stateful */
   1680                 int32_t entry;
   1681 
   1682                 /* get the dbcs state from the state table entry for SO=0x0e */
   1683                 entry=mbcsTable->stateTable[0][0xe];
   1684                 if( MBCS_ENTRY_IS_FINAL(entry) &&
   1685                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
   1686                     MBCS_ENTRY_FINAL_STATE(entry)!=0
   1687                 ) {
   1688                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
   1689 
   1690                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1691                 }
   1692             } else if(
   1693                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
   1694                 baseSharedData->staticData->minBytesPerChar==1 &&
   1695                 baseSharedData->staticData->maxBytesPerChar==2 &&
   1696                 mbcsTable->countStates<=127
   1697             ) {
   1698                 /* non-stateful base converter, need to modify the state table */
   1699                 int32_t (*newStateTable)[256];
   1700                 int32_t *state;
   1701                 int32_t i, count;
   1702 
   1703                 /* allocate a new state table and copy the base state table contents */
   1704                 count=mbcsTable->countStates;
   1705                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
   1706                 if(newStateTable==NULL) {
   1707                     ucnv_unload(baseSharedData);
   1708                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1709                     return;
   1710                 }
   1711 
   1712                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
   1713 
   1714                 /* change all final single-byte entries to go to a new all-illegal state */
   1715                 state=newStateTable[0];
   1716                 for(i=0; i<256; ++i) {
   1717                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
   1718                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
   1719                     }
   1720                 }
   1721 
   1722                 /* build the new all-illegal state */
   1723                 state=newStateTable[count];
   1724                 for(i=0; i<256; ++i) {
   1725                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
   1726                 }
   1727                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
   1728                 mbcsTable->countStates=(uint8_t)(count+1);
   1729                 mbcsTable->stateTableOwned=TRUE;
   1730 
   1731                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1732             }
   1733         }
   1734 
   1735         /*
   1736          * unlike below for files with base tables, do not get the unicodeMask
   1737          * from the sharedData; instead, use the base table's unicodeMask,
   1738          * which we copied in the memcpy above;
   1739          * this is necessary because the static data unicodeMask, especially
   1740          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
   1741          */
   1742     } else {
   1743         /* conversion file with a base table; an additional extension table is optional */
   1744         /* make sure that the output type is known */
   1745         switch(mbcsTable->outputType) {
   1746         case MBCS_OUTPUT_1:
   1747         case MBCS_OUTPUT_2:
   1748         case MBCS_OUTPUT_3:
   1749         case MBCS_OUTPUT_4:
   1750         case MBCS_OUTPUT_3_EUC:
   1751         case MBCS_OUTPUT_4_EUC:
   1752         case MBCS_OUTPUT_2_SISO:
   1753             /* OK */
   1754             break;
   1755         default:
   1756             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1757             return;
   1758         }
   1759         if(pArgs->onlyTestIsLoadable) {
   1760             /*
   1761              * Exit as soon as we know that we can load the converter
   1762              * and the format is valid and supported.
   1763              * The worst that can happen in the following code is a memory
   1764              * allocation error.
   1765              */
   1766             return;
   1767         }
   1768 
   1769         mbcsTable->countStates=(uint8_t)header->countStates;
   1770         mbcsTable->countToUFallbacks=header->countToUFallbacks;
   1771         mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
   1772         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
   1773         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
   1774 
   1775         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
   1776         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
   1777         mbcsTable->fromUBytesLength=header->fromUBytesLength;
   1778 
   1779         /*
   1780          * converter versions 6.1 and up contain a unicodeMask that is
   1781          * used here to select the most efficient function implementations
   1782          */
   1783         info.size=sizeof(UDataInfo);
   1784         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
   1785         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
   1786             /* mask off possible future extensions to be safe */
   1787             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
   1788         } else {
   1789             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
   1790             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
   1791         }
   1792 
   1793         /*
   1794          * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
   1795          * Check for the header version, SBCS vs. MBCS, and for whether the
   1796          * data structures are optimized for code points as high as what the
   1797          * runtime code is designed for.
   1798          * The implementation does not handle mapping tables with entries for
   1799          * unpaired surrogates.
   1800          */
   1801         if( header->version[1]>=3 &&
   1802             (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
   1803             (mbcsTable->countStates==1 ?
   1804                 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
   1805                 (header->version[2]>=(MBCS_FAST_MAX>>8))
   1806             )
   1807         ) {
   1808             mbcsTable->utf8Friendly=TRUE;
   1809 
   1810             if(mbcsTable->countStates==1) {
   1811                 /*
   1812                  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
   1813                  * Build a table with indexes to each block, to be used instead of
   1814                  * the regular stage 1/2 table.
   1815                  */
   1816                 int32_t i;
   1817                 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
   1818                     mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
   1819                 }
   1820                 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
   1821                 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
   1822             } else {
   1823                 /*
   1824                  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
   1825                  * The .cnv file is prebuilt with an additional stage table with indexes
   1826                  * to each block.
   1827                  */
   1828                 mbcsTable->mbcsIndex=(const uint16_t *)
   1829                     (mbcsTable->fromUnicodeBytes+
   1830                      (noFromU ? 0 : mbcsTable->fromUBytesLength));
   1831                 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
   1832             }
   1833         }
   1834 
   1835         /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
   1836         {
   1837             uint32_t asciiRoundtrips=0xffffffff;
   1838             int32_t i;
   1839 
   1840             for(i=0; i<0x80; ++i) {
   1841                 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
   1842                     asciiRoundtrips&=~((uint32_t)1<<(i>>2));
   1843                 }
   1844             }
   1845             mbcsTable->asciiRoundtrips=asciiRoundtrips;
   1846         }
   1847 
   1848         if(noFromU) {
   1849             uint32_t stage1Length=
   1850                 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
   1851                     0x440 : 0x40;
   1852             uint32_t stage2Length=
   1853                 (header->offsetFromUBytes-header->offsetFromUTable)/4-
   1854                 stage1Length/2;
   1855             reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
   1856         }
   1857     }
   1858 
   1859     /* Set the impl pointer here so that it is set for both extension-only and base tables. */
   1860     if(mbcsTable->utf8Friendly) {
   1861         if(mbcsTable->countStates==1) {
   1862             sharedData->impl=&_SBCSUTF8Impl;
   1863         } else {
   1864             if(mbcsTable->outputType==MBCS_OUTPUT_2) {
   1865                 sharedData->impl=&_DBCSUTF8Impl;
   1866             }
   1867         }
   1868     }
   1869 
   1870     if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
   1871         /*
   1872          * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
   1873          * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
   1874          */
   1875         mbcsTable->asciiRoundtrips=0;
   1876     }
   1877 }
   1878 
   1879 static void U_CALLCONV
   1880 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
   1881     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1882 
   1883     if(mbcsTable->swapLFNLStateTable!=NULL) {
   1884         uprv_free(mbcsTable->swapLFNLStateTable);
   1885     }
   1886     if(mbcsTable->stateTableOwned) {
   1887         uprv_free((void *)mbcsTable->stateTable);
   1888     }
   1889     if(mbcsTable->baseSharedData!=NULL) {
   1890         ucnv_unload(mbcsTable->baseSharedData);
   1891     }
   1892     if(mbcsTable->reconstitutedData!=NULL) {
   1893         uprv_free(mbcsTable->reconstitutedData);
   1894     }
   1895 }
   1896 
   1897 static void U_CALLCONV
   1898 ucnv_MBCSOpen(UConverter *cnv,
   1899               UConverterLoadArgs *pArgs,
   1900               UErrorCode *pErrorCode) {
   1901     UConverterMBCSTable *mbcsTable;
   1902     const int32_t *extIndexes;
   1903     uint8_t outputType;
   1904     int8_t maxBytesPerUChar;
   1905 
   1906     if(pArgs->onlyTestIsLoadable) {
   1907         return;
   1908     }
   1909 
   1910     mbcsTable=&cnv->sharedData->mbcs;
   1911     outputType=mbcsTable->outputType;
   1912 
   1913     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
   1914         /* the swaplfnl option does not apply, remove it */
   1915         cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1916     }
   1917 
   1918     if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1919         /* do this because double-checked locking is broken */
   1920         UBool isCached;
   1921 
   1922         umtx_lock(NULL);
   1923         isCached=mbcsTable->swapLFNLStateTable!=NULL;
   1924         umtx_unlock(NULL);
   1925 
   1926         if(!isCached) {
   1927             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
   1928                 if(U_FAILURE(*pErrorCode)) {
   1929                     return; /* something went wrong */
   1930                 }
   1931 
   1932                 /* the option does not apply, remove it */
   1933                 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1934             }
   1935         }
   1936     }
   1937 
   1938     if(uprv_strstr(pArgs->name, "18030")!=NULL) {
   1939         if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
   1940             /* set a flag for GB 18030 mode, which changes the callback behavior */
   1941             cnv->options|=_MBCS_OPTION_GB18030;
   1942         }
   1943     } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
   1944         /* set a flag for KEIS converter, which changes the SI/SO character sequence */
   1945         cnv->options|=_MBCS_OPTION_KEIS;
   1946     } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
   1947         /* set a flag for JEF converter, which changes the SI/SO character sequence */
   1948         cnv->options|=_MBCS_OPTION_JEF;
   1949     } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
   1950         /* set a flag for JIPS converter, which changes the SI/SO character sequence */
   1951         cnv->options|=_MBCS_OPTION_JIPS;
   1952     }
   1953 
   1954     /* fix maxBytesPerUChar depending on outputType and options etc. */
   1955     if(outputType==MBCS_OUTPUT_2_SISO) {
   1956         cnv->maxBytesPerUChar=3; /* SO+DBCS */
   1957     }
   1958 
   1959     extIndexes=mbcsTable->extIndexes;
   1960     if(extIndexes!=NULL) {
   1961         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
   1962         if(outputType==MBCS_OUTPUT_2_SISO) {
   1963             ++maxBytesPerUChar; /* SO + multiple DBCS */
   1964         }
   1965 
   1966         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
   1967             cnv->maxBytesPerUChar=maxBytesPerUChar;
   1968         }
   1969     }
   1970 
   1971 #if 0
   1972     /*
   1973      * documentation of UConverter fields used for status
   1974      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
   1975      */
   1976 
   1977     /* toUnicode */
   1978     cnv->toUnicodeStatus=0;     /* offset */
   1979     cnv->mode=0;                /* state */
   1980     cnv->toULength=0;           /* byteIndex */
   1981 
   1982     /* fromUnicode */
   1983     cnv->fromUChar32=0;
   1984     cnv->fromUnicodeStatus=1;   /* prevLength */
   1985 #endif
   1986 }
   1987 
   1988 U_CDECL_BEGIN
   1989 
   1990 static const char* U_CALLCONV
   1991 ucnv_MBCSGetName(const UConverter *cnv) {
   1992     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
   1993         return cnv->sharedData->mbcs.swapLFNLName;
   1994     } else {
   1995         return cnv->sharedData->staticData->name;
   1996     }
   1997 }
   1998 U_CDECL_END
   1999 
   2000 
   2001 /* MBCS-to-Unicode conversion functions ------------------------------------- */
   2002 
   2003 static UChar32 U_CALLCONV
   2004 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
   2005     const _MBCSToUFallback *toUFallbacks;
   2006     uint32_t i, start, limit;
   2007 
   2008     limit=mbcsTable->countToUFallbacks;
   2009     if(limit>0) {
   2010         /* do a binary search for the fallback mapping */
   2011         toUFallbacks=mbcsTable->toUFallbacks;
   2012         start=0;
   2013         while(start<limit-1) {
   2014             i=(start+limit)/2;
   2015             if(offset<toUFallbacks[i].offset) {
   2016                 limit=i;
   2017             } else {
   2018                 start=i;
   2019             }
   2020         }
   2021 
   2022         /* did we really find it? */
   2023         if(offset==toUFallbacks[start].offset) {
   2024             return toUFallbacks[start].codePoint;
   2025         }
   2026     }
   2027 
   2028     return 0xfffe;
   2029 }
   2030 
   2031 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
   2032 static void
   2033 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2034                                 UErrorCode *pErrorCode) {
   2035     UConverter *cnv;
   2036     const uint8_t *source, *sourceLimit;
   2037     UChar *target;
   2038     const UChar *targetLimit;
   2039     int32_t *offsets;
   2040 
   2041     const int32_t (*stateTable)[256];
   2042 
   2043     int32_t sourceIndex;
   2044 
   2045     int32_t entry;
   2046     UChar c;
   2047     uint8_t action;
   2048 
   2049     /* set up the local pointers */
   2050     cnv=pArgs->converter;
   2051     source=(const uint8_t *)pArgs->source;
   2052     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2053     target=pArgs->target;
   2054     targetLimit=pArgs->targetLimit;
   2055     offsets=pArgs->offsets;
   2056 
   2057     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2058         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2059     } else {
   2060         stateTable=cnv->sharedData->mbcs.stateTable;
   2061     }
   2062 
   2063     /* sourceIndex=-1 if the current character began in the previous buffer */
   2064     sourceIndex=0;
   2065 
   2066     /* conversion loop */
   2067     while(source<sourceLimit) {
   2068         /*
   2069          * This following test is to see if available input would overflow the output.
   2070          * It does not catch output of more than one code unit that
   2071          * overflows as a result of a surrogate pair or callback output
   2072          * from the last source byte.
   2073          * Therefore, those situations also test for overflows and will
   2074          * then break the loop, too.
   2075          */
   2076         if(target>=targetLimit) {
   2077             /* target is full */
   2078             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2079             break;
   2080         }
   2081 
   2082         entry=stateTable[0][*source++];
   2083         /* MBCS_ENTRY_IS_FINAL(entry) */
   2084 
   2085         /* test the most common case first */
   2086         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2087             /* output BMP code point */
   2088             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2089             if(offsets!=NULL) {
   2090                 *offsets++=sourceIndex;
   2091             }
   2092 
   2093             /* normal end of action codes: prepare for a new character */
   2094             ++sourceIndex;
   2095             continue;
   2096         }
   2097 
   2098         /*
   2099          * An if-else-if chain provides more reliable performance for
   2100          * the most common cases compared to a switch.
   2101          */
   2102         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2103         if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2104            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2105         ) {
   2106             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2107             /* output surrogate pair */
   2108             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2109             if(offsets!=NULL) {
   2110                 *offsets++=sourceIndex;
   2111             }
   2112             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2113             if(target<targetLimit) {
   2114                 *target++=c;
   2115                 if(offsets!=NULL) {
   2116                     *offsets++=sourceIndex;
   2117                 }
   2118             } else {
   2119                 /* target overflow */
   2120                 cnv->UCharErrorBuffer[0]=c;
   2121                 cnv->UCharErrorBufferLength=1;
   2122                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2123                 break;
   2124             }
   2125 
   2126             ++sourceIndex;
   2127             continue;
   2128         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2129             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2130                 /* output BMP code point */
   2131                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2132                 if(offsets!=NULL) {
   2133                     *offsets++=sourceIndex;
   2134                 }
   2135 
   2136                 ++sourceIndex;
   2137                 continue;
   2138             }
   2139         } else if(action==MBCS_STATE_UNASSIGNED) {
   2140             /* just fall through */
   2141         } else if(action==MBCS_STATE_ILLEGAL) {
   2142             /* callback(illegal) */
   2143             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2144         } else {
   2145             /* reserved, must never occur */
   2146             ++sourceIndex;
   2147             continue;
   2148         }
   2149 
   2150         if(U_FAILURE(*pErrorCode)) {
   2151             /* callback(illegal) */
   2152             break;
   2153         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2154             /* try an extension mapping */
   2155             pArgs->source=(const char *)source;
   2156             cnv->toUBytes[0]=*(source-1);
   2157             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2158                                     1, &source, sourceLimit,
   2159                                     &target, targetLimit,
   2160                                     &offsets, sourceIndex,
   2161                                     pArgs->flush,
   2162                                     pErrorCode);
   2163             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
   2164 
   2165             if(U_FAILURE(*pErrorCode)) {
   2166                 /* not mappable or buffer overflow */
   2167                 break;
   2168             }
   2169         }
   2170     }
   2171 
   2172     /* write back the updated pointers */
   2173     pArgs->source=(const char *)source;
   2174     pArgs->target=target;
   2175     pArgs->offsets=offsets;
   2176 }
   2177 
   2178 /*
   2179  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
   2180  * that only map to and from the BMP.
   2181  * In addition to single-byte optimizations, the offset calculations
   2182  * become much easier.
   2183  */
   2184 static void
   2185 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
   2186                             UErrorCode *pErrorCode) {
   2187     UConverter *cnv;
   2188     const uint8_t *source, *sourceLimit, *lastSource;
   2189     UChar *target;
   2190     int32_t targetCapacity, length;
   2191     int32_t *offsets;
   2192 
   2193     const int32_t (*stateTable)[256];
   2194 
   2195     int32_t sourceIndex;
   2196 
   2197     int32_t entry;
   2198     uint8_t action;
   2199 
   2200     /* set up the local pointers */
   2201     cnv=pArgs->converter;
   2202     source=(const uint8_t *)pArgs->source;
   2203     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2204     target=pArgs->target;
   2205     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   2206     offsets=pArgs->offsets;
   2207 
   2208     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2209         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2210     } else {
   2211         stateTable=cnv->sharedData->mbcs.stateTable;
   2212     }
   2213 
   2214     /* sourceIndex=-1 if the current character began in the previous buffer */
   2215     sourceIndex=0;
   2216     lastSource=source;
   2217 
   2218     /*
   2219      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   2220      * for the minimum of the sourceLength and targetCapacity
   2221      */
   2222     length=(int32_t)(sourceLimit-source);
   2223     if(length<targetCapacity) {
   2224         targetCapacity=length;
   2225     }
   2226 
   2227 #if MBCS_UNROLL_SINGLE_TO_BMP
   2228     /* unrolling makes it faster on Pentium III/Windows 2000 */
   2229     /* unroll the loop with the most common case */
   2230 unrolled:
   2231     if(targetCapacity>=16) {
   2232         int32_t count, loops, oredEntries;
   2233 
   2234         loops=count=targetCapacity>>4;
   2235         do {
   2236             oredEntries=entry=stateTable[0][*source++];
   2237             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2238             oredEntries|=entry=stateTable[0][*source++];
   2239             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2240             oredEntries|=entry=stateTable[0][*source++];
   2241             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2242             oredEntries|=entry=stateTable[0][*source++];
   2243             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2244             oredEntries|=entry=stateTable[0][*source++];
   2245             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2246             oredEntries|=entry=stateTable[0][*source++];
   2247             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2248             oredEntries|=entry=stateTable[0][*source++];
   2249             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2250             oredEntries|=entry=stateTable[0][*source++];
   2251             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2252             oredEntries|=entry=stateTable[0][*source++];
   2253             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2254             oredEntries|=entry=stateTable[0][*source++];
   2255             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2256             oredEntries|=entry=stateTable[0][*source++];
   2257             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2258             oredEntries|=entry=stateTable[0][*source++];
   2259             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2260             oredEntries|=entry=stateTable[0][*source++];
   2261             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2262             oredEntries|=entry=stateTable[0][*source++];
   2263             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2264             oredEntries|=entry=stateTable[0][*source++];
   2265             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2266             oredEntries|=entry=stateTable[0][*source++];
   2267             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2268 
   2269             /* were all 16 entries really valid? */
   2270             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
   2271                 /* no, return to the first of these 16 */
   2272                 source-=16;
   2273                 target-=16;
   2274                 break;
   2275             }
   2276         } while(--count>0);
   2277         count=loops-count;
   2278         targetCapacity-=16*count;
   2279 
   2280         if(offsets!=NULL) {
   2281             lastSource+=16*count;
   2282             while(count>0) {
   2283                 *offsets++=sourceIndex++;
   2284                 *offsets++=sourceIndex++;
   2285                 *offsets++=sourceIndex++;
   2286                 *offsets++=sourceIndex++;
   2287                 *offsets++=sourceIndex++;
   2288                 *offsets++=sourceIndex++;
   2289                 *offsets++=sourceIndex++;
   2290                 *offsets++=sourceIndex++;
   2291                 *offsets++=sourceIndex++;
   2292                 *offsets++=sourceIndex++;
   2293                 *offsets++=sourceIndex++;
   2294                 *offsets++=sourceIndex++;
   2295                 *offsets++=sourceIndex++;
   2296                 *offsets++=sourceIndex++;
   2297                 *offsets++=sourceIndex++;
   2298                 *offsets++=sourceIndex++;
   2299                 --count;
   2300             }
   2301         }
   2302     }
   2303 #endif
   2304 
   2305     /* conversion loop */
   2306     while(targetCapacity > 0 && source < sourceLimit) {
   2307         entry=stateTable[0][*source++];
   2308         /* MBCS_ENTRY_IS_FINAL(entry) */
   2309 
   2310         /* test the most common case first */
   2311         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2312             /* output BMP code point */
   2313             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2314             --targetCapacity;
   2315             continue;
   2316         }
   2317 
   2318         /*
   2319          * An if-else-if chain provides more reliable performance for
   2320          * the most common cases compared to a switch.
   2321          */
   2322         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2323         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2324             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2325                 /* output BMP code point */
   2326                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2327                 --targetCapacity;
   2328                 continue;
   2329             }
   2330         } else if(action==MBCS_STATE_UNASSIGNED) {
   2331             /* just fall through */
   2332         } else if(action==MBCS_STATE_ILLEGAL) {
   2333             /* callback(illegal) */
   2334             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2335         } else {
   2336             /* reserved, must never occur */
   2337             continue;
   2338         }
   2339 
   2340         /* set offsets since the start or the last extension */
   2341         if(offsets!=NULL) {
   2342             int32_t count=(int32_t)(source-lastSource);
   2343 
   2344             /* predecrement: do not set the offset for the callback-causing character */
   2345             while(--count>0) {
   2346                 *offsets++=sourceIndex++;
   2347             }
   2348             /* offset and sourceIndex are now set for the current character */
   2349         }
   2350 
   2351         if(U_FAILURE(*pErrorCode)) {
   2352             /* callback(illegal) */
   2353             break;
   2354         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2355             /* try an extension mapping */
   2356             lastSource=source;
   2357             cnv->toUBytes[0]=*(source-1);
   2358             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2359                                     1, &source, sourceLimit,
   2360                                     &target, pArgs->targetLimit,
   2361                                     &offsets, sourceIndex,
   2362                                     pArgs->flush,
   2363                                     pErrorCode);
   2364             sourceIndex+=1+(int32_t)(source-lastSource);
   2365 
   2366             if(U_FAILURE(*pErrorCode)) {
   2367                 /* not mappable or buffer overflow */
   2368                 break;
   2369             }
   2370 
   2371             /* recalculate the targetCapacity after an extension mapping */
   2372             targetCapacity=(int32_t)(pArgs->targetLimit-target);
   2373             length=(int32_t)(sourceLimit-source);
   2374             if(length<targetCapacity) {
   2375                 targetCapacity=length;
   2376             }
   2377         }
   2378 
   2379 #if MBCS_UNROLL_SINGLE_TO_BMP
   2380         /* unrolling makes it faster on Pentium III/Windows 2000 */
   2381         goto unrolled;
   2382 #endif
   2383     }
   2384 
   2385     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
   2386         /* target is full */
   2387         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2388     }
   2389 
   2390     /* set offsets since the start or the last callback */
   2391     if(offsets!=NULL) {
   2392         size_t count=source-lastSource;
   2393         while(count>0) {
   2394             *offsets++=sourceIndex++;
   2395             --count;
   2396         }
   2397     }
   2398 
   2399     /* write back the updated pointers */
   2400     pArgs->source=(const char *)source;
   2401     pArgs->target=target;
   2402     pArgs->offsets=offsets;
   2403 }
   2404 
   2405 static UBool
   2406 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
   2407     const int32_t *row=stateTable[state];
   2408     int32_t b, entry;
   2409     /* First test for final entries in this state for some commonly valid byte values. */
   2410     entry=row[0xa1];
   2411     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2412         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2413     ) {
   2414         return TRUE;
   2415     }
   2416     entry=row[0x41];
   2417     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2418         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2419     ) {
   2420         return TRUE;
   2421     }
   2422     /* Then test for final entries in this state. */
   2423     for(b=0; b<=0xff; ++b) {
   2424         entry=row[b];
   2425         if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2426             MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2427         ) {
   2428             return TRUE;
   2429         }
   2430     }
   2431     /* Then recurse for transition entries. */
   2432     for(b=0; b<=0xff; ++b) {
   2433         entry=row[b];
   2434         if( MBCS_ENTRY_IS_TRANSITION(entry) &&
   2435             hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
   2436         ) {
   2437             return TRUE;
   2438         }
   2439     }
   2440     return FALSE;
   2441 }
   2442 
   2443 /*
   2444  * Is byte b a single/lead byte in this state?
   2445  * Recurse for transition states, because here we don't want to say that
   2446  * b is a lead byte if all byte sequences that start with b are illegal.
   2447  */
   2448 static UBool
   2449 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
   2450     const int32_t *row=stateTable[state];
   2451     int32_t entry=row[b];
   2452     if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
   2453         return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
   2454     } else {
   2455         uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2456         if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
   2457             return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
   2458         } else {
   2459             return action!=MBCS_STATE_ILLEGAL;
   2460         }
   2461     }
   2462 }
   2463 
   2464 U_CFUNC void
   2465 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2466                           UErrorCode *pErrorCode) {
   2467     UConverter *cnv;
   2468     const uint8_t *source, *sourceLimit;
   2469     UChar *target;
   2470     const UChar *targetLimit;
   2471     int32_t *offsets;
   2472 
   2473     const int32_t (*stateTable)[256];
   2474     const uint16_t *unicodeCodeUnits;
   2475 
   2476     uint32_t offset;
   2477     uint8_t state;
   2478     int8_t byteIndex;
   2479     uint8_t *bytes;
   2480 
   2481     int32_t sourceIndex, nextSourceIndex;
   2482 
   2483     int32_t entry;
   2484     UChar c;
   2485     uint8_t action;
   2486 
   2487     /* use optimized function if possible */
   2488     cnv=pArgs->converter;
   2489 
   2490     if(cnv->preToULength>0) {
   2491         /*
   2492          * pass sourceIndex=-1 because we continue from an earlier buffer
   2493          * in the future, this may change with continuous offsets
   2494          */
   2495         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
   2496 
   2497         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
   2498             return;
   2499         }
   2500     }
   2501 
   2502     if(cnv->sharedData->mbcs.countStates==1) {
   2503         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   2504             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
   2505         } else {
   2506             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
   2507         }
   2508         return;
   2509     }
   2510 
   2511     /* set up the local pointers */
   2512     source=(const uint8_t *)pArgs->source;
   2513     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2514     target=pArgs->target;
   2515     targetLimit=pArgs->targetLimit;
   2516     offsets=pArgs->offsets;
   2517 
   2518     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2519         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2520     } else {
   2521         stateTable=cnv->sharedData->mbcs.stateTable;
   2522     }
   2523     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2524 
   2525     /* get the converter state from UConverter */
   2526     offset=cnv->toUnicodeStatus;
   2527     byteIndex=cnv->toULength;
   2528     bytes=cnv->toUBytes;
   2529 
   2530     /*
   2531      * if we are in the SBCS state for a DBCS-only converter,
   2532      * then load the DBCS state from the MBCS data
   2533      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2534      */
   2535     if((state=(uint8_t)(cnv->mode))==0) {
   2536         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2537     }
   2538 
   2539     /* sourceIndex=-1 if the current character began in the previous buffer */
   2540     sourceIndex=byteIndex==0 ? 0 : -1;
   2541     nextSourceIndex=0;
   2542 
   2543     /* conversion loop */
   2544     while(source<sourceLimit) {
   2545         /*
   2546          * This following test is to see if available input would overflow the output.
   2547          * It does not catch output of more than one code unit that
   2548          * overflows as a result of a surrogate pair or callback output
   2549          * from the last source byte.
   2550          * Therefore, those situations also test for overflows and will
   2551          * then break the loop, too.
   2552          */
   2553         if(target>=targetLimit) {
   2554             /* target is full */
   2555             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2556             break;
   2557         }
   2558 
   2559         if(byteIndex==0) {
   2560             /* optimized loop for 1/2-byte input and BMP output */
   2561             if(offsets==NULL) {
   2562                 do {
   2563                     entry=stateTable[state][*source];
   2564                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2565                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2566                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2567 
   2568                         ++source;
   2569                         if( source<sourceLimit &&
   2570                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2571                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2572                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2573                         ) {
   2574                             ++source;
   2575                             *target++=c;
   2576                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2577                             offset=0;
   2578                         } else {
   2579                             /* set the state and leave the optimized loop */
   2580                             bytes[0]=*(source-1);
   2581                             byteIndex=1;
   2582                             break;
   2583                         }
   2584                     } else {
   2585                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2586                             /* output BMP code point */
   2587                             ++source;
   2588                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2589                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2590                         } else {
   2591                             /* leave the optimized loop */
   2592                             break;
   2593                         }
   2594                     }
   2595                 } while(source<sourceLimit && target<targetLimit);
   2596             } else /* offsets!=NULL */ {
   2597                 do {
   2598                     entry=stateTable[state][*source];
   2599                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2600                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2601                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2602 
   2603                         ++source;
   2604                         if( source<sourceLimit &&
   2605                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2606                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2607                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2608                         ) {
   2609                             ++source;
   2610                             *target++=c;
   2611                             if(offsets!=NULL) {
   2612                                 *offsets++=sourceIndex;
   2613                                 sourceIndex=(nextSourceIndex+=2);
   2614                             }
   2615                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2616                             offset=0;
   2617                         } else {
   2618                             /* set the state and leave the optimized loop */
   2619                             ++nextSourceIndex;
   2620                             bytes[0]=*(source-1);
   2621                             byteIndex=1;
   2622                             break;
   2623                         }
   2624                     } else {
   2625                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2626                             /* output BMP code point */
   2627                             ++source;
   2628                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2629                             if(offsets!=NULL) {
   2630                                 *offsets++=sourceIndex;
   2631                                 sourceIndex=++nextSourceIndex;
   2632                             }
   2633                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2634                         } else {
   2635                             /* leave the optimized loop */
   2636                             break;
   2637                         }
   2638                     }
   2639                 } while(source<sourceLimit && target<targetLimit);
   2640             }
   2641 
   2642             /*
   2643              * these tests and break statements could be put inside the loop
   2644              * if C had "break outerLoop" like Java
   2645              */
   2646             if(source>=sourceLimit) {
   2647                 break;
   2648             }
   2649             if(target>=targetLimit) {
   2650                 /* target is full */
   2651                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2652                 break;
   2653             }
   2654 
   2655             ++nextSourceIndex;
   2656             bytes[byteIndex++]=*source++;
   2657         } else /* byteIndex>0 */ {
   2658             ++nextSourceIndex;
   2659             entry=stateTable[state][bytes[byteIndex++]=*source++];
   2660         }
   2661 
   2662         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2663             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2664             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2665             continue;
   2666         }
   2667 
   2668         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2669         cnv->mode=state;
   2670 
   2671         /* set the next state early so that we can reuse the entry variable */
   2672         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2673 
   2674         /*
   2675          * An if-else-if chain provides more reliable performance for
   2676          * the most common cases compared to a switch.
   2677          */
   2678         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2679         if(action==MBCS_STATE_VALID_16) {
   2680             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2681             c=unicodeCodeUnits[offset];
   2682             if(c<0xfffe) {
   2683                 /* output BMP code point */
   2684                 *target++=c;
   2685                 if(offsets!=NULL) {
   2686                     *offsets++=sourceIndex;
   2687                 }
   2688                 byteIndex=0;
   2689             } else if(c==0xfffe) {
   2690                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2691                     /* output fallback BMP code point */
   2692                     *target++=(UChar)entry;
   2693                     if(offsets!=NULL) {
   2694                         *offsets++=sourceIndex;
   2695                     }
   2696                     byteIndex=0;
   2697                 }
   2698             } else {
   2699                 /* callback(illegal) */
   2700                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2701             }
   2702         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   2703             /* output BMP code point */
   2704             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2705             if(offsets!=NULL) {
   2706                 *offsets++=sourceIndex;
   2707             }
   2708             byteIndex=0;
   2709         } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2710             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2711             c=unicodeCodeUnits[offset++];
   2712             if(c<0xd800) {
   2713                 /* output BMP code point below 0xd800 */
   2714                 *target++=c;
   2715                 if(offsets!=NULL) {
   2716                     *offsets++=sourceIndex;
   2717                 }
   2718                 byteIndex=0;
   2719             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2720                 /* output roundtrip or fallback surrogate pair */
   2721                 *target++=(UChar)(c&0xdbff);
   2722                 if(offsets!=NULL) {
   2723                     *offsets++=sourceIndex;
   2724                 }
   2725                 byteIndex=0;
   2726                 if(target<targetLimit) {
   2727                     *target++=unicodeCodeUnits[offset];
   2728                     if(offsets!=NULL) {
   2729                         *offsets++=sourceIndex;
   2730                     }
   2731                 } else {
   2732                     /* target overflow */
   2733                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
   2734                     cnv->UCharErrorBufferLength=1;
   2735                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2736 
   2737                     offset=0;
   2738                     break;
   2739                 }
   2740             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2741                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2742                 *target++=unicodeCodeUnits[offset];
   2743                 if(offsets!=NULL) {
   2744                     *offsets++=sourceIndex;
   2745                 }
   2746                 byteIndex=0;
   2747             } else if(c==0xffff) {
   2748                 /* callback(illegal) */
   2749                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2750             }
   2751         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2752                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2753         ) {
   2754             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2755             /* output surrogate pair */
   2756             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2757             if(offsets!=NULL) {
   2758                 *offsets++=sourceIndex;
   2759             }
   2760             byteIndex=0;
   2761             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2762             if(target<targetLimit) {
   2763                 *target++=c;
   2764                 if(offsets!=NULL) {
   2765                     *offsets++=sourceIndex;
   2766                 }
   2767             } else {
   2768                 /* target overflow */
   2769                 cnv->UCharErrorBuffer[0]=c;
   2770                 cnv->UCharErrorBufferLength=1;
   2771                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2772 
   2773                 offset=0;
   2774                 break;
   2775             }
   2776         } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2777             /*
   2778              * This serves as a state change without any output.
   2779              * It is useful for reading simple stateful encodings,
   2780              * for example using just Shift-In/Shift-Out codes.
   2781              * The 21 unused bits may later be used for more sophisticated
   2782              * state transitions.
   2783              */
   2784             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
   2785                 byteIndex=0;
   2786             } else {
   2787                 /* SI/SO are illegal for DBCS-only conversion */
   2788                 state=(uint8_t)(cnv->mode); /* restore the previous state */
   2789 
   2790                 /* callback(illegal) */
   2791                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2792             }
   2793         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2794             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2795                 /* output BMP code point */
   2796                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2797                 if(offsets!=NULL) {
   2798                     *offsets++=sourceIndex;
   2799                 }
   2800                 byteIndex=0;
   2801             }
   2802         } else if(action==MBCS_STATE_UNASSIGNED) {
   2803             /* just fall through */
   2804         } else if(action==MBCS_STATE_ILLEGAL) {
   2805             /* callback(illegal) */
   2806             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2807         } else {
   2808             /* reserved, must never occur */
   2809             byteIndex=0;
   2810         }
   2811 
   2812         /* end of action codes: prepare for a new character */
   2813         offset=0;
   2814 
   2815         if(byteIndex==0) {
   2816             sourceIndex=nextSourceIndex;
   2817         } else if(U_FAILURE(*pErrorCode)) {
   2818             /* callback(illegal) */
   2819             if(byteIndex>1) {
   2820                 /*
   2821                  * Ticket 5691: consistent illegal sequences:
   2822                  * - We include at least the first byte in the illegal sequence.
   2823                  * - If any of the non-initial bytes could be the start of a character,
   2824                  *   we stop the illegal sequence before the first one of those.
   2825                  */
   2826                 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   2827                 int8_t i;
   2828                 for(i=1;
   2829                     i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
   2830                     ++i) {}
   2831                 if(i<byteIndex) {
   2832                     /* Back out some bytes. */
   2833                     int8_t backOutDistance=byteIndex-i;
   2834                     int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
   2835                     byteIndex=i;  /* length of reported illegal byte sequence */
   2836                     if(backOutDistance<=bytesFromThisBuffer) {
   2837                         source-=backOutDistance;
   2838                     } else {
   2839                         /* Back out bytes from the previous buffer: Need to replay them. */
   2840                         cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   2841                         /* preToULength is negative! */
   2842                         uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
   2843                         source=(const uint8_t *)pArgs->source;
   2844                     }
   2845                 }
   2846             }
   2847             break;
   2848         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2849             /* try an extension mapping */
   2850             pArgs->source=(const char *)source;
   2851             byteIndex=_extToU(cnv, cnv->sharedData,
   2852                               byteIndex, &source, sourceLimit,
   2853                               &target, targetLimit,
   2854                               &offsets, sourceIndex,
   2855                               pArgs->flush,
   2856                               pErrorCode);
   2857             sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
   2858 
   2859             if(U_FAILURE(*pErrorCode)) {
   2860                 /* not mappable or buffer overflow */
   2861                 break;
   2862             }
   2863         }
   2864     }
   2865 
   2866     /* set the converter state back into UConverter */
   2867     cnv->toUnicodeStatus=offset;
   2868     cnv->mode=state;
   2869     cnv->toULength=byteIndex;
   2870 
   2871     /* write back the updated pointers */
   2872     pArgs->source=(const char *)source;
   2873     pArgs->target=target;
   2874     pArgs->offsets=offsets;
   2875 }
   2876 
   2877 /*
   2878  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
   2879  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
   2880  */
   2881 static UChar32
   2882 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2883                         UErrorCode *pErrorCode) {
   2884     UConverter *cnv;
   2885     const int32_t (*stateTable)[256];
   2886     const uint8_t *source, *sourceLimit;
   2887 
   2888     int32_t entry;
   2889     uint8_t action;
   2890 
   2891     /* set up the local pointers */
   2892     cnv=pArgs->converter;
   2893     source=(const uint8_t *)pArgs->source;
   2894     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2895     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2896         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2897     } else {
   2898         stateTable=cnv->sharedData->mbcs.stateTable;
   2899     }
   2900 
   2901     /* conversion loop */
   2902     while(source<sourceLimit) {
   2903         entry=stateTable[0][*source++];
   2904         /* MBCS_ENTRY_IS_FINAL(entry) */
   2905 
   2906         /* write back the updated pointer early so that we can return directly */
   2907         pArgs->source=(const char *)source;
   2908 
   2909         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2910             /* output BMP code point */
   2911             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2912         }
   2913 
   2914         /*
   2915          * An if-else-if chain provides more reliable performance for
   2916          * the most common cases compared to a switch.
   2917          */
   2918         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2919         if( action==MBCS_STATE_VALID_DIRECT_20 ||
   2920             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2921         ) {
   2922             /* output supplementary code point */
   2923             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2924         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2925             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2926                 /* output BMP code point */
   2927                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2928             }
   2929         } else if(action==MBCS_STATE_UNASSIGNED) {
   2930             /* just fall through */
   2931         } else if(action==MBCS_STATE_ILLEGAL) {
   2932             /* callback(illegal) */
   2933             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2934         } else {
   2935             /* reserved, must never occur */
   2936             continue;
   2937         }
   2938 
   2939         if(U_FAILURE(*pErrorCode)) {
   2940             /* callback(illegal) */
   2941             break;
   2942         } else /* unassigned sequence */ {
   2943             /* defer to the generic implementation */
   2944             pArgs->source=(const char *)source-1;
   2945             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2946         }
   2947     }
   2948 
   2949     /* no output because of empty input or only state changes */
   2950     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2951     return 0xffff;
   2952 }
   2953 
   2954 /*
   2955  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
   2956  * conversion without offset handling.
   2957  *
   2958  * When a character does not have a mapping to Unicode, then we return to the
   2959  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
   2960  * handling.
   2961  * We also defer to the generic code in other complicated cases and have them
   2962  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
   2963  *
   2964  * All normal mappings and errors are handled here.
   2965  */
   2966 static UChar32 U_CALLCONV
   2967 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2968                   UErrorCode *pErrorCode) {
   2969     UConverter *cnv;
   2970     const uint8_t *source, *sourceLimit, *lastSource;
   2971 
   2972     const int32_t (*stateTable)[256];
   2973     const uint16_t *unicodeCodeUnits;
   2974 
   2975     uint32_t offset;
   2976     uint8_t state;
   2977 
   2978     int32_t entry;
   2979     UChar32 c;
   2980     uint8_t action;
   2981 
   2982     /* use optimized function if possible */
   2983     cnv=pArgs->converter;
   2984 
   2985     if(cnv->preToULength>0) {
   2986         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
   2987         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2988     }
   2989 
   2990     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
   2991         /*
   2992          * Using the generic ucnv_getNextUChar() code lets us deal correctly
   2993          * with the rare case of a codepage that maps single surrogates
   2994          * without adding the complexity to this already complicated function here.
   2995          */
   2996         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2997     } else if(cnv->sharedData->mbcs.countStates==1) {
   2998         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
   2999     }
   3000 
   3001     /* set up the local pointers */
   3002     source=lastSource=(const uint8_t *)pArgs->source;
   3003     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   3004 
   3005     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3006         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   3007     } else {
   3008         stateTable=cnv->sharedData->mbcs.stateTable;
   3009     }
   3010     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   3011 
   3012     /* get the converter state from UConverter */
   3013     offset=cnv->toUnicodeStatus;
   3014 
   3015     /*
   3016      * if we are in the SBCS state for a DBCS-only converter,
   3017      * then load the DBCS state from the MBCS data
   3018      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   3019      */
   3020     if((state=(uint8_t)(cnv->mode))==0) {
   3021         state=cnv->sharedData->mbcs.dbcsOnlyState;
   3022     }
   3023 
   3024     /* conversion loop */
   3025     c=U_SENTINEL;
   3026     while(source<sourceLimit) {
   3027         entry=stateTable[state][*source++];
   3028         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3029             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3030             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3031 
   3032             /* optimization for 1/2-byte input and BMP output */
   3033             if( source<sourceLimit &&
   3034                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   3035                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   3036                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   3037             ) {
   3038                 ++source;
   3039                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   3040                 /* output BMP code point */
   3041                 break;
   3042             }
   3043         } else {
   3044             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   3045             cnv->mode=state;
   3046 
   3047             /* set the next state early so that we can reuse the entry variable */
   3048             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   3049 
   3050             /*
   3051              * An if-else-if chain provides more reliable performance for
   3052              * the most common cases compared to a switch.
   3053              */
   3054             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3055             if(action==MBCS_STATE_VALID_DIRECT_16) {
   3056                 /* output BMP code point */
   3057                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3058                 break;
   3059             } else if(action==MBCS_STATE_VALID_16) {
   3060                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3061                 c=unicodeCodeUnits[offset];
   3062                 if(c<0xfffe) {
   3063                     /* output BMP code point */
   3064                     break;
   3065                 } else if(c==0xfffe) {
   3066                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   3067                         break;
   3068                     }
   3069                 } else {
   3070                     /* callback(illegal) */
   3071                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3072                 }
   3073             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3074                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3075                 c=unicodeCodeUnits[offset++];
   3076                 if(c<0xd800) {
   3077                     /* output BMP code point below 0xd800 */
   3078                     break;
   3079                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3080                     /* output roundtrip or fallback supplementary code point */
   3081                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
   3082                     break;
   3083                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3084                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3085                     c=unicodeCodeUnits[offset];
   3086                     break;
   3087                 } else if(c==0xffff) {
   3088                     /* callback(illegal) */
   3089                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3090                 }
   3091             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   3092                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   3093             ) {
   3094                 /* output supplementary code point */
   3095                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   3096                 break;
   3097             } else if(action==MBCS_STATE_CHANGE_ONLY) {
   3098                 /*
   3099                  * This serves as a state change without any output.
   3100                  * It is useful for reading simple stateful encodings,
   3101                  * for example using just Shift-In/Shift-Out codes.
   3102                  * The 21 unused bits may later be used for more sophisticated
   3103                  * state transitions.
   3104                  */
   3105                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
   3106                     /* SI/SO are illegal for DBCS-only conversion */
   3107                     state=(uint8_t)(cnv->mode); /* restore the previous state */
   3108 
   3109                     /* callback(illegal) */
   3110                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3111                 }
   3112             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3113                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3114                     /* output BMP code point */
   3115                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3116                     break;
   3117                 }
   3118             } else if(action==MBCS_STATE_UNASSIGNED) {
   3119                 /* just fall through */
   3120             } else if(action==MBCS_STATE_ILLEGAL) {
   3121                 /* callback(illegal) */
   3122                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3123             } else {
   3124                 /* reserved (must never occur), or only state change */
   3125                 offset=0;
   3126                 lastSource=source;
   3127                 continue;
   3128             }
   3129 
   3130             /* end of action codes: prepare for a new character */
   3131             offset=0;
   3132 
   3133             if(U_FAILURE(*pErrorCode)) {
   3134                 /* callback(illegal) */
   3135                 break;
   3136             } else /* unassigned sequence */ {
   3137                 /* defer to the generic implementation */
   3138                 cnv->toUnicodeStatus=0;
   3139                 cnv->mode=state;
   3140                 pArgs->source=(const char *)lastSource;
   3141                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   3142             }
   3143         }
   3144     }
   3145 
   3146     if(c<0) {
   3147         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
   3148             /* incomplete character byte sequence */
   3149             uint8_t *bytes=cnv->toUBytes;
   3150             cnv->toULength=(int8_t)(source-lastSource);
   3151             do {
   3152                 *bytes++=*lastSource++;
   3153             } while(lastSource<source);
   3154             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3155         } else if(U_FAILURE(*pErrorCode)) {
   3156             /* callback(illegal) */
   3157             /*
   3158              * Ticket 5691: consistent illegal sequences:
   3159              * - We include at least the first byte in the illegal sequence.
   3160              * - If any of the non-initial bytes could be the start of a character,
   3161              *   we stop the illegal sequence before the first one of those.
   3162              */
   3163             UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   3164             uint8_t *bytes=cnv->toUBytes;
   3165             *bytes++=*lastSource++;     /* first byte */
   3166             if(lastSource==source) {
   3167                 cnv->toULength=1;
   3168             } else /* lastSource<source: multi-byte character */ {
   3169                 int8_t i;
   3170                 for(i=1;
   3171                     lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
   3172                     ++i
   3173                 ) {
   3174                     *bytes++=*lastSource++;
   3175                 }
   3176                 cnv->toULength=i;
   3177                 source=lastSource;
   3178             }
   3179         } else {
   3180             /* no output because of empty input or only state changes */
   3181             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   3182         }
   3183         c=0xffff;
   3184     }
   3185 
   3186     /* set the converter state back into UConverter, ready for a new character */
   3187     cnv->toUnicodeStatus=0;
   3188     cnv->mode=state;
   3189 
   3190     /* write back the updated pointer */
   3191     pArgs->source=(const char *)source;
   3192     return c;
   3193 }
   3194 
   3195 #if 0
   3196 /*
   3197  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3198  * Removal improves code coverage.
   3199  */
   3200 /**
   3201  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
   3202  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3203  * It does not handle conversion extensions (_extToU()).
   3204  */
   3205 U_CFUNC UChar32
   3206 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
   3207                               uint8_t b, UBool useFallback) {
   3208     int32_t entry;
   3209     uint8_t action;
   3210 
   3211     entry=sharedData->mbcs.stateTable[0][b];
   3212     /* MBCS_ENTRY_IS_FINAL(entry) */
   3213 
   3214     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   3215         /* output BMP code point */
   3216         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3217     }
   3218 
   3219     /*
   3220      * An if-else-if chain provides more reliable performance for
   3221      * the most common cases compared to a switch.
   3222      */
   3223     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3224     if(action==MBCS_STATE_VALID_DIRECT_20) {
   3225         /* output supplementary code point */
   3226         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3227     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3228         if(!TO_U_USE_FALLBACK(useFallback)) {
   3229             return 0xfffe;
   3230         }
   3231         /* output BMP code point */
   3232         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3233     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3234         if(!TO_U_USE_FALLBACK(useFallback)) {
   3235             return 0xfffe;
   3236         }
   3237         /* output supplementary code point */
   3238         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3239     } else if(action==MBCS_STATE_UNASSIGNED) {
   3240         return 0xfffe;
   3241     } else if(action==MBCS_STATE_ILLEGAL) {
   3242         return 0xffff;
   3243     } else {
   3244         /* reserved, must never occur */
   3245         return 0xffff;
   3246     }
   3247 }
   3248 #endif
   3249 
   3250 /*
   3251  * This is a simple version of _MBCSGetNextUChar() that is used
   3252  * by other converter implementations.
   3253  * It only returns an "assigned" result if it consumes the entire input.
   3254  * It does not use state from the converter, nor error codes.
   3255  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3256  * It handles conversion extensions but not GB 18030.
   3257  *
   3258  * Return value:
   3259  * U+fffe   unassigned
   3260  * U+ffff   illegal
   3261  * otherwise the Unicode code point
   3262  */
   3263 U_CFUNC UChar32
   3264 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
   3265                         const char *source, int32_t length,
   3266                         UBool useFallback) {
   3267     const int32_t (*stateTable)[256];
   3268     const uint16_t *unicodeCodeUnits;
   3269 
   3270     uint32_t offset;
   3271     uint8_t state, action;
   3272 
   3273     UChar32 c;
   3274     int32_t i, entry;
   3275 
   3276     if(length<=0) {
   3277         /* no input at all: "illegal" */
   3278         return 0xffff;
   3279     }
   3280 
   3281 #if 0
   3282 /*
   3283  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3284  * TODO In future releases, verify that this function is never called for SBCS
   3285  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
   3286  * Removal improves code coverage.
   3287  */
   3288     /* use optimized function if possible */
   3289     if(sharedData->mbcs.countStates==1) {
   3290         if(length==1) {
   3291             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
   3292         } else {
   3293             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
   3294         }
   3295     }
   3296 #endif
   3297 
   3298     /* set up the local pointers */
   3299     stateTable=sharedData->mbcs.stateTable;
   3300     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
   3301 
   3302     /* converter state */
   3303     offset=0;
   3304     state=sharedData->mbcs.dbcsOnlyState;
   3305 
   3306     /* conversion loop */
   3307     for(i=0;;) {
   3308         entry=stateTable[state][(uint8_t)source[i++]];
   3309         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3310             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3311             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3312 
   3313             if(i==length) {
   3314                 return 0xffff; /* truncated character */
   3315             }
   3316         } else {
   3317             /*
   3318              * An if-else-if chain provides more reliable performance for
   3319              * the most common cases compared to a switch.
   3320              */
   3321             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3322             if(action==MBCS_STATE_VALID_16) {
   3323                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3324                 c=unicodeCodeUnits[offset];
   3325                 if(c!=0xfffe) {
   3326                     /* done */
   3327                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3328                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
   3329                 /* else done with 0xfffe */
   3330                 }
   3331                 break;
   3332             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   3333                 /* output BMP code point */
   3334                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3335                 break;
   3336             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3337                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3338                 c=unicodeCodeUnits[offset++];
   3339                 if(c<0xd800) {
   3340                     /* output BMP code point below 0xd800 */
   3341                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3342                     /* output roundtrip or fallback supplementary code point */
   3343                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
   3344                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3345                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3346                     c=unicodeCodeUnits[offset];
   3347                 } else if(c==0xffff) {
   3348                     return 0xffff;
   3349                 } else {
   3350                     c=0xfffe;
   3351                 }
   3352                 break;
   3353             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
   3354                 /* output supplementary code point */
   3355                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3356                 break;
   3357             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3358                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3359                     c=0xfffe;
   3360                     break;
   3361                 }
   3362                 /* output BMP code point */
   3363                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3364                 break;
   3365             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3366                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3367                     c=0xfffe;
   3368                     break;
   3369                 }
   3370                 /* output supplementary code point */
   3371                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3372                 break;
   3373             } else if(action==MBCS_STATE_UNASSIGNED) {
   3374                 c=0xfffe;
   3375                 break;
   3376             }
   3377 
   3378             /*
   3379              * forbid MBCS_STATE_CHANGE_ONLY for this function,
   3380              * and MBCS_STATE_ILLEGAL and reserved action codes
   3381              */
   3382             return 0xffff;
   3383         }
   3384     }
   3385 
   3386     if(i!=length) {
   3387         /* illegal for this function: not all input consumed */
   3388         return 0xffff;
   3389     }
   3390 
   3391     if(c==0xfffe) {
   3392         /* try an extension mapping */
   3393         const int32_t *cx=sharedData->mbcs.extIndexes;
   3394         if(cx!=NULL) {
   3395             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
   3396         }
   3397     }
   3398 
   3399     return c;
   3400 }
   3401 
   3402 /* MBCS-from-Unicode conversion functions ----------------------------------- */
   3403 
   3404 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
   3405 static void
   3406 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3407                                   UErrorCode *pErrorCode) {
   3408     UConverter *cnv;
   3409     const UChar *source, *sourceLimit;
   3410     uint8_t *target;
   3411     int32_t targetCapacity;
   3412     int32_t *offsets;
   3413 
   3414     const uint16_t *table;
   3415     const uint16_t *mbcsIndex;
   3416     const uint8_t *bytes;
   3417 
   3418     UChar32 c;
   3419 
   3420     int32_t sourceIndex, nextSourceIndex;
   3421 
   3422     uint32_t stage2Entry;
   3423     uint32_t asciiRoundtrips;
   3424     uint32_t value;
   3425     uint8_t unicodeMask;
   3426 
   3427     /* use optimized function if possible */
   3428     cnv=pArgs->converter;
   3429     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3430 
   3431     /* set up the local pointers */
   3432     source=pArgs->source;
   3433     sourceLimit=pArgs->sourceLimit;
   3434     target=(uint8_t *)pArgs->target;
   3435     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3436     offsets=pArgs->offsets;
   3437 
   3438     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3439     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3440     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3441         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3442     } else {
   3443         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   3444     }
   3445     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3446 
   3447     /* get the converter state from UConverter */
   3448     c=cnv->fromUChar32;
   3449 
   3450     /* sourceIndex=-1 if the current character began in the previous buffer */
   3451     sourceIndex= c==0 ? 0 : -1;
   3452     nextSourceIndex=0;
   3453 
   3454     /* conversion loop */
   3455     if(c!=0 && targetCapacity>0) {
   3456         goto getTrail;
   3457     }
   3458 
   3459     while(source<sourceLimit) {
   3460         /*
   3461          * This following test is to see if available input would overflow the output.
   3462          * It does not catch output of more than one byte that
   3463          * overflows as a result of a multi-byte character or callback output
   3464          * from the last source character.
   3465          * Therefore, those situations also test for overflows and will
   3466          * then break the loop, too.
   3467          */
   3468         if(targetCapacity>0) {
   3469             /*
   3470              * Get a correct Unicode code point:
   3471              * a single UChar for a BMP code point or
   3472              * a matched surrogate pair for a "supplementary code point".
   3473              */
   3474             c=*source++;
   3475             ++nextSourceIndex;
   3476             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3477                 *target++=(uint8_t)c;
   3478                 if(offsets!=NULL) {
   3479                     *offsets++=sourceIndex;
   3480                     sourceIndex=nextSourceIndex;
   3481                 }
   3482                 --targetCapacity;
   3483                 c=0;
   3484                 continue;
   3485             }
   3486             /*
   3487              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   3488              * to avoid dealing with surrogates.
   3489              * MBCS_FAST_MAX must be >=0xd7ff.
   3490              */
   3491             if(c<=0xd7ff) {
   3492                 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
   3493                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   3494                 if(value==0) {
   3495                     goto unassigned;
   3496                 }
   3497                 /* output the value */
   3498             } else {
   3499                 /*
   3500                  * This also tests if the codepage maps single surrogates.
   3501                  * If it does, then surrogates are not paired but mapped separately.
   3502                  * Note that in this case unmatched surrogates are not detected.
   3503                  */
   3504                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3505                     if(U16_IS_SURROGATE_LEAD(c)) {
   3506 getTrail:
   3507                         if(source<sourceLimit) {
   3508                             /* test the following code unit */
   3509                             UChar trail=*source;
   3510                             if(U16_IS_TRAIL(trail)) {
   3511                                 ++source;
   3512                                 ++nextSourceIndex;
   3513                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   3514                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3515                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3516                                     /* callback(unassigned) */
   3517                                     goto unassigned;
   3518                                 }
   3519                                 /* convert this supplementary code point */
   3520                                 /* exit this condition tree */
   3521                             } else {
   3522                                 /* this is an unmatched lead code unit (1st surrogate) */
   3523                                 /* callback(illegal) */
   3524                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3525                                 break;
   3526                             }
   3527                         } else {
   3528                             /* no more input */
   3529                             break;
   3530                         }
   3531                     } else {
   3532                         /* this is an unmatched trail code unit (2nd surrogate) */
   3533                         /* callback(illegal) */
   3534                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3535                         break;
   3536                     }
   3537                 }
   3538 
   3539                 /* convert the Unicode code point in c into codepage bytes */
   3540                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   3541 
   3542                 /* get the bytes and the length for the output */
   3543                 /* MBCS_OUTPUT_2 */
   3544                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   3545 
   3546                 /* is this code point assigned, or do we use fallbacks? */
   3547                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   3548                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   3549                 ) {
   3550                     /*
   3551                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   3552                      * There is no way with this data structure for fallback output
   3553                      * to be a zero byte.
   3554                      */
   3555 
   3556 unassigned:
   3557                     /* try an extension mapping */
   3558                     pArgs->source=source;
   3559                     c=_extFromU(cnv, cnv->sharedData,
   3560                                 c, &source, sourceLimit,
   3561                                 &target, target+targetCapacity,
   3562                                 &offsets, sourceIndex,
   3563                                 pArgs->flush,
   3564                                 pErrorCode);
   3565                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   3566 
   3567                     if(U_FAILURE(*pErrorCode)) {
   3568                         /* not mappable or buffer overflow */
   3569                         break;
   3570                     } else {
   3571                         /* a mapping was written to the target, continue */
   3572 
   3573                         /* recalculate the targetCapacity after an extension mapping */
   3574                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3575 
   3576                         /* normal end of conversion: prepare for a new character */
   3577                         sourceIndex=nextSourceIndex;
   3578                         continue;
   3579                     }
   3580                 }
   3581             }
   3582 
   3583             /* write the output character bytes from value and length */
   3584             /* from the first if in the loop we know that targetCapacity>0 */
   3585             if(value<=0xff) {
   3586                 /* this is easy because we know that there is enough space */
   3587                 *target++=(uint8_t)value;
   3588                 if(offsets!=NULL) {
   3589                     *offsets++=sourceIndex;
   3590                 }
   3591                 --targetCapacity;
   3592             } else /* length==2 */ {
   3593                 *target++=(uint8_t)(value>>8);
   3594                 if(2<=targetCapacity) {
   3595                     *target++=(uint8_t)value;
   3596                     if(offsets!=NULL) {
   3597                         *offsets++=sourceIndex;
   3598                         *offsets++=sourceIndex;
   3599                     }
   3600                     targetCapacity-=2;
   3601                 } else {
   3602                     if(offsets!=NULL) {
   3603                         *offsets++=sourceIndex;
   3604                     }
   3605                     cnv->charErrorBuffer[0]=(char)value;
   3606                     cnv->charErrorBufferLength=1;
   3607 
   3608                     /* target overflow */
   3609                     targetCapacity=0;
   3610                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3611                     c=0;
   3612                     break;
   3613                 }
   3614             }
   3615 
   3616             /* normal end of conversion: prepare for a new character */
   3617             c=0;
   3618             sourceIndex=nextSourceIndex;
   3619             continue;
   3620         } else {
   3621             /* target is full */
   3622             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3623             break;
   3624         }
   3625     }
   3626 
   3627     /* set the converter state back into UConverter */
   3628     cnv->fromUChar32=c;
   3629 
   3630     /* write back the updated pointers */
   3631     pArgs->source=source;
   3632     pArgs->target=(char *)target;
   3633     pArgs->offsets=offsets;
   3634 }
   3635 
   3636 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
   3637 static void
   3638 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3639                                   UErrorCode *pErrorCode) {
   3640     UConverter *cnv;
   3641     const UChar *source, *sourceLimit;
   3642     uint8_t *target;
   3643     int32_t targetCapacity;
   3644     int32_t *offsets;
   3645 
   3646     const uint16_t *table;
   3647     const uint16_t *results;
   3648 
   3649     UChar32 c;
   3650 
   3651     int32_t sourceIndex, nextSourceIndex;
   3652 
   3653     uint16_t value, minValue;
   3654     UBool hasSupplementary;
   3655 
   3656     /* set up the local pointers */
   3657     cnv=pArgs->converter;
   3658     source=pArgs->source;
   3659     sourceLimit=pArgs->sourceLimit;
   3660     target=(uint8_t *)pArgs->target;
   3661     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3662     offsets=pArgs->offsets;
   3663 
   3664     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3665     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3666         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3667     } else {
   3668         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3669     }
   3670 
   3671     if(cnv->useFallback) {
   3672         /* use all roundtrip and fallback results */
   3673         minValue=0x800;
   3674     } else {
   3675         /* use only roundtrips and fallbacks from private-use characters */
   3676         minValue=0xc00;
   3677     }
   3678     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   3679 
   3680     /* get the converter state from UConverter */
   3681     c=cnv->fromUChar32;
   3682 
   3683     /* sourceIndex=-1 if the current character began in the previous buffer */
   3684     sourceIndex= c==0 ? 0 : -1;
   3685     nextSourceIndex=0;
   3686 
   3687     /* conversion loop */
   3688     if(c!=0 && targetCapacity>0) {
   3689         goto getTrail;
   3690     }
   3691 
   3692     while(source<sourceLimit) {
   3693         /*
   3694          * This following test is to see if available input would overflow the output.
   3695          * It does not catch output of more than one byte that
   3696          * overflows as a result of a multi-byte character or callback output
   3697          * from the last source character.
   3698          * Therefore, those situations also test for overflows and will
   3699          * then break the loop, too.
   3700          */
   3701         if(targetCapacity>0) {
   3702             /*
   3703              * Get a correct Unicode code point:
   3704              * a single UChar for a BMP code point or
   3705              * a matched surrogate pair for a "supplementary code point".
   3706              */
   3707             c=*source++;
   3708             ++nextSourceIndex;
   3709             if(U16_IS_SURROGATE(c)) {
   3710                 if(U16_IS_SURROGATE_LEAD(c)) {
   3711 getTrail:
   3712                     if(source<sourceLimit) {
   3713                         /* test the following code unit */
   3714                         UChar trail=*source;
   3715                         if(U16_IS_TRAIL(trail)) {
   3716                             ++source;
   3717                             ++nextSourceIndex;
   3718                             c=U16_GET_SUPPLEMENTARY(c, trail);
   3719                             if(!hasSupplementary) {
   3720                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3721                                 /* callback(unassigned) */
   3722                                 goto unassigned;
   3723                             }
   3724                             /* convert this supplementary code point */
   3725                             /* exit this condition tree */
   3726                         } else {
   3727                             /* this is an unmatched lead code unit (1st surrogate) */
   3728                             /* callback(illegal) */
   3729                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3730                             break;
   3731                         }
   3732                     } else {
   3733                         /* no more input */
   3734                         break;
   3735                     }
   3736                 } else {
   3737                     /* this is an unmatched trail code unit (2nd surrogate) */
   3738                     /* callback(illegal) */
   3739                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3740                     break;
   3741                 }
   3742             }
   3743 
   3744             /* convert the Unicode code point in c into codepage bytes */
   3745             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3746 
   3747             /* is this code point assigned, or do we use fallbacks? */
   3748             if(value>=minValue) {
   3749                 /* assigned, write the output character bytes from value and length */
   3750                 /* length==1 */
   3751                 /* this is easy because we know that there is enough space */
   3752                 *target++=(uint8_t)value;
   3753                 if(offsets!=NULL) {
   3754                     *offsets++=sourceIndex;
   3755                 }
   3756                 --targetCapacity;
   3757 
   3758                 /* normal end of conversion: prepare for a new character */
   3759                 c=0;
   3760                 sourceIndex=nextSourceIndex;
   3761             } else { /* unassigned */
   3762 unassigned:
   3763                 /* try an extension mapping */
   3764                 pArgs->source=source;
   3765                 c=_extFromU(cnv, cnv->sharedData,
   3766                             c, &source, sourceLimit,
   3767                             &target, target+targetCapacity,
   3768                             &offsets, sourceIndex,
   3769                             pArgs->flush,
   3770                             pErrorCode);
   3771                 nextSourceIndex+=(int32_t)(source-pArgs->source);
   3772 
   3773                 if(U_FAILURE(*pErrorCode)) {
   3774                     /* not mappable or buffer overflow */
   3775                     break;
   3776                 } else {
   3777                     /* a mapping was written to the target, continue */
   3778 
   3779                     /* recalculate the targetCapacity after an extension mapping */
   3780                     targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3781 
   3782                     /* normal end of conversion: prepare for a new character */
   3783                     sourceIndex=nextSourceIndex;
   3784                 }
   3785             }
   3786         } else {
   3787             /* target is full */
   3788             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3789             break;
   3790         }
   3791     }
   3792 
   3793     /* set the converter state back into UConverter */
   3794     cnv->fromUChar32=c;
   3795 
   3796     /* write back the updated pointers */
   3797     pArgs->source=source;
   3798     pArgs->target=(char *)target;
   3799     pArgs->offsets=offsets;
   3800 }
   3801 
   3802 /*
   3803  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
   3804  * that map only to and from the BMP.
   3805  * In addition to single-byte/state optimizations, the offset calculations
   3806  * become much easier.
   3807  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
   3808  * but measurements have shown that this diminishes performance
   3809  * in more cases than it improves it.
   3810  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
   3811  * for various MBCS and SBCS optimizations.
   3812  */
   3813 static void
   3814 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3815                               UErrorCode *pErrorCode) {
   3816     UConverter *cnv;
   3817     const UChar *source, *sourceLimit, *lastSource;
   3818     uint8_t *target;
   3819     int32_t targetCapacity, length;
   3820     int32_t *offsets;
   3821 
   3822     const uint16_t *table;
   3823     const uint16_t *results;
   3824 
   3825     UChar32 c;
   3826 
   3827     int32_t sourceIndex;
   3828 
   3829     uint32_t asciiRoundtrips;
   3830     uint16_t value, minValue;
   3831 
   3832     /* set up the local pointers */
   3833     cnv=pArgs->converter;
   3834     source=pArgs->source;
   3835     sourceLimit=pArgs->sourceLimit;
   3836     target=(uint8_t *)pArgs->target;
   3837     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3838     offsets=pArgs->offsets;
   3839 
   3840     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3841     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3842         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3843     } else {
   3844         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3845     }
   3846     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3847 
   3848     if(cnv->useFallback) {
   3849         /* use all roundtrip and fallback results */
   3850         minValue=0x800;
   3851     } else {
   3852         /* use only roundtrips and fallbacks from private-use characters */
   3853         minValue=0xc00;
   3854     }
   3855 
   3856     /* get the converter state from UConverter */
   3857     c=cnv->fromUChar32;
   3858 
   3859     /* sourceIndex=-1 if the current character began in the previous buffer */
   3860     sourceIndex= c==0 ? 0 : -1;
   3861     lastSource=source;
   3862 
   3863     /*
   3864      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   3865      * for the minimum of the sourceLength and targetCapacity
   3866      */
   3867     length=(int32_t)(sourceLimit-source);
   3868     if(length<targetCapacity) {
   3869         targetCapacity=length;
   3870     }
   3871 
   3872     /* conversion loop */
   3873     if(c!=0 && targetCapacity>0) {
   3874         goto getTrail;
   3875     }
   3876 
   3877 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3878     /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3879     /* unroll the loop with the most common case */
   3880 unrolled:
   3881     if(targetCapacity>=4) {
   3882         int32_t count, loops;
   3883         uint16_t andedValues;
   3884 
   3885         loops=count=targetCapacity>>2;
   3886         do {
   3887             c=*source++;
   3888             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3889             *target++=(uint8_t)value;
   3890             c=*source++;
   3891             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3892             *target++=(uint8_t)value;
   3893             c=*source++;
   3894             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3895             *target++=(uint8_t)value;
   3896             c=*source++;
   3897             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3898             *target++=(uint8_t)value;
   3899 
   3900             /* were all 4 entries really valid? */
   3901             if(andedValues<minValue) {
   3902                 /* no, return to the first of these 4 */
   3903                 source-=4;
   3904                 target-=4;
   3905                 break;
   3906             }
   3907         } while(--count>0);
   3908         count=loops-count;
   3909         targetCapacity-=4*count;
   3910 
   3911         if(offsets!=NULL) {
   3912             lastSource+=4*count;
   3913             while(count>0) {
   3914                 *offsets++=sourceIndex++;
   3915                 *offsets++=sourceIndex++;
   3916                 *offsets++=sourceIndex++;
   3917                 *offsets++=sourceIndex++;
   3918                 --count;
   3919             }
   3920         }
   3921 
   3922         c=0;
   3923     }
   3924 #endif
   3925 
   3926     while(targetCapacity>0) {
   3927         /*
   3928          * Get a correct Unicode code point:
   3929          * a single UChar for a BMP code point or
   3930          * a matched surrogate pair for a "supplementary code point".
   3931          */
   3932         c=*source++;
   3933         /*
   3934          * Do not immediately check for single surrogates:
   3935          * Assume that they are unassigned and check for them in that case.
   3936          * This speeds up the conversion of assigned characters.
   3937          */
   3938         /* convert the Unicode code point in c into codepage bytes */
   3939         if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3940             *target++=(uint8_t)c;
   3941             --targetCapacity;
   3942             c=0;
   3943             continue;
   3944         }
   3945         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3946         /* is this code point assigned, or do we use fallbacks? */
   3947         if(value>=minValue) {
   3948             /* assigned, write the output character bytes from value and length */
   3949             /* length==1 */
   3950             /* this is easy because we know that there is enough space */
   3951             *target++=(uint8_t)value;
   3952             --targetCapacity;
   3953 
   3954             /* normal end of conversion: prepare for a new character */
   3955             c=0;
   3956             continue;
   3957         } else if(!U16_IS_SURROGATE(c)) {
   3958             /* normal, unassigned BMP character */
   3959         } else if(U16_IS_SURROGATE_LEAD(c)) {
   3960 getTrail:
   3961             if(source<sourceLimit) {
   3962                 /* test the following code unit */
   3963                 UChar trail=*source;
   3964                 if(U16_IS_TRAIL(trail)) {
   3965                     ++source;
   3966                     c=U16_GET_SUPPLEMENTARY(c, trail);
   3967                     /* this codepage does not map supplementary code points */
   3968                     /* callback(unassigned) */
   3969                 } else {
   3970                     /* this is an unmatched lead code unit (1st surrogate) */
   3971                     /* callback(illegal) */
   3972                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3973                     break;
   3974                 }
   3975             } else {
   3976                 /* no more input */
   3977                 if (pArgs->flush) {
   3978                     *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3979                 }
   3980                 break;
   3981             }
   3982         } else {
   3983             /* this is an unmatched trail code unit (2nd surrogate) */
   3984             /* callback(illegal) */
   3985             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3986             break;
   3987         }
   3988 
   3989         /* c does not have a mapping */
   3990 
   3991         /* get the number of code units for c to correctly advance sourceIndex */
   3992         length=U16_LENGTH(c);
   3993 
   3994         /* set offsets since the start or the last extension */
   3995         if(offsets!=NULL) {
   3996             int32_t count=(int32_t)(source-lastSource);
   3997 
   3998             /* do not set the offset for this character */
   3999             count-=length;
   4000 
   4001             while(count>0) {
   4002                 *offsets++=sourceIndex++;
   4003                 --count;
   4004             }
   4005             /* offsets and sourceIndex are now set for the current character */
   4006         }
   4007 
   4008         /* try an extension mapping */
   4009         lastSource=source;
   4010         c=_extFromU(cnv, cnv->sharedData,
   4011                     c, &source, sourceLimit,
   4012                     &target, (const uint8_t *)(pArgs->targetLimit),
   4013                     &offsets, sourceIndex,
   4014                     pArgs->flush,
   4015                     pErrorCode);
   4016         sourceIndex+=length+(int32_t)(source-lastSource);
   4017         lastSource=source;
   4018 
   4019         if(U_FAILURE(*pErrorCode)) {
   4020             /* not mappable or buffer overflow */
   4021             break;
   4022         } else {
   4023             /* a mapping was written to the target, continue */
   4024 
   4025             /* recalculate the targetCapacity after an extension mapping */
   4026             targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4027             length=(int32_t)(sourceLimit-source);
   4028             if(length<targetCapacity) {
   4029                 targetCapacity=length;
   4030             }
   4031         }
   4032 
   4033 #if MBCS_UNROLL_SINGLE_FROM_BMP
   4034         /* unrolling makes it slower on Pentium III/Windows 2000?! */
   4035         goto unrolled;
   4036 #endif
   4037     }
   4038 
   4039     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
   4040         /* target is full */
   4041         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4042     }
   4043 
   4044     /* set offsets since the start or the last callback */
   4045     if(offsets!=NULL) {
   4046         size_t count=source-lastSource;
   4047         if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
   4048             /*
   4049             Caller gave us a partial supplementary character,
   4050             which this function couldn't convert in any case.
   4051             The callback will handle the offset.
   4052             */
   4053             count--;
   4054         }
   4055         while(count>0) {
   4056             *offsets++=sourceIndex++;
   4057             --count;
   4058         }
   4059     }
   4060 
   4061     /* set the converter state back into UConverter */
   4062     cnv->fromUChar32=c;
   4063 
   4064     /* write back the updated pointers */
   4065     pArgs->source=source;
   4066     pArgs->target=(char *)target;
   4067     pArgs->offsets=offsets;
   4068 }
   4069 
   4070 U_CFUNC void
   4071 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   4072                             UErrorCode *pErrorCode) {
   4073     UConverter *cnv;
   4074     const UChar *source, *sourceLimit;
   4075     uint8_t *target;
   4076     int32_t targetCapacity;
   4077     int32_t *offsets;
   4078 
   4079     const uint16_t *table;
   4080     const uint16_t *mbcsIndex;
   4081     const uint8_t *p, *bytes;
   4082     uint8_t outputType;
   4083 
   4084     UChar32 c;
   4085 
   4086     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
   4087 
   4088     uint32_t stage2Entry;
   4089     uint32_t asciiRoundtrips;
   4090     uint32_t value;
   4091     /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
   4092     uint8_t siBytes[2] = {0, 0};
   4093     uint8_t soBytes[2] = {0, 0};
   4094     uint8_t siLength, soLength;
   4095     int32_t length = 0, prevLength;
   4096     uint8_t unicodeMask;
   4097 
   4098     cnv=pArgs->converter;
   4099 
   4100     if(cnv->preFromUFirstCP>=0) {
   4101         /*
   4102          * pass sourceIndex=-1 because we continue from an earlier buffer
   4103          * in the future, this may change with continuous offsets
   4104          */
   4105         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
   4106 
   4107         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
   4108             return;
   4109         }
   4110     }
   4111 
   4112     /* use optimized function if possible */
   4113     outputType=cnv->sharedData->mbcs.outputType;
   4114     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   4115     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4116         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4117             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
   4118         } else {
   4119             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
   4120         }
   4121         return;
   4122     } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
   4123         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
   4124         return;
   4125     }
   4126 
   4127     /* set up the local pointers */
   4128     source=pArgs->source;
   4129     sourceLimit=pArgs->sourceLimit;
   4130     target=(uint8_t *)pArgs->target;
   4131     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   4132     offsets=pArgs->offsets;
   4133 
   4134     table=cnv->sharedData->mbcs.fromUnicodeTable;
   4135     if(cnv->sharedData->mbcs.utf8Friendly) {
   4136         mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   4137     } else {
   4138         mbcsIndex=NULL;
   4139     }
   4140     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   4141         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   4142     } else {
   4143         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   4144     }
   4145     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   4146 
   4147     /* get the converter state from UConverter */
   4148     c=cnv->fromUChar32;
   4149 
   4150     if(outputType==MBCS_OUTPUT_2_SISO) {
   4151         prevLength=cnv->fromUnicodeStatus;
   4152         if(prevLength==0) {
   4153             /* set the real value */
   4154             prevLength=1;
   4155         }
   4156     } else {
   4157         /* prevent fromUnicodeStatus from being set to something non-0 */
   4158         prevLength=0;
   4159     }
   4160 
   4161     /* sourceIndex=-1 if the current character began in the previous buffer */
   4162     prevSourceIndex=-1;
   4163     sourceIndex= c==0 ? 0 : -1;
   4164     nextSourceIndex=0;
   4165 
   4166     /* Get the SI/SO character for the converter */
   4167     siLength = getSISOBytes(SI, cnv->options, siBytes);
   4168     soLength = getSISOBytes(SO, cnv->options, soBytes);
   4169 
   4170     /* conversion loop */
   4171     /*
   4172      * This is another piece of ugly code:
   4173      * A goto into the loop if the converter state contains a first surrogate
   4174      * from the previous function call.
   4175      * It saves me to check in each loop iteration a check of if(c==0)
   4176      * and duplicating the trail-surrogate-handling code in the else
   4177      * branch of that check.
   4178      * I could not find any other way to get around this other than
   4179      * using a function call for the conversion and callback, which would
   4180      * be even more inefficient.
   4181      *
   4182      * Markus Scherer 2000-jul-19
   4183      */
   4184     if(c!=0 && targetCapacity>0) {
   4185         goto getTrail;
   4186     }
   4187 
   4188     while(source<sourceLimit) {
   4189         /*
   4190          * This following test is to see if available input would overflow the output.
   4191          * It does not catch output of more than one byte that
   4192          * overflows as a result of a multi-byte character or callback output
   4193          * from the last source character.
   4194          * Therefore, those situations also test for overflows and will
   4195          * then break the loop, too.
   4196          */
   4197         if(targetCapacity>0) {
   4198             /*
   4199              * Get a correct Unicode code point:
   4200              * a single UChar for a BMP code point or
   4201              * a matched surrogate pair for a "supplementary code point".
   4202              */
   4203             c=*source++;
   4204             ++nextSourceIndex;
   4205             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   4206                 *target++=(uint8_t)c;
   4207                 if(offsets!=NULL) {
   4208                     *offsets++=sourceIndex;
   4209                     prevSourceIndex=sourceIndex;
   4210                     sourceIndex=nextSourceIndex;
   4211                 }
   4212                 --targetCapacity;
   4213                 c=0;
   4214                 continue;
   4215             }
   4216             /*
   4217              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   4218              * to avoid dealing with surrogates.
   4219              * MBCS_FAST_MAX must be >=0xd7ff.
   4220              */
   4221             if(c<=0xd7ff && mbcsIndex!=NULL) {
   4222                 value=mbcsIndex[c>>6];
   4223 
   4224                 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
   4225                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   4226                 switch(outputType) {
   4227                 case MBCS_OUTPUT_2:
   4228                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4229                     if(value<=0xff) {
   4230                         if(value==0) {
   4231                             goto unassigned;
   4232                         } else {
   4233                             length=1;
   4234                         }
   4235                     } else {
   4236                         length=2;
   4237                     }
   4238                     break;
   4239                 case MBCS_OUTPUT_2_SISO:
   4240                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4241                     /*
   4242                      * Save the old state in the converter object
   4243                      * right here, then change the local prevLength state variable if necessary.
   4244                      * Then, if this character turns out to be unassigned or a fallback that
   4245                      * is not taken, the callback code must not save the new state in the converter
   4246                      * because the new state is for a character that is not output.
   4247                      * However, the callback must still restore the state from the converter
   4248                      * in case the callback function changed it for its output.
   4249                      */
   4250                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4251                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4252                     if(value<=0xff) {
   4253                         if(value==0) {
   4254                             goto unassigned;
   4255                         } else if(prevLength<=1) {
   4256                             length=1;
   4257                         } else {
   4258                             /* change from double-byte mode to single-byte */
   4259                             if (siLength == 1) {
   4260                                 value|=(uint32_t)siBytes[0]<<8;
   4261                                 length = 2;
   4262                             } else if (siLength == 2) {
   4263                                 value|=(uint32_t)siBytes[1]<<8;
   4264                                 value|=(uint32_t)siBytes[0]<<16;
   4265                                 length = 3;
   4266                             }
   4267                             prevLength=1;
   4268                         }
   4269                     } else {
   4270                         if(prevLength==2) {
   4271                             length=2;
   4272                         } else {
   4273                             /* change from single-byte mode to double-byte */
   4274                             if (soLength == 1) {
   4275                                 value|=(uint32_t)soBytes[0]<<16;
   4276                                 length = 3;
   4277                             } else if (soLength == 2) {
   4278                                 value|=(uint32_t)soBytes[1]<<16;
   4279                                 value|=(uint32_t)soBytes[0]<<24;
   4280                                 length = 4;
   4281                             }
   4282                             prevLength=2;
   4283                         }
   4284                     }
   4285                     break;
   4286                 case MBCS_OUTPUT_DBCS_ONLY:
   4287                     /* table with single-byte results, but only DBCS mappings used */
   4288                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4289                     if(value<=0xff) {
   4290                         /* no mapping or SBCS result, not taken for DBCS-only */
   4291                         goto unassigned;
   4292                     } else {
   4293                         length=2;
   4294                     }
   4295                     break;
   4296                 case MBCS_OUTPUT_3:
   4297                     p=bytes+(value+(c&0x3f))*3;
   4298                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4299                     if(value<=0xff) {
   4300                         if(value==0) {
   4301                             goto unassigned;
   4302                         } else {
   4303                             length=1;
   4304                         }
   4305                     } else if(value<=0xffff) {
   4306                         length=2;
   4307                     } else {
   4308                         length=3;
   4309                     }
   4310                     break;
   4311                 case MBCS_OUTPUT_4:
   4312                     value=((const uint32_t *)bytes)[value +(c&0x3f)];
   4313                     if(value<=0xff) {
   4314                         if(value==0) {
   4315                             goto unassigned;
   4316                         } else {
   4317                             length=1;
   4318                         }
   4319                     } else if(value<=0xffff) {
   4320                         length=2;
   4321                     } else if(value<=0xffffff) {
   4322                         length=3;
   4323                     } else {
   4324                         length=4;
   4325                     }
   4326                     break;
   4327                 case MBCS_OUTPUT_3_EUC:
   4328                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4329                     /* EUC 16-bit fixed-length representation */
   4330                     if(value<=0xff) {
   4331                         if(value==0) {
   4332                             goto unassigned;
   4333                         } else {
   4334                             length=1;
   4335                         }
   4336                     } else if((value&0x8000)==0) {
   4337                         value|=0x8e8000;
   4338                         length=3;
   4339                     } else if((value&0x80)==0) {
   4340                         value|=0x8f0080;
   4341                         length=3;
   4342                     } else {
   4343                         length=2;
   4344                     }
   4345                     break;
   4346                 case MBCS_OUTPUT_4_EUC:
   4347                     p=bytes+(value+(c&0x3f))*3;
   4348                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4349                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4350                     if(value<=0xff) {
   4351                         if(value==0) {
   4352                             goto unassigned;
   4353                         } else {
   4354                             length=1;
   4355                         }
   4356                     } else if(value<=0xffff) {
   4357                         length=2;
   4358                     } else if((value&0x800000)==0) {
   4359                         value|=0x8e800000;
   4360                         length=4;
   4361                     } else if((value&0x8000)==0) {
   4362                         value|=0x8f008000;
   4363                         length=4;
   4364                     } else {
   4365                         length=3;
   4366                     }
   4367                     break;
   4368                 default:
   4369                     /* must not occur */
   4370                     /*
   4371                      * To avoid compiler warnings that value & length may be
   4372                      * used without having been initialized, we set them here.
   4373                      * In reality, this is unreachable code.
   4374                      * Not having a default branch also causes warnings with
   4375                      * some compilers.
   4376                      */
   4377                     value=0;
   4378                     length=0;
   4379                     break;
   4380                 }
   4381                 /* output the value */
   4382             } else {
   4383                 /*
   4384                  * This also tests if the codepage maps single surrogates.
   4385                  * If it does, then surrogates are not paired but mapped separately.
   4386                  * Note that in this case unmatched surrogates are not detected.
   4387                  */
   4388                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4389                     if(U16_IS_SURROGATE_LEAD(c)) {
   4390 getTrail:
   4391                         if(source<sourceLimit) {
   4392                             /* test the following code unit */
   4393                             UChar trail=*source;
   4394                             if(U16_IS_TRAIL(trail)) {
   4395                                 ++source;
   4396                                 ++nextSourceIndex;
   4397                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   4398                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4399                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4400                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4401                                     /* callback(unassigned) */
   4402                                     goto unassigned;
   4403                                 }
   4404                                 /* convert this supplementary code point */
   4405                                 /* exit this condition tree */
   4406                             } else {
   4407                                 /* this is an unmatched lead code unit (1st surrogate) */
   4408                                 /* callback(illegal) */
   4409                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4410                                 break;
   4411                             }
   4412                         } else {
   4413                             /* no more input */
   4414                             break;
   4415                         }
   4416                     } else {
   4417                         /* this is an unmatched trail code unit (2nd surrogate) */
   4418                         /* callback(illegal) */
   4419                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4420                         break;
   4421                     }
   4422                 }
   4423 
   4424                 /* convert the Unicode code point in c into codepage bytes */
   4425 
   4426                 /*
   4427                  * The basic lookup is a triple-stage compact array (trie) lookup.
   4428                  * For details see the beginning of this file.
   4429                  *
   4430                  * Single-byte codepages are handled with a different data structure
   4431                  * by _MBCSSingle... functions.
   4432                  *
   4433                  * The result consists of a 32-bit value from stage 2 and
   4434                  * a pointer to as many bytes as are stored per character.
   4435                  * The pointer points to the character's bytes in stage 3.
   4436                  * Bits 15..0 of the stage 2 entry contain the stage 3 index
   4437                  * for that pointer, while bits 31..16 are flags for which of
   4438                  * the 16 characters in the block are roundtrip-assigned.
   4439                  *
   4440                  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
   4441                  * respectively as uint32_t, in the platform encoding.
   4442                  * For 3-byte codepages, the bytes are always stored in big-endian order.
   4443                  *
   4444                  * For EUC encodings that use only either 0x8e or 0x8f as the first
   4445                  * byte of their longest byte sequences, the first two bytes in
   4446                  * this third stage indicate with their 7th bits whether these bytes
   4447                  * are to be written directly or actually need to be preceeded by
   4448                  * one of the two Single-Shift codes. With this, the third stage
   4449                  * stores one byte fewer per character than the actual maximum length of
   4450                  * EUC byte sequences.
   4451                  *
   4452                  * Other than that, leading zero bytes are removed and the other
   4453                  * bytes output. A single zero byte may be output if the "assigned"
   4454                  * bit in stage 2 was on.
   4455                  * The data structure does not support zero byte output as a fallback,
   4456                  * and also does not allow output of leading zeros.
   4457                  */
   4458                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4459 
   4460                 /* get the bytes and the length for the output */
   4461                 switch(outputType) {
   4462                 case MBCS_OUTPUT_2:
   4463                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4464                     if(value<=0xff) {
   4465                         length=1;
   4466                     } else {
   4467                         length=2;
   4468                     }
   4469                     break;
   4470                 case MBCS_OUTPUT_2_SISO:
   4471                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4472                     /*
   4473                      * Save the old state in the converter object
   4474                      * right here, then change the local prevLength state variable if necessary.
   4475                      * Then, if this character turns out to be unassigned or a fallback that
   4476                      * is not taken, the callback code must not save the new state in the converter
   4477                      * because the new state is for a character that is not output.
   4478                      * However, the callback must still restore the state from the converter
   4479                      * in case the callback function changed it for its output.
   4480                      */
   4481                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4482                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4483                     if(value<=0xff) {
   4484                         if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
   4485                             /* no mapping, leave value==0 */
   4486                             length=0;
   4487                         } else if(prevLength<=1) {
   4488                             length=1;
   4489                         } else {
   4490                             /* change from double-byte mode to single-byte */
   4491                             if (siLength == 1) {
   4492                                 value|=(uint32_t)siBytes[0]<<8;
   4493                                 length = 2;
   4494                             } else if (siLength == 2) {
   4495                                 value|=(uint32_t)siBytes[1]<<8;
   4496                                 value|=(uint32_t)siBytes[0]<<16;
   4497                                 length = 3;
   4498                             }
   4499                             prevLength=1;
   4500                         }
   4501                     } else {
   4502                         if(prevLength==2) {
   4503                             length=2;
   4504                         } else {
   4505                             /* change from single-byte mode to double-byte */
   4506                             if (soLength == 1) {
   4507                                 value|=(uint32_t)soBytes[0]<<16;
   4508                                 length = 3;
   4509                             } else if (soLength == 2) {
   4510                                 value|=(uint32_t)soBytes[1]<<16;
   4511                                 value|=(uint32_t)soBytes[0]<<24;
   4512                                 length = 4;
   4513                             }
   4514                             prevLength=2;
   4515                         }
   4516                     }
   4517                     break;
   4518                 case MBCS_OUTPUT_DBCS_ONLY:
   4519                     /* table with single-byte results, but only DBCS mappings used */
   4520                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4521                     if(value<=0xff) {
   4522                         /* no mapping or SBCS result, not taken for DBCS-only */
   4523                         value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4524                         length=0;
   4525                     } else {
   4526                         length=2;
   4527                     }
   4528                     break;
   4529                 case MBCS_OUTPUT_3:
   4530                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4531                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4532                     if(value<=0xff) {
   4533                         length=1;
   4534                     } else if(value<=0xffff) {
   4535                         length=2;
   4536                     } else {
   4537                         length=3;
   4538                     }
   4539                     break;
   4540                 case MBCS_OUTPUT_4:
   4541                     value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
   4542                     if(value<=0xff) {
   4543                         length=1;
   4544                     } else if(value<=0xffff) {
   4545                         length=2;
   4546                     } else if(value<=0xffffff) {
   4547                         length=3;
   4548                     } else {
   4549                         length=4;
   4550                     }
   4551                     break;
   4552                 case MBCS_OUTPUT_3_EUC:
   4553                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4554                     /* EUC 16-bit fixed-length representation */
   4555                     if(value<=0xff) {
   4556                         length=1;
   4557                     } else if((value&0x8000)==0) {
   4558                         value|=0x8e8000;
   4559                         length=3;
   4560                     } else if((value&0x80)==0) {
   4561                         value|=0x8f0080;
   4562                         length=3;
   4563                     } else {
   4564                         length=2;
   4565                     }
   4566                     break;
   4567                 case MBCS_OUTPUT_4_EUC:
   4568                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4569                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4570                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4571                     if(value<=0xff) {
   4572                         length=1;
   4573                     } else if(value<=0xffff) {
   4574                         length=2;
   4575                     } else if((value&0x800000)==0) {
   4576                         value|=0x8e800000;
   4577                         length=4;
   4578                     } else if((value&0x8000)==0) {
   4579                         value|=0x8f008000;
   4580                         length=4;
   4581                     } else {
   4582                         length=3;
   4583                     }
   4584                     break;
   4585                 default:
   4586                     /* must not occur */
   4587                     /*
   4588                      * To avoid compiler warnings that value & length may be
   4589                      * used without having been initialized, we set them here.
   4590                      * In reality, this is unreachable code.
   4591                      * Not having a default branch also causes warnings with
   4592                      * some compilers.
   4593                      */
   4594                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4595                     length=0;
   4596                     break;
   4597                 }
   4598 
   4599                 /* is this code point assigned, or do we use fallbacks? */
   4600                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
   4601                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   4602                 ) {
   4603                     /*
   4604                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4605                      * There is no way with this data structure for fallback output
   4606                      * to be a zero byte.
   4607                      */
   4608 
   4609 unassigned:
   4610                     /* try an extension mapping */
   4611                     pArgs->source=source;
   4612                     c=_extFromU(cnv, cnv->sharedData,
   4613                                 c, &source, sourceLimit,
   4614                                 &target, target+targetCapacity,
   4615                                 &offsets, sourceIndex,
   4616                                 pArgs->flush,
   4617                                 pErrorCode);
   4618                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   4619                     prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
   4620 
   4621                     if(U_FAILURE(*pErrorCode)) {
   4622                         /* not mappable or buffer overflow */
   4623                         break;
   4624                     } else {
   4625                         /* a mapping was written to the target, continue */
   4626 
   4627                         /* recalculate the targetCapacity after an extension mapping */
   4628                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4629 
   4630                         /* normal end of conversion: prepare for a new character */
   4631                         if(offsets!=NULL) {
   4632                             prevSourceIndex=sourceIndex;
   4633                             sourceIndex=nextSourceIndex;
   4634                         }
   4635                         continue;
   4636                     }
   4637                 }
   4638             }
   4639 
   4640             /* write the output character bytes from value and length */
   4641             /* from the first if in the loop we know that targetCapacity>0 */
   4642             if(length<=targetCapacity) {
   4643                 if(offsets==NULL) {
   4644                     switch(length) {
   4645                         /* each branch falls through to the next one */
   4646                     case 4:
   4647                         *target++=(uint8_t)(value>>24);
   4648                         U_FALLTHROUGH;
   4649                     case 3:
   4650                         *target++=(uint8_t)(value>>16);
   4651                         U_FALLTHROUGH;
   4652                     case 2:
   4653                         *target++=(uint8_t)(value>>8);
   4654                         U_FALLTHROUGH;
   4655                     case 1:
   4656                         *target++=(uint8_t)value;
   4657                         U_FALLTHROUGH;
   4658                     default:
   4659                         /* will never occur */
   4660                         break;
   4661                     }
   4662                 } else {
   4663                     switch(length) {
   4664                         /* each branch falls through to the next one */
   4665                     case 4:
   4666                         *target++=(uint8_t)(value>>24);
   4667                         *offsets++=sourceIndex;
   4668                         U_FALLTHROUGH;
   4669                     case 3:
   4670                         *target++=(uint8_t)(value>>16);
   4671                         *offsets++=sourceIndex;
   4672                         U_FALLTHROUGH;
   4673                     case 2:
   4674                         *target++=(uint8_t)(value>>8);
   4675                         *offsets++=sourceIndex;
   4676                         U_FALLTHROUGH;
   4677                     case 1:
   4678                         *target++=(uint8_t)value;
   4679                         *offsets++=sourceIndex;
   4680                         U_FALLTHROUGH;
   4681                     default:
   4682                         /* will never occur */
   4683                         break;
   4684                     }
   4685                 }
   4686                 targetCapacity-=length;
   4687             } else {
   4688                 uint8_t *charErrorBuffer;
   4689 
   4690                 /*
   4691                  * We actually do this backwards here:
   4692                  * In order to save an intermediate variable, we output
   4693                  * first to the overflow buffer what does not fit into the
   4694                  * regular target.
   4695                  */
   4696                 /* we know that 1<=targetCapacity<length<=4 */
   4697                 length-=targetCapacity;
   4698                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   4699                 switch(length) {
   4700                     /* each branch falls through to the next one */
   4701                 case 3:
   4702                     *charErrorBuffer++=(uint8_t)(value>>16);
   4703                     U_FALLTHROUGH;
   4704                 case 2:
   4705                     *charErrorBuffer++=(uint8_t)(value>>8);
   4706                     U_FALLTHROUGH;
   4707                 case 1:
   4708                     *charErrorBuffer=(uint8_t)value;
   4709                     U_FALLTHROUGH;
   4710                 default:
   4711                     /* will never occur */
   4712                     break;
   4713                 }
   4714                 cnv->charErrorBufferLength=(int8_t)length;
   4715 
   4716                 /* now output what fits into the regular target */
   4717                 value>>=8*length; /* length was reduced by targetCapacity */
   4718                 switch(targetCapacity) {
   4719                     /* each branch falls through to the next one */
   4720                 case 3:
   4721                     *target++=(uint8_t)(value>>16);
   4722                     if(offsets!=NULL) {
   4723                         *offsets++=sourceIndex;
   4724                     }
   4725                     U_FALLTHROUGH;
   4726                 case 2:
   4727                     *target++=(uint8_t)(value>>8);
   4728                     if(offsets!=NULL) {
   4729                         *offsets++=sourceIndex;
   4730                     }
   4731                     U_FALLTHROUGH;
   4732                 case 1:
   4733                     *target++=(uint8_t)value;
   4734                     if(offsets!=NULL) {
   4735                         *offsets++=sourceIndex;
   4736                     }
   4737                     U_FALLTHROUGH;
   4738                 default:
   4739                     /* will never occur */
   4740                     break;
   4741                 }
   4742 
   4743                 /* target overflow */
   4744                 targetCapacity=0;
   4745                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4746                 c=0;
   4747                 break;
   4748             }
   4749 
   4750             /* normal end of conversion: prepare for a new character */
   4751             c=0;
   4752             if(offsets!=NULL) {
   4753                 prevSourceIndex=sourceIndex;
   4754                 sourceIndex=nextSourceIndex;
   4755             }
   4756             continue;
   4757         } else {
   4758             /* target is full */
   4759             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4760             break;
   4761         }
   4762     }
   4763 
   4764     /*
   4765      * the end of the input stream and detection of truncated input
   4766      * are handled by the framework, but for EBCDIC_STATEFUL conversion
   4767      * we need to emit an SI at the very end
   4768      *
   4769      * conditions:
   4770      *   successful
   4771      *   EBCDIC_STATEFUL in DBCS mode
   4772      *   end of input and no truncated input
   4773      */
   4774     if( U_SUCCESS(*pErrorCode) &&
   4775         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
   4776         pArgs->flush && source>=sourceLimit && c==0
   4777     ) {
   4778         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
   4779         if(targetCapacity>0) {
   4780             *target++=(uint8_t)siBytes[0];
   4781             if (siLength == 2) {
   4782                 if (targetCapacity<2) {
   4783                     cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
   4784                     cnv->charErrorBufferLength=1;
   4785                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4786                 } else {
   4787                     *target++=(uint8_t)siBytes[1];
   4788                 }
   4789             }
   4790             if(offsets!=NULL) {
   4791                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
   4792                 *offsets++=prevSourceIndex;
   4793             }
   4794         } else {
   4795             /* target is full */
   4796             cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
   4797             if (siLength == 2) {
   4798                 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
   4799             }
   4800             cnv->charErrorBufferLength=siLength;
   4801             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4802         }
   4803         prevLength=1; /* we switched into SBCS */
   4804     }
   4805 
   4806     /* set the converter state back into UConverter */
   4807     cnv->fromUChar32=c;
   4808     cnv->fromUnicodeStatus=prevLength;
   4809 
   4810     /* write back the updated pointers */
   4811     pArgs->source=source;
   4812     pArgs->target=(char *)target;
   4813     pArgs->offsets=offsets;
   4814 }
   4815 
   4816 /*
   4817  * This is another simple conversion function for internal use by other
   4818  * conversion implementations.
   4819  * It does not use the converter state nor call callbacks.
   4820  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4821  * It handles conversion extensions but not GB 18030.
   4822  *
   4823  * It converts one single Unicode code point into codepage bytes, encoded
   4824  * as one 32-bit value. The function returns the number of bytes in *pValue:
   4825  * 1..4 the number of bytes in *pValue
   4826  * 0    unassigned (*pValue undefined)
   4827  * -1   illegal (currently not used, *pValue undefined)
   4828  *
   4829  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
   4830  * the second to last byte in bits 15..8, etc.
   4831  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
   4832  */
   4833 U_CFUNC int32_t
   4834 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
   4835                  UChar32 c, uint32_t *pValue,
   4836                  UBool useFallback) {
   4837     const int32_t *cx;
   4838     const uint16_t *table;
   4839 #if 0
   4840 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4841     const uint8_t *p;
   4842 #endif
   4843     uint32_t stage2Entry;
   4844     uint32_t value;
   4845     int32_t length;
   4846 
   4847     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4848     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4849         table=sharedData->mbcs.fromUnicodeTable;
   4850 
   4851         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4852         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
   4853             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4854             /* is this code point assigned, or do we use fallbacks? */
   4855             if(useFallback ? value>=0x800 : value>=0xc00) {
   4856                 *pValue=value&0xff;
   4857                 return 1;
   4858             }
   4859         } else /* outputType!=MBCS_OUTPUT_1 */ {
   4860             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4861 
   4862             /* get the bytes and the length for the output */
   4863             switch(sharedData->mbcs.outputType) {
   4864             case MBCS_OUTPUT_2:
   4865                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4866                 if(value<=0xff) {
   4867                     length=1;
   4868                 } else {
   4869                     length=2;
   4870                 }
   4871                 break;
   4872 #if 0
   4873 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4874             case MBCS_OUTPUT_DBCS_ONLY:
   4875                 /* table with single-byte results, but only DBCS mappings used */
   4876                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4877                 if(value<=0xff) {
   4878                     /* no mapping or SBCS result, not taken for DBCS-only */
   4879                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4880                     length=0;
   4881                 } else {
   4882                     length=2;
   4883                 }
   4884                 break;
   4885             case MBCS_OUTPUT_3:
   4886                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4887                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4888                 if(value<=0xff) {
   4889                     length=1;
   4890                 } else if(value<=0xffff) {
   4891                     length=2;
   4892                 } else {
   4893                     length=3;
   4894                 }
   4895                 break;
   4896             case MBCS_OUTPUT_4:
   4897                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4898                 if(value<=0xff) {
   4899                     length=1;
   4900                 } else if(value<=0xffff) {
   4901                     length=2;
   4902                 } else if(value<=0xffffff) {
   4903                     length=3;
   4904                 } else {
   4905                     length=4;
   4906                 }
   4907                 break;
   4908             case MBCS_OUTPUT_3_EUC:
   4909                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4910                 /* EUC 16-bit fixed-length representation */
   4911                 if(value<=0xff) {
   4912                     length=1;
   4913                 } else if((value&0x8000)==0) {
   4914                     value|=0x8e8000;
   4915                     length=3;
   4916                 } else if((value&0x80)==0) {
   4917                     value|=0x8f0080;
   4918                     length=3;
   4919                 } else {
   4920                     length=2;
   4921                 }
   4922                 break;
   4923             case MBCS_OUTPUT_4_EUC:
   4924                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4925                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4926                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4927                 if(value<=0xff) {
   4928                     length=1;
   4929                 } else if(value<=0xffff) {
   4930                     length=2;
   4931                 } else if((value&0x800000)==0) {
   4932                     value|=0x8e800000;
   4933                     length=4;
   4934                 } else if((value&0x8000)==0) {
   4935                     value|=0x8f008000;
   4936                     length=4;
   4937                 } else {
   4938                     length=3;
   4939                 }
   4940                 break;
   4941 #endif
   4942             default:
   4943                 /* must not occur */
   4944                 return -1;
   4945             }
   4946 
   4947             /* is this code point assigned, or do we use fallbacks? */
   4948             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   4949                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
   4950             ) {
   4951                 /*
   4952                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4953                  * There is no way with this data structure for fallback output
   4954                  * to be a zero byte.
   4955                  */
   4956                 /* assigned */
   4957                 *pValue=value;
   4958                 return length;
   4959             }
   4960         }
   4961     }
   4962 
   4963     cx=sharedData->mbcs.extIndexes;
   4964     if(cx!=NULL) {
   4965         length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
   4966         return length>=0 ? length : -length;  /* return abs(length); */
   4967     }
   4968 
   4969     /* unassigned */
   4970     return 0;
   4971 }
   4972 
   4973 
   4974 #if 0
   4975 /*
   4976  * This function has been moved to ucnv2022.c for inlining.
   4977  * This implementation is here only for documentation purposes
   4978  */
   4979 
   4980 /**
   4981  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
   4982  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4983  * It does not handle conversion extensions (_extFromU()).
   4984  *
   4985  * It returns the codepage byte for the code point, or -1 if it is unassigned.
   4986  */
   4987 U_CFUNC int32_t
   4988 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
   4989                        UChar32 c,
   4990                        UBool useFallback) {
   4991     const uint16_t *table;
   4992     int32_t value;
   4993 
   4994     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4995     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4996         return -1;
   4997     }
   4998 
   4999     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   5000     table=sharedData->mbcs.fromUnicodeTable;
   5001 
   5002     /* get the byte for the output */
   5003     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   5004     /* is this code point assigned, or do we use fallbacks? */
   5005     if(useFallback ? value>=0x800 : value>=0xc00) {
   5006         return value&0xff;
   5007     } else {
   5008         return -1;
   5009     }
   5010 }
   5011 #endif
   5012 
   5013 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
   5014 
   5015 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   5016 static const UChar32
   5017 utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   5018 
   5019 static void U_CALLCONV
   5020 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5021                   UConverterToUnicodeArgs *pToUArgs,
   5022                   UErrorCode *pErrorCode) {
   5023     UConverter *utf8, *cnv;
   5024     const uint8_t *source, *sourceLimit;
   5025     uint8_t *target;
   5026     int32_t targetCapacity;
   5027 
   5028     const uint16_t *table, *sbcsIndex;
   5029     const uint16_t *results;
   5030 
   5031     int8_t oldToULength, toULength, toULimit;
   5032 
   5033     UChar32 c;
   5034     uint8_t b, t1, t2;
   5035 
   5036     uint32_t asciiRoundtrips;
   5037     uint16_t value, minValue = 0;
   5038     UBool hasSupplementary;
   5039 
   5040     /* set up the local pointers */
   5041     utf8=pToUArgs->converter;
   5042     cnv=pFromUArgs->converter;
   5043     source=(uint8_t *)pToUArgs->source;
   5044     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5045     target=(uint8_t *)pFromUArgs->target;
   5046     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5047 
   5048     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5049     sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
   5050     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5051         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5052     } else {
   5053         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5054     }
   5055     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5056 
   5057     if(cnv->useFallback) {
   5058         /* use all roundtrip and fallback results */
   5059         minValue=0x800;
   5060     } else {
   5061         /* use only roundtrips and fallbacks from private-use characters */
   5062         minValue=0xc00;
   5063     }
   5064     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5065 
   5066     /* get the converter state from the UTF-8 UConverter */
   5067     c=(UChar32)utf8->toUnicodeStatus;
   5068     if(c!=0) {
   5069         toULength=oldToULength=utf8->toULength;
   5070         toULimit=(int8_t)utf8->mode;
   5071     } else {
   5072         toULength=oldToULength=toULimit=0;
   5073     }
   5074 
   5075     // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
   5076     // If the buffer ends with a truncated 2- or 3-byte sequence,
   5077     // then we reduce the sourceLimit to before that,
   5078     // and collect the remaining bytes after the conversion loop.
   5079     {
   5080         // Do not go back into the bytes that will be read for finishing a partial
   5081         // sequence from the previous buffer.
   5082         int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5083         if(length>0) {
   5084             uint8_t b1=*(sourceLimit-1);
   5085             if(U8_IS_SINGLE(b1)) {
   5086                 // common ASCII character
   5087             } else if(U8_IS_TRAIL(b1) && length>=2) {
   5088                 uint8_t b2=*(sourceLimit-2);
   5089                 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
   5090                     // truncated 3-byte sequence
   5091                     sourceLimit-=2;
   5092                 }
   5093             } else if(0xc2<=b1 && b1<0xf0) {
   5094                 // truncated 2- or 3-byte sequence
   5095                 --sourceLimit;
   5096             }
   5097         }
   5098     }
   5099 
   5100     if(c!=0 && targetCapacity>0) {
   5101         utf8->toUnicodeStatus=0;
   5102         utf8->toULength=0;
   5103         goto moreBytes;
   5104         /*
   5105          * Note: We could avoid the goto by duplicating some of the moreBytes
   5106          * code, but only up to the point of collecting a complete UTF-8
   5107          * sequence; then recurse for the toUBytes[toULength]
   5108          * and then continue with normal conversion.
   5109          *
   5110          * If so, move this code to just after initializing the minimum
   5111          * set of local variables for reading the UTF-8 input
   5112          * (utf8, source, target, limits but not cnv, table, minValue, etc.).
   5113          *
   5114          * Potential advantages:
   5115          * - avoid the goto
   5116          * - oldToULength could become a local variable in just those code blocks
   5117          *   that deal with buffer boundaries
   5118          * - possibly faster if the goto prevents some compiler optimizations
   5119          *   (this would need measuring to confirm)
   5120          * Disadvantage:
   5121          * - code duplication
   5122          */
   5123     }
   5124 
   5125     /* conversion loop */
   5126     while(source<sourceLimit) {
   5127         if(targetCapacity>0) {
   5128             b=*source++;
   5129             if(U8_IS_SINGLE(b)) {
   5130                 /* convert ASCII */
   5131                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5132                     *target++=(uint8_t)b;
   5133                     --targetCapacity;
   5134                     continue;
   5135                 } else {
   5136                     c=b;
   5137                     value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
   5138                 }
   5139             } else {
   5140                 if(b<0xe0) {
   5141                     if( /* handle U+0080..U+07FF inline */
   5142                         b>=0xc2 &&
   5143                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5144                     ) {
   5145                         c=b&0x1f;
   5146                         ++source;
   5147                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
   5148                         if(value>=minValue) {
   5149                             *target++=(uint8_t)value;
   5150                             --targetCapacity;
   5151                             continue;
   5152                         } else {
   5153                             c=(c<<6)|t1;
   5154                         }
   5155                     } else {
   5156                         c=-1;
   5157                     }
   5158                 } else if(b==0xe0) {
   5159                     if( /* handle U+0800..U+0FFF inline */
   5160                         (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
   5161                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5162                     ) {
   5163                         c=t1;
   5164                         source+=2;
   5165                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
   5166                         if(value>=minValue) {
   5167                             *target++=(uint8_t)value;
   5168                             --targetCapacity;
   5169                             continue;
   5170                         } else {
   5171                             c=(c<<6)|t2;
   5172                         }
   5173                     } else {
   5174                         c=-1;
   5175                     }
   5176                 } else {
   5177                     c=-1;
   5178                 }
   5179 
   5180                 if(c<0) {
   5181                     /* handle "complicated" and error cases, and continuing partial characters */
   5182                     oldToULength=0;
   5183                     toULength=1;
   5184                     toULimit=U8_COUNT_BYTES_NON_ASCII(b);
   5185                     c=b;
   5186 moreBytes:
   5187                     while(toULength<toULimit) {
   5188                         /*
   5189                          * The sourceLimit may have been adjusted before the conversion loop
   5190                          * to stop before a truncated sequence.
   5191                          * Here we need to use the real limit in case we have two truncated
   5192                          * sequences at the end.
   5193                          * See ticket #7492.
   5194                          */
   5195                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5196                             b=*source;
   5197                             if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
   5198                                 ++source;
   5199                                 ++toULength;
   5200                                 c=(c<<6)+b;
   5201                             } else {
   5202                                 break; /* sequence too short, stop with toULength<toULimit */
   5203                             }
   5204                         } else {
   5205                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5206                             source-=(toULength-oldToULength);
   5207                             while(oldToULength<toULength) {
   5208                                 utf8->toUBytes[oldToULength++]=*source++;
   5209                             }
   5210                             utf8->toUnicodeStatus=c;
   5211                             utf8->toULength=toULength;
   5212                             utf8->mode=toULimit;
   5213                             pToUArgs->source=(char *)source;
   5214                             pFromUArgs->target=(char *)target;
   5215                             return;
   5216                         }
   5217                     }
   5218 
   5219                     if(toULength==toULimit) {
   5220                         c-=utf8_offsets[toULength];
   5221                         if(toULength<=3) {  /* BMP */
   5222                             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5223                         } else {
   5224                             /* supplementary code point */
   5225                             if(!hasSupplementary) {
   5226                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5227                                 value=0;
   5228                             } else {
   5229                                 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5230                             }
   5231                         }
   5232                     } else {
   5233                         /* error handling: illegal UTF-8 byte sequence */
   5234                         source-=(toULength-oldToULength);
   5235                         while(oldToULength<toULength) {
   5236                             utf8->toUBytes[oldToULength++]=*source++;
   5237                         }
   5238                         utf8->toULength=toULength;
   5239                         pToUArgs->source=(char *)source;
   5240                         pFromUArgs->target=(char *)target;
   5241                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5242                         return;
   5243                     }
   5244                 }
   5245             }
   5246 
   5247             if(value>=minValue) {
   5248                 /* output the mapping for c */
   5249                 *target++=(uint8_t)value;
   5250                 --targetCapacity;
   5251             } else {
   5252                 /* value<minValue means c is unassigned (unmappable) */
   5253                 /*
   5254                  * Try an extension mapping.
   5255                  * Pass in no source because we don't have UTF-16 input.
   5256                  * If we have a partial match on c, we will return and revert
   5257                  * to UTF-8->UTF-16->charset conversion.
   5258                  */
   5259                 static const UChar nul=0;
   5260                 const UChar *noSource=&nul;
   5261                 c=_extFromU(cnv, cnv->sharedData,
   5262                             c, &noSource, noSource,
   5263                             &target, target+targetCapacity,
   5264                             NULL, -1,
   5265                             pFromUArgs->flush,
   5266                             pErrorCode);
   5267 
   5268                 if(U_FAILURE(*pErrorCode)) {
   5269                     /* not mappable or buffer overflow */
   5270                     cnv->fromUChar32=c;
   5271                     break;
   5272                 } else if(cnv->preFromUFirstCP>=0) {
   5273                     /*
   5274                      * Partial match, return and revert to pivoting.
   5275                      * In normal from-UTF-16 conversion, we would just continue
   5276                      * but then exit the loop because the extension match would
   5277                      * have consumed the source.
   5278                      */
   5279                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5280                     break;
   5281                 } else {
   5282                     /* a mapping was written to the target, continue */
   5283 
   5284                     /* recalculate the targetCapacity after an extension mapping */
   5285                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5286                 }
   5287             }
   5288         } else {
   5289             /* target is full */
   5290             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5291             break;
   5292         }
   5293     }
   5294 
   5295     /*
   5296      * The sourceLimit may have been adjusted before the conversion loop
   5297      * to stop before a truncated sequence.
   5298      * If so, then collect the truncated sequence now.
   5299      */
   5300     if(U_SUCCESS(*pErrorCode) &&
   5301             cnv->preFromUFirstCP<0 &&
   5302             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5303         c=utf8->toUBytes[0]=b=*source++;
   5304         toULength=1;
   5305         toULimit=U8_COUNT_BYTES(b);
   5306         while(source<sourceLimit) {
   5307             utf8->toUBytes[toULength++]=b=*source++;
   5308             c=(c<<6)+b;
   5309         }
   5310         utf8->toUnicodeStatus=c;
   5311         utf8->toULength=toULength;
   5312         utf8->mode=toULimit;
   5313     }
   5314 
   5315     /* write back the updated pointers */
   5316     pToUArgs->source=(char *)source;
   5317     pFromUArgs->target=(char *)target;
   5318 }
   5319 
   5320 static void U_CALLCONV
   5321 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5322                   UConverterToUnicodeArgs *pToUArgs,
   5323                   UErrorCode *pErrorCode) {
   5324     UConverter *utf8, *cnv;
   5325     const uint8_t *source, *sourceLimit;
   5326     uint8_t *target;
   5327     int32_t targetCapacity;
   5328 
   5329     const uint16_t *table, *mbcsIndex;
   5330     const uint16_t *results;
   5331 
   5332     int8_t oldToULength, toULength, toULimit;
   5333 
   5334     UChar32 c;
   5335     uint8_t b, t1, t2;
   5336 
   5337     uint32_t stage2Entry;
   5338     uint32_t asciiRoundtrips;
   5339     uint16_t value = 0;
   5340     UBool hasSupplementary;
   5341 
   5342     /* set up the local pointers */
   5343     utf8=pToUArgs->converter;
   5344     cnv=pFromUArgs->converter;
   5345     source=(uint8_t *)pToUArgs->source;
   5346     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5347     target=(uint8_t *)pFromUArgs->target;
   5348     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5349 
   5350     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5351     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   5352     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5353         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5354     } else {
   5355         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5356     }
   5357     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5358 
   5359     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5360 
   5361     /* get the converter state from the UTF-8 UConverter */
   5362     c=(UChar32)utf8->toUnicodeStatus;
   5363     if(c!=0) {
   5364         toULength=oldToULength=utf8->toULength;
   5365         toULimit=(int8_t)utf8->mode;
   5366     } else {
   5367         toULength=oldToULength=toULimit=0;
   5368     }
   5369 
   5370     // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
   5371     // If the buffer ends with a truncated 2- or 3-byte sequence,
   5372     // then we reduce the sourceLimit to before that,
   5373     // and collect the remaining bytes after the conversion loop.
   5374     {
   5375         // Do not go back into the bytes that will be read for finishing a partial
   5376         // sequence from the previous buffer.
   5377         int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5378         if(length>0) {
   5379             uint8_t b1=*(sourceLimit-1);
   5380             if(U8_IS_SINGLE(b1)) {
   5381                 // common ASCII character
   5382             } else if(U8_IS_TRAIL(b1) && length>=2) {
   5383                 uint8_t b2=*(sourceLimit-2);
   5384                 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
   5385                     // truncated 3-byte sequence
   5386                     sourceLimit-=2;
   5387                 }
   5388             } else if(0xc2<=b1 && b1<0xf0) {
   5389                 // truncated 2- or 3-byte sequence
   5390                 --sourceLimit;
   5391             }
   5392         }
   5393     }
   5394 
   5395     if(c!=0 && targetCapacity>0) {
   5396         utf8->toUnicodeStatus=0;
   5397         utf8->toULength=0;
   5398         goto moreBytes;
   5399         /* See note in ucnv_SBCSFromUTF8() about this goto. */
   5400     }
   5401 
   5402     /* conversion loop */
   5403     while(source<sourceLimit) {
   5404         if(targetCapacity>0) {
   5405             b=*source++;
   5406             if(U8_IS_SINGLE(b)) {
   5407                 /* convert ASCII */
   5408                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5409                     *target++=b;
   5410                     --targetCapacity;
   5411                     continue;
   5412                 } else {
   5413                     value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
   5414                     if(value==0) {
   5415                         c=b;
   5416                         goto unassigned;
   5417                     }
   5418                 }
   5419             } else {
   5420                 if(b>=0xe0) {
   5421                     if( /* handle U+0800..U+D7FF inline */
   5422                         b<=0xed &&  // do not assume maxFastUChar>0xd7ff
   5423                         U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
   5424                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5425                     ) {
   5426                         c=((b&0xf)<<6)|(t1&0x3f);
   5427                         source+=2;
   5428                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
   5429                         if(value==0) {
   5430                             c=(c<<6)|t2;
   5431                             goto unassigned;
   5432                         }
   5433                     } else {
   5434                         c=-1;
   5435                     }
   5436                 } else {
   5437                     if( /* handle U+0080..U+07FF inline */
   5438                         b>=0xc2 &&
   5439                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5440                     ) {
   5441                         c=b&0x1f;
   5442                         ++source;
   5443                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
   5444                         if(value==0) {
   5445                             c=(c<<6)|t1;
   5446                             goto unassigned;
   5447                         }
   5448                     } else {
   5449                         c=-1;
   5450                     }
   5451                 }
   5452 
   5453                 if(c<0) {
   5454                     /* handle "complicated" and error cases, and continuing partial characters */
   5455                     oldToULength=0;
   5456                     toULength=1;
   5457                     toULimit=U8_COUNT_BYTES_NON_ASCII(b);
   5458                     c=b;
   5459 moreBytes:
   5460                     while(toULength<toULimit) {
   5461                         /*
   5462                          * The sourceLimit may have been adjusted before the conversion loop
   5463                          * to stop before a truncated sequence.
   5464                          * Here we need to use the real limit in case we have two truncated
   5465                          * sequences at the end.
   5466                          * See ticket #7492.
   5467                          */
   5468                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5469                             b=*source;
   5470                             if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
   5471                                 ++source;
   5472                                 ++toULength;
   5473                                 c=(c<<6)+b;
   5474                             } else {
   5475                                 break; /* sequence too short, stop with toULength<toULimit */
   5476                             }
   5477                         } else {
   5478                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5479                             source-=(toULength-oldToULength);
   5480                             while(oldToULength<toULength) {
   5481                                 utf8->toUBytes[oldToULength++]=*source++;
   5482                             }
   5483                             utf8->toUnicodeStatus=c;
   5484                             utf8->toULength=toULength;
   5485                             utf8->mode=toULimit;
   5486                             pToUArgs->source=(char *)source;
   5487                             pFromUArgs->target=(char *)target;
   5488                             return;
   5489                         }
   5490                     }
   5491 
   5492                     if(toULength==toULimit) {
   5493                         c-=utf8_offsets[toULength];
   5494                         if(toULength<=3) {  /* BMP */
   5495                             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5496                         } else {
   5497                             /* supplementary code point */
   5498                             if(!hasSupplementary) {
   5499                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5500                                 stage2Entry=0;
   5501                             } else {
   5502                                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5503                             }
   5504                         }
   5505                     } else {
   5506                         /* error handling: illegal UTF-8 byte sequence */
   5507                         source-=(toULength-oldToULength);
   5508                         while(oldToULength<toULength) {
   5509                             utf8->toUBytes[oldToULength++]=*source++;
   5510                         }
   5511                         utf8->toULength=toULength;
   5512                         pToUArgs->source=(char *)source;
   5513                         pFromUArgs->target=(char *)target;
   5514                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5515                         return;
   5516                     }
   5517 
   5518                     /* get the bytes and the length for the output */
   5519                     /* MBCS_OUTPUT_2 */
   5520                     value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
   5521 
   5522                     /* is this code point assigned, or do we use fallbacks? */
   5523                     if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   5524                          (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   5525                     ) {
   5526                         goto unassigned;
   5527                     }
   5528                 }
   5529             }
   5530 
   5531             /* write the output character bytes from value and length */
   5532             /* from the first if in the loop we know that targetCapacity>0 */
   5533             if(value<=0xff) {
   5534                 /* this is easy because we know that there is enough space */
   5535                 *target++=(uint8_t)value;
   5536                 --targetCapacity;
   5537             } else /* length==2 */ {
   5538                 *target++=(uint8_t)(value>>8);
   5539                 if(2<=targetCapacity) {
   5540                     *target++=(uint8_t)value;
   5541                     targetCapacity-=2;
   5542                 } else {
   5543                     cnv->charErrorBuffer[0]=(char)value;
   5544                     cnv->charErrorBufferLength=1;
   5545 
   5546                     /* target overflow */
   5547                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5548                     break;
   5549                 }
   5550             }
   5551             continue;
   5552 
   5553 unassigned:
   5554             {
   5555                 /*
   5556                  * Try an extension mapping.
   5557                  * Pass in no source because we don't have UTF-16 input.
   5558                  * If we have a partial match on c, we will return and revert
   5559                  * to UTF-8->UTF-16->charset conversion.
   5560                  */
   5561                 static const UChar nul=0;
   5562                 const UChar *noSource=&nul;
   5563                 c=_extFromU(cnv, cnv->sharedData,
   5564                             c, &noSource, noSource,
   5565                             &target, target+targetCapacity,
   5566                             NULL, -1,
   5567                             pFromUArgs->flush,
   5568                             pErrorCode);
   5569 
   5570                 if(U_FAILURE(*pErrorCode)) {
   5571                     /* not mappable or buffer overflow */
   5572                     cnv->fromUChar32=c;
   5573                     break;
   5574                 } else if(cnv->preFromUFirstCP>=0) {
   5575                     /*
   5576                      * Partial match, return and revert to pivoting.
   5577                      * In normal from-UTF-16 conversion, we would just continue
   5578                      * but then exit the loop because the extension match would
   5579                      * have consumed the source.
   5580                      */
   5581                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5582                     break;
   5583                 } else {
   5584                     /* a mapping was written to the target, continue */
   5585 
   5586                     /* recalculate the targetCapacity after an extension mapping */
   5587                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5588                     continue;
   5589                 }
   5590             }
   5591         } else {
   5592             /* target is full */
   5593             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5594             break;
   5595         }
   5596     }
   5597 
   5598     /*
   5599      * The sourceLimit may have been adjusted before the conversion loop
   5600      * to stop before a truncated sequence.
   5601      * If so, then collect the truncated sequence now.
   5602      */
   5603     if(U_SUCCESS(*pErrorCode) &&
   5604             cnv->preFromUFirstCP<0 &&
   5605             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5606         c=utf8->toUBytes[0]=b=*source++;
   5607         toULength=1;
   5608         toULimit=U8_COUNT_BYTES(b);
   5609         while(source<sourceLimit) {
   5610             utf8->toUBytes[toULength++]=b=*source++;
   5611             c=(c<<6)+b;
   5612         }
   5613         utf8->toUnicodeStatus=c;
   5614         utf8->toULength=toULength;
   5615         utf8->mode=toULimit;
   5616     }
   5617 
   5618     /* write back the updated pointers */
   5619     pToUArgs->source=(char *)source;
   5620     pFromUArgs->target=(char *)target;
   5621 }
   5622 
   5623 /* miscellaneous ------------------------------------------------------------ */
   5624 
   5625 static void U_CALLCONV
   5626 ucnv_MBCSGetStarters(const UConverter* cnv,
   5627                  UBool starters[256],
   5628                  UErrorCode *) {
   5629     const int32_t *state0;
   5630     int i;
   5631 
   5632     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
   5633     for(i=0; i<256; ++i) {
   5634         /* all bytes that cause a state transition from state 0 are lead bytes */
   5635         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
   5636     }
   5637 }
   5638 
   5639 /*
   5640  * This is an internal function that allows other converter implementations
   5641  * to check whether a byte is a lead byte.
   5642  */
   5643 U_CFUNC UBool
   5644 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
   5645     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
   5646 }
   5647 
   5648 static void U_CALLCONV
   5649 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
   5650               int32_t offsetIndex,
   5651               UErrorCode *pErrorCode) {
   5652     UConverter *cnv=pArgs->converter;
   5653     char *p, *subchar;
   5654     char buffer[4];
   5655     int32_t length;
   5656 
   5657     /* first, select between subChar and subChar1 */
   5658     if( cnv->subChar1!=0 &&
   5659         (cnv->sharedData->mbcs.extIndexes!=NULL ?
   5660             cnv->useSubChar1 :
   5661             (cnv->invalidUCharBuffer[0]<=0xff))
   5662     ) {
   5663         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
   5664         subchar=(char *)&cnv->subChar1;
   5665         length=1;
   5666     } else {
   5667         /* select subChar in all other cases */
   5668         subchar=(char *)cnv->subChars;
   5669         length=cnv->subCharLen;
   5670     }
   5671 
   5672     /* reset the selector for the next code point */
   5673     cnv->useSubChar1=FALSE;
   5674 
   5675     if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
   5676         p=buffer;
   5677 
   5678         /* fromUnicodeStatus contains prevLength */
   5679         switch(length) {
   5680         case 1:
   5681             if(cnv->fromUnicodeStatus==2) {
   5682                 /* DBCS mode and SBCS sub char: change to SBCS */
   5683                 cnv->fromUnicodeStatus=1;
   5684                 *p++=UCNV_SI;
   5685             }
   5686             *p++=subchar[0];
   5687             break;
   5688         case 2:
   5689             if(cnv->fromUnicodeStatus<=1) {
   5690                 /* SBCS mode and DBCS sub char: change to DBCS */
   5691                 cnv->fromUnicodeStatus=2;
   5692                 *p++=UCNV_SO;
   5693             }
   5694             *p++=subchar[0];
   5695             *p++=subchar[1];
   5696             break;
   5697         default:
   5698             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   5699             return;
   5700         }
   5701         subchar=buffer;
   5702         length=(int32_t)(p-buffer);
   5703     }
   5704 
   5705     ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
   5706 }
   5707 
   5708 U_CFUNC UConverterType
   5709 ucnv_MBCSGetType(const UConverter* converter) {
   5710     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
   5711     if(converter->sharedData->mbcs.countStates==1) {
   5712         return (UConverterType)UCNV_SBCS;
   5713     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
   5714         return (UConverterType)UCNV_EBCDIC_STATEFUL;
   5715     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
   5716         return (UConverterType)UCNV_DBCS;
   5717     }
   5718     return (UConverterType)UCNV_MBCS;
   5719 }
   5720 
   5721 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   5722