Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ucnvmbcs.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jul03
     16 *   created by: Markus W. Scherer
     17 *
     18 *   The current code in this file replaces the previous implementation
     19 *   of conversion code from multi-byte codepages to Unicode and back.
     20 *   This implementation supports the following:
     21 *   - legacy variable-length codepages with up to 4 bytes per character
     22 *   - all Unicode code points (up to 0x10ffff)
     23 *   - efficient distinction of unassigned vs. illegal byte sequences
     24 *   - it is possible in fromUnicode() to directly deal with simple
     25 *     stateful encodings (used for EBCDIC_STATEFUL)
     26 *   - it is possible to convert Unicode code points
     27 *     to a single zero byte (but not as a fallback except for SBCS)
     28 *
     29 *   Remaining limitations in fromUnicode:
     30 *   - byte sequences must not have leading zero bytes
     31 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
     32 *   - limitation to up to 4 bytes per character
     33 *
     34 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
     35 *   limitations and adds m:n character mappings and other features.
     36 *   See ucnv_ext.h for details.
     37 *
     38 *   Change history:
     39 *
     40 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
     41 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
     42 *                             macros to ucnvmbcs.h file
     43 */
     44 
     45 #include "unicode/utypes.h"
     46 
     47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     48 
     49 #include "unicode/ucnv.h"
     50 #include "unicode/ucnv_cb.h"
     51 #include "unicode/udata.h"
     52 #include "unicode/uset.h"
     53 #include "unicode/utf8.h"
     54 #include "unicode/utf16.h"
     55 #include "ucnv_bld.h"
     56 #include "ucnvmbcs.h"
     57 #include "ucnv_ext.h"
     58 #include "ucnv_cnv.h"
     59 #include "cmemory.h"
     60 #include "cstring.h"
     61 #include "umutex.h"
     62 
     63 /* control optimizations according to the platform */
     64 #define MBCS_UNROLL_SINGLE_TO_BMP 1
     65 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
     66 
     67 /*
     68  * _MBCSHeader versions 5.3 & 4.3
     69  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
     70  *
     71  * This version is optional. Version 5 is used for incompatible data format changes.
     72  * makeconv will continue to generate version 4 files if possible.
     73  *
     74  * Changes from version 4:
     75  *
     76  * The main difference is an additional _MBCSHeader field with
     77  * - the length (number of uint32_t) of the _MBCSHeader
     78  * - flags for further incompatible data format changes
     79  * - flags for further, backward compatible data format changes
     80  *
     81  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
     82  * the file and needs to be reconstituted at load time.
     83  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
     84  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
     85  * (For details about these structures see below, and see ucnvmbcs.h.)
     86  *
     87  *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
     88  *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
     89  *   precision markers for all mappings.)
     90  *
     91  *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
     92  *   omitted data that can be reconstituted from the toUnicode data.
     93  *
     94  *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
     95  *   With only roundtrip mappings in the base fromUnicode data, this part is fully
     96  *   redundant with the mbcsIndex and will be reconstituted from that (also using the
     97  *   stage 1 table which contains the information about how stage 2 was compacted).
     98  *
     99  *   The rest of the stage 2 table, the part for code points above maxFastUChar,
    100  *   is stored in the file and will be appended to the reconstituted part.
    101  *
    102  *   The entire fromUBytes array is omitted from the file and will be reconstitued.
    103  *   This is done by enumerating all toUnicode roundtrip mappings, performing
    104  *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
    105  *   writing instead of reading the byte values.
    106  *
    107  * _MBCSHeader version 4.3
    108  *
    109  * Change from version 4.2:
    110  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
    111  *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
    112  *   files which can be used instead of stages 1 & 2.
    113  *   Faster lookups for roundtrips from most commonly used characters,
    114  *   and lookups from UTF-8 byte sequences with a natural bit distribution.
    115  *   See ucnvmbcs.h for more details.
    116  *
    117  * Change from version 4.1:
    118  * - Added an optional extension table structure at the end of the .cnv file.
    119  *   It is present if the upper bits of the header flags field contains a non-zero
    120  *   byte offset to it.
    121  *   Files that contain only a conversion table and no base table
    122  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
    123  *   These contain the base table name between the MBCS header and the extension
    124  *   data.
    125  *
    126  * Change from version 4.0:
    127  * - Replace header.reserved with header.fromUBytesLength so that all
    128  *   fields in the data have length.
    129  *
    130  * Changes from version 3 (for performance improvements):
    131  * - new bit distribution for state table entries
    132  * - reordered action codes
    133  * - new data structure for single-byte fromUnicode
    134  *   + stage 2 only contains indexes
    135  *   + stage 3 stores 16 bits per character with classification bits 15..8
    136  * - no multiplier for stage 1 entries
    137  * - stage 2 for non-single-byte codepages contains the index and the flags in
    138  *   one 32-bit value
    139  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
    140  *
    141  * For more details about old versions of the MBCS data structure, see
    142  * the corresponding versions of this file.
    143  *
    144  * Converting stateless codepage data ---------------------------------------***
    145  * (or codepage data with simple states) to Unicode.
    146  *
    147  * Data structure and algorithm for converting from complex legacy codepages
    148  * to Unicode. (Designed before 2000-may-22.)
    149  *
    150  * The basic idea is that the structure of legacy codepages can be described
    151  * with state tables.
    152  * When reading a byte stream, each input byte causes a state transition.
    153  * Some transitions result in the output of a code point, some result in
    154  * "unassigned" or "illegal" output.
    155  * This is used here for character conversion.
    156  *
    157  * The data structure begins with a state table consisting of a row
    158  * per state, with 256 entries (columns) per row for each possible input
    159  * byte value.
    160  * Each entry is 32 bits wide, with two formats distinguished by
    161  * the sign bit (bit 31):
    162  *
    163  * One format for transitional entries (bit 31 not set) for non-final bytes, and
    164  * one format for final entries (bit 31 set).
    165  * Both formats contain the number of the next state in the same bit
    166  * positions.
    167  * State 0 is the initial state.
    168  *
    169  * Most of the time, the offset values of subsequent states are added
    170  * up to a scalar value. This value will eventually be the index of
    171  * the Unicode code point in a table that follows the state table.
    172  * The effect is that the code points for final state table rows
    173  * are contiguous. The code points of final state rows follow each other
    174  * in the order of the references to those final states by previous
    175  * states, etc.
    176  *
    177  * For some terminal states, the offset is itself the output Unicode
    178  * code point (16 bits for a BMP code point or 20 bits for a supplementary
    179  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
    180  * For others, the code point in the Unicode table is stored with either
    181  * one or two code units: one for BMP code points, two for a pair of
    182  * surrogates.
    183  * All code points for a final state entry take up the same number of code
    184  * units, regardless of whether they all actually _use_ the same number
    185  * of code units. This is necessary for simple array access.
    186  *
    187  * An additional feature comes in with what in ICU is called "fallback"
    188  * mappings:
    189  *
    190  * In addition to round-trippable, precise, 1:1 mappings, there are often
    191  * mappings defined between similar, though not the same, characters.
    192  * Typically, such mappings occur only in fromUnicode mapping tables because
    193  * Unicode has a superset repertoire of most other codepages. However, it
    194  * is possible to provide such mappings in the toUnicode tables, too.
    195  * In this case, the fallback mappings are partly integrated into the
    196  * general state tables because the structure of the encoding includes their
    197  * byte sequences.
    198  * For final entries in an initial state, fallback mappings are stored in
    199  * the entry itself like with roundtrip mappings.
    200  * For other final entries, they are stored in the code units table if
    201  * the entry is for a pair of code units.
    202  * For single-unit results in the code units table, there is no space to
    203  * alternatively hold a fallback mapping; in this case, the code unit
    204  * is stored as U+fffe (unassigned), and the fallback mapping needs to
    205  * be looked up by the scalar offset value in a separate table.
    206  *
    207  * "Unassigned" state entries really mean "structurally unassigned",
    208  * i.e., such a byte sequence will never have a mapping result.
    209  *
    210  * The interpretation of the bits in each entry is as follows:
    211  *
    212  * Bit 31 not set, not a terminal entry ("transitional"):
    213  * 30..24 next state
    214  * 23..0  offset delta, to be added up
    215  *
    216  * Bit 31 set, terminal ("final") entry:
    217  * 30..24 next state (regardless of action code)
    218  * 23..20 action code:
    219  *        action codes 0 and 1 result in precise-mapping Unicode code points
    220  *        0  valid byte sequence
    221  *           19..16 not used, 0
    222  *           15..0  16-bit Unicode BMP code point
    223  *                  never U+fffe or U+ffff
    224  *        1  valid byte sequence
    225  *           19..0  20-bit Unicode supplementary code point
    226  *                  never U+fffe or U+ffff
    227  *
    228  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
    229  *        2  valid byte sequence (fallback)
    230  *           19..16 not used, 0
    231  *           15..0  16-bit Unicode BMP code point as fallback result
    232  *        3  valid byte sequence (fallback)
    233  *           19..0  20-bit Unicode supplementary code point as fallback result
    234  *
    235  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
    236  *        depending on the code units they result in
    237  *        4  valid byte sequence
    238  *           19..9  not used, 0
    239  *            8..0  final offset delta
    240  *                  pointing to one 16-bit code unit which may be
    241  *                  fffe  unassigned -- look for a fallback for this offset
    242  *                  ffff  illegal
    243  *        5  valid byte sequence
    244  *           19..9  not used, 0
    245  *            8..0  final offset delta
    246  *                  pointing to two 16-bit code units
    247  *                  (typically UTF-16 surrogates)
    248  *                  the result depends on the first code unit as follows:
    249  *                  0000..d7ff  roundtrip BMP code point (1st alone)
    250  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
    251  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
    252  *                  e000        roundtrip BMP code point (2nd alone)
    253  *                  e001        fallback BMP code point (2nd alone)
    254  *                  fffe        unassigned
    255  *                  ffff        illegal
    256  *           (the final offset deltas are at most 255 * 2,
    257  *            times 2 because of storing code unit pairs)
    258  *
    259  *        6  unassigned byte sequence
    260  *           19..16 not used, 0
    261  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
    262  *                  this does not contain a final offset delta because the main
    263  *                  purpose of this action code is to save scalar offset values;
    264  *                  therefore, fallback values cannot be assigned to byte
    265  *                  sequences that result in this action code
    266  *        7  illegal byte sequence
    267  *           19..16 not used, 0
    268  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
    269  *        8  state change only
    270  *           19..0  not used, 0
    271  *           useful for state changes in simple stateful encodings,
    272  *           at Shift-In/Shift-Out codes
    273  *
    274  *
    275  *        9..15 reserved for future use
    276  *           current implementations will only perform a state change
    277  *           and ignore bits 19..0
    278  *
    279  * An encoding with contiguous ranges of unassigned byte sequences, like
    280  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
    281  * at least two states for the trail bytes:
    282  * One trail byte state that results in code points, and one that only
    283  * has "unassigned" and "illegal" terminal states.
    284  *
    285  * Note: partly by accident, this data structure supports simple stateful
    286  * encodings without any additional logic.
    287  * Currently, only simple Shift-In/Shift-Out schemes are handled with
    288  * appropriate state tables (especially EBCDIC_STATEFUL!).
    289  *
    290  * MBCS version 2 added:
    291  * unassigned and illegal action codes have U+fffe and U+ffff
    292  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
    293  *
    294  * Converting from Unicode to codepage bytes --------------------------------***
    295  *
    296  * The conversion data structure for fromUnicode is designed for the known
    297  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
    298  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
    299  * a roundtrip mapping.
    300  *
    301  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
    302  * like in the character properties table.
    303  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
    304  * with the resulting bytes is at offsetFromUBytes.
    305  *
    306  * Beginning with version 4, single-byte codepages have a significantly different
    307  * trie compared to other codepages.
    308  * In all cases, the entry in stage 1 is directly the index of the block of
    309  * 64 entries in stage 2.
    310  *
    311  * Single-byte lookup:
    312  *
    313  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
    314  * Stage 3 contains one 16-bit word per result:
    315  * Bits 15..8 indicate the kind of result:
    316  *    f  roundtrip result
    317  *    c  fallback result from private-use code point
    318  *    8  fallback result from other code points
    319  *    0  unassigned
    320  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
    321  *
    322  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
    323  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
    324  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    325  * ASCII code points can be looked up with a linear array access into stage 3.
    326  * See maxFastUChar and other details in ucnvmbcs.h.
    327  *
    328  * Multi-byte lookup:
    329  *
    330  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
    331  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
    332  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
    333  *             If this test is false, then a non-zero result will be interpreted as
    334  *             a fallback mapping.
    335  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
    336  *
    337  * Stage 3 contains 2, 3, or 4 bytes per result.
    338  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
    339  * while 3 bytes are stored as bytes in big-endian order.
    340  * Leading zero bytes are ignored, and the number of bytes is counted.
    341  * A zero byte mapping result is possible as a roundtrip result.
    342  * For some output types, the actual result is processed from this;
    343  * see ucnv_MBCSFromUnicodeWithOffsets().
    344  *
    345  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
    346  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
    347  *
    348  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
    349  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
    350  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    351  * ASCII code points can be looked up with a linear array access into stage 3.
    352  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
    353  *
    354  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
    355  * for compaction.
    356  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
    357  * may overlap by any number of entries.
    358  *
    359  * MBCS version 2 added:
    360  * the converter checks for known output types, which allows
    361  * adding new ones without crashing an unaware converter
    362  */
    363 
    364 /**
    365  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
    366  * consecutive sequences of bytes, starting from the one encoded in value,
    367  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
    368  * Does not currently support m:n mappings or reverse fallbacks.
    369  * This function will not be called for sequences of bytes with leading zeros.
    370  *
    371  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
    372  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
    373  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
    374  *        not map to anything
    375  * @return TRUE to continue enumeration, FALSE to stop
    376  */
    377 typedef UBool U_CALLCONV
    378 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
    379 
    380 static void U_CALLCONV
    381 ucnv_MBCSLoad(UConverterSharedData *sharedData,
    382           UConverterLoadArgs *pArgs,
    383           const uint8_t *raw,
    384           UErrorCode *pErrorCode);
    385 
    386 static void U_CALLCONV
    387 ucnv_MBCSUnload(UConverterSharedData *sharedData);
    388 
    389 static void U_CALLCONV
    390 ucnv_MBCSOpen(UConverter *cnv,
    391               UConverterLoadArgs *pArgs,
    392               UErrorCode *pErrorCode);
    393 
    394 static UChar32 U_CALLCONV
    395 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
    396                   UErrorCode *pErrorCode);
    397 
    398 static void U_CALLCONV
    399 ucnv_MBCSGetStarters(const UConverter* cnv,
    400                  UBool starters[256],
    401                  UErrorCode *pErrorCode);
    402 
    403 U_CDECL_BEGIN
    404 static const char* U_CALLCONV
    405 ucnv_MBCSGetName(const UConverter *cnv);
    406 U_CDECL_END
    407 
    408 static void U_CALLCONV
    409 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
    410               int32_t offsetIndex,
    411               UErrorCode *pErrorCode);
    412 
    413 static UChar32 U_CALLCONV
    414 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
    415                   UErrorCode *pErrorCode);
    416 
    417 static void U_CALLCONV
    418 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    419                   UConverterToUnicodeArgs *pToUArgs,
    420                   UErrorCode *pErrorCode);
    421 
    422 static void U_CALLCONV
    423 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
    424                    const USetAdder *sa,
    425                    UConverterUnicodeSet which,
    426                    UErrorCode *pErrorCode);
    427 
    428 static void U_CALLCONV
    429 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    430                   UConverterToUnicodeArgs *pToUArgs,
    431                   UErrorCode *pErrorCode);
    432 
    433 static const UConverterImpl _SBCSUTF8Impl={
    434     UCNV_MBCS,
    435 
    436     ucnv_MBCSLoad,
    437     ucnv_MBCSUnload,
    438 
    439     ucnv_MBCSOpen,
    440     NULL,
    441     NULL,
    442 
    443     ucnv_MBCSToUnicodeWithOffsets,
    444     ucnv_MBCSToUnicodeWithOffsets,
    445     ucnv_MBCSFromUnicodeWithOffsets,
    446     ucnv_MBCSFromUnicodeWithOffsets,
    447     ucnv_MBCSGetNextUChar,
    448 
    449     ucnv_MBCSGetStarters,
    450     ucnv_MBCSGetName,
    451     ucnv_MBCSWriteSub,
    452     NULL,
    453     ucnv_MBCSGetUnicodeSet,
    454 
    455     NULL,
    456     ucnv_SBCSFromUTF8
    457 };
    458 
    459 static const UConverterImpl _DBCSUTF8Impl={
    460     UCNV_MBCS,
    461 
    462     ucnv_MBCSLoad,
    463     ucnv_MBCSUnload,
    464 
    465     ucnv_MBCSOpen,
    466     NULL,
    467     NULL,
    468 
    469     ucnv_MBCSToUnicodeWithOffsets,
    470     ucnv_MBCSToUnicodeWithOffsets,
    471     ucnv_MBCSFromUnicodeWithOffsets,
    472     ucnv_MBCSFromUnicodeWithOffsets,
    473     ucnv_MBCSGetNextUChar,
    474 
    475     ucnv_MBCSGetStarters,
    476     ucnv_MBCSGetName,
    477     ucnv_MBCSWriteSub,
    478     NULL,
    479     ucnv_MBCSGetUnicodeSet,
    480 
    481     NULL,
    482     ucnv_DBCSFromUTF8
    483 };
    484 
    485 static const UConverterImpl _MBCSImpl={
    486     UCNV_MBCS,
    487 
    488     ucnv_MBCSLoad,
    489     ucnv_MBCSUnload,
    490 
    491     ucnv_MBCSOpen,
    492     NULL,
    493     NULL,
    494 
    495     ucnv_MBCSToUnicodeWithOffsets,
    496     ucnv_MBCSToUnicodeWithOffsets,
    497     ucnv_MBCSFromUnicodeWithOffsets,
    498     ucnv_MBCSFromUnicodeWithOffsets,
    499     ucnv_MBCSGetNextUChar,
    500 
    501     ucnv_MBCSGetStarters,
    502     ucnv_MBCSGetName,
    503     ucnv_MBCSWriteSub,
    504     NULL,
    505     ucnv_MBCSGetUnicodeSet,
    506     NULL,
    507     NULL
    508 };
    509 
    510 /* Static data is in tools/makeconv/ucnvstat.c for data-based
    511  * converters. Be sure to update it as well.
    512  */
    513 
    514 const UConverterSharedData _MBCSData={
    515     sizeof(UConverterSharedData), 1,
    516     NULL, NULL, FALSE, TRUE, &_MBCSImpl,
    517     0, UCNV_MBCS_TABLE_INITIALIZER
    518 };
    519 
    520 
    521 /* GB 18030 data ------------------------------------------------------------ */
    522 
    523 /* helper macros for linear values for GB 18030 four-byte sequences */
    524 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
    525 
    526 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
    527 
    528 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
    529 
    530 /*
    531  * Some ranges of GB 18030 where both the Unicode code points and the
    532  * GB four-byte sequences are contiguous and are handled algorithmically by
    533  * the special callback functions below.
    534  * The values are start & end of Unicode & GB codes.
    535  *
    536  * Note that single surrogates are not mapped by GB 18030
    537  * as of the re-released mapping tables from 2000-nov-30.
    538  */
    539 static const uint32_t
    540 gb18030Ranges[14][4]={
    541     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
    542     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
    543     {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
    544     {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
    545     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
    546     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
    547     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
    548     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
    549     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
    550     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
    551     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
    552     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
    553     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
    554     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
    555 };
    556 
    557 /* bit flag for UConverter.options indicating GB 18030 special handling */
    558 #define _MBCS_OPTION_GB18030 0x8000
    559 
    560 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
    561 #define _MBCS_OPTION_KEIS 0x01000
    562 #define _MBCS_OPTION_JEF  0x02000
    563 #define _MBCS_OPTION_JIPS 0x04000
    564 
    565 #define KEIS_SO_CHAR_1 0x0A
    566 #define KEIS_SO_CHAR_2 0x42
    567 #define KEIS_SI_CHAR_1 0x0A
    568 #define KEIS_SI_CHAR_2 0x41
    569 
    570 #define JEF_SO_CHAR 0x28
    571 #define JEF_SI_CHAR 0x29
    572 
    573 #define JIPS_SO_CHAR_1 0x1A
    574 #define JIPS_SO_CHAR_2 0x70
    575 #define JIPS_SI_CHAR_1 0x1A
    576 #define JIPS_SI_CHAR_2 0x71
    577 
    578 enum SISO_Option {
    579     SI,
    580     SO
    581 };
    582 typedef enum SISO_Option SISO_Option;
    583 
    584 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
    585     int32_t SISOLength = 0;
    586 
    587     switch (option) {
    588         case SI:
    589             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    590                 value[0] = KEIS_SI_CHAR_1;
    591                 value[1] = KEIS_SI_CHAR_2;
    592                 SISOLength = 2;
    593             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    594                 value[0] = JEF_SI_CHAR;
    595                 SISOLength = 1;
    596             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    597                 value[0] = JIPS_SI_CHAR_1;
    598                 value[1] = JIPS_SI_CHAR_2;
    599                 SISOLength = 2;
    600             } else {
    601                 value[0] = UCNV_SI;
    602                 SISOLength = 1;
    603             }
    604             break;
    605         case SO:
    606             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    607                 value[0] = KEIS_SO_CHAR_1;
    608                 value[1] = KEIS_SO_CHAR_2;
    609                 SISOLength = 2;
    610             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    611                 value[0] = JEF_SO_CHAR;
    612                 SISOLength = 1;
    613             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    614                 value[0] = JIPS_SO_CHAR_1;
    615                 value[1] = JIPS_SO_CHAR_2;
    616                 SISOLength = 2;
    617             } else {
    618                 value[0] = UCNV_SO;
    619                 SISOLength = 1;
    620             }
    621             break;
    622         default:
    623             /* Should never happen. */
    624             break;
    625     }
    626 
    627     return SISOLength;
    628 }
    629 
    630 /* Miscellaneous ------------------------------------------------------------ */
    631 
    632 /* similar to ucnv_MBCSGetNextUChar() but recursive */
    633 static UBool
    634 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
    635         int32_t state, uint32_t offset,
    636         uint32_t value,
    637         UConverterEnumToUCallback *callback, const void *context,
    638         UErrorCode *pErrorCode) {
    639     UChar32 codePoints[32];
    640     const int32_t *row;
    641     const uint16_t *unicodeCodeUnits;
    642     UChar32 anyCodePoints;
    643     int32_t b, limit;
    644 
    645     row=mbcsTable->stateTable[state];
    646     unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
    647 
    648     value<<=8;
    649     anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
    650 
    651     b=(stateProps[state]&0x38)<<2;
    652     if(b==0 && stateProps[state]>=0x40) {
    653         /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
    654         codePoints[0]=U_SENTINEL;
    655         b=1;
    656     }
    657     limit=((stateProps[state]&7)+1)<<5;
    658     while(b<limit) {
    659         int32_t entry=row[b];
    660         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    661             int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
    662             if(stateProps[nextState]>=0) {
    663                 /* recurse to a state with non-ignorable actions */
    664                 if(!enumToU(
    665                         mbcsTable, stateProps, nextState,
    666                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
    667                         value|(uint32_t)b,
    668                         callback, context,
    669                         pErrorCode)) {
    670                     return FALSE;
    671                 }
    672             }
    673             codePoints[b&0x1f]=U_SENTINEL;
    674         } else {
    675             UChar32 c;
    676             int32_t action;
    677 
    678             /*
    679              * An if-else-if chain provides more reliable performance for
    680              * the most common cases compared to a switch.
    681              */
    682             action=MBCS_ENTRY_FINAL_ACTION(entry);
    683             if(action==MBCS_STATE_VALID_DIRECT_16) {
    684                 /* output BMP code point */
    685                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
    686             } else if(action==MBCS_STATE_VALID_16) {
    687                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    688                 c=unicodeCodeUnits[finalOffset];
    689                 if(c<0xfffe) {
    690                     /* output BMP code point */
    691                 } else {
    692                     c=U_SENTINEL;
    693                 }
    694             } else if(action==MBCS_STATE_VALID_16_PAIR) {
    695                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    696                 c=unicodeCodeUnits[finalOffset++];
    697                 if(c<0xd800) {
    698                     /* output BMP code point below 0xd800 */
    699                 } else if(c<=0xdbff) {
    700                     /* output roundtrip or fallback supplementary code point */
    701                     c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
    702                 } else if(c==0xe000) {
    703                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
    704                     c=unicodeCodeUnits[finalOffset];
    705                 } else {
    706                     c=U_SENTINEL;
    707                 }
    708             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
    709                 /* output supplementary code point */
    710                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
    711             } else {
    712                 c=U_SENTINEL;
    713             }
    714 
    715             codePoints[b&0x1f]=c;
    716             anyCodePoints&=c;
    717         }
    718         if(((++b)&0x1f)==0) {
    719             if(anyCodePoints>=0) {
    720                 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
    721                     return FALSE;
    722                 }
    723                 anyCodePoints=-1;
    724             }
    725         }
    726     }
    727     return TRUE;
    728 }
    729 
    730 /*
    731  * Only called if stateProps[state]==-1.
    732  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
    733  * MBCS_STATE_CHANGE_ONLY.
    734  */
    735 static int8_t
    736 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
    737     const int32_t *row;
    738     int32_t min, max, entry, nextState;
    739 
    740     row=stateTable[state];
    741     stateProps[state]=0;
    742 
    743     /* find first non-ignorable state */
    744     for(min=0;; ++min) {
    745         entry=row[min];
    746         nextState=MBCS_ENTRY_STATE(entry);
    747         if(stateProps[nextState]==-1) {
    748             getStateProp(stateTable, stateProps, nextState);
    749         }
    750         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    751             if(stateProps[nextState]>=0) {
    752                 break;
    753             }
    754         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    755             break;
    756         }
    757         if(min==0xff) {
    758             stateProps[state]=-0x40;  /* (int8_t)0xc0 */
    759             return stateProps[state];
    760         }
    761     }
    762     stateProps[state]|=(int8_t)((min>>5)<<3);
    763 
    764     /* find last non-ignorable state */
    765     for(max=0xff; min<max; --max) {
    766         entry=row[max];
    767         nextState=MBCS_ENTRY_STATE(entry);
    768         if(stateProps[nextState]==-1) {
    769             getStateProp(stateTable, stateProps, nextState);
    770         }
    771         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    772             if(stateProps[nextState]>=0) {
    773                 break;
    774             }
    775         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    776             break;
    777         }
    778     }
    779     stateProps[state]|=(int8_t)(max>>5);
    780 
    781     /* recurse further and collect direct-state information */
    782     while(min<=max) {
    783         entry=row[min];
    784         nextState=MBCS_ENTRY_STATE(entry);
    785         if(stateProps[nextState]==-1) {
    786             getStateProp(stateTable, stateProps, nextState);
    787         }
    788         if(MBCS_ENTRY_IS_FINAL(entry)) {
    789             stateProps[nextState]|=0x40;
    790             if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
    791                 stateProps[state]|=0x40;
    792             }
    793         }
    794         ++min;
    795     }
    796     return stateProps[state];
    797 }
    798 
    799 /*
    800  * Internal function enumerating the toUnicode data of an MBCS converter.
    801  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
    802  * table, but could also be used for a future ucnv_getUnicodeSet() option
    803  * that includes reverse fallbacks (after updating this function's implementation).
    804  * Currently only handles roundtrip mappings.
    805  * Does not currently handle extensions.
    806  */
    807 static void
    808 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
    809                        UConverterEnumToUCallback *callback, const void *context,
    810                        UErrorCode *pErrorCode) {
    811     /*
    812      * Properties for each state, to speed up the enumeration.
    813      * Ignorable actions are unassigned/illegal/state-change-only:
    814      * They do not lead to mappings.
    815      *
    816      * Bits 7..6:
    817      * 1 direct/initial state (stateful converters have multiple)
    818      * 0 non-initial state with transitions or with non-ignorable result actions
    819      * -1 final state with only ignorable actions
    820      *
    821      * Bits 5..3:
    822      * The lowest byte value with non-ignorable actions is
    823      * value<<5 (rounded down).
    824      *
    825      * Bits 2..0:
    826      * The highest byte value with non-ignorable actions is
    827      * (value<<5)&0x1f (rounded up).
    828      */
    829     int8_t stateProps[MBCS_MAX_STATE_COUNT];
    830     int32_t state;
    831 
    832     uprv_memset(stateProps, -1, sizeof(stateProps));
    833 
    834     /* recurse from state 0 and set all stateProps */
    835     getStateProp(mbcsTable->stateTable, stateProps, 0);
    836 
    837     for(state=0; state<mbcsTable->countStates; ++state) {
    838         /*if(stateProps[state]==-1) {
    839             printf("unused/unreachable <icu:state> %d\n", state);
    840         }*/
    841         if(stateProps[state]>=0x40) {
    842             /* start from each direct state */
    843             enumToU(
    844                 mbcsTable, stateProps, state, 0, 0,
    845                 callback, context,
    846                 pErrorCode);
    847         }
    848     }
    849 }
    850 
    851 U_CFUNC void
    852 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    853                                          const USetAdder *sa,
    854                                          UConverterUnicodeSet which,
    855                                          UConverterSetFilter filter,
    856                                          UErrorCode *pErrorCode) {
    857     const UConverterMBCSTable *mbcsTable;
    858     const uint16_t *table;
    859 
    860     uint32_t st3;
    861     uint16_t st1, maxStage1, st2;
    862 
    863     UChar32 c;
    864 
    865     /* enumerate the from-Unicode trie table */
    866     mbcsTable=&sharedData->mbcs;
    867     table=mbcsTable->fromUnicodeTable;
    868     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
    869         maxStage1=0x440;
    870     } else {
    871         maxStage1=0x40;
    872     }
    873 
    874     c=0; /* keep track of the current code point while enumerating */
    875 
    876     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
    877         const uint16_t *stage2, *stage3, *results;
    878         uint16_t minValue;
    879 
    880         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
    881 
    882         /*
    883          * Set a threshold variable for selecting which mappings to use.
    884          * See ucnv_MBCSSingleFromBMPWithOffsets() and
    885          * MBCS_SINGLE_RESULT_FROM_U() for details.
    886          */
    887         if(which==UCNV_ROUNDTRIP_SET) {
    888             /* use only roundtrips */
    889             minValue=0xf00;
    890         } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
    891             /* use all roundtrip and fallback results */
    892             minValue=0x800;
    893         }
    894 
    895         for(st1=0; st1<maxStage1; ++st1) {
    896             st2=table[st1];
    897             if(st2>maxStage1) {
    898                 stage2=table+st2;
    899                 for(st2=0; st2<64; ++st2) {
    900                     if((st3=stage2[st2])!=0) {
    901                         /* read the stage 3 block */
    902                         stage3=results+st3;
    903 
    904                         do {
    905                             if(*stage3++>=minValue) {
    906                                 sa->add(sa->set, c);
    907                             }
    908                         } while((++c&0xf)!=0);
    909                     } else {
    910                         c+=16; /* empty stage 3 block */
    911                     }
    912                 }
    913             } else {
    914                 c+=1024; /* empty stage 2 block */
    915             }
    916         }
    917     } else {
    918         const uint32_t *stage2;
    919         const uint8_t *stage3, *bytes;
    920         uint32_t st3Multiplier;
    921         uint32_t value;
    922         UBool useFallback;
    923 
    924         bytes=mbcsTable->fromUnicodeBytes;
    925 
    926         useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
    927 
    928         switch(mbcsTable->outputType) {
    929         case MBCS_OUTPUT_3:
    930         case MBCS_OUTPUT_4_EUC:
    931             st3Multiplier=3;
    932             break;
    933         case MBCS_OUTPUT_4:
    934             st3Multiplier=4;
    935             break;
    936         default:
    937             st3Multiplier=2;
    938             break;
    939         }
    940 
    941         for(st1=0; st1<maxStage1; ++st1) {
    942             st2=table[st1];
    943             if(st2>(maxStage1>>1)) {
    944                 stage2=(const uint32_t *)table+st2;
    945                 for(st2=0; st2<64; ++st2) {
    946                     if((st3=stage2[st2])!=0) {
    947                         /* read the stage 3 block */
    948                         stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
    949 
    950                         /* get the roundtrip flags for the stage 3 block */
    951                         st3>>=16;
    952 
    953                         /*
    954                          * Add code points for which the roundtrip flag is set,
    955                          * or which map to non-zero bytes if we use fallbacks.
    956                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
    957                          */
    958                         switch(filter) {
    959                         case UCNV_SET_FILTER_NONE:
    960                             do {
    961                                 if(st3&1) {
    962                                     sa->add(sa->set, c);
    963                                     stage3+=st3Multiplier;
    964                                 } else if(useFallback) {
    965                                     uint8_t b=0;
    966                                     switch(st3Multiplier) {
    967                                     case 4:
    968                                         b|=*stage3++;
    969                                         U_FALLTHROUGH;
    970                                     case 3:
    971                                         b|=*stage3++;
    972                                         U_FALLTHROUGH;
    973                                     case 2:
    974                                         b|=stage3[0]|stage3[1];
    975                                         stage3+=2;
    976                                         U_FALLTHROUGH;
    977                                     default:
    978                                         break;
    979                                     }
    980                                     if(b!=0) {
    981                                         sa->add(sa->set, c);
    982                                     }
    983                                 }
    984                                 st3>>=1;
    985                             } while((++c&0xf)!=0);
    986                             break;
    987                         case UCNV_SET_FILTER_DBCS_ONLY:
    988                              /* Ignore single-byte results (<0x100). */
    989                             do {
    990                                 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
    991                                     sa->add(sa->set, c);
    992                                 }
    993                                 st3>>=1;
    994                                 stage3+=2;  /* +=st3Multiplier */
    995                             } while((++c&0xf)!=0);
    996                             break;
    997                         case UCNV_SET_FILTER_2022_CN:
    998                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
    999                             do {
   1000                                 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
   1001                                     sa->add(sa->set, c);
   1002                                 }
   1003                                 st3>>=1;
   1004                                 stage3+=3;  /* +=st3Multiplier */
   1005                             } while((++c&0xf)!=0);
   1006                             break;
   1007                         case UCNV_SET_FILTER_SJIS:
   1008                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
   1009                             do {
   1010                                 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
   1011                                     sa->add(sa->set, c);
   1012                                 }
   1013                                 st3>>=1;
   1014                                 stage3+=2;  /* +=st3Multiplier */
   1015                             } while((++c&0xf)!=0);
   1016                             break;
   1017                         case UCNV_SET_FILTER_GR94DBCS:
   1018                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
   1019                             do {
   1020                                 if( ((st3&1)!=0 || useFallback) &&
   1021                                     (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
   1022                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
   1023                                 ) {
   1024                                     sa->add(sa->set, c);
   1025                                 }
   1026                                 st3>>=1;
   1027                                 stage3+=2;  /* +=st3Multiplier */
   1028                             } while((++c&0xf)!=0);
   1029                             break;
   1030                         case UCNV_SET_FILTER_HZ:
   1031                             /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
   1032                             do {
   1033                                 if( ((st3&1)!=0 || useFallback) &&
   1034                                     (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
   1035                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
   1036                                 ) {
   1037                                     sa->add(sa->set, c);
   1038                                 }
   1039                                 st3>>=1;
   1040                                 stage3+=2;  /* +=st3Multiplier */
   1041                             } while((++c&0xf)!=0);
   1042                             break;
   1043                         default:
   1044                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
   1045                             return;
   1046                         }
   1047                     } else {
   1048                         c+=16; /* empty stage 3 block */
   1049                     }
   1050                 }
   1051             } else {
   1052                 c+=1024; /* empty stage 2 block */
   1053             }
   1054         }
   1055     }
   1056 
   1057     ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
   1058 }
   1059 
   1060 U_CFUNC void
   1061 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
   1062                                  const USetAdder *sa,
   1063                                  UConverterUnicodeSet which,
   1064                                  UErrorCode *pErrorCode) {
   1065     ucnv_MBCSGetFilteredUnicodeSetForUnicode(
   1066         sharedData, sa, which,
   1067         sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
   1068             UCNV_SET_FILTER_DBCS_ONLY :
   1069             UCNV_SET_FILTER_NONE,
   1070         pErrorCode);
   1071 }
   1072 
   1073 static void U_CALLCONV
   1074 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
   1075                    const USetAdder *sa,
   1076                    UConverterUnicodeSet which,
   1077                    UErrorCode *pErrorCode) {
   1078     if(cnv->options&_MBCS_OPTION_GB18030) {
   1079         sa->addRange(sa->set, 0, 0xd7ff);
   1080         sa->addRange(sa->set, 0xe000, 0x10ffff);
   1081     } else {
   1082         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
   1083     }
   1084 }
   1085 
   1086 /* conversion extensions for input not in the main table -------------------- */
   1087 
   1088 /*
   1089  * Hardcoded extension handling for GB 18030.
   1090  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
   1091  *
   1092  * In the future, conversion extensions may handle m:n mappings and delta tables,
   1093  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
   1094  *
   1095  * If an input character cannot be mapped, then these functions set an error
   1096  * code. The framework will then call the callback function.
   1097  */
   1098 
   1099 /*
   1100  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
   1101  *         else return 0 after output has been written to the target
   1102  */
   1103 static UChar32
   1104 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
   1105           UChar32 cp,
   1106           const UChar **source, const UChar *sourceLimit,
   1107           uint8_t **target, const uint8_t *targetLimit,
   1108           int32_t **offsets, int32_t sourceIndex,
   1109           UBool flush,
   1110           UErrorCode *pErrorCode) {
   1111     const int32_t *cx;
   1112 
   1113     cnv->useSubChar1=FALSE;
   1114 
   1115     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1116         ucnv_extInitialMatchFromU(
   1117             cnv, cx,
   1118             cp, source, sourceLimit,
   1119             (char **)target, (char *)targetLimit,
   1120             offsets, sourceIndex,
   1121             flush,
   1122             pErrorCode)
   1123     ) {
   1124         return 0; /* an extension mapping handled the input */
   1125     }
   1126 
   1127     /* GB 18030 */
   1128     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1129         const uint32_t *range;
   1130         int32_t i;
   1131 
   1132         range=gb18030Ranges[0];
   1133         for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
   1134             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
   1135                 /* found the Unicode code point, output the four-byte sequence for it */
   1136                 uint32_t linear;
   1137                 char bytes[4];
   1138 
   1139                 /* get the linear value of the first GB 18030 code in this range */
   1140                 linear=range[2]-LINEAR_18030_BASE;
   1141 
   1142                 /* add the offset from the beginning of the range */
   1143                 linear+=((uint32_t)cp-range[0]);
   1144 
   1145                 /* turn this into a four-byte sequence */
   1146                 bytes[3]=(char)(0x30+linear%10); linear/=10;
   1147                 bytes[2]=(char)(0x81+linear%126); linear/=126;
   1148                 bytes[1]=(char)(0x30+linear%10); linear/=10;
   1149                 bytes[0]=(char)(0x81+linear);
   1150 
   1151                 /* output this sequence */
   1152                 ucnv_fromUWriteBytes(cnv,
   1153                                      bytes, 4, (char **)target, (char *)targetLimit,
   1154                                      offsets, sourceIndex, pErrorCode);
   1155                 return 0;
   1156             }
   1157         }
   1158     }
   1159 
   1160     /* no mapping */
   1161     *pErrorCode=U_INVALID_CHAR_FOUND;
   1162     return cp;
   1163 }
   1164 
   1165 /*
   1166  * Input sequence: cnv->toUBytes[0..length[
   1167  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
   1168  *         else return 0 after output has been written to the target
   1169  */
   1170 static int8_t
   1171 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
   1172         int8_t length,
   1173         const uint8_t **source, const uint8_t *sourceLimit,
   1174         UChar **target, const UChar *targetLimit,
   1175         int32_t **offsets, int32_t sourceIndex,
   1176         UBool flush,
   1177         UErrorCode *pErrorCode) {
   1178     const int32_t *cx;
   1179 
   1180     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1181         ucnv_extInitialMatchToU(
   1182             cnv, cx,
   1183             length, (const char **)source, (const char *)sourceLimit,
   1184             target, targetLimit,
   1185             offsets, sourceIndex,
   1186             flush,
   1187             pErrorCode)
   1188     ) {
   1189         return 0; /* an extension mapping handled the input */
   1190     }
   1191 
   1192     /* GB 18030 */
   1193     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1194         const uint32_t *range;
   1195         uint32_t linear;
   1196         int32_t i;
   1197 
   1198         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
   1199         range=gb18030Ranges[0];
   1200         for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
   1201             if(range[2]<=linear && linear<=range[3]) {
   1202                 /* found the sequence, output the Unicode code point for it */
   1203                 *pErrorCode=U_ZERO_ERROR;
   1204 
   1205                 /* add the linear difference between the input and start sequences to the start code point */
   1206                 linear=range[0]+(linear-range[2]);
   1207 
   1208                 /* output this code point */
   1209                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
   1210 
   1211                 return 0;
   1212             }
   1213         }
   1214     }
   1215 
   1216     /* no mapping */
   1217     *pErrorCode=U_INVALID_CHAR_FOUND;
   1218     return length;
   1219 }
   1220 
   1221 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
   1222 
   1223 /*
   1224  * This code modifies a standard EBCDIC<->Unicode mapping table for
   1225  * OS/390 (z/OS) Unix System Services (Open Edition).
   1226  * The difference is in the mapping of Line Feed and New Line control codes:
   1227  * Standard EBCDIC maps
   1228  *
   1229  *   <U000A> \x25 |0
   1230  *   <U0085> \x15 |0
   1231  *
   1232  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
   1233  * mapping
   1234  *
   1235  *   <U000A> \x15 |0
   1236  *   <U0085> \x25 |0
   1237  *
   1238  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
   1239  * by copying it into allocated memory and swapping the LF and NL values.
   1240  * It allows to support the same EBCDIC charset in both versions without
   1241  * duplicating the entire installed table.
   1242  */
   1243 
   1244 /* standard EBCDIC codes */
   1245 #define EBCDIC_LF 0x25
   1246 #define EBCDIC_NL 0x15
   1247 
   1248 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
   1249 #define EBCDIC_RT_LF 0xf25
   1250 #define EBCDIC_RT_NL 0xf15
   1251 
   1252 /* Unicode code points */
   1253 #define U_LF 0x0a
   1254 #define U_NL 0x85
   1255 
   1256 static UBool
   1257 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
   1258     UConverterMBCSTable *mbcsTable;
   1259 
   1260     const uint16_t *table, *results;
   1261     const uint8_t *bytes;
   1262 
   1263     int32_t (*newStateTable)[256];
   1264     uint16_t *newResults;
   1265     uint8_t *p;
   1266     char *name;
   1267 
   1268     uint32_t stage2Entry;
   1269     uint32_t size, sizeofFromUBytes;
   1270 
   1271     mbcsTable=&sharedData->mbcs;
   1272 
   1273     table=mbcsTable->fromUnicodeTable;
   1274     bytes=mbcsTable->fromUnicodeBytes;
   1275     results=(const uint16_t *)bytes;
   1276 
   1277     /*
   1278      * Check that this is an EBCDIC table with SBCS portion -
   1279      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
   1280      *
   1281      * If not, ignore the option. Options are always ignored if they do not apply.
   1282      */
   1283     if(!(
   1284          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
   1285          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
   1286          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
   1287     )) {
   1288         return FALSE;
   1289     }
   1290 
   1291     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1292         if(!(
   1293              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
   1294              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
   1295         )) {
   1296             return FALSE;
   1297         }
   1298     } else /* MBCS_OUTPUT_2_SISO */ {
   1299         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1300         if(!(
   1301              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
   1302              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
   1303         )) {
   1304             return FALSE;
   1305         }
   1306 
   1307         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1308         if(!(
   1309              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
   1310              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
   1311         )) {
   1312             return FALSE;
   1313         }
   1314     }
   1315 
   1316     if(mbcsTable->fromUBytesLength>0) {
   1317         /*
   1318          * We _know_ the number of bytes in the fromUnicodeBytes array
   1319          * starting with header.version 4.1.
   1320          */
   1321         sizeofFromUBytes=mbcsTable->fromUBytesLength;
   1322     } else {
   1323         /*
   1324          * Otherwise:
   1325          * There used to be code to enumerate the fromUnicode
   1326          * trie and find the highest entry, but it was removed in ICU 3.2
   1327          * because it was not tested and caused a low code coverage number.
   1328          * See Jitterbug 3674.
   1329          * This affects only some .cnv file formats with a header.version
   1330          * below 4.1, and only when swaplfnl is requested.
   1331          *
   1332          * ucnvmbcs.c revision 1.99 is the last one with the
   1333          * ucnv_MBCSSizeofFromUBytes() function.
   1334          */
   1335         *pErrorCode=U_INVALID_FORMAT_ERROR;
   1336         return FALSE;
   1337     }
   1338 
   1339     /*
   1340      * The table has an appropriate format.
   1341      * Allocate and build
   1342      * - a modified to-Unicode state table
   1343      * - a modified from-Unicode output array
   1344      * - a converter name string with the swap option appended
   1345      */
   1346     size=
   1347         mbcsTable->countStates*1024+
   1348         sizeofFromUBytes+
   1349         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
   1350     p=(uint8_t *)uprv_malloc(size);
   1351     if(p==NULL) {
   1352         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1353         return FALSE;
   1354     }
   1355 
   1356     /* copy and modify the to-Unicode state table */
   1357     newStateTable=(int32_t (*)[256])p;
   1358     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
   1359 
   1360     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
   1361     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
   1362 
   1363     /* copy and modify the from-Unicode result table */
   1364     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
   1365     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
   1366 
   1367     /* conveniently, the table access macros work on the left side of expressions */
   1368     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1369         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
   1370         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
   1371     } else /* MBCS_OUTPUT_2_SISO */ {
   1372         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1373         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
   1374 
   1375         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1376         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
   1377     }
   1378 
   1379     /* set the canonical converter name */
   1380     name=(char *)newResults+sizeofFromUBytes;
   1381     uprv_strcpy(name, sharedData->staticData->name);
   1382     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
   1383 
   1384     /* set the pointers */
   1385     umtx_lock(NULL);
   1386     if(mbcsTable->swapLFNLStateTable==NULL) {
   1387         mbcsTable->swapLFNLStateTable=newStateTable;
   1388         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
   1389         mbcsTable->swapLFNLName=name;
   1390 
   1391         newStateTable=NULL;
   1392     }
   1393     umtx_unlock(NULL);
   1394 
   1395     /* release the allocated memory if another thread beat us to it */
   1396     if(newStateTable!=NULL) {
   1397         uprv_free(newStateTable);
   1398     }
   1399     return TRUE;
   1400 }
   1401 
   1402 /* reconstitute omitted fromUnicode data ------------------------------------ */
   1403 
   1404 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
   1405 static UBool U_CALLCONV
   1406 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
   1407     UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
   1408     const uint16_t *table;
   1409     uint32_t *stage2;
   1410     uint8_t *bytes, *p;
   1411     UChar32 c;
   1412     int32_t i, st3;
   1413 
   1414     table=mbcsTable->fromUnicodeTable;
   1415     bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
   1416 
   1417     /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
   1418     switch(mbcsTable->outputType) {
   1419     case MBCS_OUTPUT_3_EUC:
   1420         if(value<=0xffff) {
   1421             /* short sequences are stored directly */
   1422             /* code set 0 or 1 */
   1423         } else if(value<=0x8effff) {
   1424             /* code set 2 */
   1425             value&=0x7fff;
   1426         } else /* first byte is 0x8f */ {
   1427             /* code set 3 */
   1428             value&=0xff7f;
   1429         }
   1430         break;
   1431     case MBCS_OUTPUT_4_EUC:
   1432         if(value<=0xffffff) {
   1433             /* short sequences are stored directly */
   1434             /* code set 0 or 1 */
   1435         } else if(value<=0x8effffff) {
   1436             /* code set 2 */
   1437             value&=0x7fffff;
   1438         } else /* first byte is 0x8f */ {
   1439             /* code set 3 */
   1440             value&=0xff7fff;
   1441         }
   1442         break;
   1443     default:
   1444         break;
   1445     }
   1446 
   1447     for(i=0; i<=0x1f; ++value, ++i) {
   1448         c=codePoints[i];
   1449         if(c<0) {
   1450             continue;
   1451         }
   1452 
   1453         /* locate the stage 2 & 3 data */
   1454         stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
   1455         p=bytes;
   1456         st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
   1457 
   1458         /* write the codepage bytes into stage 3 */
   1459         switch(mbcsTable->outputType) {
   1460         case MBCS_OUTPUT_3:
   1461         case MBCS_OUTPUT_4_EUC:
   1462             p+=st3*3;
   1463             p[0]=(uint8_t)(value>>16);
   1464             p[1]=(uint8_t)(value>>8);
   1465             p[2]=(uint8_t)value;
   1466             break;
   1467         case MBCS_OUTPUT_4:
   1468             ((uint32_t *)p)[st3]=value;
   1469             break;
   1470         default:
   1471             /* 2 bytes per character */
   1472             ((uint16_t *)p)[st3]=(uint16_t)value;
   1473             break;
   1474         }
   1475 
   1476         /* set the roundtrip flag */
   1477         *stage2|=(1UL<<(16+(c&0xf)));
   1478     }
   1479     return TRUE;
   1480  }
   1481 
   1482 static void
   1483 reconstituteData(UConverterMBCSTable *mbcsTable,
   1484                  uint32_t stage1Length, uint32_t stage2Length,
   1485                  uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
   1486                  UErrorCode *pErrorCode) {
   1487     uint16_t *stage1;
   1488     uint32_t *stage2;
   1489     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
   1490     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
   1491     if(mbcsTable->reconstitutedData==NULL) {
   1492         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1493         return;
   1494     }
   1495     uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
   1496 
   1497     /* copy existing data and reroute the pointers */
   1498     stage1=(uint16_t *)mbcsTable->reconstitutedData;
   1499     uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
   1500 
   1501     stage2=(uint32_t *)(stage1+stage1Length);
   1502     uprv_memcpy(stage2+(fullStage2Length-stage2Length),
   1503                 mbcsTable->fromUnicodeTable+stage1Length,
   1504                 stage2Length*4);
   1505 
   1506     mbcsTable->fromUnicodeTable=stage1;
   1507     mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
   1508 
   1509     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
   1510     stage2=(uint32_t *)stage1;
   1511 
   1512     /* reconstitute the initial part of stage 2 from the mbcsIndex */
   1513     {
   1514         int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
   1515         int32_t stageUTF8Index=0;
   1516         int32_t st1, st2, st3, i;
   1517 
   1518         for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
   1519             st2=stage1[st1];
   1520             if(st2!=(int32_t)stage1Length/2) {
   1521                 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
   1522                 for(i=0; i<16; ++i) {
   1523                     st3=mbcsTable->mbcsIndex[stageUTF8Index++];
   1524                     if(st3!=0) {
   1525                         /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
   1526                         st3>>=4;
   1527                         /*
   1528                          * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
   1529                          * allocated together as a single 64-block for access from the mbcsIndex
   1530                          */
   1531                         stage2[st2++]=st3++;
   1532                         stage2[st2++]=st3++;
   1533                         stage2[st2++]=st3++;
   1534                         stage2[st2++]=st3;
   1535                     } else {
   1536                         /* no stage 3 block, skip */
   1537                         st2+=4;
   1538                     }
   1539                 }
   1540             } else {
   1541                 /* no stage 2 block, skip */
   1542                 stageUTF8Index+=16;
   1543             }
   1544         }
   1545     }
   1546 
   1547     /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
   1548     ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
   1549 }
   1550 
   1551 /* MBCS setup functions ----------------------------------------------------- */
   1552 
   1553 static void U_CALLCONV
   1554 ucnv_MBCSLoad(UConverterSharedData *sharedData,
   1555           UConverterLoadArgs *pArgs,
   1556           const uint8_t *raw,
   1557           UErrorCode *pErrorCode) {
   1558     UDataInfo info;
   1559     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1560     _MBCSHeader *header=(_MBCSHeader *)raw;
   1561     uint32_t offset;
   1562     uint32_t headerLength;
   1563     UBool noFromU=FALSE;
   1564 
   1565     if(header->version[0]==4) {
   1566         headerLength=MBCS_HEADER_V4_LENGTH;
   1567     } else if(header->version[0]==5 && header->version[1]>=3 &&
   1568               (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
   1569         headerLength=header->options&MBCS_OPT_LENGTH_MASK;
   1570         noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
   1571     } else {
   1572         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1573         return;
   1574     }
   1575 
   1576     mbcsTable->outputType=(uint8_t)header->flags;
   1577     if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
   1578         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1579         return;
   1580     }
   1581 
   1582     /* extension data, header version 4.2 and higher */
   1583     offset=header->flags>>8;
   1584     if(offset!=0) {
   1585         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
   1586     }
   1587 
   1588     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
   1589         UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER;
   1590         UConverterSharedData *baseSharedData;
   1591         const int32_t *extIndexes;
   1592         const char *baseName;
   1593 
   1594         /* extension-only file, load the base table and set values appropriately */
   1595         if((extIndexes=mbcsTable->extIndexes)==NULL) {
   1596             /* extension-only file without extension */
   1597             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1598             return;
   1599         }
   1600 
   1601         if(pArgs->nestedLoads!=1) {
   1602             /* an extension table must not be loaded as a base table */
   1603             *pErrorCode=U_INVALID_TABLE_FILE;
   1604             return;
   1605         }
   1606 
   1607         /* load the base table */
   1608         baseName=(const char *)header+headerLength*4;
   1609         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
   1610             /* forbid loading this same extension-only file */
   1611             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1612             return;
   1613         }
   1614 
   1615         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
   1616         args.size=sizeof(UConverterLoadArgs);
   1617         args.nestedLoads=2;
   1618         args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
   1619         args.reserved=pArgs->reserved;
   1620         args.options=pArgs->options;
   1621         args.pkg=pArgs->pkg;
   1622         args.name=baseName;
   1623         baseSharedData=ucnv_load(&args, pErrorCode);
   1624         if(U_FAILURE(*pErrorCode)) {
   1625             return;
   1626         }
   1627         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
   1628             baseSharedData->mbcs.baseSharedData!=NULL
   1629         ) {
   1630             ucnv_unload(baseSharedData);
   1631             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1632             return;
   1633         }
   1634         if(pArgs->onlyTestIsLoadable) {
   1635             /*
   1636              * Exit as soon as we know that we can load the converter
   1637              * and the format is valid and supported.
   1638              * The worst that can happen in the following code is a memory
   1639              * allocation error.
   1640              */
   1641             ucnv_unload(baseSharedData);
   1642             return;
   1643         }
   1644 
   1645         /* copy the base table data */
   1646         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
   1647 
   1648         /* overwrite values with relevant ones for the extension converter */
   1649         mbcsTable->baseSharedData=baseSharedData;
   1650         mbcsTable->extIndexes=extIndexes;
   1651 
   1652         /*
   1653          * It would be possible to share the swapLFNL data with a base converter,
   1654          * but the generated name would have to be different, and the memory
   1655          * would have to be free'd only once.
   1656          * It is easier to just create the data for the extension converter
   1657          * separately when it is requested.
   1658          */
   1659         mbcsTable->swapLFNLStateTable=NULL;
   1660         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
   1661         mbcsTable->swapLFNLName=NULL;
   1662 
   1663         /*
   1664          * The reconstitutedData must be deleted only when the base converter
   1665          * is unloaded.
   1666          */
   1667         mbcsTable->reconstitutedData=NULL;
   1668 
   1669         /*
   1670          * Set a special, runtime-only outputType if the extension converter
   1671          * is a DBCS version of a base converter that also maps single bytes.
   1672          */
   1673         if( sharedData->staticData->conversionType==UCNV_DBCS ||
   1674                 (sharedData->staticData->conversionType==UCNV_MBCS &&
   1675                  sharedData->staticData->minBytesPerChar>=2)
   1676         ) {
   1677             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
   1678                 /* the base converter is SI/SO-stateful */
   1679                 int32_t entry;
   1680 
   1681                 /* get the dbcs state from the state table entry for SO=0x0e */
   1682                 entry=mbcsTable->stateTable[0][0xe];
   1683                 if( MBCS_ENTRY_IS_FINAL(entry) &&
   1684                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
   1685                     MBCS_ENTRY_FINAL_STATE(entry)!=0
   1686                 ) {
   1687                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
   1688 
   1689                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1690                 }
   1691             } else if(
   1692                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
   1693                 baseSharedData->staticData->minBytesPerChar==1 &&
   1694                 baseSharedData->staticData->maxBytesPerChar==2 &&
   1695                 mbcsTable->countStates<=127
   1696             ) {
   1697                 /* non-stateful base converter, need to modify the state table */
   1698                 int32_t (*newStateTable)[256];
   1699                 int32_t *state;
   1700                 int32_t i, count;
   1701 
   1702                 /* allocate a new state table and copy the base state table contents */
   1703                 count=mbcsTable->countStates;
   1704                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
   1705                 if(newStateTable==NULL) {
   1706                     ucnv_unload(baseSharedData);
   1707                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1708                     return;
   1709                 }
   1710 
   1711                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
   1712 
   1713                 /* change all final single-byte entries to go to a new all-illegal state */
   1714                 state=newStateTable[0];
   1715                 for(i=0; i<256; ++i) {
   1716                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
   1717                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
   1718                     }
   1719                 }
   1720 
   1721                 /* build the new all-illegal state */
   1722                 state=newStateTable[count];
   1723                 for(i=0; i<256; ++i) {
   1724                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
   1725                 }
   1726                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
   1727                 mbcsTable->countStates=(uint8_t)(count+1);
   1728                 mbcsTable->stateTableOwned=TRUE;
   1729 
   1730                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1731             }
   1732         }
   1733 
   1734         /*
   1735          * unlike below for files with base tables, do not get the unicodeMask
   1736          * from the sharedData; instead, use the base table's unicodeMask,
   1737          * which we copied in the memcpy above;
   1738          * this is necessary because the static data unicodeMask, especially
   1739          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
   1740          */
   1741     } else {
   1742         /* conversion file with a base table; an additional extension table is optional */
   1743         /* make sure that the output type is known */
   1744         switch(mbcsTable->outputType) {
   1745         case MBCS_OUTPUT_1:
   1746         case MBCS_OUTPUT_2:
   1747         case MBCS_OUTPUT_3:
   1748         case MBCS_OUTPUT_4:
   1749         case MBCS_OUTPUT_3_EUC:
   1750         case MBCS_OUTPUT_4_EUC:
   1751         case MBCS_OUTPUT_2_SISO:
   1752             /* OK */
   1753             break;
   1754         default:
   1755             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1756             return;
   1757         }
   1758         if(pArgs->onlyTestIsLoadable) {
   1759             /*
   1760              * Exit as soon as we know that we can load the converter
   1761              * and the format is valid and supported.
   1762              * The worst that can happen in the following code is a memory
   1763              * allocation error.
   1764              */
   1765             return;
   1766         }
   1767 
   1768         mbcsTable->countStates=(uint8_t)header->countStates;
   1769         mbcsTable->countToUFallbacks=header->countToUFallbacks;
   1770         mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
   1771         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
   1772         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
   1773 
   1774         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
   1775         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
   1776         mbcsTable->fromUBytesLength=header->fromUBytesLength;
   1777 
   1778         /*
   1779          * converter versions 6.1 and up contain a unicodeMask that is
   1780          * used here to select the most efficient function implementations
   1781          */
   1782         info.size=sizeof(UDataInfo);
   1783         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
   1784         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
   1785             /* mask off possible future extensions to be safe */
   1786             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
   1787         } else {
   1788             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
   1789             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
   1790         }
   1791 
   1792         /*
   1793          * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
   1794          * Check for the header version, SBCS vs. MBCS, and for whether the
   1795          * data structures are optimized for code points as high as what the
   1796          * runtime code is designed for.
   1797          * The implementation does not handle mapping tables with entries for
   1798          * unpaired surrogates.
   1799          */
   1800         if( header->version[1]>=3 &&
   1801             (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
   1802             (mbcsTable->countStates==1 ?
   1803                 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
   1804                 (header->version[2]>=(MBCS_FAST_MAX>>8))
   1805             )
   1806         ) {
   1807             mbcsTable->utf8Friendly=TRUE;
   1808 
   1809             if(mbcsTable->countStates==1) {
   1810                 /*
   1811                  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
   1812                  * Build a table with indexes to each block, to be used instead of
   1813                  * the regular stage 1/2 table.
   1814                  */
   1815                 int32_t i;
   1816                 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
   1817                     mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
   1818                 }
   1819                 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
   1820                 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
   1821             } else {
   1822                 /*
   1823                  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
   1824                  * The .cnv file is prebuilt with an additional stage table with indexes
   1825                  * to each block.
   1826                  */
   1827                 mbcsTable->mbcsIndex=(const uint16_t *)
   1828                     (mbcsTable->fromUnicodeBytes+
   1829                      (noFromU ? 0 : mbcsTable->fromUBytesLength));
   1830                 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
   1831             }
   1832         }
   1833 
   1834         /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
   1835         {
   1836             uint32_t asciiRoundtrips=0xffffffff;
   1837             int32_t i;
   1838 
   1839             for(i=0; i<0x80; ++i) {
   1840                 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
   1841                     asciiRoundtrips&=~((uint32_t)1<<(i>>2));
   1842                 }
   1843             }
   1844             mbcsTable->asciiRoundtrips=asciiRoundtrips;
   1845         }
   1846 
   1847         if(noFromU) {
   1848             uint32_t stage1Length=
   1849                 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
   1850                     0x440 : 0x40;
   1851             uint32_t stage2Length=
   1852                 (header->offsetFromUBytes-header->offsetFromUTable)/4-
   1853                 stage1Length/2;
   1854             reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
   1855         }
   1856     }
   1857 
   1858     /* Set the impl pointer here so that it is set for both extension-only and base tables. */
   1859     if(mbcsTable->utf8Friendly) {
   1860         if(mbcsTable->countStates==1) {
   1861             sharedData->impl=&_SBCSUTF8Impl;
   1862         } else {
   1863             if(mbcsTable->outputType==MBCS_OUTPUT_2) {
   1864                 sharedData->impl=&_DBCSUTF8Impl;
   1865             }
   1866         }
   1867     }
   1868 
   1869     if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
   1870         /*
   1871          * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
   1872          * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
   1873          */
   1874         mbcsTable->asciiRoundtrips=0;
   1875     }
   1876 }
   1877 
   1878 static void U_CALLCONV
   1879 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
   1880     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1881 
   1882     if(mbcsTable->swapLFNLStateTable!=NULL) {
   1883         uprv_free(mbcsTable->swapLFNLStateTable);
   1884     }
   1885     if(mbcsTable->stateTableOwned) {
   1886         uprv_free((void *)mbcsTable->stateTable);
   1887     }
   1888     if(mbcsTable->baseSharedData!=NULL) {
   1889         ucnv_unload(mbcsTable->baseSharedData);
   1890     }
   1891     if(mbcsTable->reconstitutedData!=NULL) {
   1892         uprv_free(mbcsTable->reconstitutedData);
   1893     }
   1894 }
   1895 
   1896 static void U_CALLCONV
   1897 ucnv_MBCSOpen(UConverter *cnv,
   1898               UConverterLoadArgs *pArgs,
   1899               UErrorCode *pErrorCode) {
   1900     UConverterMBCSTable *mbcsTable;
   1901     const int32_t *extIndexes;
   1902     uint8_t outputType;
   1903     int8_t maxBytesPerUChar;
   1904 
   1905     if(pArgs->onlyTestIsLoadable) {
   1906         return;
   1907     }
   1908 
   1909     mbcsTable=&cnv->sharedData->mbcs;
   1910     outputType=mbcsTable->outputType;
   1911 
   1912     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
   1913         /* the swaplfnl option does not apply, remove it */
   1914         cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1915     }
   1916 
   1917     if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1918         /* do this because double-checked locking is broken */
   1919         UBool isCached;
   1920 
   1921         umtx_lock(NULL);
   1922         isCached=mbcsTable->swapLFNLStateTable!=NULL;
   1923         umtx_unlock(NULL);
   1924 
   1925         if(!isCached) {
   1926             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
   1927                 if(U_FAILURE(*pErrorCode)) {
   1928                     return; /* something went wrong */
   1929                 }
   1930 
   1931                 /* the option does not apply, remove it */
   1932                 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1933             }
   1934         }
   1935     }
   1936 
   1937     if(uprv_strstr(pArgs->name, "18030")!=NULL) {
   1938         if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
   1939             /* set a flag for GB 18030 mode, which changes the callback behavior */
   1940             cnv->options|=_MBCS_OPTION_GB18030;
   1941         }
   1942     } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
   1943         /* set a flag for KEIS converter, which changes the SI/SO character sequence */
   1944         cnv->options|=_MBCS_OPTION_KEIS;
   1945     } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
   1946         /* set a flag for JEF converter, which changes the SI/SO character sequence */
   1947         cnv->options|=_MBCS_OPTION_JEF;
   1948     } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
   1949         /* set a flag for JIPS converter, which changes the SI/SO character sequence */
   1950         cnv->options|=_MBCS_OPTION_JIPS;
   1951     }
   1952 
   1953     /* fix maxBytesPerUChar depending on outputType and options etc. */
   1954     if(outputType==MBCS_OUTPUT_2_SISO) {
   1955         cnv->maxBytesPerUChar=3; /* SO+DBCS */
   1956     }
   1957 
   1958     extIndexes=mbcsTable->extIndexes;
   1959     if(extIndexes!=NULL) {
   1960         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
   1961         if(outputType==MBCS_OUTPUT_2_SISO) {
   1962             ++maxBytesPerUChar; /* SO + multiple DBCS */
   1963         }
   1964 
   1965         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
   1966             cnv->maxBytesPerUChar=maxBytesPerUChar;
   1967         }
   1968     }
   1969 
   1970 #if 0
   1971     /*
   1972      * documentation of UConverter fields used for status
   1973      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
   1974      */
   1975 
   1976     /* toUnicode */
   1977     cnv->toUnicodeStatus=0;     /* offset */
   1978     cnv->mode=0;                /* state */
   1979     cnv->toULength=0;           /* byteIndex */
   1980 
   1981     /* fromUnicode */
   1982     cnv->fromUChar32=0;
   1983     cnv->fromUnicodeStatus=1;   /* prevLength */
   1984 #endif
   1985 }
   1986 
   1987 U_CDECL_BEGIN
   1988 
   1989 static const char* U_CALLCONV
   1990 ucnv_MBCSGetName(const UConverter *cnv) {
   1991     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
   1992         return cnv->sharedData->mbcs.swapLFNLName;
   1993     } else {
   1994         return cnv->sharedData->staticData->name;
   1995     }
   1996 }
   1997 U_CDECL_END
   1998 
   1999 
   2000 /* MBCS-to-Unicode conversion functions ------------------------------------- */
   2001 
   2002 static UChar32 U_CALLCONV
   2003 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
   2004     const _MBCSToUFallback *toUFallbacks;
   2005     uint32_t i, start, limit;
   2006 
   2007     limit=mbcsTable->countToUFallbacks;
   2008     if(limit>0) {
   2009         /* do a binary search for the fallback mapping */
   2010         toUFallbacks=mbcsTable->toUFallbacks;
   2011         start=0;
   2012         while(start<limit-1) {
   2013             i=(start+limit)/2;
   2014             if(offset<toUFallbacks[i].offset) {
   2015                 limit=i;
   2016             } else {
   2017                 start=i;
   2018             }
   2019         }
   2020 
   2021         /* did we really find it? */
   2022         if(offset==toUFallbacks[start].offset) {
   2023             return toUFallbacks[start].codePoint;
   2024         }
   2025     }
   2026 
   2027     return 0xfffe;
   2028 }
   2029 
   2030 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
   2031 static void
   2032 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2033                                 UErrorCode *pErrorCode) {
   2034     UConverter *cnv;
   2035     const uint8_t *source, *sourceLimit;
   2036     UChar *target;
   2037     const UChar *targetLimit;
   2038     int32_t *offsets;
   2039 
   2040     const int32_t (*stateTable)[256];
   2041 
   2042     int32_t sourceIndex;
   2043 
   2044     int32_t entry;
   2045     UChar c;
   2046     uint8_t action;
   2047 
   2048     /* set up the local pointers */
   2049     cnv=pArgs->converter;
   2050     source=(const uint8_t *)pArgs->source;
   2051     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2052     target=pArgs->target;
   2053     targetLimit=pArgs->targetLimit;
   2054     offsets=pArgs->offsets;
   2055 
   2056     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2057         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2058     } else {
   2059         stateTable=cnv->sharedData->mbcs.stateTable;
   2060     }
   2061 
   2062     /* sourceIndex=-1 if the current character began in the previous buffer */
   2063     sourceIndex=0;
   2064 
   2065     /* conversion loop */
   2066     while(source<sourceLimit) {
   2067         /*
   2068          * This following test is to see if available input would overflow the output.
   2069          * It does not catch output of more than one code unit that
   2070          * overflows as a result of a surrogate pair or callback output
   2071          * from the last source byte.
   2072          * Therefore, those situations also test for overflows and will
   2073          * then break the loop, too.
   2074          */
   2075         if(target>=targetLimit) {
   2076             /* target is full */
   2077             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2078             break;
   2079         }
   2080 
   2081         entry=stateTable[0][*source++];
   2082         /* MBCS_ENTRY_IS_FINAL(entry) */
   2083 
   2084         /* test the most common case first */
   2085         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2086             /* output BMP code point */
   2087             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2088             if(offsets!=NULL) {
   2089                 *offsets++=sourceIndex;
   2090             }
   2091 
   2092             /* normal end of action codes: prepare for a new character */
   2093             ++sourceIndex;
   2094             continue;
   2095         }
   2096 
   2097         /*
   2098          * An if-else-if chain provides more reliable performance for
   2099          * the most common cases compared to a switch.
   2100          */
   2101         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2102         if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2103            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2104         ) {
   2105             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2106             /* output surrogate pair */
   2107             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2108             if(offsets!=NULL) {
   2109                 *offsets++=sourceIndex;
   2110             }
   2111             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2112             if(target<targetLimit) {
   2113                 *target++=c;
   2114                 if(offsets!=NULL) {
   2115                     *offsets++=sourceIndex;
   2116                 }
   2117             } else {
   2118                 /* target overflow */
   2119                 cnv->UCharErrorBuffer[0]=c;
   2120                 cnv->UCharErrorBufferLength=1;
   2121                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2122                 break;
   2123             }
   2124 
   2125             ++sourceIndex;
   2126             continue;
   2127         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2128             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2129                 /* output BMP code point */
   2130                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2131                 if(offsets!=NULL) {
   2132                     *offsets++=sourceIndex;
   2133                 }
   2134 
   2135                 ++sourceIndex;
   2136                 continue;
   2137             }
   2138         } else if(action==MBCS_STATE_UNASSIGNED) {
   2139             /* just fall through */
   2140         } else if(action==MBCS_STATE_ILLEGAL) {
   2141             /* callback(illegal) */
   2142             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2143         } else {
   2144             /* reserved, must never occur */
   2145             ++sourceIndex;
   2146             continue;
   2147         }
   2148 
   2149         if(U_FAILURE(*pErrorCode)) {
   2150             /* callback(illegal) */
   2151             break;
   2152         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2153             /* try an extension mapping */
   2154             pArgs->source=(const char *)source;
   2155             cnv->toUBytes[0]=*(source-1);
   2156             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2157                                     1, &source, sourceLimit,
   2158                                     &target, targetLimit,
   2159                                     &offsets, sourceIndex,
   2160                                     pArgs->flush,
   2161                                     pErrorCode);
   2162             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
   2163 
   2164             if(U_FAILURE(*pErrorCode)) {
   2165                 /* not mappable or buffer overflow */
   2166                 break;
   2167             }
   2168         }
   2169     }
   2170 
   2171     /* write back the updated pointers */
   2172     pArgs->source=(const char *)source;
   2173     pArgs->target=target;
   2174     pArgs->offsets=offsets;
   2175 }
   2176 
   2177 /*
   2178  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
   2179  * that only map to and from the BMP.
   2180  * In addition to single-byte optimizations, the offset calculations
   2181  * become much easier.
   2182  */
   2183 static void
   2184 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
   2185                             UErrorCode *pErrorCode) {
   2186     UConverter *cnv;
   2187     const uint8_t *source, *sourceLimit, *lastSource;
   2188     UChar *target;
   2189     int32_t targetCapacity, length;
   2190     int32_t *offsets;
   2191 
   2192     const int32_t (*stateTable)[256];
   2193 
   2194     int32_t sourceIndex;
   2195 
   2196     int32_t entry;
   2197     uint8_t action;
   2198 
   2199     /* set up the local pointers */
   2200     cnv=pArgs->converter;
   2201     source=(const uint8_t *)pArgs->source;
   2202     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2203     target=pArgs->target;
   2204     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   2205     offsets=pArgs->offsets;
   2206 
   2207     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2208         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2209     } else {
   2210         stateTable=cnv->sharedData->mbcs.stateTable;
   2211     }
   2212 
   2213     /* sourceIndex=-1 if the current character began in the previous buffer */
   2214     sourceIndex=0;
   2215     lastSource=source;
   2216 
   2217     /*
   2218      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   2219      * for the minimum of the sourceLength and targetCapacity
   2220      */
   2221     length=(int32_t)(sourceLimit-source);
   2222     if(length<targetCapacity) {
   2223         targetCapacity=length;
   2224     }
   2225 
   2226 #if MBCS_UNROLL_SINGLE_TO_BMP
   2227     /* unrolling makes it faster on Pentium III/Windows 2000 */
   2228     /* unroll the loop with the most common case */
   2229 unrolled:
   2230     if(targetCapacity>=16) {
   2231         int32_t count, loops, oredEntries;
   2232 
   2233         loops=count=targetCapacity>>4;
   2234         do {
   2235             oredEntries=entry=stateTable[0][*source++];
   2236             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2237             oredEntries|=entry=stateTable[0][*source++];
   2238             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2239             oredEntries|=entry=stateTable[0][*source++];
   2240             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2241             oredEntries|=entry=stateTable[0][*source++];
   2242             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2243             oredEntries|=entry=stateTable[0][*source++];
   2244             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2245             oredEntries|=entry=stateTable[0][*source++];
   2246             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2247             oredEntries|=entry=stateTable[0][*source++];
   2248             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2249             oredEntries|=entry=stateTable[0][*source++];
   2250             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2251             oredEntries|=entry=stateTable[0][*source++];
   2252             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2253             oredEntries|=entry=stateTable[0][*source++];
   2254             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2255             oredEntries|=entry=stateTable[0][*source++];
   2256             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2257             oredEntries|=entry=stateTable[0][*source++];
   2258             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2259             oredEntries|=entry=stateTable[0][*source++];
   2260             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2261             oredEntries|=entry=stateTable[0][*source++];
   2262             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2263             oredEntries|=entry=stateTable[0][*source++];
   2264             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2265             oredEntries|=entry=stateTable[0][*source++];
   2266             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2267 
   2268             /* were all 16 entries really valid? */
   2269             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
   2270                 /* no, return to the first of these 16 */
   2271                 source-=16;
   2272                 target-=16;
   2273                 break;
   2274             }
   2275         } while(--count>0);
   2276         count=loops-count;
   2277         targetCapacity-=16*count;
   2278 
   2279         if(offsets!=NULL) {
   2280             lastSource+=16*count;
   2281             while(count>0) {
   2282                 *offsets++=sourceIndex++;
   2283                 *offsets++=sourceIndex++;
   2284                 *offsets++=sourceIndex++;
   2285                 *offsets++=sourceIndex++;
   2286                 *offsets++=sourceIndex++;
   2287                 *offsets++=sourceIndex++;
   2288                 *offsets++=sourceIndex++;
   2289                 *offsets++=sourceIndex++;
   2290                 *offsets++=sourceIndex++;
   2291                 *offsets++=sourceIndex++;
   2292                 *offsets++=sourceIndex++;
   2293                 *offsets++=sourceIndex++;
   2294                 *offsets++=sourceIndex++;
   2295                 *offsets++=sourceIndex++;
   2296                 *offsets++=sourceIndex++;
   2297                 *offsets++=sourceIndex++;
   2298                 --count;
   2299             }
   2300         }
   2301     }
   2302 #endif
   2303 
   2304     /* conversion loop */
   2305     while(targetCapacity > 0 && source < sourceLimit) {
   2306         entry=stateTable[0][*source++];
   2307         /* MBCS_ENTRY_IS_FINAL(entry) */
   2308 
   2309         /* test the most common case first */
   2310         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2311             /* output BMP code point */
   2312             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2313             --targetCapacity;
   2314             continue;
   2315         }
   2316 
   2317         /*
   2318          * An if-else-if chain provides more reliable performance for
   2319          * the most common cases compared to a switch.
   2320          */
   2321         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2322         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2323             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2324                 /* output BMP code point */
   2325                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2326                 --targetCapacity;
   2327                 continue;
   2328             }
   2329         } else if(action==MBCS_STATE_UNASSIGNED) {
   2330             /* just fall through */
   2331         } else if(action==MBCS_STATE_ILLEGAL) {
   2332             /* callback(illegal) */
   2333             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2334         } else {
   2335             /* reserved, must never occur */
   2336             continue;
   2337         }
   2338 
   2339         /* set offsets since the start or the last extension */
   2340         if(offsets!=NULL) {
   2341             int32_t count=(int32_t)(source-lastSource);
   2342 
   2343             /* predecrement: do not set the offset for the callback-causing character */
   2344             while(--count>0) {
   2345                 *offsets++=sourceIndex++;
   2346             }
   2347             /* offset and sourceIndex are now set for the current character */
   2348         }
   2349 
   2350         if(U_FAILURE(*pErrorCode)) {
   2351             /* callback(illegal) */
   2352             break;
   2353         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2354             /* try an extension mapping */
   2355             lastSource=source;
   2356             cnv->toUBytes[0]=*(source-1);
   2357             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2358                                     1, &source, sourceLimit,
   2359                                     &target, pArgs->targetLimit,
   2360                                     &offsets, sourceIndex,
   2361                                     pArgs->flush,
   2362                                     pErrorCode);
   2363             sourceIndex+=1+(int32_t)(source-lastSource);
   2364 
   2365             if(U_FAILURE(*pErrorCode)) {
   2366                 /* not mappable or buffer overflow */
   2367                 break;
   2368             }
   2369 
   2370             /* recalculate the targetCapacity after an extension mapping */
   2371             targetCapacity=(int32_t)(pArgs->targetLimit-target);
   2372             length=(int32_t)(sourceLimit-source);
   2373             if(length<targetCapacity) {
   2374                 targetCapacity=length;
   2375             }
   2376         }
   2377 
   2378 #if MBCS_UNROLL_SINGLE_TO_BMP
   2379         /* unrolling makes it faster on Pentium III/Windows 2000 */
   2380         goto unrolled;
   2381 #endif
   2382     }
   2383 
   2384     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
   2385         /* target is full */
   2386         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2387     }
   2388 
   2389     /* set offsets since the start or the last callback */
   2390     if(offsets!=NULL) {
   2391         size_t count=source-lastSource;
   2392         while(count>0) {
   2393             *offsets++=sourceIndex++;
   2394             --count;
   2395         }
   2396     }
   2397 
   2398     /* write back the updated pointers */
   2399     pArgs->source=(const char *)source;
   2400     pArgs->target=target;
   2401     pArgs->offsets=offsets;
   2402 }
   2403 
   2404 static UBool
   2405 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
   2406     const int32_t *row=stateTable[state];
   2407     int32_t b, entry;
   2408     /* First test for final entries in this state for some commonly valid byte values. */
   2409     entry=row[0xa1];
   2410     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2411         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2412     ) {
   2413         return TRUE;
   2414     }
   2415     entry=row[0x41];
   2416     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2417         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2418     ) {
   2419         return TRUE;
   2420     }
   2421     /* Then test for final entries in this state. */
   2422     for(b=0; b<=0xff; ++b) {
   2423         entry=row[b];
   2424         if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2425             MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2426         ) {
   2427             return TRUE;
   2428         }
   2429     }
   2430     /* Then recurse for transition entries. */
   2431     for(b=0; b<=0xff; ++b) {
   2432         entry=row[b];
   2433         if( MBCS_ENTRY_IS_TRANSITION(entry) &&
   2434             hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
   2435         ) {
   2436             return TRUE;
   2437         }
   2438     }
   2439     return FALSE;
   2440 }
   2441 
   2442 /*
   2443  * Is byte b a single/lead byte in this state?
   2444  * Recurse for transition states, because here we don't want to say that
   2445  * b is a lead byte if all byte sequences that start with b are illegal.
   2446  */
   2447 static UBool
   2448 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
   2449     const int32_t *row=stateTable[state];
   2450     int32_t entry=row[b];
   2451     if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
   2452         return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
   2453     } else {
   2454         uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2455         if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
   2456             return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
   2457         } else {
   2458             return action!=MBCS_STATE_ILLEGAL;
   2459         }
   2460     }
   2461 }
   2462 
   2463 U_CFUNC void
   2464 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2465                           UErrorCode *pErrorCode) {
   2466     UConverter *cnv;
   2467     const uint8_t *source, *sourceLimit;
   2468     UChar *target;
   2469     const UChar *targetLimit;
   2470     int32_t *offsets;
   2471 
   2472     const int32_t (*stateTable)[256];
   2473     const uint16_t *unicodeCodeUnits;
   2474 
   2475     uint32_t offset;
   2476     uint8_t state;
   2477     int8_t byteIndex;
   2478     uint8_t *bytes;
   2479 
   2480     int32_t sourceIndex, nextSourceIndex;
   2481 
   2482     int32_t entry;
   2483     UChar c;
   2484     uint8_t action;
   2485 
   2486     /* use optimized function if possible */
   2487     cnv=pArgs->converter;
   2488 
   2489     if(cnv->preToULength>0) {
   2490         /*
   2491          * pass sourceIndex=-1 because we continue from an earlier buffer
   2492          * in the future, this may change with continuous offsets
   2493          */
   2494         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
   2495 
   2496         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
   2497             return;
   2498         }
   2499     }
   2500 
   2501     if(cnv->sharedData->mbcs.countStates==1) {
   2502         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   2503             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
   2504         } else {
   2505             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
   2506         }
   2507         return;
   2508     }
   2509 
   2510     /* set up the local pointers */
   2511     source=(const uint8_t *)pArgs->source;
   2512     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2513     target=pArgs->target;
   2514     targetLimit=pArgs->targetLimit;
   2515     offsets=pArgs->offsets;
   2516 
   2517     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2518         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2519     } else {
   2520         stateTable=cnv->sharedData->mbcs.stateTable;
   2521     }
   2522     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2523 
   2524     /* get the converter state from UConverter */
   2525     offset=cnv->toUnicodeStatus;
   2526     byteIndex=cnv->toULength;
   2527     bytes=cnv->toUBytes;
   2528 
   2529     /*
   2530      * if we are in the SBCS state for a DBCS-only converter,
   2531      * then load the DBCS state from the MBCS data
   2532      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2533      */
   2534     if((state=(uint8_t)(cnv->mode))==0) {
   2535         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2536     }
   2537 
   2538     /* sourceIndex=-1 if the current character began in the previous buffer */
   2539     sourceIndex=byteIndex==0 ? 0 : -1;
   2540     nextSourceIndex=0;
   2541 
   2542     /* conversion loop */
   2543     while(source<sourceLimit) {
   2544         /*
   2545          * This following test is to see if available input would overflow the output.
   2546          * It does not catch output of more than one code unit that
   2547          * overflows as a result of a surrogate pair or callback output
   2548          * from the last source byte.
   2549          * Therefore, those situations also test for overflows and will
   2550          * then break the loop, too.
   2551          */
   2552         if(target>=targetLimit) {
   2553             /* target is full */
   2554             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2555             break;
   2556         }
   2557 
   2558         if(byteIndex==0) {
   2559             /* optimized loop for 1/2-byte input and BMP output */
   2560             if(offsets==NULL) {
   2561                 do {
   2562                     entry=stateTable[state][*source];
   2563                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2564                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2565                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2566 
   2567                         ++source;
   2568                         if( source<sourceLimit &&
   2569                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2570                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2571                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2572                         ) {
   2573                             ++source;
   2574                             *target++=c;
   2575                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2576                             offset=0;
   2577                         } else {
   2578                             /* set the state and leave the optimized loop */
   2579                             bytes[0]=*(source-1);
   2580                             byteIndex=1;
   2581                             break;
   2582                         }
   2583                     } else {
   2584                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2585                             /* output BMP code point */
   2586                             ++source;
   2587                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2588                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2589                         } else {
   2590                             /* leave the optimized loop */
   2591                             break;
   2592                         }
   2593                     }
   2594                 } while(source<sourceLimit && target<targetLimit);
   2595             } else /* offsets!=NULL */ {
   2596                 do {
   2597                     entry=stateTable[state][*source];
   2598                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2599                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2600                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2601 
   2602                         ++source;
   2603                         if( source<sourceLimit &&
   2604                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2605                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2606                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2607                         ) {
   2608                             ++source;
   2609                             *target++=c;
   2610                             if(offsets!=NULL) {
   2611                                 *offsets++=sourceIndex;
   2612                                 sourceIndex=(nextSourceIndex+=2);
   2613                             }
   2614                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2615                             offset=0;
   2616                         } else {
   2617                             /* set the state and leave the optimized loop */
   2618                             ++nextSourceIndex;
   2619                             bytes[0]=*(source-1);
   2620                             byteIndex=1;
   2621                             break;
   2622                         }
   2623                     } else {
   2624                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2625                             /* output BMP code point */
   2626                             ++source;
   2627                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2628                             if(offsets!=NULL) {
   2629                                 *offsets++=sourceIndex;
   2630                                 sourceIndex=++nextSourceIndex;
   2631                             }
   2632                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2633                         } else {
   2634                             /* leave the optimized loop */
   2635                             break;
   2636                         }
   2637                     }
   2638                 } while(source<sourceLimit && target<targetLimit);
   2639             }
   2640 
   2641             /*
   2642              * these tests and break statements could be put inside the loop
   2643              * if C had "break outerLoop" like Java
   2644              */
   2645             if(source>=sourceLimit) {
   2646                 break;
   2647             }
   2648             if(target>=targetLimit) {
   2649                 /* target is full */
   2650                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2651                 break;
   2652             }
   2653 
   2654             ++nextSourceIndex;
   2655             bytes[byteIndex++]=*source++;
   2656         } else /* byteIndex>0 */ {
   2657             ++nextSourceIndex;
   2658             entry=stateTable[state][bytes[byteIndex++]=*source++];
   2659         }
   2660 
   2661         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2662             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2663             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2664             continue;
   2665         }
   2666 
   2667         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2668         cnv->mode=state;
   2669 
   2670         /* set the next state early so that we can reuse the entry variable */
   2671         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2672 
   2673         /*
   2674          * An if-else-if chain provides more reliable performance for
   2675          * the most common cases compared to a switch.
   2676          */
   2677         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2678         if(action==MBCS_STATE_VALID_16) {
   2679             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2680             c=unicodeCodeUnits[offset];
   2681             if(c<0xfffe) {
   2682                 /* output BMP code point */
   2683                 *target++=c;
   2684                 if(offsets!=NULL) {
   2685                     *offsets++=sourceIndex;
   2686                 }
   2687                 byteIndex=0;
   2688             } else if(c==0xfffe) {
   2689                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2690                     /* output fallback BMP code point */
   2691                     *target++=(UChar)entry;
   2692                     if(offsets!=NULL) {
   2693                         *offsets++=sourceIndex;
   2694                     }
   2695                     byteIndex=0;
   2696                 }
   2697             } else {
   2698                 /* callback(illegal) */
   2699                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2700             }
   2701         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   2702             /* output BMP code point */
   2703             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2704             if(offsets!=NULL) {
   2705                 *offsets++=sourceIndex;
   2706             }
   2707             byteIndex=0;
   2708         } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2709             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2710             c=unicodeCodeUnits[offset++];
   2711             if(c<0xd800) {
   2712                 /* output BMP code point below 0xd800 */
   2713                 *target++=c;
   2714                 if(offsets!=NULL) {
   2715                     *offsets++=sourceIndex;
   2716                 }
   2717                 byteIndex=0;
   2718             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2719                 /* output roundtrip or fallback surrogate pair */
   2720                 *target++=(UChar)(c&0xdbff);
   2721                 if(offsets!=NULL) {
   2722                     *offsets++=sourceIndex;
   2723                 }
   2724                 byteIndex=0;
   2725                 if(target<targetLimit) {
   2726                     *target++=unicodeCodeUnits[offset];
   2727                     if(offsets!=NULL) {
   2728                         *offsets++=sourceIndex;
   2729                     }
   2730                 } else {
   2731                     /* target overflow */
   2732                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
   2733                     cnv->UCharErrorBufferLength=1;
   2734                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2735 
   2736                     offset=0;
   2737                     break;
   2738                 }
   2739             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2740                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2741                 *target++=unicodeCodeUnits[offset];
   2742                 if(offsets!=NULL) {
   2743                     *offsets++=sourceIndex;
   2744                 }
   2745                 byteIndex=0;
   2746             } else if(c==0xffff) {
   2747                 /* callback(illegal) */
   2748                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2749             }
   2750         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2751                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2752         ) {
   2753             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2754             /* output surrogate pair */
   2755             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2756             if(offsets!=NULL) {
   2757                 *offsets++=sourceIndex;
   2758             }
   2759             byteIndex=0;
   2760             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2761             if(target<targetLimit) {
   2762                 *target++=c;
   2763                 if(offsets!=NULL) {
   2764                     *offsets++=sourceIndex;
   2765                 }
   2766             } else {
   2767                 /* target overflow */
   2768                 cnv->UCharErrorBuffer[0]=c;
   2769                 cnv->UCharErrorBufferLength=1;
   2770                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2771 
   2772                 offset=0;
   2773                 break;
   2774             }
   2775         } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2776             /*
   2777              * This serves as a state change without any output.
   2778              * It is useful for reading simple stateful encodings,
   2779              * for example using just Shift-In/Shift-Out codes.
   2780              * The 21 unused bits may later be used for more sophisticated
   2781              * state transitions.
   2782              */
   2783             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
   2784                 byteIndex=0;
   2785             } else {
   2786                 /* SI/SO are illegal for DBCS-only conversion */
   2787                 state=(uint8_t)(cnv->mode); /* restore the previous state */
   2788 
   2789                 /* callback(illegal) */
   2790                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2791             }
   2792         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2793             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2794                 /* output BMP code point */
   2795                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2796                 if(offsets!=NULL) {
   2797                     *offsets++=sourceIndex;
   2798                 }
   2799                 byteIndex=0;
   2800             }
   2801         } else if(action==MBCS_STATE_UNASSIGNED) {
   2802             /* just fall through */
   2803         } else if(action==MBCS_STATE_ILLEGAL) {
   2804             /* callback(illegal) */
   2805             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2806         } else {
   2807             /* reserved, must never occur */
   2808             byteIndex=0;
   2809         }
   2810 
   2811         /* end of action codes: prepare for a new character */
   2812         offset=0;
   2813 
   2814         if(byteIndex==0) {
   2815             sourceIndex=nextSourceIndex;
   2816         } else if(U_FAILURE(*pErrorCode)) {
   2817             /* callback(illegal) */
   2818             if(byteIndex>1) {
   2819                 /*
   2820                  * Ticket 5691: consistent illegal sequences:
   2821                  * - We include at least the first byte in the illegal sequence.
   2822                  * - If any of the non-initial bytes could be the start of a character,
   2823                  *   we stop the illegal sequence before the first one of those.
   2824                  */
   2825                 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   2826                 int8_t i;
   2827                 for(i=1;
   2828                     i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
   2829                     ++i) {}
   2830                 if(i<byteIndex) {
   2831                     /* Back out some bytes. */
   2832                     int8_t backOutDistance=byteIndex-i;
   2833                     int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
   2834                     byteIndex=i;  /* length of reported illegal byte sequence */
   2835                     if(backOutDistance<=bytesFromThisBuffer) {
   2836                         source-=backOutDistance;
   2837                     } else {
   2838                         /* Back out bytes from the previous buffer: Need to replay them. */
   2839                         cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   2840                         /* preToULength is negative! */
   2841                         uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
   2842                         source=(const uint8_t *)pArgs->source;
   2843                     }
   2844                 }
   2845             }
   2846             break;
   2847         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2848             /* try an extension mapping */
   2849             pArgs->source=(const char *)source;
   2850             byteIndex=_extToU(cnv, cnv->sharedData,
   2851                               byteIndex, &source, sourceLimit,
   2852                               &target, targetLimit,
   2853                               &offsets, sourceIndex,
   2854                               pArgs->flush,
   2855                               pErrorCode);
   2856             sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
   2857 
   2858             if(U_FAILURE(*pErrorCode)) {
   2859                 /* not mappable or buffer overflow */
   2860                 break;
   2861             }
   2862         }
   2863     }
   2864 
   2865     /* set the converter state back into UConverter */
   2866     cnv->toUnicodeStatus=offset;
   2867     cnv->mode=state;
   2868     cnv->toULength=byteIndex;
   2869 
   2870     /* write back the updated pointers */
   2871     pArgs->source=(const char *)source;
   2872     pArgs->target=target;
   2873     pArgs->offsets=offsets;
   2874 }
   2875 
   2876 /*
   2877  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
   2878  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
   2879  */
   2880 static UChar32
   2881 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2882                         UErrorCode *pErrorCode) {
   2883     UConverter *cnv;
   2884     const int32_t (*stateTable)[256];
   2885     const uint8_t *source, *sourceLimit;
   2886 
   2887     int32_t entry;
   2888     uint8_t action;
   2889 
   2890     /* set up the local pointers */
   2891     cnv=pArgs->converter;
   2892     source=(const uint8_t *)pArgs->source;
   2893     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2894     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2895         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2896     } else {
   2897         stateTable=cnv->sharedData->mbcs.stateTable;
   2898     }
   2899 
   2900     /* conversion loop */
   2901     while(source<sourceLimit) {
   2902         entry=stateTable[0][*source++];
   2903         /* MBCS_ENTRY_IS_FINAL(entry) */
   2904 
   2905         /* write back the updated pointer early so that we can return directly */
   2906         pArgs->source=(const char *)source;
   2907 
   2908         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2909             /* output BMP code point */
   2910             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2911         }
   2912 
   2913         /*
   2914          * An if-else-if chain provides more reliable performance for
   2915          * the most common cases compared to a switch.
   2916          */
   2917         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2918         if( action==MBCS_STATE_VALID_DIRECT_20 ||
   2919             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2920         ) {
   2921             /* output supplementary code point */
   2922             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2923         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2924             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2925                 /* output BMP code point */
   2926                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2927             }
   2928         } else if(action==MBCS_STATE_UNASSIGNED) {
   2929             /* just fall through */
   2930         } else if(action==MBCS_STATE_ILLEGAL) {
   2931             /* callback(illegal) */
   2932             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2933         } else {
   2934             /* reserved, must never occur */
   2935             continue;
   2936         }
   2937 
   2938         if(U_FAILURE(*pErrorCode)) {
   2939             /* callback(illegal) */
   2940             break;
   2941         } else /* unassigned sequence */ {
   2942             /* defer to the generic implementation */
   2943             pArgs->source=(const char *)source-1;
   2944             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2945         }
   2946     }
   2947 
   2948     /* no output because of empty input or only state changes */
   2949     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2950     return 0xffff;
   2951 }
   2952 
   2953 /*
   2954  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
   2955  * conversion without offset handling.
   2956  *
   2957  * When a character does not have a mapping to Unicode, then we return to the
   2958  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
   2959  * handling.
   2960  * We also defer to the generic code in other complicated cases and have them
   2961  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
   2962  *
   2963  * All normal mappings and errors are handled here.
   2964  */
   2965 static UChar32 U_CALLCONV
   2966 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2967                   UErrorCode *pErrorCode) {
   2968     UConverter *cnv;
   2969     const uint8_t *source, *sourceLimit, *lastSource;
   2970 
   2971     const int32_t (*stateTable)[256];
   2972     const uint16_t *unicodeCodeUnits;
   2973 
   2974     uint32_t offset;
   2975     uint8_t state;
   2976 
   2977     int32_t entry;
   2978     UChar32 c;
   2979     uint8_t action;
   2980 
   2981     /* use optimized function if possible */
   2982     cnv=pArgs->converter;
   2983 
   2984     if(cnv->preToULength>0) {
   2985         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
   2986         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2987     }
   2988 
   2989     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
   2990         /*
   2991          * Using the generic ucnv_getNextUChar() code lets us deal correctly
   2992          * with the rare case of a codepage that maps single surrogates
   2993          * without adding the complexity to this already complicated function here.
   2994          */
   2995         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2996     } else if(cnv->sharedData->mbcs.countStates==1) {
   2997         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
   2998     }
   2999 
   3000     /* set up the local pointers */
   3001     source=lastSource=(const uint8_t *)pArgs->source;
   3002     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   3003 
   3004     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3005         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   3006     } else {
   3007         stateTable=cnv->sharedData->mbcs.stateTable;
   3008     }
   3009     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   3010 
   3011     /* get the converter state from UConverter */
   3012     offset=cnv->toUnicodeStatus;
   3013 
   3014     /*
   3015      * if we are in the SBCS state for a DBCS-only converter,
   3016      * then load the DBCS state from the MBCS data
   3017      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   3018      */
   3019     if((state=(uint8_t)(cnv->mode))==0) {
   3020         state=cnv->sharedData->mbcs.dbcsOnlyState;
   3021     }
   3022 
   3023     /* conversion loop */
   3024     c=U_SENTINEL;
   3025     while(source<sourceLimit) {
   3026         entry=stateTable[state][*source++];
   3027         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3028             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3029             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3030 
   3031             /* optimization for 1/2-byte input and BMP output */
   3032             if( source<sourceLimit &&
   3033                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   3034                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   3035                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   3036             ) {
   3037                 ++source;
   3038                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   3039                 /* output BMP code point */
   3040                 break;
   3041             }
   3042         } else {
   3043             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   3044             cnv->mode=state;
   3045 
   3046             /* set the next state early so that we can reuse the entry variable */
   3047             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   3048 
   3049             /*
   3050              * An if-else-if chain provides more reliable performance for
   3051              * the most common cases compared to a switch.
   3052              */
   3053             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3054             if(action==MBCS_STATE_VALID_DIRECT_16) {
   3055                 /* output BMP code point */
   3056                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3057                 break;
   3058             } else if(action==MBCS_STATE_VALID_16) {
   3059                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3060                 c=unicodeCodeUnits[offset];
   3061                 if(c<0xfffe) {
   3062                     /* output BMP code point */
   3063                     break;
   3064                 } else if(c==0xfffe) {
   3065                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   3066                         break;
   3067                     }
   3068                 } else {
   3069                     /* callback(illegal) */
   3070                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3071                 }
   3072             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3073                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3074                 c=unicodeCodeUnits[offset++];
   3075                 if(c<0xd800) {
   3076                     /* output BMP code point below 0xd800 */
   3077                     break;
   3078                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3079                     /* output roundtrip or fallback supplementary code point */
   3080                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
   3081                     break;
   3082                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3083                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3084                     c=unicodeCodeUnits[offset];
   3085                     break;
   3086                 } else if(c==0xffff) {
   3087                     /* callback(illegal) */
   3088                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3089                 }
   3090             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   3091                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   3092             ) {
   3093                 /* output supplementary code point */
   3094                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   3095                 break;
   3096             } else if(action==MBCS_STATE_CHANGE_ONLY) {
   3097                 /*
   3098                  * This serves as a state change without any output.
   3099                  * It is useful for reading simple stateful encodings,
   3100                  * for example using just Shift-In/Shift-Out codes.
   3101                  * The 21 unused bits may later be used for more sophisticated
   3102                  * state transitions.
   3103                  */
   3104                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
   3105                     /* SI/SO are illegal for DBCS-only conversion */
   3106                     state=(uint8_t)(cnv->mode); /* restore the previous state */
   3107 
   3108                     /* callback(illegal) */
   3109                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3110                 }
   3111             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3112                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3113                     /* output BMP code point */
   3114                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3115                     break;
   3116                 }
   3117             } else if(action==MBCS_STATE_UNASSIGNED) {
   3118                 /* just fall through */
   3119             } else if(action==MBCS_STATE_ILLEGAL) {
   3120                 /* callback(illegal) */
   3121                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3122             } else {
   3123                 /* reserved (must never occur), or only state change */
   3124                 offset=0;
   3125                 lastSource=source;
   3126                 continue;
   3127             }
   3128 
   3129             /* end of action codes: prepare for a new character */
   3130             offset=0;
   3131 
   3132             if(U_FAILURE(*pErrorCode)) {
   3133                 /* callback(illegal) */
   3134                 break;
   3135             } else /* unassigned sequence */ {
   3136                 /* defer to the generic implementation */
   3137                 cnv->toUnicodeStatus=0;
   3138                 cnv->mode=state;
   3139                 pArgs->source=(const char *)lastSource;
   3140                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   3141             }
   3142         }
   3143     }
   3144 
   3145     if(c<0) {
   3146         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
   3147             /* incomplete character byte sequence */
   3148             uint8_t *bytes=cnv->toUBytes;
   3149             cnv->toULength=(int8_t)(source-lastSource);
   3150             do {
   3151                 *bytes++=*lastSource++;
   3152             } while(lastSource<source);
   3153             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3154         } else if(U_FAILURE(*pErrorCode)) {
   3155             /* callback(illegal) */
   3156             /*
   3157              * Ticket 5691: consistent illegal sequences:
   3158              * - We include at least the first byte in the illegal sequence.
   3159              * - If any of the non-initial bytes could be the start of a character,
   3160              *   we stop the illegal sequence before the first one of those.
   3161              */
   3162             UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   3163             uint8_t *bytes=cnv->toUBytes;
   3164             *bytes++=*lastSource++;     /* first byte */
   3165             if(lastSource==source) {
   3166                 cnv->toULength=1;
   3167             } else /* lastSource<source: multi-byte character */ {
   3168                 int8_t i;
   3169                 for(i=1;
   3170                     lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
   3171                     ++i
   3172                 ) {
   3173                     *bytes++=*lastSource++;
   3174                 }
   3175                 cnv->toULength=i;
   3176                 source=lastSource;
   3177             }
   3178         } else {
   3179             /* no output because of empty input or only state changes */
   3180             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   3181         }
   3182         c=0xffff;
   3183     }
   3184 
   3185     /* set the converter state back into UConverter, ready for a new character */
   3186     cnv->toUnicodeStatus=0;
   3187     cnv->mode=state;
   3188 
   3189     /* write back the updated pointer */
   3190     pArgs->source=(const char *)source;
   3191     return c;
   3192 }
   3193 
   3194 #if 0
   3195 /*
   3196  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3197  * Removal improves code coverage.
   3198  */
   3199 /**
   3200  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
   3201  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3202  * It does not handle conversion extensions (_extToU()).
   3203  */
   3204 U_CFUNC UChar32
   3205 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
   3206                               uint8_t b, UBool useFallback) {
   3207     int32_t entry;
   3208     uint8_t action;
   3209 
   3210     entry=sharedData->mbcs.stateTable[0][b];
   3211     /* MBCS_ENTRY_IS_FINAL(entry) */
   3212 
   3213     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   3214         /* output BMP code point */
   3215         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3216     }
   3217 
   3218     /*
   3219      * An if-else-if chain provides more reliable performance for
   3220      * the most common cases compared to a switch.
   3221      */
   3222     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3223     if(action==MBCS_STATE_VALID_DIRECT_20) {
   3224         /* output supplementary code point */
   3225         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3226     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3227         if(!TO_U_USE_FALLBACK(useFallback)) {
   3228             return 0xfffe;
   3229         }
   3230         /* output BMP code point */
   3231         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3232     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3233         if(!TO_U_USE_FALLBACK(useFallback)) {
   3234             return 0xfffe;
   3235         }
   3236         /* output supplementary code point */
   3237         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3238     } else if(action==MBCS_STATE_UNASSIGNED) {
   3239         return 0xfffe;
   3240     } else if(action==MBCS_STATE_ILLEGAL) {
   3241         return 0xffff;
   3242     } else {
   3243         /* reserved, must never occur */
   3244         return 0xffff;
   3245     }
   3246 }
   3247 #endif
   3248 
   3249 /*
   3250  * This is a simple version of _MBCSGetNextUChar() that is used
   3251  * by other converter implementations.
   3252  * It only returns an "assigned" result if it consumes the entire input.
   3253  * It does not use state from the converter, nor error codes.
   3254  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3255  * It handles conversion extensions but not GB 18030.
   3256  *
   3257  * Return value:
   3258  * U+fffe   unassigned
   3259  * U+ffff   illegal
   3260  * otherwise the Unicode code point
   3261  */
   3262 U_CFUNC UChar32
   3263 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
   3264                         const char *source, int32_t length,
   3265                         UBool useFallback) {
   3266     const int32_t (*stateTable)[256];
   3267     const uint16_t *unicodeCodeUnits;
   3268 
   3269     uint32_t offset;
   3270     uint8_t state, action;
   3271 
   3272     UChar32 c;
   3273     int32_t i, entry;
   3274 
   3275     if(length<=0) {
   3276         /* no input at all: "illegal" */
   3277         return 0xffff;
   3278     }
   3279 
   3280 #if 0
   3281 /*
   3282  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3283  * TODO In future releases, verify that this function is never called for SBCS
   3284  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
   3285  * Removal improves code coverage.
   3286  */
   3287     /* use optimized function if possible */
   3288     if(sharedData->mbcs.countStates==1) {
   3289         if(length==1) {
   3290             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
   3291         } else {
   3292             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
   3293         }
   3294     }
   3295 #endif
   3296 
   3297     /* set up the local pointers */
   3298     stateTable=sharedData->mbcs.stateTable;
   3299     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
   3300 
   3301     /* converter state */
   3302     offset=0;
   3303     state=sharedData->mbcs.dbcsOnlyState;
   3304 
   3305     /* conversion loop */
   3306     for(i=0;;) {
   3307         entry=stateTable[state][(uint8_t)source[i++]];
   3308         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3309             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3310             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3311 
   3312             if(i==length) {
   3313                 return 0xffff; /* truncated character */
   3314             }
   3315         } else {
   3316             /*
   3317              * An if-else-if chain provides more reliable performance for
   3318              * the most common cases compared to a switch.
   3319              */
   3320             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3321             if(action==MBCS_STATE_VALID_16) {
   3322                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3323                 c=unicodeCodeUnits[offset];
   3324                 if(c!=0xfffe) {
   3325                     /* done */
   3326                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3327                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
   3328                 /* else done with 0xfffe */
   3329                 }
   3330                 break;
   3331             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   3332                 /* output BMP code point */
   3333                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3334                 break;
   3335             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3336                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3337                 c=unicodeCodeUnits[offset++];
   3338                 if(c<0xd800) {
   3339                     /* output BMP code point below 0xd800 */
   3340                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3341                     /* output roundtrip or fallback supplementary code point */
   3342                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
   3343                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3344                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3345                     c=unicodeCodeUnits[offset];
   3346                 } else if(c==0xffff) {
   3347                     return 0xffff;
   3348                 } else {
   3349                     c=0xfffe;
   3350                 }
   3351                 break;
   3352             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
   3353                 /* output supplementary code point */
   3354                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3355                 break;
   3356             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3357                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3358                     c=0xfffe;
   3359                     break;
   3360                 }
   3361                 /* output BMP code point */
   3362                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3363                 break;
   3364             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3365                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3366                     c=0xfffe;
   3367                     break;
   3368                 }
   3369                 /* output supplementary code point */
   3370                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3371                 break;
   3372             } else if(action==MBCS_STATE_UNASSIGNED) {
   3373                 c=0xfffe;
   3374                 break;
   3375             }
   3376 
   3377             /*
   3378              * forbid MBCS_STATE_CHANGE_ONLY for this function,
   3379              * and MBCS_STATE_ILLEGAL and reserved action codes
   3380              */
   3381             return 0xffff;
   3382         }
   3383     }
   3384 
   3385     if(i!=length) {
   3386         /* illegal for this function: not all input consumed */
   3387         return 0xffff;
   3388     }
   3389 
   3390     if(c==0xfffe) {
   3391         /* try an extension mapping */
   3392         const int32_t *cx=sharedData->mbcs.extIndexes;
   3393         if(cx!=NULL) {
   3394             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
   3395         }
   3396     }
   3397 
   3398     return c;
   3399 }
   3400 
   3401 /* MBCS-from-Unicode conversion functions ----------------------------------- */
   3402 
   3403 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
   3404 static void
   3405 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3406                                   UErrorCode *pErrorCode) {
   3407     UConverter *cnv;
   3408     const UChar *source, *sourceLimit;
   3409     uint8_t *target;
   3410     int32_t targetCapacity;
   3411     int32_t *offsets;
   3412 
   3413     const uint16_t *table;
   3414     const uint16_t *mbcsIndex;
   3415     const uint8_t *bytes;
   3416 
   3417     UChar32 c;
   3418 
   3419     int32_t sourceIndex, nextSourceIndex;
   3420 
   3421     uint32_t stage2Entry;
   3422     uint32_t asciiRoundtrips;
   3423     uint32_t value;
   3424     uint8_t unicodeMask;
   3425 
   3426     /* use optimized function if possible */
   3427     cnv=pArgs->converter;
   3428     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3429 
   3430     /* set up the local pointers */
   3431     source=pArgs->source;
   3432     sourceLimit=pArgs->sourceLimit;
   3433     target=(uint8_t *)pArgs->target;
   3434     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3435     offsets=pArgs->offsets;
   3436 
   3437     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3438     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3439     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3440         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3441     } else {
   3442         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   3443     }
   3444     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3445 
   3446     /* get the converter state from UConverter */
   3447     c=cnv->fromUChar32;
   3448 
   3449     /* sourceIndex=-1 if the current character began in the previous buffer */
   3450     sourceIndex= c==0 ? 0 : -1;
   3451     nextSourceIndex=0;
   3452 
   3453     /* conversion loop */
   3454     if(c!=0 && targetCapacity>0) {
   3455         goto getTrail;
   3456     }
   3457 
   3458     while(source<sourceLimit) {
   3459         /*
   3460          * This following test is to see if available input would overflow the output.
   3461          * It does not catch output of more than one byte that
   3462          * overflows as a result of a multi-byte character or callback output
   3463          * from the last source character.
   3464          * Therefore, those situations also test for overflows and will
   3465          * then break the loop, too.
   3466          */
   3467         if(targetCapacity>0) {
   3468             /*
   3469              * Get a correct Unicode code point:
   3470              * a single UChar for a BMP code point or
   3471              * a matched surrogate pair for a "supplementary code point".
   3472              */
   3473             c=*source++;
   3474             ++nextSourceIndex;
   3475             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3476                 *target++=(uint8_t)c;
   3477                 if(offsets!=NULL) {
   3478                     *offsets++=sourceIndex;
   3479                     sourceIndex=nextSourceIndex;
   3480                 }
   3481                 --targetCapacity;
   3482                 c=0;
   3483                 continue;
   3484             }
   3485             /*
   3486              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   3487              * to avoid dealing with surrogates.
   3488              * MBCS_FAST_MAX must be >=0xd7ff.
   3489              */
   3490             if(c<=0xd7ff) {
   3491                 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
   3492                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   3493                 if(value==0) {
   3494                     goto unassigned;
   3495                 }
   3496                 /* output the value */
   3497             } else {
   3498                 /*
   3499                  * This also tests if the codepage maps single surrogates.
   3500                  * If it does, then surrogates are not paired but mapped separately.
   3501                  * Note that in this case unmatched surrogates are not detected.
   3502                  */
   3503                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3504                     if(U16_IS_SURROGATE_LEAD(c)) {
   3505 getTrail:
   3506                         if(source<sourceLimit) {
   3507                             /* test the following code unit */
   3508                             UChar trail=*source;
   3509                             if(U16_IS_TRAIL(trail)) {
   3510                                 ++source;
   3511                                 ++nextSourceIndex;
   3512                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   3513                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3514                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3515                                     /* callback(unassigned) */
   3516                                     goto unassigned;
   3517                                 }
   3518                                 /* convert this supplementary code point */
   3519                                 /* exit this condition tree */
   3520                             } else {
   3521                                 /* this is an unmatched lead code unit (1st surrogate) */
   3522                                 /* callback(illegal) */
   3523                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3524                                 break;
   3525                             }
   3526                         } else {
   3527                             /* no more input */
   3528                             break;
   3529                         }
   3530                     } else {
   3531                         /* this is an unmatched trail code unit (2nd surrogate) */
   3532                         /* callback(illegal) */
   3533                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3534                         break;
   3535                     }
   3536                 }
   3537 
   3538                 /* convert the Unicode code point in c into codepage bytes */
   3539                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   3540 
   3541                 /* get the bytes and the length for the output */
   3542                 /* MBCS_OUTPUT_2 */
   3543                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   3544 
   3545                 /* is this code point assigned, or do we use fallbacks? */
   3546                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   3547                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   3548                 ) {
   3549                     /*
   3550                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   3551                      * There is no way with this data structure for fallback output
   3552                      * to be a zero byte.
   3553                      */
   3554 
   3555 unassigned:
   3556                     /* try an extension mapping */
   3557                     pArgs->source=source;
   3558                     c=_extFromU(cnv, cnv->sharedData,
   3559                                 c, &source, sourceLimit,
   3560                                 &target, target+targetCapacity,
   3561                                 &offsets, sourceIndex,
   3562                                 pArgs->flush,
   3563                                 pErrorCode);
   3564                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   3565 
   3566                     if(U_FAILURE(*pErrorCode)) {
   3567                         /* not mappable or buffer overflow */
   3568                         break;
   3569                     } else {
   3570                         /* a mapping was written to the target, continue */
   3571 
   3572                         /* recalculate the targetCapacity after an extension mapping */
   3573                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3574 
   3575                         /* normal end of conversion: prepare for a new character */
   3576                         sourceIndex=nextSourceIndex;
   3577                         continue;
   3578                     }
   3579                 }
   3580             }
   3581 
   3582             /* write the output character bytes from value and length */
   3583             /* from the first if in the loop we know that targetCapacity>0 */
   3584             if(value<=0xff) {
   3585                 /* this is easy because we know that there is enough space */
   3586                 *target++=(uint8_t)value;
   3587                 if(offsets!=NULL) {
   3588                     *offsets++=sourceIndex;
   3589                 }
   3590                 --targetCapacity;
   3591             } else /* length==2 */ {
   3592                 *target++=(uint8_t)(value>>8);
   3593                 if(2<=targetCapacity) {
   3594                     *target++=(uint8_t)value;
   3595                     if(offsets!=NULL) {
   3596                         *offsets++=sourceIndex;
   3597                         *offsets++=sourceIndex;
   3598                     }
   3599                     targetCapacity-=2;
   3600                 } else {
   3601                     if(offsets!=NULL) {
   3602                         *offsets++=sourceIndex;
   3603                     }
   3604                     cnv->charErrorBuffer[0]=(char)value;
   3605                     cnv->charErrorBufferLength=1;
   3606 
   3607                     /* target overflow */
   3608                     targetCapacity=0;
   3609                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3610                     c=0;
   3611                     break;
   3612                 }
   3613             }
   3614 
   3615             /* normal end of conversion: prepare for a new character */
   3616             c=0;
   3617             sourceIndex=nextSourceIndex;
   3618             continue;
   3619         } else {
   3620             /* target is full */
   3621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3622             break;
   3623         }
   3624     }
   3625 
   3626     /* set the converter state back into UConverter */
   3627     cnv->fromUChar32=c;
   3628 
   3629     /* write back the updated pointers */
   3630     pArgs->source=source;
   3631     pArgs->target=(char *)target;
   3632     pArgs->offsets=offsets;
   3633 }
   3634 
   3635 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
   3636 static void
   3637 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3638                                   UErrorCode *pErrorCode) {
   3639     UConverter *cnv;
   3640     const UChar *source, *sourceLimit;
   3641     uint8_t *target;
   3642     int32_t targetCapacity;
   3643     int32_t *offsets;
   3644 
   3645     const uint16_t *table;
   3646     const uint16_t *results;
   3647 
   3648     UChar32 c;
   3649 
   3650     int32_t sourceIndex, nextSourceIndex;
   3651 
   3652     uint16_t value, minValue;
   3653     UBool hasSupplementary;
   3654 
   3655     /* set up the local pointers */
   3656     cnv=pArgs->converter;
   3657     source=pArgs->source;
   3658     sourceLimit=pArgs->sourceLimit;
   3659     target=(uint8_t *)pArgs->target;
   3660     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3661     offsets=pArgs->offsets;
   3662 
   3663     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3664     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3665         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3666     } else {
   3667         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3668     }
   3669 
   3670     if(cnv->useFallback) {
   3671         /* use all roundtrip and fallback results */
   3672         minValue=0x800;
   3673     } else {
   3674         /* use only roundtrips and fallbacks from private-use characters */
   3675         minValue=0xc00;
   3676     }
   3677     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   3678 
   3679     /* get the converter state from UConverter */
   3680     c=cnv->fromUChar32;
   3681 
   3682     /* sourceIndex=-1 if the current character began in the previous buffer */
   3683     sourceIndex= c==0 ? 0 : -1;
   3684     nextSourceIndex=0;
   3685 
   3686     /* conversion loop */
   3687     if(c!=0 && targetCapacity>0) {
   3688         goto getTrail;
   3689     }
   3690 
   3691     while(source<sourceLimit) {
   3692         /*
   3693          * This following test is to see if available input would overflow the output.
   3694          * It does not catch output of more than one byte that
   3695          * overflows as a result of a multi-byte character or callback output
   3696          * from the last source character.
   3697          * Therefore, those situations also test for overflows and will
   3698          * then break the loop, too.
   3699          */
   3700         if(targetCapacity>0) {
   3701             /*
   3702              * Get a correct Unicode code point:
   3703              * a single UChar for a BMP code point or
   3704              * a matched surrogate pair for a "supplementary code point".
   3705              */
   3706             c=*source++;
   3707             ++nextSourceIndex;
   3708             if(U16_IS_SURROGATE(c)) {
   3709                 if(U16_IS_SURROGATE_LEAD(c)) {
   3710 getTrail:
   3711                     if(source<sourceLimit) {
   3712                         /* test the following code unit */
   3713                         UChar trail=*source;
   3714                         if(U16_IS_TRAIL(trail)) {
   3715                             ++source;
   3716                             ++nextSourceIndex;
   3717                             c=U16_GET_SUPPLEMENTARY(c, trail);
   3718                             if(!hasSupplementary) {
   3719                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3720                                 /* callback(unassigned) */
   3721                                 goto unassigned;
   3722                             }
   3723                             /* convert this supplementary code point */
   3724                             /* exit this condition tree */
   3725                         } else {
   3726                             /* this is an unmatched lead code unit (1st surrogate) */
   3727                             /* callback(illegal) */
   3728                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3729                             break;
   3730                         }
   3731                     } else {
   3732                         /* no more input */
   3733                         break;
   3734                     }
   3735                 } else {
   3736                     /* this is an unmatched trail code unit (2nd surrogate) */
   3737                     /* callback(illegal) */
   3738                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3739                     break;
   3740                 }
   3741             }
   3742 
   3743             /* convert the Unicode code point in c into codepage bytes */
   3744             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3745 
   3746             /* is this code point assigned, or do we use fallbacks? */
   3747             if(value>=minValue) {
   3748                 /* assigned, write the output character bytes from value and length */
   3749                 /* length==1 */
   3750                 /* this is easy because we know that there is enough space */
   3751                 *target++=(uint8_t)value;
   3752                 if(offsets!=NULL) {
   3753                     *offsets++=sourceIndex;
   3754                 }
   3755                 --targetCapacity;
   3756 
   3757                 /* normal end of conversion: prepare for a new character */
   3758                 c=0;
   3759                 sourceIndex=nextSourceIndex;
   3760             } else { /* unassigned */
   3761 unassigned:
   3762                 /* try an extension mapping */
   3763                 pArgs->source=source;
   3764                 c=_extFromU(cnv, cnv->sharedData,
   3765                             c, &source, sourceLimit,
   3766                             &target, target+targetCapacity,
   3767                             &offsets, sourceIndex,
   3768                             pArgs->flush,
   3769                             pErrorCode);
   3770                 nextSourceIndex+=(int32_t)(source-pArgs->source);
   3771 
   3772                 if(U_FAILURE(*pErrorCode)) {
   3773                     /* not mappable or buffer overflow */
   3774                     break;
   3775                 } else {
   3776                     /* a mapping was written to the target, continue */
   3777 
   3778                     /* recalculate the targetCapacity after an extension mapping */
   3779                     targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3780 
   3781                     /* normal end of conversion: prepare for a new character */
   3782                     sourceIndex=nextSourceIndex;
   3783                 }
   3784             }
   3785         } else {
   3786             /* target is full */
   3787             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3788             break;
   3789         }
   3790     }
   3791 
   3792     /* set the converter state back into UConverter */
   3793     cnv->fromUChar32=c;
   3794 
   3795     /* write back the updated pointers */
   3796     pArgs->source=source;
   3797     pArgs->target=(char *)target;
   3798     pArgs->offsets=offsets;
   3799 }
   3800 
   3801 /*
   3802  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
   3803  * that map only to and from the BMP.
   3804  * In addition to single-byte/state optimizations, the offset calculations
   3805  * become much easier.
   3806  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
   3807  * but measurements have shown that this diminishes performance
   3808  * in more cases than it improves it.
   3809  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
   3810  * for various MBCS and SBCS optimizations.
   3811  */
   3812 static void
   3813 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3814                               UErrorCode *pErrorCode) {
   3815     UConverter *cnv;
   3816     const UChar *source, *sourceLimit, *lastSource;
   3817     uint8_t *target;
   3818     int32_t targetCapacity, length;
   3819     int32_t *offsets;
   3820 
   3821     const uint16_t *table;
   3822     const uint16_t *results;
   3823 
   3824     UChar32 c;
   3825 
   3826     int32_t sourceIndex;
   3827 
   3828     uint32_t asciiRoundtrips;
   3829     uint16_t value, minValue;
   3830 
   3831     /* set up the local pointers */
   3832     cnv=pArgs->converter;
   3833     source=pArgs->source;
   3834     sourceLimit=pArgs->sourceLimit;
   3835     target=(uint8_t *)pArgs->target;
   3836     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3837     offsets=pArgs->offsets;
   3838 
   3839     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3840     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3841         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3842     } else {
   3843         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3844     }
   3845     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3846 
   3847     if(cnv->useFallback) {
   3848         /* use all roundtrip and fallback results */
   3849         minValue=0x800;
   3850     } else {
   3851         /* use only roundtrips and fallbacks from private-use characters */
   3852         minValue=0xc00;
   3853     }
   3854 
   3855     /* get the converter state from UConverter */
   3856     c=cnv->fromUChar32;
   3857 
   3858     /* sourceIndex=-1 if the current character began in the previous buffer */
   3859     sourceIndex= c==0 ? 0 : -1;
   3860     lastSource=source;
   3861 
   3862     /*
   3863      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   3864      * for the minimum of the sourceLength and targetCapacity
   3865      */
   3866     length=(int32_t)(sourceLimit-source);
   3867     if(length<targetCapacity) {
   3868         targetCapacity=length;
   3869     }
   3870 
   3871     /* conversion loop */
   3872     if(c!=0 && targetCapacity>0) {
   3873         goto getTrail;
   3874     }
   3875 
   3876 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3877     /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3878     /* unroll the loop with the most common case */
   3879 unrolled:
   3880     if(targetCapacity>=4) {
   3881         int32_t count, loops;
   3882         uint16_t andedValues;
   3883 
   3884         loops=count=targetCapacity>>2;
   3885         do {
   3886             c=*source++;
   3887             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3888             *target++=(uint8_t)value;
   3889             c=*source++;
   3890             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3891             *target++=(uint8_t)value;
   3892             c=*source++;
   3893             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3894             *target++=(uint8_t)value;
   3895             c=*source++;
   3896             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3897             *target++=(uint8_t)value;
   3898 
   3899             /* were all 4 entries really valid? */
   3900             if(andedValues<minValue) {
   3901                 /* no, return to the first of these 4 */
   3902                 source-=4;
   3903                 target-=4;
   3904                 break;
   3905             }
   3906         } while(--count>0);
   3907         count=loops-count;
   3908         targetCapacity-=4*count;
   3909 
   3910         if(offsets!=NULL) {
   3911             lastSource+=4*count;
   3912             while(count>0) {
   3913                 *offsets++=sourceIndex++;
   3914                 *offsets++=sourceIndex++;
   3915                 *offsets++=sourceIndex++;
   3916                 *offsets++=sourceIndex++;
   3917                 --count;
   3918             }
   3919         }
   3920 
   3921         c=0;
   3922     }
   3923 #endif
   3924 
   3925     while(targetCapacity>0) {
   3926         /*
   3927          * Get a correct Unicode code point:
   3928          * a single UChar for a BMP code point or
   3929          * a matched surrogate pair for a "supplementary code point".
   3930          */
   3931         c=*source++;
   3932         /*
   3933          * Do not immediately check for single surrogates:
   3934          * Assume that they are unassigned and check for them in that case.
   3935          * This speeds up the conversion of assigned characters.
   3936          */
   3937         /* convert the Unicode code point in c into codepage bytes */
   3938         if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3939             *target++=(uint8_t)c;
   3940             --targetCapacity;
   3941             c=0;
   3942             continue;
   3943         }
   3944         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3945         /* is this code point assigned, or do we use fallbacks? */
   3946         if(value>=minValue) {
   3947             /* assigned, write the output character bytes from value and length */
   3948             /* length==1 */
   3949             /* this is easy because we know that there is enough space */
   3950             *target++=(uint8_t)value;
   3951             --targetCapacity;
   3952 
   3953             /* normal end of conversion: prepare for a new character */
   3954             c=0;
   3955             continue;
   3956         } else if(!U16_IS_SURROGATE(c)) {
   3957             /* normal, unassigned BMP character */
   3958         } else if(U16_IS_SURROGATE_LEAD(c)) {
   3959 getTrail:
   3960             if(source<sourceLimit) {
   3961                 /* test the following code unit */
   3962                 UChar trail=*source;
   3963                 if(U16_IS_TRAIL(trail)) {
   3964                     ++source;
   3965                     c=U16_GET_SUPPLEMENTARY(c, trail);
   3966                     /* this codepage does not map supplementary code points */
   3967                     /* callback(unassigned) */
   3968                 } else {
   3969                     /* this is an unmatched lead code unit (1st surrogate) */
   3970                     /* callback(illegal) */
   3971                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3972                     break;
   3973                 }
   3974             } else {
   3975                 /* no more input */
   3976                 if (pArgs->flush) {
   3977                     *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3978                 }
   3979                 break;
   3980             }
   3981         } else {
   3982             /* this is an unmatched trail code unit (2nd surrogate) */
   3983             /* callback(illegal) */
   3984             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3985             break;
   3986         }
   3987 
   3988         /* c does not have a mapping */
   3989 
   3990         /* get the number of code units for c to correctly advance sourceIndex */
   3991         length=U16_LENGTH(c);
   3992 
   3993         /* set offsets since the start or the last extension */
   3994         if(offsets!=NULL) {
   3995             int32_t count=(int32_t)(source-lastSource);
   3996 
   3997             /* do not set the offset for this character */
   3998             count-=length;
   3999 
   4000             while(count>0) {
   4001                 *offsets++=sourceIndex++;
   4002                 --count;
   4003             }
   4004             /* offsets and sourceIndex are now set for the current character */
   4005         }
   4006 
   4007         /* try an extension mapping */
   4008         lastSource=source;
   4009         c=_extFromU(cnv, cnv->sharedData,
   4010                     c, &source, sourceLimit,
   4011                     &target, (const uint8_t *)(pArgs->targetLimit),
   4012                     &offsets, sourceIndex,
   4013                     pArgs->flush,
   4014                     pErrorCode);
   4015         sourceIndex+=length+(int32_t)(source-lastSource);
   4016         lastSource=source;
   4017 
   4018         if(U_FAILURE(*pErrorCode)) {
   4019             /* not mappable or buffer overflow */
   4020             break;
   4021         } else {
   4022             /* a mapping was written to the target, continue */
   4023 
   4024             /* recalculate the targetCapacity after an extension mapping */
   4025             targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4026             length=(int32_t)(sourceLimit-source);
   4027             if(length<targetCapacity) {
   4028                 targetCapacity=length;
   4029             }
   4030         }
   4031 
   4032 #if MBCS_UNROLL_SINGLE_FROM_BMP
   4033         /* unrolling makes it slower on Pentium III/Windows 2000?! */
   4034         goto unrolled;
   4035 #endif
   4036     }
   4037 
   4038     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
   4039         /* target is full */
   4040         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4041     }
   4042 
   4043     /* set offsets since the start or the last callback */
   4044     if(offsets!=NULL) {
   4045         size_t count=source-lastSource;
   4046         if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
   4047             /*
   4048             Caller gave us a partial supplementary character,
   4049             which this function couldn't convert in any case.
   4050             The callback will handle the offset.
   4051             */
   4052             count--;
   4053         }
   4054         while(count>0) {
   4055             *offsets++=sourceIndex++;
   4056             --count;
   4057         }
   4058     }
   4059 
   4060     /* set the converter state back into UConverter */
   4061     cnv->fromUChar32=c;
   4062 
   4063     /* write back the updated pointers */
   4064     pArgs->source=source;
   4065     pArgs->target=(char *)target;
   4066     pArgs->offsets=offsets;
   4067 }
   4068 
   4069 U_CFUNC void
   4070 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   4071                             UErrorCode *pErrorCode) {
   4072     UConverter *cnv;
   4073     const UChar *source, *sourceLimit;
   4074     uint8_t *target;
   4075     int32_t targetCapacity;
   4076     int32_t *offsets;
   4077 
   4078     const uint16_t *table;
   4079     const uint16_t *mbcsIndex;
   4080     const uint8_t *p, *bytes;
   4081     uint8_t outputType;
   4082 
   4083     UChar32 c;
   4084 
   4085     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
   4086 
   4087     uint32_t stage2Entry;
   4088     uint32_t asciiRoundtrips;
   4089     uint32_t value;
   4090     /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
   4091     uint8_t siBytes[2] = {0, 0};
   4092     uint8_t soBytes[2] = {0, 0};
   4093     uint8_t siLength, soLength;
   4094     int32_t length = 0, prevLength;
   4095     uint8_t unicodeMask;
   4096 
   4097     cnv=pArgs->converter;
   4098 
   4099     if(cnv->preFromUFirstCP>=0) {
   4100         /*
   4101          * pass sourceIndex=-1 because we continue from an earlier buffer
   4102          * in the future, this may change with continuous offsets
   4103          */
   4104         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
   4105 
   4106         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
   4107             return;
   4108         }
   4109     }
   4110 
   4111     /* use optimized function if possible */
   4112     outputType=cnv->sharedData->mbcs.outputType;
   4113     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   4114     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4115         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4116             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
   4117         } else {
   4118             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
   4119         }
   4120         return;
   4121     } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
   4122         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
   4123         return;
   4124     }
   4125 
   4126     /* set up the local pointers */
   4127     source=pArgs->source;
   4128     sourceLimit=pArgs->sourceLimit;
   4129     target=(uint8_t *)pArgs->target;
   4130     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   4131     offsets=pArgs->offsets;
   4132 
   4133     table=cnv->sharedData->mbcs.fromUnicodeTable;
   4134     if(cnv->sharedData->mbcs.utf8Friendly) {
   4135         mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   4136     } else {
   4137         mbcsIndex=NULL;
   4138     }
   4139     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   4140         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   4141     } else {
   4142         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   4143     }
   4144     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   4145 
   4146     /* get the converter state from UConverter */
   4147     c=cnv->fromUChar32;
   4148 
   4149     if(outputType==MBCS_OUTPUT_2_SISO) {
   4150         prevLength=cnv->fromUnicodeStatus;
   4151         if(prevLength==0) {
   4152             /* set the real value */
   4153             prevLength=1;
   4154         }
   4155     } else {
   4156         /* prevent fromUnicodeStatus from being set to something non-0 */
   4157         prevLength=0;
   4158     }
   4159 
   4160     /* sourceIndex=-1 if the current character began in the previous buffer */
   4161     prevSourceIndex=-1;
   4162     sourceIndex= c==0 ? 0 : -1;
   4163     nextSourceIndex=0;
   4164 
   4165     /* Get the SI/SO character for the converter */
   4166     siLength = getSISOBytes(SI, cnv->options, siBytes);
   4167     soLength = getSISOBytes(SO, cnv->options, soBytes);
   4168 
   4169     /* conversion loop */
   4170     /*
   4171      * This is another piece of ugly code:
   4172      * A goto into the loop if the converter state contains a first surrogate
   4173      * from the previous function call.
   4174      * It saves me to check in each loop iteration a check of if(c==0)
   4175      * and duplicating the trail-surrogate-handling code in the else
   4176      * branch of that check.
   4177      * I could not find any other way to get around this other than
   4178      * using a function call for the conversion and callback, which would
   4179      * be even more inefficient.
   4180      *
   4181      * Markus Scherer 2000-jul-19
   4182      */
   4183     if(c!=0 && targetCapacity>0) {
   4184         goto getTrail;
   4185     }
   4186 
   4187     while(source<sourceLimit) {
   4188         /*
   4189          * This following test is to see if available input would overflow the output.
   4190          * It does not catch output of more than one byte that
   4191          * overflows as a result of a multi-byte character or callback output
   4192          * from the last source character.
   4193          * Therefore, those situations also test for overflows and will
   4194          * then break the loop, too.
   4195          */
   4196         if(targetCapacity>0) {
   4197             /*
   4198              * Get a correct Unicode code point:
   4199              * a single UChar for a BMP code point or
   4200              * a matched surrogate pair for a "supplementary code point".
   4201              */
   4202             c=*source++;
   4203             ++nextSourceIndex;
   4204             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   4205                 *target++=(uint8_t)c;
   4206                 if(offsets!=NULL) {
   4207                     *offsets++=sourceIndex;
   4208                     prevSourceIndex=sourceIndex;
   4209                     sourceIndex=nextSourceIndex;
   4210                 }
   4211                 --targetCapacity;
   4212                 c=0;
   4213                 continue;
   4214             }
   4215             /*
   4216              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   4217              * to avoid dealing with surrogates.
   4218              * MBCS_FAST_MAX must be >=0xd7ff.
   4219              */
   4220             if(c<=0xd7ff && mbcsIndex!=NULL) {
   4221                 value=mbcsIndex[c>>6];
   4222 
   4223                 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
   4224                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   4225                 switch(outputType) {
   4226                 case MBCS_OUTPUT_2:
   4227                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4228                     if(value<=0xff) {
   4229                         if(value==0) {
   4230                             goto unassigned;
   4231                         } else {
   4232                             length=1;
   4233                         }
   4234                     } else {
   4235                         length=2;
   4236                     }
   4237                     break;
   4238                 case MBCS_OUTPUT_2_SISO:
   4239                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4240                     /*
   4241                      * Save the old state in the converter object
   4242                      * right here, then change the local prevLength state variable if necessary.
   4243                      * Then, if this character turns out to be unassigned or a fallback that
   4244                      * is not taken, the callback code must not save the new state in the converter
   4245                      * because the new state is for a character that is not output.
   4246                      * However, the callback must still restore the state from the converter
   4247                      * in case the callback function changed it for its output.
   4248                      */
   4249                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4250                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4251                     if(value<=0xff) {
   4252                         if(value==0) {
   4253                             goto unassigned;
   4254                         } else if(prevLength<=1) {
   4255                             length=1;
   4256                         } else {
   4257                             /* change from double-byte mode to single-byte */
   4258                             if (siLength == 1) {
   4259                                 value|=(uint32_t)siBytes[0]<<8;
   4260                                 length = 2;
   4261                             } else if (siLength == 2) {
   4262                                 value|=(uint32_t)siBytes[1]<<8;
   4263                                 value|=(uint32_t)siBytes[0]<<16;
   4264                                 length = 3;
   4265                             }
   4266                             prevLength=1;
   4267                         }
   4268                     } else {
   4269                         if(prevLength==2) {
   4270                             length=2;
   4271                         } else {
   4272                             /* change from single-byte mode to double-byte */
   4273                             if (soLength == 1) {
   4274                                 value|=(uint32_t)soBytes[0]<<16;
   4275                                 length = 3;
   4276                             } else if (soLength == 2) {
   4277                                 value|=(uint32_t)soBytes[1]<<16;
   4278                                 value|=(uint32_t)soBytes[0]<<24;
   4279                                 length = 4;
   4280                             }
   4281                             prevLength=2;
   4282                         }
   4283                     }
   4284                     break;
   4285                 case MBCS_OUTPUT_DBCS_ONLY:
   4286                     /* table with single-byte results, but only DBCS mappings used */
   4287                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4288                     if(value<=0xff) {
   4289                         /* no mapping or SBCS result, not taken for DBCS-only */
   4290                         goto unassigned;
   4291                     } else {
   4292                         length=2;
   4293                     }
   4294                     break;
   4295                 case MBCS_OUTPUT_3:
   4296                     p=bytes+(value+(c&0x3f))*3;
   4297                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4298                     if(value<=0xff) {
   4299                         if(value==0) {
   4300                             goto unassigned;
   4301                         } else {
   4302                             length=1;
   4303                         }
   4304                     } else if(value<=0xffff) {
   4305                         length=2;
   4306                     } else {
   4307                         length=3;
   4308                     }
   4309                     break;
   4310                 case MBCS_OUTPUT_4:
   4311                     value=((const uint32_t *)bytes)[value +(c&0x3f)];
   4312                     if(value<=0xff) {
   4313                         if(value==0) {
   4314                             goto unassigned;
   4315                         } else {
   4316                             length=1;
   4317                         }
   4318                     } else if(value<=0xffff) {
   4319                         length=2;
   4320                     } else if(value<=0xffffff) {
   4321                         length=3;
   4322                     } else {
   4323                         length=4;
   4324                     }
   4325                     break;
   4326                 case MBCS_OUTPUT_3_EUC:
   4327                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4328                     /* EUC 16-bit fixed-length representation */
   4329                     if(value<=0xff) {
   4330                         if(value==0) {
   4331                             goto unassigned;
   4332                         } else {
   4333                             length=1;
   4334                         }
   4335                     } else if((value&0x8000)==0) {
   4336                         value|=0x8e8000;
   4337                         length=3;
   4338                     } else if((value&0x80)==0) {
   4339                         value|=0x8f0080;
   4340                         length=3;
   4341                     } else {
   4342                         length=2;
   4343                     }
   4344                     break;
   4345                 case MBCS_OUTPUT_4_EUC:
   4346                     p=bytes+(value+(c&0x3f))*3;
   4347                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4348                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4349                     if(value<=0xff) {
   4350                         if(value==0) {
   4351                             goto unassigned;
   4352                         } else {
   4353                             length=1;
   4354                         }
   4355                     } else if(value<=0xffff) {
   4356                         length=2;
   4357                     } else if((value&0x800000)==0) {
   4358                         value|=0x8e800000;
   4359                         length=4;
   4360                     } else if((value&0x8000)==0) {
   4361                         value|=0x8f008000;
   4362                         length=4;
   4363                     } else {
   4364                         length=3;
   4365                     }
   4366                     break;
   4367                 default:
   4368                     /* must not occur */
   4369                     /*
   4370                      * To avoid compiler warnings that value & length may be
   4371                      * used without having been initialized, we set them here.
   4372                      * In reality, this is unreachable code.
   4373                      * Not having a default branch also causes warnings with
   4374                      * some compilers.
   4375                      */
   4376                     value=0;
   4377                     length=0;
   4378                     break;
   4379                 }
   4380                 /* output the value */
   4381             } else {
   4382                 /*
   4383                  * This also tests if the codepage maps single surrogates.
   4384                  * If it does, then surrogates are not paired but mapped separately.
   4385                  * Note that in this case unmatched surrogates are not detected.
   4386                  */
   4387                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4388                     if(U16_IS_SURROGATE_LEAD(c)) {
   4389 getTrail:
   4390                         if(source<sourceLimit) {
   4391                             /* test the following code unit */
   4392                             UChar trail=*source;
   4393                             if(U16_IS_TRAIL(trail)) {
   4394                                 ++source;
   4395                                 ++nextSourceIndex;
   4396                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   4397                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4398                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4399                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4400                                     /* callback(unassigned) */
   4401                                     goto unassigned;
   4402                                 }
   4403                                 /* convert this supplementary code point */
   4404                                 /* exit this condition tree */
   4405                             } else {
   4406                                 /* this is an unmatched lead code unit (1st surrogate) */
   4407                                 /* callback(illegal) */
   4408                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4409                                 break;
   4410                             }
   4411                         } else {
   4412                             /* no more input */
   4413                             break;
   4414                         }
   4415                     } else {
   4416                         /* this is an unmatched trail code unit (2nd surrogate) */
   4417                         /* callback(illegal) */
   4418                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4419                         break;
   4420                     }
   4421                 }
   4422 
   4423                 /* convert the Unicode code point in c into codepage bytes */
   4424 
   4425                 /*
   4426                  * The basic lookup is a triple-stage compact array (trie) lookup.
   4427                  * For details see the beginning of this file.
   4428                  *
   4429                  * Single-byte codepages are handled with a different data structure
   4430                  * by _MBCSSingle... functions.
   4431                  *
   4432                  * The result consists of a 32-bit value from stage 2 and
   4433                  * a pointer to as many bytes as are stored per character.
   4434                  * The pointer points to the character's bytes in stage 3.
   4435                  * Bits 15..0 of the stage 2 entry contain the stage 3 index
   4436                  * for that pointer, while bits 31..16 are flags for which of
   4437                  * the 16 characters in the block are roundtrip-assigned.
   4438                  *
   4439                  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
   4440                  * respectively as uint32_t, in the platform encoding.
   4441                  * For 3-byte codepages, the bytes are always stored in big-endian order.
   4442                  *
   4443                  * For EUC encodings that use only either 0x8e or 0x8f as the first
   4444                  * byte of their longest byte sequences, the first two bytes in
   4445                  * this third stage indicate with their 7th bits whether these bytes
   4446                  * are to be written directly or actually need to be preceeded by
   4447                  * one of the two Single-Shift codes. With this, the third stage
   4448                  * stores one byte fewer per character than the actual maximum length of
   4449                  * EUC byte sequences.
   4450                  *
   4451                  * Other than that, leading zero bytes are removed and the other
   4452                  * bytes output. A single zero byte may be output if the "assigned"
   4453                  * bit in stage 2 was on.
   4454                  * The data structure does not support zero byte output as a fallback,
   4455                  * and also does not allow output of leading zeros.
   4456                  */
   4457                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4458 
   4459                 /* get the bytes and the length for the output */
   4460                 switch(outputType) {
   4461                 case MBCS_OUTPUT_2:
   4462                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4463                     if(value<=0xff) {
   4464                         length=1;
   4465                     } else {
   4466                         length=2;
   4467                     }
   4468                     break;
   4469                 case MBCS_OUTPUT_2_SISO:
   4470                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4471                     /*
   4472                      * Save the old state in the converter object
   4473                      * right here, then change the local prevLength state variable if necessary.
   4474                      * Then, if this character turns out to be unassigned or a fallback that
   4475                      * is not taken, the callback code must not save the new state in the converter
   4476                      * because the new state is for a character that is not output.
   4477                      * However, the callback must still restore the state from the converter
   4478                      * in case the callback function changed it for its output.
   4479                      */
   4480                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4481                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4482                     if(value<=0xff) {
   4483                         if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
   4484                             /* no mapping, leave value==0 */
   4485                             length=0;
   4486                         } else if(prevLength<=1) {
   4487                             length=1;
   4488                         } else {
   4489                             /* change from double-byte mode to single-byte */
   4490                             if (siLength == 1) {
   4491                                 value|=(uint32_t)siBytes[0]<<8;
   4492                                 length = 2;
   4493                             } else if (siLength == 2) {
   4494                                 value|=(uint32_t)siBytes[1]<<8;
   4495                                 value|=(uint32_t)siBytes[0]<<16;
   4496                                 length = 3;
   4497                             }
   4498                             prevLength=1;
   4499                         }
   4500                     } else {
   4501                         if(prevLength==2) {
   4502                             length=2;
   4503                         } else {
   4504                             /* change from single-byte mode to double-byte */
   4505                             if (soLength == 1) {
   4506                                 value|=(uint32_t)soBytes[0]<<16;
   4507                                 length = 3;
   4508                             } else if (soLength == 2) {
   4509                                 value|=(uint32_t)soBytes[1]<<16;
   4510                                 value|=(uint32_t)soBytes[0]<<24;
   4511                                 length = 4;
   4512                             }
   4513                             prevLength=2;
   4514                         }
   4515                     }
   4516                     break;
   4517                 case MBCS_OUTPUT_DBCS_ONLY:
   4518                     /* table with single-byte results, but only DBCS mappings used */
   4519                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4520                     if(value<=0xff) {
   4521                         /* no mapping or SBCS result, not taken for DBCS-only */
   4522                         value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4523                         length=0;
   4524                     } else {
   4525                         length=2;
   4526                     }
   4527                     break;
   4528                 case MBCS_OUTPUT_3:
   4529                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4530                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4531                     if(value<=0xff) {
   4532                         length=1;
   4533                     } else if(value<=0xffff) {
   4534                         length=2;
   4535                     } else {
   4536                         length=3;
   4537                     }
   4538                     break;
   4539                 case MBCS_OUTPUT_4:
   4540                     value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
   4541                     if(value<=0xff) {
   4542                         length=1;
   4543                     } else if(value<=0xffff) {
   4544                         length=2;
   4545                     } else if(value<=0xffffff) {
   4546                         length=3;
   4547                     } else {
   4548                         length=4;
   4549                     }
   4550                     break;
   4551                 case MBCS_OUTPUT_3_EUC:
   4552                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4553                     /* EUC 16-bit fixed-length representation */
   4554                     if(value<=0xff) {
   4555                         length=1;
   4556                     } else if((value&0x8000)==0) {
   4557                         value|=0x8e8000;
   4558                         length=3;
   4559                     } else if((value&0x80)==0) {
   4560                         value|=0x8f0080;
   4561                         length=3;
   4562                     } else {
   4563                         length=2;
   4564                     }
   4565                     break;
   4566                 case MBCS_OUTPUT_4_EUC:
   4567                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4568                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4569                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4570                     if(value<=0xff) {
   4571                         length=1;
   4572                     } else if(value<=0xffff) {
   4573                         length=2;
   4574                     } else if((value&0x800000)==0) {
   4575                         value|=0x8e800000;
   4576                         length=4;
   4577                     } else if((value&0x8000)==0) {
   4578                         value|=0x8f008000;
   4579                         length=4;
   4580                     } else {
   4581                         length=3;
   4582                     }
   4583                     break;
   4584                 default:
   4585                     /* must not occur */
   4586                     /*
   4587                      * To avoid compiler warnings that value & length may be
   4588                      * used without having been initialized, we set them here.
   4589                      * In reality, this is unreachable code.
   4590                      * Not having a default branch also causes warnings with
   4591                      * some compilers.
   4592                      */
   4593                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4594                     length=0;
   4595                     break;
   4596                 }
   4597 
   4598                 /* is this code point assigned, or do we use fallbacks? */
   4599                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
   4600                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   4601                 ) {
   4602                     /*
   4603                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4604                      * There is no way with this data structure for fallback output
   4605                      * to be a zero byte.
   4606                      */
   4607 
   4608 unassigned:
   4609                     /* try an extension mapping */
   4610                     pArgs->source=source;
   4611                     c=_extFromU(cnv, cnv->sharedData,
   4612                                 c, &source, sourceLimit,
   4613                                 &target, target+targetCapacity,
   4614                                 &offsets, sourceIndex,
   4615                                 pArgs->flush,
   4616                                 pErrorCode);
   4617                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   4618                     prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
   4619 
   4620                     if(U_FAILURE(*pErrorCode)) {
   4621                         /* not mappable or buffer overflow */
   4622                         break;
   4623                     } else {
   4624                         /* a mapping was written to the target, continue */
   4625 
   4626                         /* recalculate the targetCapacity after an extension mapping */
   4627                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4628 
   4629                         /* normal end of conversion: prepare for a new character */
   4630                         if(offsets!=NULL) {
   4631                             prevSourceIndex=sourceIndex;
   4632                             sourceIndex=nextSourceIndex;
   4633                         }
   4634                         continue;
   4635                     }
   4636                 }
   4637             }
   4638 
   4639             /* write the output character bytes from value and length */
   4640             /* from the first if in the loop we know that targetCapacity>0 */
   4641             if(length<=targetCapacity) {
   4642                 if(offsets==NULL) {
   4643                     switch(length) {
   4644                         /* each branch falls through to the next one */
   4645                     case 4:
   4646                         *target++=(uint8_t)(value>>24);
   4647                         U_FALLTHROUGH;
   4648                     case 3:
   4649                         *target++=(uint8_t)(value>>16);
   4650                         U_FALLTHROUGH;
   4651                     case 2:
   4652                         *target++=(uint8_t)(value>>8);
   4653                         U_FALLTHROUGH;
   4654                     case 1:
   4655                         *target++=(uint8_t)value;
   4656                         U_FALLTHROUGH;
   4657                     default:
   4658                         /* will never occur */
   4659                         break;
   4660                     }
   4661                 } else {
   4662                     switch(length) {
   4663                         /* each branch falls through to the next one */
   4664                     case 4:
   4665                         *target++=(uint8_t)(value>>24);
   4666                         *offsets++=sourceIndex;
   4667                         U_FALLTHROUGH;
   4668                     case 3:
   4669                         *target++=(uint8_t)(value>>16);
   4670                         *offsets++=sourceIndex;
   4671                         U_FALLTHROUGH;
   4672                     case 2:
   4673                         *target++=(uint8_t)(value>>8);
   4674                         *offsets++=sourceIndex;
   4675                         U_FALLTHROUGH;
   4676                     case 1:
   4677                         *target++=(uint8_t)value;
   4678                         *offsets++=sourceIndex;
   4679                         U_FALLTHROUGH;
   4680                     default:
   4681                         /* will never occur */
   4682                         break;
   4683                     }
   4684                 }
   4685                 targetCapacity-=length;
   4686             } else {
   4687                 uint8_t *charErrorBuffer;
   4688 
   4689                 /*
   4690                  * We actually do this backwards here:
   4691                  * In order to save an intermediate variable, we output
   4692                  * first to the overflow buffer what does not fit into the
   4693                  * regular target.
   4694                  */
   4695                 /* we know that 1<=targetCapacity<length<=4 */
   4696                 length-=targetCapacity;
   4697                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   4698                 switch(length) {
   4699                     /* each branch falls through to the next one */
   4700                 case 3:
   4701                     *charErrorBuffer++=(uint8_t)(value>>16);
   4702                     U_FALLTHROUGH;
   4703                 case 2:
   4704                     *charErrorBuffer++=(uint8_t)(value>>8);
   4705                     U_FALLTHROUGH;
   4706                 case 1:
   4707                     *charErrorBuffer=(uint8_t)value;
   4708                     U_FALLTHROUGH;
   4709                 default:
   4710                     /* will never occur */
   4711                     break;
   4712                 }
   4713                 cnv->charErrorBufferLength=(int8_t)length;
   4714 
   4715                 /* now output what fits into the regular target */
   4716                 value>>=8*length; /* length was reduced by targetCapacity */
   4717                 switch(targetCapacity) {
   4718                     /* each branch falls through to the next one */
   4719                 case 3:
   4720                     *target++=(uint8_t)(value>>16);
   4721                     if(offsets!=NULL) {
   4722                         *offsets++=sourceIndex;
   4723                     }
   4724                     U_FALLTHROUGH;
   4725                 case 2:
   4726                     *target++=(uint8_t)(value>>8);
   4727                     if(offsets!=NULL) {
   4728                         *offsets++=sourceIndex;
   4729                     }
   4730                     U_FALLTHROUGH;
   4731                 case 1:
   4732                     *target++=(uint8_t)value;
   4733                     if(offsets!=NULL) {
   4734                         *offsets++=sourceIndex;
   4735                     }
   4736                     U_FALLTHROUGH;
   4737                 default:
   4738                     /* will never occur */
   4739                     break;
   4740                 }
   4741 
   4742                 /* target overflow */
   4743                 targetCapacity=0;
   4744                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4745                 c=0;
   4746                 break;
   4747             }
   4748 
   4749             /* normal end of conversion: prepare for a new character */
   4750             c=0;
   4751             if(offsets!=NULL) {
   4752                 prevSourceIndex=sourceIndex;
   4753                 sourceIndex=nextSourceIndex;
   4754             }
   4755             continue;
   4756         } else {
   4757             /* target is full */
   4758             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4759             break;
   4760         }
   4761     }
   4762 
   4763     /*
   4764      * the end of the input stream and detection of truncated input
   4765      * are handled by the framework, but for EBCDIC_STATEFUL conversion
   4766      * we need to emit an SI at the very end
   4767      *
   4768      * conditions:
   4769      *   successful
   4770      *   EBCDIC_STATEFUL in DBCS mode
   4771      *   end of input and no truncated input
   4772      */
   4773     if( U_SUCCESS(*pErrorCode) &&
   4774         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
   4775         pArgs->flush && source>=sourceLimit && c==0
   4776     ) {
   4777         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
   4778         if(targetCapacity>0) {
   4779             *target++=(uint8_t)siBytes[0];
   4780             if (siLength == 2) {
   4781                 if (targetCapacity<2) {
   4782                     cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
   4783                     cnv->charErrorBufferLength=1;
   4784                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4785                 } else {
   4786                     *target++=(uint8_t)siBytes[1];
   4787                 }
   4788             }
   4789             if(offsets!=NULL) {
   4790                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
   4791                 *offsets++=prevSourceIndex;
   4792             }
   4793         } else {
   4794             /* target is full */
   4795             cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
   4796             if (siLength == 2) {
   4797                 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
   4798             }
   4799             cnv->charErrorBufferLength=siLength;
   4800             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4801         }
   4802         prevLength=1; /* we switched into SBCS */
   4803     }
   4804 
   4805     /* set the converter state back into UConverter */
   4806     cnv->fromUChar32=c;
   4807     cnv->fromUnicodeStatus=prevLength;
   4808 
   4809     /* write back the updated pointers */
   4810     pArgs->source=source;
   4811     pArgs->target=(char *)target;
   4812     pArgs->offsets=offsets;
   4813 }
   4814 
   4815 /*
   4816  * This is another simple conversion function for internal use by other
   4817  * conversion implementations.
   4818  * It does not use the converter state nor call callbacks.
   4819  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4820  * It handles conversion extensions but not GB 18030.
   4821  *
   4822  * It converts one single Unicode code point into codepage bytes, encoded
   4823  * as one 32-bit value. The function returns the number of bytes in *pValue:
   4824  * 1..4 the number of bytes in *pValue
   4825  * 0    unassigned (*pValue undefined)
   4826  * -1   illegal (currently not used, *pValue undefined)
   4827  *
   4828  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
   4829  * the second to last byte in bits 15..8, etc.
   4830  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
   4831  */
   4832 U_CFUNC int32_t
   4833 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
   4834                  UChar32 c, uint32_t *pValue,
   4835                  UBool useFallback) {
   4836     const int32_t *cx;
   4837     const uint16_t *table;
   4838 #if 0
   4839 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4840     const uint8_t *p;
   4841 #endif
   4842     uint32_t stage2Entry;
   4843     uint32_t value;
   4844     int32_t length;
   4845 
   4846     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4847     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4848         table=sharedData->mbcs.fromUnicodeTable;
   4849 
   4850         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4851         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
   4852             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4853             /* is this code point assigned, or do we use fallbacks? */
   4854             if(useFallback ? value>=0x800 : value>=0xc00) {
   4855                 *pValue=value&0xff;
   4856                 return 1;
   4857             }
   4858         } else /* outputType!=MBCS_OUTPUT_1 */ {
   4859             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4860 
   4861             /* get the bytes and the length for the output */
   4862             switch(sharedData->mbcs.outputType) {
   4863             case MBCS_OUTPUT_2:
   4864                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4865                 if(value<=0xff) {
   4866                     length=1;
   4867                 } else {
   4868                     length=2;
   4869                 }
   4870                 break;
   4871 #if 0
   4872 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4873             case MBCS_OUTPUT_DBCS_ONLY:
   4874                 /* table with single-byte results, but only DBCS mappings used */
   4875                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4876                 if(value<=0xff) {
   4877                     /* no mapping or SBCS result, not taken for DBCS-only */
   4878                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4879                     length=0;
   4880                 } else {
   4881                     length=2;
   4882                 }
   4883                 break;
   4884             case MBCS_OUTPUT_3:
   4885                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4886                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4887                 if(value<=0xff) {
   4888                     length=1;
   4889                 } else if(value<=0xffff) {
   4890                     length=2;
   4891                 } else {
   4892                     length=3;
   4893                 }
   4894                 break;
   4895             case MBCS_OUTPUT_4:
   4896                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4897                 if(value<=0xff) {
   4898                     length=1;
   4899                 } else if(value<=0xffff) {
   4900                     length=2;
   4901                 } else if(value<=0xffffff) {
   4902                     length=3;
   4903                 } else {
   4904                     length=4;
   4905                 }
   4906                 break;
   4907             case MBCS_OUTPUT_3_EUC:
   4908                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4909                 /* EUC 16-bit fixed-length representation */
   4910                 if(value<=0xff) {
   4911                     length=1;
   4912                 } else if((value&0x8000)==0) {
   4913                     value|=0x8e8000;
   4914                     length=3;
   4915                 } else if((value&0x80)==0) {
   4916                     value|=0x8f0080;
   4917                     length=3;
   4918                 } else {
   4919                     length=2;
   4920                 }
   4921                 break;
   4922             case MBCS_OUTPUT_4_EUC:
   4923                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4924                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4925                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4926                 if(value<=0xff) {
   4927                     length=1;
   4928                 } else if(value<=0xffff) {
   4929                     length=2;
   4930                 } else if((value&0x800000)==0) {
   4931                     value|=0x8e800000;
   4932                     length=4;
   4933                 } else if((value&0x8000)==0) {
   4934                     value|=0x8f008000;
   4935                     length=4;
   4936                 } else {
   4937                     length=3;
   4938                 }
   4939                 break;
   4940 #endif
   4941             default:
   4942                 /* must not occur */
   4943                 return -1;
   4944             }
   4945 
   4946             /* is this code point assigned, or do we use fallbacks? */
   4947             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   4948                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
   4949             ) {
   4950                 /*
   4951                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4952                  * There is no way with this data structure for fallback output
   4953                  * to be a zero byte.
   4954                  */
   4955                 /* assigned */
   4956                 *pValue=value;
   4957                 return length;
   4958             }
   4959         }
   4960     }
   4961 
   4962     cx=sharedData->mbcs.extIndexes;
   4963     if(cx!=NULL) {
   4964         length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
   4965         return length>=0 ? length : -length;  /* return abs(length); */
   4966     }
   4967 
   4968     /* unassigned */
   4969     return 0;
   4970 }
   4971 
   4972 
   4973 #if 0
   4974 /*
   4975  * This function has been moved to ucnv2022.c for inlining.
   4976  * This implementation is here only for documentation purposes
   4977  */
   4978 
   4979 /**
   4980  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
   4981  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4982  * It does not handle conversion extensions (_extFromU()).
   4983  *
   4984  * It returns the codepage byte for the code point, or -1 if it is unassigned.
   4985  */
   4986 U_CFUNC int32_t
   4987 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
   4988                        UChar32 c,
   4989                        UBool useFallback) {
   4990     const uint16_t *table;
   4991     int32_t value;
   4992 
   4993     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4994     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4995         return -1;
   4996     }
   4997 
   4998     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4999     table=sharedData->mbcs.fromUnicodeTable;
   5000 
   5001     /* get the byte for the output */
   5002     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   5003     /* is this code point assigned, or do we use fallbacks? */
   5004     if(useFallback ? value>=0x800 : value>=0xc00) {
   5005         return value&0xff;
   5006     } else {
   5007         return -1;
   5008     }
   5009 }
   5010 #endif
   5011 
   5012 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
   5013 
   5014 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
   5015 static const UChar32
   5016 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
   5017 
   5018 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   5019 static const UChar32
   5020 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   5021 
   5022 static void U_CALLCONV
   5023 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5024                   UConverterToUnicodeArgs *pToUArgs,
   5025                   UErrorCode *pErrorCode) {
   5026     UConverter *utf8, *cnv;
   5027     const uint8_t *source, *sourceLimit;
   5028     uint8_t *target;
   5029     int32_t targetCapacity;
   5030 
   5031     const uint16_t *table, *sbcsIndex;
   5032     const uint16_t *results;
   5033 
   5034     int8_t oldToULength, toULength, toULimit;
   5035 
   5036     UChar32 c;
   5037     uint8_t b, t1, t2;
   5038 
   5039     uint32_t asciiRoundtrips;
   5040     uint16_t value, minValue;
   5041     UBool hasSupplementary;
   5042 
   5043     /* set up the local pointers */
   5044     utf8=pToUArgs->converter;
   5045     cnv=pFromUArgs->converter;
   5046     source=(uint8_t *)pToUArgs->source;
   5047     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5048     target=(uint8_t *)pFromUArgs->target;
   5049     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5050 
   5051     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5052     sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
   5053     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5054         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5055     } else {
   5056         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5057     }
   5058     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5059 
   5060     if(cnv->useFallback) {
   5061         /* use all roundtrip and fallback results */
   5062         minValue=0x800;
   5063     } else {
   5064         /* use only roundtrips and fallbacks from private-use characters */
   5065         minValue=0xc00;
   5066     }
   5067     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5068 
   5069     /* get the converter state from the UTF-8 UConverter */
   5070     c=(UChar32)utf8->toUnicodeStatus;
   5071     if(c!=0) {
   5072         toULength=oldToULength=utf8->toULength;
   5073         toULimit=(int8_t)utf8->mode;
   5074     } else {
   5075         toULength=oldToULength=toULimit=0;
   5076     }
   5077 
   5078     /*
   5079      * Make sure that the last byte sequence before sourceLimit is complete
   5080      * or runs into a lead byte.
   5081      * Do not go back into the bytes that will be read for finishing a partial
   5082      * sequence from the previous buffer.
   5083      * In the conversion loop compare source with sourceLimit only once
   5084      * per multi-byte character.
   5085      */
   5086     {
   5087         int32_t i, length;
   5088 
   5089         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5090         for(i=0; i<3 && i<length;) {
   5091             b=*(sourceLimit-i-1);
   5092             if(U8_IS_TRAIL(b)) {
   5093                 ++i;
   5094             } else {
   5095                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
   5096                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   5097                     sourceLimit-=i+1;
   5098                 }
   5099                 break;
   5100             }
   5101         }
   5102     }
   5103 
   5104     if(c!=0 && targetCapacity>0) {
   5105         utf8->toUnicodeStatus=0;
   5106         utf8->toULength=0;
   5107         goto moreBytes;
   5108         /*
   5109          * Note: We could avoid the goto by duplicating some of the moreBytes
   5110          * code, but only up to the point of collecting a complete UTF-8
   5111          * sequence; then recurse for the toUBytes[toULength]
   5112          * and then continue with normal conversion.
   5113          *
   5114          * If so, move this code to just after initializing the minimum
   5115          * set of local variables for reading the UTF-8 input
   5116          * (utf8, source, target, limits but not cnv, table, minValue, etc.).
   5117          *
   5118          * Potential advantages:
   5119          * - avoid the goto
   5120          * - oldToULength could become a local variable in just those code blocks
   5121          *   that deal with buffer boundaries
   5122          * - possibly faster if the goto prevents some compiler optimizations
   5123          *   (this would need measuring to confirm)
   5124          * Disadvantage:
   5125          * - code duplication
   5126          */
   5127     }
   5128 
   5129     /* conversion loop */
   5130     while(source<sourceLimit) {
   5131         if(targetCapacity>0) {
   5132             b=*source++;
   5133             if((int8_t)b>=0) {
   5134                 /* convert ASCII */
   5135                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5136                     *target++=(uint8_t)b;
   5137                     --targetCapacity;
   5138                     continue;
   5139                 } else {
   5140                     c=b;
   5141                     value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
   5142                 }
   5143             } else {
   5144                 if(b<0xe0) {
   5145                     if( /* handle U+0080..U+07FF inline */
   5146                         b>=0xc2 &&
   5147                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5148                     ) {
   5149                         c=b&0x1f;
   5150                         ++source;
   5151                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
   5152                         if(value>=minValue) {
   5153                             *target++=(uint8_t)value;
   5154                             --targetCapacity;
   5155                             continue;
   5156                         } else {
   5157                             c=(c<<6)|t1;
   5158                         }
   5159                     } else {
   5160                         c=-1;
   5161                     }
   5162                 } else if(b==0xe0) {
   5163                     if( /* handle U+0800..U+0FFF inline */
   5164                         (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
   5165                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5166                     ) {
   5167                         c=t1;
   5168                         source+=2;
   5169                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
   5170                         if(value>=minValue) {
   5171                             *target++=(uint8_t)value;
   5172                             --targetCapacity;
   5173                             continue;
   5174                         } else {
   5175                             c=(c<<6)|t2;
   5176                         }
   5177                     } else {
   5178                         c=-1;
   5179                     }
   5180                 } else {
   5181                     c=-1;
   5182                 }
   5183 
   5184                 if(c<0) {
   5185                     /* handle "complicated" and error cases, and continuing partial characters */
   5186                     oldToULength=0;
   5187                     toULength=1;
   5188                     toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5189                     c=b;
   5190 moreBytes:
   5191                     while(toULength<toULimit) {
   5192                         /*
   5193                          * The sourceLimit may have been adjusted before the conversion loop
   5194                          * to stop before a truncated sequence.
   5195                          * Here we need to use the real limit in case we have two truncated
   5196                          * sequences at the end.
   5197                          * See ticket #7492.
   5198                          */
   5199                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5200                             b=*source;
   5201                             if(U8_IS_TRAIL(b)) {
   5202                                 ++source;
   5203                                 ++toULength;
   5204                                 c=(c<<6)+b;
   5205                             } else {
   5206                                 break; /* sequence too short, stop with toULength<toULimit */
   5207                             }
   5208                         } else {
   5209                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5210                             source-=(toULength-oldToULength);
   5211                             while(oldToULength<toULength) {
   5212                                 utf8->toUBytes[oldToULength++]=*source++;
   5213                             }
   5214                             utf8->toUnicodeStatus=c;
   5215                             utf8->toULength=toULength;
   5216                             utf8->mode=toULimit;
   5217                             pToUArgs->source=(char *)source;
   5218                             pFromUArgs->target=(char *)target;
   5219                             return;
   5220                         }
   5221                     }
   5222 
   5223                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5224                         (toULength==3 || toULength==2) &&             /* BMP */
   5225                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5226                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5227                     ) {
   5228                         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5229                     } else if(
   5230                         toULength==toULimit && toULength==4 &&
   5231                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5232                     ) {
   5233                         /* supplementary code point */
   5234                         if(!hasSupplementary) {
   5235                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5236                             value=0;
   5237                         } else {
   5238                             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5239                         }
   5240                     } else {
   5241                         /* error handling: illegal UTF-8 byte sequence */
   5242                         source-=(toULength-oldToULength);
   5243                         while(oldToULength<toULength) {
   5244                             utf8->toUBytes[oldToULength++]=*source++;
   5245                         }
   5246                         utf8->toULength=toULength;
   5247                         pToUArgs->source=(char *)source;
   5248                         pFromUArgs->target=(char *)target;
   5249                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5250                         return;
   5251                     }
   5252                 }
   5253             }
   5254 
   5255             if(value>=minValue) {
   5256                 /* output the mapping for c */
   5257                 *target++=(uint8_t)value;
   5258                 --targetCapacity;
   5259             } else {
   5260                 /* value<minValue means c is unassigned (unmappable) */
   5261                 /*
   5262                  * Try an extension mapping.
   5263                  * Pass in no source because we don't have UTF-16 input.
   5264                  * If we have a partial match on c, we will return and revert
   5265                  * to UTF-8->UTF-16->charset conversion.
   5266                  */
   5267                 static const UChar nul=0;
   5268                 const UChar *noSource=&nul;
   5269                 c=_extFromU(cnv, cnv->sharedData,
   5270                             c, &noSource, noSource,
   5271                             &target, target+targetCapacity,
   5272                             NULL, -1,
   5273                             pFromUArgs->flush,
   5274                             pErrorCode);
   5275 
   5276                 if(U_FAILURE(*pErrorCode)) {
   5277                     /* not mappable or buffer overflow */
   5278                     cnv->fromUChar32=c;
   5279                     break;
   5280                 } else if(cnv->preFromUFirstCP>=0) {
   5281                     /*
   5282                      * Partial match, return and revert to pivoting.
   5283                      * In normal from-UTF-16 conversion, we would just continue
   5284                      * but then exit the loop because the extension match would
   5285                      * have consumed the source.
   5286                      */
   5287                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5288                     break;
   5289                 } else {
   5290                     /* a mapping was written to the target, continue */
   5291 
   5292                     /* recalculate the targetCapacity after an extension mapping */
   5293                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5294                 }
   5295             }
   5296         } else {
   5297             /* target is full */
   5298             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5299             break;
   5300         }
   5301     }
   5302 
   5303     /*
   5304      * The sourceLimit may have been adjusted before the conversion loop
   5305      * to stop before a truncated sequence.
   5306      * If so, then collect the truncated sequence now.
   5307      */
   5308     if(U_SUCCESS(*pErrorCode) &&
   5309             cnv->preFromUFirstCP<0 &&
   5310             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5311         c=utf8->toUBytes[0]=b=*source++;
   5312         toULength=1;
   5313         toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5314         while(source<sourceLimit) {
   5315             utf8->toUBytes[toULength++]=b=*source++;
   5316             c=(c<<6)+b;
   5317         }
   5318         utf8->toUnicodeStatus=c;
   5319         utf8->toULength=toULength;
   5320         utf8->mode=toULimit;
   5321     }
   5322 
   5323     /* write back the updated pointers */
   5324     pToUArgs->source=(char *)source;
   5325     pFromUArgs->target=(char *)target;
   5326 }
   5327 
   5328 static void U_CALLCONV
   5329 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5330                   UConverterToUnicodeArgs *pToUArgs,
   5331                   UErrorCode *pErrorCode) {
   5332     UConverter *utf8, *cnv;
   5333     const uint8_t *source, *sourceLimit;
   5334     uint8_t *target;
   5335     int32_t targetCapacity;
   5336 
   5337     const uint16_t *table, *mbcsIndex;
   5338     const uint16_t *results;
   5339 
   5340     int8_t oldToULength, toULength, toULimit;
   5341 
   5342     UChar32 c;
   5343     uint8_t b, t1, t2;
   5344 
   5345     uint32_t stage2Entry;
   5346     uint32_t asciiRoundtrips;
   5347     uint16_t value;
   5348     UBool hasSupplementary;
   5349 
   5350     /* set up the local pointers */
   5351     utf8=pToUArgs->converter;
   5352     cnv=pFromUArgs->converter;
   5353     source=(uint8_t *)pToUArgs->source;
   5354     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5355     target=(uint8_t *)pFromUArgs->target;
   5356     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5357 
   5358     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5359     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   5360     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5361         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5362     } else {
   5363         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5364     }
   5365     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5366 
   5367     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5368 
   5369     /* get the converter state from the UTF-8 UConverter */
   5370     c=(UChar32)utf8->toUnicodeStatus;
   5371     if(c!=0) {
   5372         toULength=oldToULength=utf8->toULength;
   5373         toULimit=(int8_t)utf8->mode;
   5374     } else {
   5375         toULength=oldToULength=toULimit=0;
   5376     }
   5377 
   5378     /*
   5379      * Make sure that the last byte sequence before sourceLimit is complete
   5380      * or runs into a lead byte.
   5381      * Do not go back into the bytes that will be read for finishing a partial
   5382      * sequence from the previous buffer.
   5383      * In the conversion loop compare source with sourceLimit only once
   5384      * per multi-byte character.
   5385      */
   5386     {
   5387         int32_t i, length;
   5388 
   5389         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5390         for(i=0; i<3 && i<length;) {
   5391             b=*(sourceLimit-i-1);
   5392             if(U8_IS_TRAIL(b)) {
   5393                 ++i;
   5394             } else {
   5395                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
   5396                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   5397                     sourceLimit-=i+1;
   5398                 }
   5399                 break;
   5400             }
   5401         }
   5402     }
   5403 
   5404     if(c!=0 && targetCapacity>0) {
   5405         utf8->toUnicodeStatus=0;
   5406         utf8->toULength=0;
   5407         goto moreBytes;
   5408         /* See note in ucnv_SBCSFromUTF8() about this goto. */
   5409     }
   5410 
   5411     /* conversion loop */
   5412     while(source<sourceLimit) {
   5413         if(targetCapacity>0) {
   5414             b=*source++;
   5415             if((int8_t)b>=0) {
   5416                 /* convert ASCII */
   5417                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5418                     *target++=b;
   5419                     --targetCapacity;
   5420                     continue;
   5421                 } else {
   5422                     value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
   5423                     if(value==0) {
   5424                         c=b;
   5425                         goto unassigned;
   5426                     }
   5427                 }
   5428             } else {
   5429                 if(b>0xe0) {
   5430                     if( /* handle U+1000..U+D7FF inline */
   5431                         (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
   5432                                                         (b==0xed && (t1 <= 0x1f))) &&
   5433                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5434                     ) {
   5435                         c=((b&0xf)<<6)|t1;
   5436                         source+=2;
   5437                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
   5438                         if(value==0) {
   5439                             c=(c<<6)|t2;
   5440                             goto unassigned;
   5441                         }
   5442                     } else {
   5443                         c=-1;
   5444                     }
   5445                 } else if(b<0xe0) {
   5446                     if( /* handle U+0080..U+07FF inline */
   5447                         b>=0xc2 &&
   5448                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5449                     ) {
   5450                         c=b&0x1f;
   5451                         ++source;
   5452                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
   5453                         if(value==0) {
   5454                             c=(c<<6)|t1;
   5455                             goto unassigned;
   5456                         }
   5457                     } else {
   5458                         c=-1;
   5459                     }
   5460                 } else {
   5461                     c=-1;
   5462                 }
   5463 
   5464                 if(c<0) {
   5465                     /* handle "complicated" and error cases, and continuing partial characters */
   5466                     oldToULength=0;
   5467                     toULength=1;
   5468                     toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5469                     c=b;
   5470 moreBytes:
   5471                     while(toULength<toULimit) {
   5472                         /*
   5473                          * The sourceLimit may have been adjusted before the conversion loop
   5474                          * to stop before a truncated sequence.
   5475                          * Here we need to use the real limit in case we have two truncated
   5476                          * sequences at the end.
   5477                          * See ticket #7492.
   5478                          */
   5479                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5480                             b=*source;
   5481                             if(U8_IS_TRAIL(b)) {
   5482                                 ++source;
   5483                                 ++toULength;
   5484                                 c=(c<<6)+b;
   5485                             } else {
   5486                                 break; /* sequence too short, stop with toULength<toULimit */
   5487                             }
   5488                         } else {
   5489                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5490                             source-=(toULength-oldToULength);
   5491                             while(oldToULength<toULength) {
   5492                                 utf8->toUBytes[oldToULength++]=*source++;
   5493                             }
   5494                             utf8->toUnicodeStatus=c;
   5495                             utf8->toULength=toULength;
   5496                             utf8->mode=toULimit;
   5497                             pToUArgs->source=(char *)source;
   5498                             pFromUArgs->target=(char *)target;
   5499                             return;
   5500                         }
   5501                     }
   5502 
   5503                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5504                         (toULength==3 || toULength==2) &&             /* BMP */
   5505                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5506                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5507                     ) {
   5508                         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5509                     } else if(
   5510                         toULength==toULimit && toULength==4 &&
   5511                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5512                     ) {
   5513                         /* supplementary code point */
   5514                         if(!hasSupplementary) {
   5515                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5516                             stage2Entry=0;
   5517                         } else {
   5518                             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5519                         }
   5520                     } else {
   5521                         /* error handling: illegal UTF-8 byte sequence */
   5522                         source-=(toULength-oldToULength);
   5523                         while(oldToULength<toULength) {
   5524                             utf8->toUBytes[oldToULength++]=*source++;
   5525                         }
   5526                         utf8->toULength=toULength;
   5527                         pToUArgs->source=(char *)source;
   5528                         pFromUArgs->target=(char *)target;
   5529                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5530                         return;
   5531                     }
   5532 
   5533                     /* get the bytes and the length for the output */
   5534                     /* MBCS_OUTPUT_2 */
   5535                     value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
   5536 
   5537                     /* is this code point assigned, or do we use fallbacks? */
   5538                     if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   5539                          (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   5540                     ) {
   5541                         goto unassigned;
   5542                     }
   5543                 }
   5544             }
   5545 
   5546             /* write the output character bytes from value and length */
   5547             /* from the first if in the loop we know that targetCapacity>0 */
   5548             if(value<=0xff) {
   5549                 /* this is easy because we know that there is enough space */
   5550                 *target++=(uint8_t)value;
   5551                 --targetCapacity;
   5552             } else /* length==2 */ {
   5553                 *target++=(uint8_t)(value>>8);
   5554                 if(2<=targetCapacity) {
   5555                     *target++=(uint8_t)value;
   5556                     targetCapacity-=2;
   5557                 } else {
   5558                     cnv->charErrorBuffer[0]=(char)value;
   5559                     cnv->charErrorBufferLength=1;
   5560 
   5561                     /* target overflow */
   5562                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5563                     break;
   5564                 }
   5565             }
   5566             continue;
   5567 
   5568 unassigned:
   5569             {
   5570                 /*
   5571                  * Try an extension mapping.
   5572                  * Pass in no source because we don't have UTF-16 input.
   5573                  * If we have a partial match on c, we will return and revert
   5574                  * to UTF-8->UTF-16->charset conversion.
   5575                  */
   5576                 static const UChar nul=0;
   5577                 const UChar *noSource=&nul;
   5578                 c=_extFromU(cnv, cnv->sharedData,
   5579                             c, &noSource, noSource,
   5580                             &target, target+targetCapacity,
   5581                             NULL, -1,
   5582                             pFromUArgs->flush,
   5583                             pErrorCode);
   5584 
   5585                 if(U_FAILURE(*pErrorCode)) {
   5586                     /* not mappable or buffer overflow */
   5587                     cnv->fromUChar32=c;
   5588                     break;
   5589                 } else if(cnv->preFromUFirstCP>=0) {
   5590                     /*
   5591                      * Partial match, return and revert to pivoting.
   5592                      * In normal from-UTF-16 conversion, we would just continue
   5593                      * but then exit the loop because the extension match would
   5594                      * have consumed the source.
   5595                      */
   5596                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5597                     break;
   5598                 } else {
   5599                     /* a mapping was written to the target, continue */
   5600 
   5601                     /* recalculate the targetCapacity after an extension mapping */
   5602                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5603                     continue;
   5604                 }
   5605             }
   5606         } else {
   5607             /* target is full */
   5608             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5609             break;
   5610         }
   5611     }
   5612 
   5613     /*
   5614      * The sourceLimit may have been adjusted before the conversion loop
   5615      * to stop before a truncated sequence.
   5616      * If so, then collect the truncated sequence now.
   5617      */
   5618     if(U_SUCCESS(*pErrorCode) &&
   5619             cnv->preFromUFirstCP<0 &&
   5620             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5621         c=utf8->toUBytes[0]=b=*source++;
   5622         toULength=1;
   5623         toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5624         while(source<sourceLimit) {
   5625             utf8->toUBytes[toULength++]=b=*source++;
   5626             c=(c<<6)+b;
   5627         }
   5628         utf8->toUnicodeStatus=c;
   5629         utf8->toULength=toULength;
   5630         utf8->mode=toULimit;
   5631     }
   5632 
   5633     /* write back the updated pointers */
   5634     pToUArgs->source=(char *)source;
   5635     pFromUArgs->target=(char *)target;
   5636 }
   5637 
   5638 /* miscellaneous ------------------------------------------------------------ */
   5639 
   5640 static void U_CALLCONV
   5641 ucnv_MBCSGetStarters(const UConverter* cnv,
   5642                  UBool starters[256],
   5643                  UErrorCode *) {
   5644     const int32_t *state0;
   5645     int i;
   5646 
   5647     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
   5648     for(i=0; i<256; ++i) {
   5649         /* all bytes that cause a state transition from state 0 are lead bytes */
   5650         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
   5651     }
   5652 }
   5653 
   5654 /*
   5655  * This is an internal function that allows other converter implementations
   5656  * to check whether a byte is a lead byte.
   5657  */
   5658 U_CFUNC UBool
   5659 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
   5660     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
   5661 }
   5662 
   5663 static void U_CALLCONV
   5664 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
   5665               int32_t offsetIndex,
   5666               UErrorCode *pErrorCode) {
   5667     UConverter *cnv=pArgs->converter;
   5668     char *p, *subchar;
   5669     char buffer[4];
   5670     int32_t length;
   5671 
   5672     /* first, select between subChar and subChar1 */
   5673     if( cnv->subChar1!=0 &&
   5674         (cnv->sharedData->mbcs.extIndexes!=NULL ?
   5675             cnv->useSubChar1 :
   5676             (cnv->invalidUCharBuffer[0]<=0xff))
   5677     ) {
   5678         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
   5679         subchar=(char *)&cnv->subChar1;
   5680         length=1;
   5681     } else {
   5682         /* select subChar in all other cases */
   5683         subchar=(char *)cnv->subChars;
   5684         length=cnv->subCharLen;
   5685     }
   5686 
   5687     /* reset the selector for the next code point */
   5688     cnv->useSubChar1=FALSE;
   5689 
   5690     if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
   5691         p=buffer;
   5692 
   5693         /* fromUnicodeStatus contains prevLength */
   5694         switch(length) {
   5695         case 1:
   5696             if(cnv->fromUnicodeStatus==2) {
   5697                 /* DBCS mode and SBCS sub char: change to SBCS */
   5698                 cnv->fromUnicodeStatus=1;
   5699                 *p++=UCNV_SI;
   5700             }
   5701             *p++=subchar[0];
   5702             break;
   5703         case 2:
   5704             if(cnv->fromUnicodeStatus<=1) {
   5705                 /* SBCS mode and DBCS sub char: change to DBCS */
   5706                 cnv->fromUnicodeStatus=2;
   5707                 *p++=UCNV_SO;
   5708             }
   5709             *p++=subchar[0];
   5710             *p++=subchar[1];
   5711             break;
   5712         default:
   5713             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   5714             return;
   5715         }
   5716         subchar=buffer;
   5717         length=(int32_t)(p-buffer);
   5718     }
   5719 
   5720     ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
   5721 }
   5722 
   5723 U_CFUNC UConverterType
   5724 ucnv_MBCSGetType(const UConverter* converter) {
   5725     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
   5726     if(converter->sharedData->mbcs.countStates==1) {
   5727         return (UConverterType)UCNV_SBCS;
   5728     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
   5729         return (UConverterType)UCNV_EBCDIC_STATEFUL;
   5730     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
   5731         return (UConverterType)UCNV_DBCS;
   5732     }
   5733     return (UConverterType)UCNV_MBCS;
   5734 }
   5735 
   5736 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   5737