Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvmbcs.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000jul03
     14 *   created by: Markus W. Scherer
     15 *
     16 *   The current code in this file replaces the previous implementation
     17 *   of conversion code from multi-byte codepages to Unicode and back.
     18 *   This implementation supports the following:
     19 *   - legacy variable-length codepages with up to 4 bytes per character
     20 *   - all Unicode code points (up to 0x10ffff)
     21 *   - efficient distinction of unassigned vs. illegal byte sequences
     22 *   - it is possible in fromUnicode() to directly deal with simple
     23 *     stateful encodings (used for EBCDIC_STATEFUL)
     24 *   - it is possible to convert Unicode code points
     25 *     to a single zero byte (but not as a fallback except for SBCS)
     26 *
     27 *   Remaining limitations in fromUnicode:
     28 *   - byte sequences must not have leading zero bytes
     29 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
     30 *   - limitation to up to 4 bytes per character
     31 *
     32 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
     33 *   limitations and adds m:n character mappings and other features.
     34 *   See ucnv_ext.h for details.
     35 *
     36 *   Change history:
     37 *
     38 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
     39 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
     40 *                             macros to ucnvmbcs.h file
     41 */
     42 
     43 #include "unicode/utypes.h"
     44 
     45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     46 
     47 #include "unicode/ucnv.h"
     48 #include "unicode/ucnv_cb.h"
     49 #include "unicode/udata.h"
     50 #include "unicode/uset.h"
     51 #include "unicode/utf8.h"
     52 #include "unicode/utf16.h"
     53 #include "ucnv_bld.h"
     54 #include "ucnvmbcs.h"
     55 #include "ucnv_ext.h"
     56 #include "ucnv_cnv.h"
     57 #include "cmemory.h"
     58 #include "cstring.h"
     59 #include "umutex.h"
     60 
     61 /* control optimizations according to the platform */
     62 #define MBCS_UNROLL_SINGLE_TO_BMP 1
     63 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
     64 
     65 /*
     66  * _MBCSHeader versions 5.3 & 4.3
     67  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
     68  *
     69  * This version is optional. Version 5 is used for incompatible data format changes.
     70  * makeconv will continue to generate version 4 files if possible.
     71  *
     72  * Changes from version 4:
     73  *
     74  * The main difference is an additional _MBCSHeader field with
     75  * - the length (number of uint32_t) of the _MBCSHeader
     76  * - flags for further incompatible data format changes
     77  * - flags for further, backward compatible data format changes
     78  *
     79  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
     80  * the file and needs to be reconstituted at load time.
     81  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
     82  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
     83  * (For details about these structures see below, and see ucnvmbcs.h.)
     84  *
     85  *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
     86  *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
     87  *   precision markers for all mappings.)
     88  *
     89  *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
     90  *   omitted data that can be reconstituted from the toUnicode data.
     91  *
     92  *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
     93  *   With only roundtrip mappings in the base fromUnicode data, this part is fully
     94  *   redundant with the mbcsIndex and will be reconstituted from that (also using the
     95  *   stage 1 table which contains the information about how stage 2 was compacted).
     96  *
     97  *   The rest of the stage 2 table, the part for code points above maxFastUChar,
     98  *   is stored in the file and will be appended to the reconstituted part.
     99  *
    100  *   The entire fromUBytes array is omitted from the file and will be reconstitued.
    101  *   This is done by enumerating all toUnicode roundtrip mappings, performing
    102  *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
    103  *   writing instead of reading the byte values.
    104  *
    105  * _MBCSHeader version 4.3
    106  *
    107  * Change from version 4.2:
    108  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
    109  *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
    110  *   files which can be used instead of stages 1 & 2.
    111  *   Faster lookups for roundtrips from most commonly used characters,
    112  *   and lookups from UTF-8 byte sequences with a natural bit distribution.
    113  *   See ucnvmbcs.h for more details.
    114  *
    115  * Change from version 4.1:
    116  * - Added an optional extension table structure at the end of the .cnv file.
    117  *   It is present if the upper bits of the header flags field contains a non-zero
    118  *   byte offset to it.
    119  *   Files that contain only a conversion table and no base table
    120  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
    121  *   These contain the base table name between the MBCS header and the extension
    122  *   data.
    123  *
    124  * Change from version 4.0:
    125  * - Replace header.reserved with header.fromUBytesLength so that all
    126  *   fields in the data have length.
    127  *
    128  * Changes from version 3 (for performance improvements):
    129  * - new bit distribution for state table entries
    130  * - reordered action codes
    131  * - new data structure for single-byte fromUnicode
    132  *   + stage 2 only contains indexes
    133  *   + stage 3 stores 16 bits per character with classification bits 15..8
    134  * - no multiplier for stage 1 entries
    135  * - stage 2 for non-single-byte codepages contains the index and the flags in
    136  *   one 32-bit value
    137  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
    138  *
    139  * For more details about old versions of the MBCS data structure, see
    140  * the corresponding versions of this file.
    141  *
    142  * Converting stateless codepage data ---------------------------------------***
    143  * (or codepage data with simple states) to Unicode.
    144  *
    145  * Data structure and algorithm for converting from complex legacy codepages
    146  * to Unicode. (Designed before 2000-may-22.)
    147  *
    148  * The basic idea is that the structure of legacy codepages can be described
    149  * with state tables.
    150  * When reading a byte stream, each input byte causes a state transition.
    151  * Some transitions result in the output of a code point, some result in
    152  * "unassigned" or "illegal" output.
    153  * This is used here for character conversion.
    154  *
    155  * The data structure begins with a state table consisting of a row
    156  * per state, with 256 entries (columns) per row for each possible input
    157  * byte value.
    158  * Each entry is 32 bits wide, with two formats distinguished by
    159  * the sign bit (bit 31):
    160  *
    161  * One format for transitional entries (bit 31 not set) for non-final bytes, and
    162  * one format for final entries (bit 31 set).
    163  * Both formats contain the number of the next state in the same bit
    164  * positions.
    165  * State 0 is the initial state.
    166  *
    167  * Most of the time, the offset values of subsequent states are added
    168  * up to a scalar value. This value will eventually be the index of
    169  * the Unicode code point in a table that follows the state table.
    170  * The effect is that the code points for final state table rows
    171  * are contiguous. The code points of final state rows follow each other
    172  * in the order of the references to those final states by previous
    173  * states, etc.
    174  *
    175  * For some terminal states, the offset is itself the output Unicode
    176  * code point (16 bits for a BMP code point or 20 bits for a supplementary
    177  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
    178  * For others, the code point in the Unicode table is stored with either
    179  * one or two code units: one for BMP code points, two for a pair of
    180  * surrogates.
    181  * All code points for a final state entry take up the same number of code
    182  * units, regardless of whether they all actually _use_ the same number
    183  * of code units. This is necessary for simple array access.
    184  *
    185  * An additional feature comes in with what in ICU is called "fallback"
    186  * mappings:
    187  *
    188  * In addition to round-trippable, precise, 1:1 mappings, there are often
    189  * mappings defined between similar, though not the same, characters.
    190  * Typically, such mappings occur only in fromUnicode mapping tables because
    191  * Unicode has a superset repertoire of most other codepages. However, it
    192  * is possible to provide such mappings in the toUnicode tables, too.
    193  * In this case, the fallback mappings are partly integrated into the
    194  * general state tables because the structure of the encoding includes their
    195  * byte sequences.
    196  * For final entries in an initial state, fallback mappings are stored in
    197  * the entry itself like with roundtrip mappings.
    198  * For other final entries, they are stored in the code units table if
    199  * the entry is for a pair of code units.
    200  * For single-unit results in the code units table, there is no space to
    201  * alternatively hold a fallback mapping; in this case, the code unit
    202  * is stored as U+fffe (unassigned), and the fallback mapping needs to
    203  * be looked up by the scalar offset value in a separate table.
    204  *
    205  * "Unassigned" state entries really mean "structurally unassigned",
    206  * i.e., such a byte sequence will never have a mapping result.
    207  *
    208  * The interpretation of the bits in each entry is as follows:
    209  *
    210  * Bit 31 not set, not a terminal entry ("transitional"):
    211  * 30..24 next state
    212  * 23..0  offset delta, to be added up
    213  *
    214  * Bit 31 set, terminal ("final") entry:
    215  * 30..24 next state (regardless of action code)
    216  * 23..20 action code:
    217  *        action codes 0 and 1 result in precise-mapping Unicode code points
    218  *        0  valid byte sequence
    219  *           19..16 not used, 0
    220  *           15..0  16-bit Unicode BMP code point
    221  *                  never U+fffe or U+ffff
    222  *        1  valid byte sequence
    223  *           19..0  20-bit Unicode supplementary code point
    224  *                  never U+fffe or U+ffff
    225  *
    226  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
    227  *        2  valid byte sequence (fallback)
    228  *           19..16 not used, 0
    229  *           15..0  16-bit Unicode BMP code point as fallback result
    230  *        3  valid byte sequence (fallback)
    231  *           19..0  20-bit Unicode supplementary code point as fallback result
    232  *
    233  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
    234  *        depending on the code units they result in
    235  *        4  valid byte sequence
    236  *           19..9  not used, 0
    237  *            8..0  final offset delta
    238  *                  pointing to one 16-bit code unit which may be
    239  *                  fffe  unassigned -- look for a fallback for this offset
    240  *                  ffff  illegal
    241  *        5  valid byte sequence
    242  *           19..9  not used, 0
    243  *            8..0  final offset delta
    244  *                  pointing to two 16-bit code units
    245  *                  (typically UTF-16 surrogates)
    246  *                  the result depends on the first code unit as follows:
    247  *                  0000..d7ff  roundtrip BMP code point (1st alone)
    248  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
    249  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
    250  *                  e000        roundtrip BMP code point (2nd alone)
    251  *                  e001        fallback BMP code point (2nd alone)
    252  *                  fffe        unassigned
    253  *                  ffff        illegal
    254  *           (the final offset deltas are at most 255 * 2,
    255  *            times 2 because of storing code unit pairs)
    256  *
    257  *        6  unassigned byte sequence
    258  *           19..16 not used, 0
    259  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
    260  *                  this does not contain a final offset delta because the main
    261  *                  purpose of this action code is to save scalar offset values;
    262  *                  therefore, fallback values cannot be assigned to byte
    263  *                  sequences that result in this action code
    264  *        7  illegal byte sequence
    265  *           19..16 not used, 0
    266  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
    267  *        8  state change only
    268  *           19..0  not used, 0
    269  *           useful for state changes in simple stateful encodings,
    270  *           at Shift-In/Shift-Out codes
    271  *
    272  *
    273  *        9..15 reserved for future use
    274  *           current implementations will only perform a state change
    275  *           and ignore bits 19..0
    276  *
    277  * An encoding with contiguous ranges of unassigned byte sequences, like
    278  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
    279  * at least two states for the trail bytes:
    280  * One trail byte state that results in code points, and one that only
    281  * has "unassigned" and "illegal" terminal states.
    282  *
    283  * Note: partly by accident, this data structure supports simple stateful
    284  * encodings without any additional logic.
    285  * Currently, only simple Shift-In/Shift-Out schemes are handled with
    286  * appropriate state tables (especially EBCDIC_STATEFUL!).
    287  *
    288  * MBCS version 2 added:
    289  * unassigned and illegal action codes have U+fffe and U+ffff
    290  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
    291  *
    292  * Converting from Unicode to codepage bytes --------------------------------***
    293  *
    294  * The conversion data structure for fromUnicode is designed for the known
    295  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
    296  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
    297  * a roundtrip mapping.
    298  *
    299  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
    300  * like in the character properties table.
    301  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
    302  * with the resulting bytes is at offsetFromUBytes.
    303  *
    304  * Beginning with version 4, single-byte codepages have a significantly different
    305  * trie compared to other codepages.
    306  * In all cases, the entry in stage 1 is directly the index of the block of
    307  * 64 entries in stage 2.
    308  *
    309  * Single-byte lookup:
    310  *
    311  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
    312  * Stage 3 contains one 16-bit word per result:
    313  * Bits 15..8 indicate the kind of result:
    314  *    f  roundtrip result
    315  *    c  fallback result from private-use code point
    316  *    8  fallback result from other code points
    317  *    0  unassigned
    318  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
    319  *
    320  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
    321  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
    322  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    323  * ASCII code points can be looked up with a linear array access into stage 3.
    324  * See maxFastUChar and other details in ucnvmbcs.h.
    325  *
    326  * Multi-byte lookup:
    327  *
    328  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
    329  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
    330  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
    331  *             If this test is false, then a non-zero result will be interpreted as
    332  *             a fallback mapping.
    333  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
    334  *
    335  * Stage 3 contains 2, 3, or 4 bytes per result.
    336  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
    337  * while 3 bytes are stored as bytes in big-endian order.
    338  * Leading zero bytes are ignored, and the number of bytes is counted.
    339  * A zero byte mapping result is possible as a roundtrip result.
    340  * For some output types, the actual result is processed from this;
    341  * see ucnv_MBCSFromUnicodeWithOffsets().
    342  *
    343  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
    344  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
    345  *
    346  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
    347  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
    348  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    349  * ASCII code points can be looked up with a linear array access into stage 3.
    350  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
    351  *
    352  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
    353  * for compaction.
    354  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
    355  * may overlap by any number of entries.
    356  *
    357  * MBCS version 2 added:
    358  * the converter checks for known output types, which allows
    359  * adding new ones without crashing an unaware converter
    360  */
    361 
    362 static const UConverterImpl _SBCSUTF8Impl;
    363 static const UConverterImpl _DBCSUTF8Impl;
    364 
    365 /* GB 18030 data ------------------------------------------------------------ */
    366 
    367 /* helper macros for linear values for GB 18030 four-byte sequences */
    368 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
    369 
    370 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
    371 
    372 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
    373 
    374 /*
    375  * Some ranges of GB 18030 where both the Unicode code points and the
    376  * GB four-byte sequences are contiguous and are handled algorithmically by
    377  * the special callback functions below.
    378  * The values are start & end of Unicode & GB codes.
    379  *
    380  * Note that single surrogates are not mapped by GB 18030
    381  * as of the re-released mapping tables from 2000-nov-30.
    382  */
    383 static const uint32_t
    384 gb18030Ranges[14][4]={
    385     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
    386     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
    387     {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
    388     {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
    389     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
    390     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
    391     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
    392     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
    393     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
    394     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
    395     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
    396     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
    397     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
    398     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
    399 };
    400 
    401 /* bit flag for UConverter.options indicating GB 18030 special handling */
    402 #define _MBCS_OPTION_GB18030 0x8000
    403 
    404 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
    405 #define _MBCS_OPTION_KEIS 0x01000
    406 #define _MBCS_OPTION_JEF  0x02000
    407 #define _MBCS_OPTION_JIPS 0x04000
    408 
    409 #define KEIS_SO_CHAR_1 0x0A
    410 #define KEIS_SO_CHAR_2 0x42
    411 #define KEIS_SI_CHAR_1 0x0A
    412 #define KEIS_SI_CHAR_2 0x41
    413 
    414 #define JEF_SO_CHAR 0x28
    415 #define JEF_SI_CHAR 0x29
    416 
    417 #define JIPS_SO_CHAR_1 0x1A
    418 #define JIPS_SO_CHAR_2 0x70
    419 #define JIPS_SI_CHAR_1 0x1A
    420 #define JIPS_SI_CHAR_2 0x71
    421 
    422 enum SISO_Option {
    423     SI,
    424     SO
    425 };
    426 typedef enum SISO_Option SISO_Option;
    427 
    428 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
    429     int32_t SISOLength = 0;
    430 
    431     switch (option) {
    432         case SI:
    433             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    434                 value[0] = KEIS_SI_CHAR_1;
    435                 value[1] = KEIS_SI_CHAR_2;
    436                 SISOLength = 2;
    437             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    438                 value[0] = JEF_SI_CHAR;
    439                 SISOLength = 1;
    440             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    441                 value[0] = JIPS_SI_CHAR_1;
    442                 value[1] = JIPS_SI_CHAR_2;
    443                 SISOLength = 2;
    444             } else {
    445                 value[0] = UCNV_SI;
    446                 SISOLength = 1;
    447             }
    448             break;
    449         case SO:
    450             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    451                 value[0] = KEIS_SO_CHAR_1;
    452                 value[1] = KEIS_SO_CHAR_2;
    453                 SISOLength = 2;
    454             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    455                 value[0] = JEF_SO_CHAR;
    456                 SISOLength = 1;
    457             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    458                 value[0] = JIPS_SO_CHAR_1;
    459                 value[1] = JIPS_SO_CHAR_2;
    460                 SISOLength = 2;
    461             } else {
    462                 value[0] = UCNV_SO;
    463                 SISOLength = 1;
    464             }
    465             break;
    466         default:
    467             /* Should never happen. */
    468             break;
    469     }
    470 
    471     return SISOLength;
    472 }
    473 
    474 /* Miscellaneous ------------------------------------------------------------ */
    475 
    476 /**
    477  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
    478  * consecutive sequences of bytes, starting from the one encoded in value,
    479  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
    480  * Does not currently support m:n mappings or reverse fallbacks.
    481  * This function will not be called for sequences of bytes with leading zeros.
    482  *
    483  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
    484  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
    485  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
    486  *        not map to anything
    487  * @return TRUE to continue enumeration, FALSE to stop
    488  */
    489 typedef UBool U_CALLCONV
    490 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
    491 
    492 /* similar to ucnv_MBCSGetNextUChar() but recursive */
    493 static UBool
    494 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
    495         int32_t state, uint32_t offset,
    496         uint32_t value,
    497         UConverterEnumToUCallback *callback, const void *context,
    498         UErrorCode *pErrorCode) {
    499     UChar32 codePoints[32];
    500     const int32_t *row;
    501     const uint16_t *unicodeCodeUnits;
    502     UChar32 anyCodePoints;
    503     int32_t b, limit;
    504 
    505     row=mbcsTable->stateTable[state];
    506     unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
    507 
    508     value<<=8;
    509     anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
    510 
    511     b=(stateProps[state]&0x38)<<2;
    512     if(b==0 && stateProps[state]>=0x40) {
    513         /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
    514         codePoints[0]=U_SENTINEL;
    515         b=1;
    516     }
    517     limit=((stateProps[state]&7)+1)<<5;
    518     while(b<limit) {
    519         int32_t entry=row[b];
    520         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    521             int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
    522             if(stateProps[nextState]>=0) {
    523                 /* recurse to a state with non-ignorable actions */
    524                 if(!enumToU(
    525                         mbcsTable, stateProps, nextState,
    526                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
    527                         value|(uint32_t)b,
    528                         callback, context,
    529                         pErrorCode)) {
    530                     return FALSE;
    531                 }
    532             }
    533             codePoints[b&0x1f]=U_SENTINEL;
    534         } else {
    535             UChar32 c;
    536             int32_t action;
    537 
    538             /*
    539              * An if-else-if chain provides more reliable performance for
    540              * the most common cases compared to a switch.
    541              */
    542             action=MBCS_ENTRY_FINAL_ACTION(entry);
    543             if(action==MBCS_STATE_VALID_DIRECT_16) {
    544                 /* output BMP code point */
    545                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
    546             } else if(action==MBCS_STATE_VALID_16) {
    547                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    548                 c=unicodeCodeUnits[finalOffset];
    549                 if(c<0xfffe) {
    550                     /* output BMP code point */
    551                 } else {
    552                     c=U_SENTINEL;
    553                 }
    554             } else if(action==MBCS_STATE_VALID_16_PAIR) {
    555                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    556                 c=unicodeCodeUnits[finalOffset++];
    557                 if(c<0xd800) {
    558                     /* output BMP code point below 0xd800 */
    559                 } else if(c<=0xdbff) {
    560                     /* output roundtrip or fallback supplementary code point */
    561                     c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
    562                 } else if(c==0xe000) {
    563                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
    564                     c=unicodeCodeUnits[finalOffset];
    565                 } else {
    566                     c=U_SENTINEL;
    567                 }
    568             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
    569                 /* output supplementary code point */
    570                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
    571             } else {
    572                 c=U_SENTINEL;
    573             }
    574 
    575             codePoints[b&0x1f]=c;
    576             anyCodePoints&=c;
    577         }
    578         if(((++b)&0x1f)==0) {
    579             if(anyCodePoints>=0) {
    580                 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
    581                     return FALSE;
    582                 }
    583                 anyCodePoints=-1;
    584             }
    585         }
    586     }
    587     return TRUE;
    588 }
    589 
    590 /*
    591  * Only called if stateProps[state]==-1.
    592  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
    593  * MBCS_STATE_CHANGE_ONLY.
    594  */
    595 static int8_t
    596 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
    597     const int32_t *row;
    598     int32_t min, max, entry, nextState;
    599 
    600     row=stateTable[state];
    601     stateProps[state]=0;
    602 
    603     /* find first non-ignorable state */
    604     for(min=0;; ++min) {
    605         entry=row[min];
    606         nextState=MBCS_ENTRY_STATE(entry);
    607         if(stateProps[nextState]==-1) {
    608             getStateProp(stateTable, stateProps, nextState);
    609         }
    610         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    611             if(stateProps[nextState]>=0) {
    612                 break;
    613             }
    614         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    615             break;
    616         }
    617         if(min==0xff) {
    618             stateProps[state]=-0x40;  /* (int8_t)0xc0 */
    619             return stateProps[state];
    620         }
    621     }
    622     stateProps[state]|=(int8_t)((min>>5)<<3);
    623 
    624     /* find last non-ignorable state */
    625     for(max=0xff; min<max; --max) {
    626         entry=row[max];
    627         nextState=MBCS_ENTRY_STATE(entry);
    628         if(stateProps[nextState]==-1) {
    629             getStateProp(stateTable, stateProps, nextState);
    630         }
    631         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    632             if(stateProps[nextState]>=0) {
    633                 break;
    634             }
    635         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    636             break;
    637         }
    638     }
    639     stateProps[state]|=(int8_t)(max>>5);
    640 
    641     /* recurse further and collect direct-state information */
    642     while(min<=max) {
    643         entry=row[min];
    644         nextState=MBCS_ENTRY_STATE(entry);
    645         if(stateProps[nextState]==-1) {
    646             getStateProp(stateTable, stateProps, nextState);
    647         }
    648         if(MBCS_ENTRY_IS_FINAL(entry)) {
    649             stateProps[nextState]|=0x40;
    650             if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
    651                 stateProps[state]|=0x40;
    652             }
    653         }
    654         ++min;
    655     }
    656     return stateProps[state];
    657 }
    658 
    659 /*
    660  * Internal function enumerating the toUnicode data of an MBCS converter.
    661  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
    662  * table, but could also be used for a future ucnv_getUnicodeSet() option
    663  * that includes reverse fallbacks (after updating this function's implementation).
    664  * Currently only handles roundtrip mappings.
    665  * Does not currently handle extensions.
    666  */
    667 static void
    668 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
    669                        UConverterEnumToUCallback *callback, const void *context,
    670                        UErrorCode *pErrorCode) {
    671     /*
    672      * Properties for each state, to speed up the enumeration.
    673      * Ignorable actions are unassigned/illegal/state-change-only:
    674      * They do not lead to mappings.
    675      *
    676      * Bits 7..6:
    677      * 1 direct/initial state (stateful converters have multiple)
    678      * 0 non-initial state with transitions or with non-ignorable result actions
    679      * -1 final state with only ignorable actions
    680      *
    681      * Bits 5..3:
    682      * The lowest byte value with non-ignorable actions is
    683      * value<<5 (rounded down).
    684      *
    685      * Bits 2..0:
    686      * The highest byte value with non-ignorable actions is
    687      * (value<<5)&0x1f (rounded up).
    688      */
    689     int8_t stateProps[MBCS_MAX_STATE_COUNT];
    690     int32_t state;
    691 
    692     uprv_memset(stateProps, -1, sizeof(stateProps));
    693 
    694     /* recurse from state 0 and set all stateProps */
    695     getStateProp(mbcsTable->stateTable, stateProps, 0);
    696 
    697     for(state=0; state<mbcsTable->countStates; ++state) {
    698         /*if(stateProps[state]==-1) {
    699             printf("unused/unreachable <icu:state> %d\n", state);
    700         }*/
    701         if(stateProps[state]>=0x40) {
    702             /* start from each direct state */
    703             enumToU(
    704                 mbcsTable, stateProps, state, 0, 0,
    705                 callback, context,
    706                 pErrorCode);
    707         }
    708     }
    709 }
    710 
    711 U_CFUNC void
    712 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    713                                          const USetAdder *sa,
    714                                          UConverterUnicodeSet which,
    715                                          UConverterSetFilter filter,
    716                                          UErrorCode *pErrorCode) {
    717     const UConverterMBCSTable *mbcsTable;
    718     const uint16_t *table;
    719 
    720     uint32_t st3;
    721     uint16_t st1, maxStage1, st2;
    722 
    723     UChar32 c;
    724 
    725     /* enumerate the from-Unicode trie table */
    726     mbcsTable=&sharedData->mbcs;
    727     table=mbcsTable->fromUnicodeTable;
    728     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
    729         maxStage1=0x440;
    730     } else {
    731         maxStage1=0x40;
    732     }
    733 
    734     c=0; /* keep track of the current code point while enumerating */
    735 
    736     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
    737         const uint16_t *stage2, *stage3, *results;
    738         uint16_t minValue;
    739 
    740         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
    741 
    742         /*
    743          * Set a threshold variable for selecting which mappings to use.
    744          * See ucnv_MBCSSingleFromBMPWithOffsets() and
    745          * MBCS_SINGLE_RESULT_FROM_U() for details.
    746          */
    747         if(which==UCNV_ROUNDTRIP_SET) {
    748             /* use only roundtrips */
    749             minValue=0xf00;
    750         } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
    751             /* use all roundtrip and fallback results */
    752             minValue=0x800;
    753         }
    754 
    755         for(st1=0; st1<maxStage1; ++st1) {
    756             st2=table[st1];
    757             if(st2>maxStage1) {
    758                 stage2=table+st2;
    759                 for(st2=0; st2<64; ++st2) {
    760                     if((st3=stage2[st2])!=0) {
    761                         /* read the stage 3 block */
    762                         stage3=results+st3;
    763 
    764                         do {
    765                             if(*stage3++>=minValue) {
    766                                 sa->add(sa->set, c);
    767                             }
    768                         } while((++c&0xf)!=0);
    769                     } else {
    770                         c+=16; /* empty stage 3 block */
    771                     }
    772                 }
    773             } else {
    774                 c+=1024; /* empty stage 2 block */
    775             }
    776         }
    777     } else {
    778         const uint32_t *stage2;
    779         const uint8_t *stage3, *bytes;
    780         uint32_t st3Multiplier;
    781         uint32_t value;
    782         UBool useFallback;
    783 
    784         bytes=mbcsTable->fromUnicodeBytes;
    785 
    786         useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
    787 
    788         switch(mbcsTable->outputType) {
    789         case MBCS_OUTPUT_3:
    790         case MBCS_OUTPUT_4_EUC:
    791             st3Multiplier=3;
    792             break;
    793         case MBCS_OUTPUT_4:
    794             st3Multiplier=4;
    795             break;
    796         default:
    797             st3Multiplier=2;
    798             break;
    799         }
    800 
    801         for(st1=0; st1<maxStage1; ++st1) {
    802             st2=table[st1];
    803             if(st2>(maxStage1>>1)) {
    804                 stage2=(const uint32_t *)table+st2;
    805                 for(st2=0; st2<64; ++st2) {
    806                     if((st3=stage2[st2])!=0) {
    807                         /* read the stage 3 block */
    808                         stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
    809 
    810                         /* get the roundtrip flags for the stage 3 block */
    811                         st3>>=16;
    812 
    813                         /*
    814                          * Add code points for which the roundtrip flag is set,
    815                          * or which map to non-zero bytes if we use fallbacks.
    816                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
    817                          */
    818                         switch(filter) {
    819                         case UCNV_SET_FILTER_NONE:
    820                             do {
    821                                 if(st3&1) {
    822                                     sa->add(sa->set, c);
    823                                     stage3+=st3Multiplier;
    824                                 } else if(useFallback) {
    825                                     uint8_t b=0;
    826                                     switch(st3Multiplier) {
    827                                     case 4:
    828                                         b|=*stage3++;
    829                                     case 3: /*fall through*/
    830                                         b|=*stage3++;
    831                                     case 2: /*fall through*/
    832                                         b|=stage3[0]|stage3[1];
    833                                         stage3+=2;
    834                                     default:
    835                                         break;
    836                                     }
    837                                     if(b!=0) {
    838                                         sa->add(sa->set, c);
    839                                     }
    840                                 }
    841                                 st3>>=1;
    842                             } while((++c&0xf)!=0);
    843                             break;
    844                         case UCNV_SET_FILTER_DBCS_ONLY:
    845                              /* Ignore single-byte results (<0x100). */
    846                             do {
    847                                 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
    848                                     sa->add(sa->set, c);
    849                                 }
    850                                 st3>>=1;
    851                                 stage3+=2;  /* +=st3Multiplier */
    852                             } while((++c&0xf)!=0);
    853                             break;
    854                         case UCNV_SET_FILTER_2022_CN:
    855                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
    856                             do {
    857                                 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
    858                                     sa->add(sa->set, c);
    859                                 }
    860                                 st3>>=1;
    861                                 stage3+=3;  /* +=st3Multiplier */
    862                             } while((++c&0xf)!=0);
    863                             break;
    864                         case UCNV_SET_FILTER_SJIS:
    865                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
    866                             do {
    867                                 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
    868                                     sa->add(sa->set, c);
    869                                 }
    870                                 st3>>=1;
    871                                 stage3+=2;  /* +=st3Multiplier */
    872                             } while((++c&0xf)!=0);
    873                             break;
    874                         case UCNV_SET_FILTER_GR94DBCS:
    875                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
    876                             do {
    877                                 if( ((st3&1)!=0 || useFallback) &&
    878                                     (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
    879                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
    880                                 ) {
    881                                     sa->add(sa->set, c);
    882                                 }
    883                                 st3>>=1;
    884                                 stage3+=2;  /* +=st3Multiplier */
    885                             } while((++c&0xf)!=0);
    886                             break;
    887                         case UCNV_SET_FILTER_HZ:
    888                             /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
    889                             do {
    890                                 if( ((st3&1)!=0 || useFallback) &&
    891                                     (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
    892                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
    893                                 ) {
    894                                     sa->add(sa->set, c);
    895                                 }
    896                                 st3>>=1;
    897                                 stage3+=2;  /* +=st3Multiplier */
    898                             } while((++c&0xf)!=0);
    899                             break;
    900                         default:
    901                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    902                             return;
    903                         }
    904                     } else {
    905                         c+=16; /* empty stage 3 block */
    906                     }
    907                 }
    908             } else {
    909                 c+=1024; /* empty stage 2 block */
    910             }
    911         }
    912     }
    913 
    914     ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
    915 }
    916 
    917 U_CFUNC void
    918 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    919                                  const USetAdder *sa,
    920                                  UConverterUnicodeSet which,
    921                                  UErrorCode *pErrorCode) {
    922     ucnv_MBCSGetFilteredUnicodeSetForUnicode(
    923         sharedData, sa, which,
    924         sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
    925             UCNV_SET_FILTER_DBCS_ONLY :
    926             UCNV_SET_FILTER_NONE,
    927         pErrorCode);
    928 }
    929 
    930 static void
    931 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
    932                    const USetAdder *sa,
    933                    UConverterUnicodeSet which,
    934                    UErrorCode *pErrorCode) {
    935     if(cnv->options&_MBCS_OPTION_GB18030) {
    936         sa->addRange(sa->set, 0, 0xd7ff);
    937         sa->addRange(sa->set, 0xe000, 0x10ffff);
    938     } else {
    939         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
    940     }
    941 }
    942 
    943 /* conversion extensions for input not in the main table -------------------- */
    944 
    945 /*
    946  * Hardcoded extension handling for GB 18030.
    947  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
    948  *
    949  * In the future, conversion extensions may handle m:n mappings and delta tables,
    950  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
    951  *
    952  * If an input character cannot be mapped, then these functions set an error
    953  * code. The framework will then call the callback function.
    954  */
    955 
    956 /*
    957  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
    958  *         else return 0 after output has been written to the target
    959  */
    960 static UChar32
    961 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
    962           UChar32 cp,
    963           const UChar **source, const UChar *sourceLimit,
    964           uint8_t **target, const uint8_t *targetLimit,
    965           int32_t **offsets, int32_t sourceIndex,
    966           UBool flush,
    967           UErrorCode *pErrorCode) {
    968     const int32_t *cx;
    969 
    970     cnv->useSubChar1=FALSE;
    971 
    972     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
    973         ucnv_extInitialMatchFromU(
    974             cnv, cx,
    975             cp, source, sourceLimit,
    976             (char **)target, (char *)targetLimit,
    977             offsets, sourceIndex,
    978             flush,
    979             pErrorCode)
    980     ) {
    981         return 0; /* an extension mapping handled the input */
    982     }
    983 
    984     /* GB 18030 */
    985     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
    986         const uint32_t *range;
    987         int32_t i;
    988 
    989         range=gb18030Ranges[0];
    990         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
    991             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
    992                 /* found the Unicode code point, output the four-byte sequence for it */
    993                 uint32_t linear;
    994                 char bytes[4];
    995 
    996                 /* get the linear value of the first GB 18030 code in this range */
    997                 linear=range[2]-LINEAR_18030_BASE;
    998 
    999                 /* add the offset from the beginning of the range */
   1000                 linear+=((uint32_t)cp-range[0]);
   1001 
   1002                 /* turn this into a four-byte sequence */
   1003                 bytes[3]=(char)(0x30+linear%10); linear/=10;
   1004                 bytes[2]=(char)(0x81+linear%126); linear/=126;
   1005                 bytes[1]=(char)(0x30+linear%10); linear/=10;
   1006                 bytes[0]=(char)(0x81+linear);
   1007 
   1008                 /* output this sequence */
   1009                 ucnv_fromUWriteBytes(cnv,
   1010                                      bytes, 4, (char **)target, (char *)targetLimit,
   1011                                      offsets, sourceIndex, pErrorCode);
   1012                 return 0;
   1013             }
   1014         }
   1015     }
   1016 
   1017     /* no mapping */
   1018     *pErrorCode=U_INVALID_CHAR_FOUND;
   1019     return cp;
   1020 }
   1021 
   1022 /*
   1023  * Input sequence: cnv->toUBytes[0..length[
   1024  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
   1025  *         else return 0 after output has been written to the target
   1026  */
   1027 static int8_t
   1028 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
   1029         int8_t length,
   1030         const uint8_t **source, const uint8_t *sourceLimit,
   1031         UChar **target, const UChar *targetLimit,
   1032         int32_t **offsets, int32_t sourceIndex,
   1033         UBool flush,
   1034         UErrorCode *pErrorCode) {
   1035     const int32_t *cx;
   1036 
   1037     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1038         ucnv_extInitialMatchToU(
   1039             cnv, cx,
   1040             length, (const char **)source, (const char *)sourceLimit,
   1041             target, targetLimit,
   1042             offsets, sourceIndex,
   1043             flush,
   1044             pErrorCode)
   1045     ) {
   1046         return 0; /* an extension mapping handled the input */
   1047     }
   1048 
   1049     /* GB 18030 */
   1050     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1051         const uint32_t *range;
   1052         uint32_t linear;
   1053         int32_t i;
   1054 
   1055         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
   1056         range=gb18030Ranges[0];
   1057         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
   1058             if(range[2]<=linear && linear<=range[3]) {
   1059                 /* found the sequence, output the Unicode code point for it */
   1060                 *pErrorCode=U_ZERO_ERROR;
   1061 
   1062                 /* add the linear difference between the input and start sequences to the start code point */
   1063                 linear=range[0]+(linear-range[2]);
   1064 
   1065                 /* output this code point */
   1066                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
   1067 
   1068                 return 0;
   1069             }
   1070         }
   1071     }
   1072 
   1073     /* no mapping */
   1074     *pErrorCode=U_INVALID_CHAR_FOUND;
   1075     return length;
   1076 }
   1077 
   1078 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
   1079 
   1080 /*
   1081  * This code modifies a standard EBCDIC<->Unicode mapping table for
   1082  * OS/390 (z/OS) Unix System Services (Open Edition).
   1083  * The difference is in the mapping of Line Feed and New Line control codes:
   1084  * Standard EBCDIC maps
   1085  *
   1086  *   <U000A> \x25 |0
   1087  *   <U0085> \x15 |0
   1088  *
   1089  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
   1090  * mapping
   1091  *
   1092  *   <U000A> \x15 |0
   1093  *   <U0085> \x25 |0
   1094  *
   1095  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
   1096  * by copying it into allocated memory and swapping the LF and NL values.
   1097  * It allows to support the same EBCDIC charset in both versions without
   1098  * duplicating the entire installed table.
   1099  */
   1100 
   1101 /* standard EBCDIC codes */
   1102 #define EBCDIC_LF 0x25
   1103 #define EBCDIC_NL 0x15
   1104 
   1105 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
   1106 #define EBCDIC_RT_LF 0xf25
   1107 #define EBCDIC_RT_NL 0xf15
   1108 
   1109 /* Unicode code points */
   1110 #define U_LF 0x0a
   1111 #define U_NL 0x85
   1112 
   1113 static UBool
   1114 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
   1115     UConverterMBCSTable *mbcsTable;
   1116 
   1117     const uint16_t *table, *results;
   1118     const uint8_t *bytes;
   1119 
   1120     int32_t (*newStateTable)[256];
   1121     uint16_t *newResults;
   1122     uint8_t *p;
   1123     char *name;
   1124 
   1125     uint32_t stage2Entry;
   1126     uint32_t size, sizeofFromUBytes;
   1127 
   1128     mbcsTable=&sharedData->mbcs;
   1129 
   1130     table=mbcsTable->fromUnicodeTable;
   1131     bytes=mbcsTable->fromUnicodeBytes;
   1132     results=(const uint16_t *)bytes;
   1133 
   1134     /*
   1135      * Check that this is an EBCDIC table with SBCS portion -
   1136      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
   1137      *
   1138      * If not, ignore the option. Options are always ignored if they do not apply.
   1139      */
   1140     if(!(
   1141          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
   1142          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
   1143          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
   1144     )) {
   1145         return FALSE;
   1146     }
   1147 
   1148     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1149         if(!(
   1150              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
   1151              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
   1152         )) {
   1153             return FALSE;
   1154         }
   1155     } else /* MBCS_OUTPUT_2_SISO */ {
   1156         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1157         if(!(
   1158              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
   1159              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
   1160         )) {
   1161             return FALSE;
   1162         }
   1163 
   1164         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1165         if(!(
   1166              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
   1167              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
   1168         )) {
   1169             return FALSE;
   1170         }
   1171     }
   1172 
   1173     if(mbcsTable->fromUBytesLength>0) {
   1174         /*
   1175          * We _know_ the number of bytes in the fromUnicodeBytes array
   1176          * starting with header.version 4.1.
   1177          */
   1178         sizeofFromUBytes=mbcsTable->fromUBytesLength;
   1179     } else {
   1180         /*
   1181          * Otherwise:
   1182          * There used to be code to enumerate the fromUnicode
   1183          * trie and find the highest entry, but it was removed in ICU 3.2
   1184          * because it was not tested and caused a low code coverage number.
   1185          * See Jitterbug 3674.
   1186          * This affects only some .cnv file formats with a header.version
   1187          * below 4.1, and only when swaplfnl is requested.
   1188          *
   1189          * ucnvmbcs.c revision 1.99 is the last one with the
   1190          * ucnv_MBCSSizeofFromUBytes() function.
   1191          */
   1192         *pErrorCode=U_INVALID_FORMAT_ERROR;
   1193         return FALSE;
   1194     }
   1195 
   1196     /*
   1197      * The table has an appropriate format.
   1198      * Allocate and build
   1199      * - a modified to-Unicode state table
   1200      * - a modified from-Unicode output array
   1201      * - a converter name string with the swap option appended
   1202      */
   1203     size=
   1204         mbcsTable->countStates*1024+
   1205         sizeofFromUBytes+
   1206         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
   1207     p=(uint8_t *)uprv_malloc(size);
   1208     if(p==NULL) {
   1209         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1210         return FALSE;
   1211     }
   1212 
   1213     /* copy and modify the to-Unicode state table */
   1214     newStateTable=(int32_t (*)[256])p;
   1215     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
   1216 
   1217     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
   1218     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
   1219 
   1220     /* copy and modify the from-Unicode result table */
   1221     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
   1222     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
   1223 
   1224     /* conveniently, the table access macros work on the left side of expressions */
   1225     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1226         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
   1227         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
   1228     } else /* MBCS_OUTPUT_2_SISO */ {
   1229         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1230         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
   1231 
   1232         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1233         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
   1234     }
   1235 
   1236     /* set the canonical converter name */
   1237     name=(char *)newResults+sizeofFromUBytes;
   1238     uprv_strcpy(name, sharedData->staticData->name);
   1239     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
   1240 
   1241     /* set the pointers */
   1242     umtx_lock(NULL);
   1243     if(mbcsTable->swapLFNLStateTable==NULL) {
   1244         mbcsTable->swapLFNLStateTable=newStateTable;
   1245         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
   1246         mbcsTable->swapLFNLName=name;
   1247 
   1248         newStateTable=NULL;
   1249     }
   1250     umtx_unlock(NULL);
   1251 
   1252     /* release the allocated memory if another thread beat us to it */
   1253     if(newStateTable!=NULL) {
   1254         uprv_free(newStateTable);
   1255     }
   1256     return TRUE;
   1257 }
   1258 
   1259 /* reconstitute omitted fromUnicode data ------------------------------------ */
   1260 
   1261 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
   1262 static UBool U_CALLCONV
   1263 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
   1264     UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
   1265     const uint16_t *table;
   1266     uint32_t *stage2;
   1267     uint8_t *bytes, *p;
   1268     UChar32 c;
   1269     int32_t i, st3;
   1270 
   1271     table=mbcsTable->fromUnicodeTable;
   1272     bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
   1273 
   1274     /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
   1275     switch(mbcsTable->outputType) {
   1276     case MBCS_OUTPUT_3_EUC:
   1277         if(value<=0xffff) {
   1278             /* short sequences are stored directly */
   1279             /* code set 0 or 1 */
   1280         } else if(value<=0x8effff) {
   1281             /* code set 2 */
   1282             value&=0x7fff;
   1283         } else /* first byte is 0x8f */ {
   1284             /* code set 3 */
   1285             value&=0xff7f;
   1286         }
   1287         break;
   1288     case MBCS_OUTPUT_4_EUC:
   1289         if(value<=0xffffff) {
   1290             /* short sequences are stored directly */
   1291             /* code set 0 or 1 */
   1292         } else if(value<=0x8effffff) {
   1293             /* code set 2 */
   1294             value&=0x7fffff;
   1295         } else /* first byte is 0x8f */ {
   1296             /* code set 3 */
   1297             value&=0xff7fff;
   1298         }
   1299         break;
   1300     default:
   1301         break;
   1302     }
   1303 
   1304     for(i=0; i<=0x1f; ++value, ++i) {
   1305         c=codePoints[i];
   1306         if(c<0) {
   1307             continue;
   1308         }
   1309 
   1310         /* locate the stage 2 & 3 data */
   1311         stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
   1312         p=bytes;
   1313         st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
   1314 
   1315         /* write the codepage bytes into stage 3 */
   1316         switch(mbcsTable->outputType) {
   1317         case MBCS_OUTPUT_3:
   1318         case MBCS_OUTPUT_4_EUC:
   1319             p+=st3*3;
   1320             p[0]=(uint8_t)(value>>16);
   1321             p[1]=(uint8_t)(value>>8);
   1322             p[2]=(uint8_t)value;
   1323             break;
   1324         case MBCS_OUTPUT_4:
   1325             ((uint32_t *)p)[st3]=value;
   1326             break;
   1327         default:
   1328             /* 2 bytes per character */
   1329             ((uint16_t *)p)[st3]=(uint16_t)value;
   1330             break;
   1331         }
   1332 
   1333         /* set the roundtrip flag */
   1334         *stage2|=(1UL<<(16+(c&0xf)));
   1335     }
   1336     return TRUE;
   1337  }
   1338 
   1339 static void
   1340 reconstituteData(UConverterMBCSTable *mbcsTable,
   1341                  uint32_t stage1Length, uint32_t stage2Length,
   1342                  uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
   1343                  UErrorCode *pErrorCode) {
   1344     uint16_t *stage1;
   1345     uint32_t *stage2;
   1346     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
   1347     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
   1348     if(mbcsTable->reconstitutedData==NULL) {
   1349         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1350         return;
   1351     }
   1352     uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
   1353 
   1354     /* copy existing data and reroute the pointers */
   1355     stage1=(uint16_t *)mbcsTable->reconstitutedData;
   1356     uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
   1357 
   1358     stage2=(uint32_t *)(stage1+stage1Length);
   1359     uprv_memcpy(stage2+(fullStage2Length-stage2Length),
   1360                 mbcsTable->fromUnicodeTable+stage1Length,
   1361                 stage2Length*4);
   1362 
   1363     mbcsTable->fromUnicodeTable=stage1;
   1364     mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
   1365 
   1366     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
   1367     stage2=(uint32_t *)stage1;
   1368 
   1369     /* reconstitute the initial part of stage 2 from the mbcsIndex */
   1370     {
   1371         int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
   1372         int32_t stageUTF8Index=0;
   1373         int32_t st1, st2, st3, i;
   1374 
   1375         for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
   1376             st2=stage1[st1];
   1377             if(st2!=stage1Length/2) {
   1378                 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
   1379                 for(i=0; i<16; ++i) {
   1380                     st3=mbcsTable->mbcsIndex[stageUTF8Index++];
   1381                     if(st3!=0) {
   1382                         /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
   1383                         st3>>=4;
   1384                         /*
   1385                          * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
   1386                          * allocated together as a single 64-block for access from the mbcsIndex
   1387                          */
   1388                         stage2[st2++]=st3++;
   1389                         stage2[st2++]=st3++;
   1390                         stage2[st2++]=st3++;
   1391                         stage2[st2++]=st3;
   1392                     } else {
   1393                         /* no stage 3 block, skip */
   1394                         st2+=4;
   1395                     }
   1396                 }
   1397             } else {
   1398                 /* no stage 2 block, skip */
   1399                 stageUTF8Index+=16;
   1400             }
   1401         }
   1402     }
   1403 
   1404     /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
   1405     ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
   1406 }
   1407 
   1408 /* MBCS setup functions ----------------------------------------------------- */
   1409 
   1410 static void
   1411 ucnv_MBCSLoad(UConverterSharedData *sharedData,
   1412           UConverterLoadArgs *pArgs,
   1413           const uint8_t *raw,
   1414           UErrorCode *pErrorCode) {
   1415     UDataInfo info;
   1416     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1417     _MBCSHeader *header=(_MBCSHeader *)raw;
   1418     uint32_t offset;
   1419     uint32_t headerLength;
   1420     UBool noFromU=FALSE;
   1421 
   1422     if(header->version[0]==4) {
   1423         headerLength=MBCS_HEADER_V4_LENGTH;
   1424     } else if(header->version[0]==5 && header->version[1]>=3 &&
   1425               (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
   1426         headerLength=header->options&MBCS_OPT_LENGTH_MASK;
   1427         noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
   1428     } else {
   1429         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1430         return;
   1431     }
   1432 
   1433     mbcsTable->outputType=(uint8_t)header->flags;
   1434     if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
   1435         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1436         return;
   1437     }
   1438 
   1439     /* extension data, header version 4.2 and higher */
   1440     offset=header->flags>>8;
   1441     if(offset!=0) {
   1442         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
   1443     }
   1444 
   1445     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
   1446         UConverterLoadArgs args={ 0 };
   1447         UConverterSharedData *baseSharedData;
   1448         const int32_t *extIndexes;
   1449         const char *baseName;
   1450 
   1451         /* extension-only file, load the base table and set values appropriately */
   1452         if((extIndexes=mbcsTable->extIndexes)==NULL) {
   1453             /* extension-only file without extension */
   1454             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1455             return;
   1456         }
   1457 
   1458         if(pArgs->nestedLoads!=1) {
   1459             /* an extension table must not be loaded as a base table */
   1460             *pErrorCode=U_INVALID_TABLE_FILE;
   1461             return;
   1462         }
   1463 
   1464         /* load the base table */
   1465         baseName=(const char *)header+headerLength*4;
   1466         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
   1467             /* forbid loading this same extension-only file */
   1468             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1469             return;
   1470         }
   1471 
   1472         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
   1473         args.size=sizeof(UConverterLoadArgs);
   1474         args.nestedLoads=2;
   1475         args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
   1476         args.reserved=pArgs->reserved;
   1477         args.options=pArgs->options;
   1478         args.pkg=pArgs->pkg;
   1479         args.name=baseName;
   1480         baseSharedData=ucnv_load(&args, pErrorCode);
   1481         if(U_FAILURE(*pErrorCode)) {
   1482             return;
   1483         }
   1484         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
   1485             baseSharedData->mbcs.baseSharedData!=NULL
   1486         ) {
   1487             ucnv_unload(baseSharedData);
   1488             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1489             return;
   1490         }
   1491         if(pArgs->onlyTestIsLoadable) {
   1492             /*
   1493              * Exit as soon as we know that we can load the converter
   1494              * and the format is valid and supported.
   1495              * The worst that can happen in the following code is a memory
   1496              * allocation error.
   1497              */
   1498             ucnv_unload(baseSharedData);
   1499             return;
   1500         }
   1501 
   1502         /* copy the base table data */
   1503         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
   1504 
   1505         /* overwrite values with relevant ones for the extension converter */
   1506         mbcsTable->baseSharedData=baseSharedData;
   1507         mbcsTable->extIndexes=extIndexes;
   1508 
   1509         /*
   1510          * It would be possible to share the swapLFNL data with a base converter,
   1511          * but the generated name would have to be different, and the memory
   1512          * would have to be free'd only once.
   1513          * It is easier to just create the data for the extension converter
   1514          * separately when it is requested.
   1515          */
   1516         mbcsTable->swapLFNLStateTable=NULL;
   1517         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
   1518         mbcsTable->swapLFNLName=NULL;
   1519 
   1520         /*
   1521          * The reconstitutedData must be deleted only when the base converter
   1522          * is unloaded.
   1523          */
   1524         mbcsTable->reconstitutedData=NULL;
   1525 
   1526         /*
   1527          * Set a special, runtime-only outputType if the extension converter
   1528          * is a DBCS version of a base converter that also maps single bytes.
   1529          */
   1530         if( sharedData->staticData->conversionType==UCNV_DBCS ||
   1531                 (sharedData->staticData->conversionType==UCNV_MBCS &&
   1532                  sharedData->staticData->minBytesPerChar>=2)
   1533         ) {
   1534             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
   1535                 /* the base converter is SI/SO-stateful */
   1536                 int32_t entry;
   1537 
   1538                 /* get the dbcs state from the state table entry for SO=0x0e */
   1539                 entry=mbcsTable->stateTable[0][0xe];
   1540                 if( MBCS_ENTRY_IS_FINAL(entry) &&
   1541                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
   1542                     MBCS_ENTRY_FINAL_STATE(entry)!=0
   1543                 ) {
   1544                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
   1545 
   1546                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1547                 }
   1548             } else if(
   1549                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
   1550                 baseSharedData->staticData->minBytesPerChar==1 &&
   1551                 baseSharedData->staticData->maxBytesPerChar==2 &&
   1552                 mbcsTable->countStates<=127
   1553             ) {
   1554                 /* non-stateful base converter, need to modify the state table */
   1555                 int32_t (*newStateTable)[256];
   1556                 int32_t *state;
   1557                 int32_t i, count;
   1558 
   1559                 /* allocate a new state table and copy the base state table contents */
   1560                 count=mbcsTable->countStates;
   1561                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
   1562                 if(newStateTable==NULL) {
   1563                     ucnv_unload(baseSharedData);
   1564                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1565                     return;
   1566                 }
   1567 
   1568                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
   1569 
   1570                 /* change all final single-byte entries to go to a new all-illegal state */
   1571                 state=newStateTable[0];
   1572                 for(i=0; i<256; ++i) {
   1573                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
   1574                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
   1575                     }
   1576                 }
   1577 
   1578                 /* build the new all-illegal state */
   1579                 state=newStateTable[count];
   1580                 for(i=0; i<256; ++i) {
   1581                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
   1582                 }
   1583                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
   1584                 mbcsTable->countStates=(uint8_t)(count+1);
   1585                 mbcsTable->stateTableOwned=TRUE;
   1586 
   1587                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1588             }
   1589         }
   1590 
   1591         /*
   1592          * unlike below for files with base tables, do not get the unicodeMask
   1593          * from the sharedData; instead, use the base table's unicodeMask,
   1594          * which we copied in the memcpy above;
   1595          * this is necessary because the static data unicodeMask, especially
   1596          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
   1597          */
   1598     } else {
   1599         /* conversion file with a base table; an additional extension table is optional */
   1600         /* make sure that the output type is known */
   1601         switch(mbcsTable->outputType) {
   1602         case MBCS_OUTPUT_1:
   1603         case MBCS_OUTPUT_2:
   1604         case MBCS_OUTPUT_3:
   1605         case MBCS_OUTPUT_4:
   1606         case MBCS_OUTPUT_3_EUC:
   1607         case MBCS_OUTPUT_4_EUC:
   1608         case MBCS_OUTPUT_2_SISO:
   1609             /* OK */
   1610             break;
   1611         default:
   1612             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1613             return;
   1614         }
   1615         if(pArgs->onlyTestIsLoadable) {
   1616             /*
   1617              * Exit as soon as we know that we can load the converter
   1618              * and the format is valid and supported.
   1619              * The worst that can happen in the following code is a memory
   1620              * allocation error.
   1621              */
   1622             return;
   1623         }
   1624 
   1625         mbcsTable->countStates=(uint8_t)header->countStates;
   1626         mbcsTable->countToUFallbacks=header->countToUFallbacks;
   1627         mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
   1628         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
   1629         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
   1630 
   1631         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
   1632         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
   1633         mbcsTable->fromUBytesLength=header->fromUBytesLength;
   1634 
   1635         /*
   1636          * converter versions 6.1 and up contain a unicodeMask that is
   1637          * used here to select the most efficient function implementations
   1638          */
   1639         info.size=sizeof(UDataInfo);
   1640         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
   1641         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
   1642             /* mask off possible future extensions to be safe */
   1643             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
   1644         } else {
   1645             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
   1646             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
   1647         }
   1648 
   1649         /*
   1650          * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
   1651          * Check for the header version, SBCS vs. MBCS, and for whether the
   1652          * data structures are optimized for code points as high as what the
   1653          * runtime code is designed for.
   1654          * The implementation does not handle mapping tables with entries for
   1655          * unpaired surrogates.
   1656          */
   1657         if( header->version[1]>=3 &&
   1658             (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
   1659             (mbcsTable->countStates==1 ?
   1660                 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
   1661                 (header->version[2]>=(MBCS_FAST_MAX>>8))
   1662             )
   1663         ) {
   1664             mbcsTable->utf8Friendly=TRUE;
   1665 
   1666             if(mbcsTable->countStates==1) {
   1667                 /*
   1668                  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
   1669                  * Build a table with indexes to each block, to be used instead of
   1670                  * the regular stage 1/2 table.
   1671                  */
   1672                 int32_t i;
   1673                 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
   1674                     mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
   1675                 }
   1676                 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
   1677                 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
   1678             } else {
   1679                 /*
   1680                  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
   1681                  * The .cnv file is prebuilt with an additional stage table with indexes
   1682                  * to each block.
   1683                  */
   1684                 mbcsTable->mbcsIndex=(const uint16_t *)
   1685                     (mbcsTable->fromUnicodeBytes+
   1686                      (noFromU ? 0 : mbcsTable->fromUBytesLength));
   1687                 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
   1688             }
   1689         }
   1690 
   1691         /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
   1692         {
   1693             uint32_t asciiRoundtrips=0xffffffff;
   1694             int32_t i;
   1695 
   1696             for(i=0; i<0x80; ++i) {
   1697                 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
   1698                     asciiRoundtrips&=~((uint32_t)1<<(i>>2));
   1699                 }
   1700             }
   1701             mbcsTable->asciiRoundtrips=asciiRoundtrips;
   1702         }
   1703 
   1704         if(noFromU) {
   1705             uint32_t stage1Length=
   1706                 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
   1707                     0x440 : 0x40;
   1708             uint32_t stage2Length=
   1709                 (header->offsetFromUBytes-header->offsetFromUTable)/4-
   1710                 stage1Length/2;
   1711             reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
   1712         }
   1713     }
   1714 
   1715     /* Set the impl pointer here so that it is set for both extension-only and base tables. */
   1716     if(mbcsTable->utf8Friendly) {
   1717         if(mbcsTable->countStates==1) {
   1718             sharedData->impl=&_SBCSUTF8Impl;
   1719         } else {
   1720             if(mbcsTable->outputType==MBCS_OUTPUT_2) {
   1721                 sharedData->impl=&_DBCSUTF8Impl;
   1722             }
   1723         }
   1724     }
   1725 
   1726     if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
   1727         /*
   1728          * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
   1729          * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
   1730          */
   1731         mbcsTable->asciiRoundtrips=0;
   1732     }
   1733 }
   1734 
   1735 static void
   1736 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
   1737     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1738 
   1739     if(mbcsTable->swapLFNLStateTable!=NULL) {
   1740         uprv_free(mbcsTable->swapLFNLStateTable);
   1741     }
   1742     if(mbcsTable->stateTableOwned) {
   1743         uprv_free((void *)mbcsTable->stateTable);
   1744     }
   1745     if(mbcsTable->baseSharedData!=NULL) {
   1746         ucnv_unload(mbcsTable->baseSharedData);
   1747     }
   1748     if(mbcsTable->reconstitutedData!=NULL) {
   1749         uprv_free(mbcsTable->reconstitutedData);
   1750     }
   1751 }
   1752 
   1753 static void
   1754 ucnv_MBCSOpen(UConverter *cnv,
   1755               UConverterLoadArgs *pArgs,
   1756               UErrorCode *pErrorCode) {
   1757     UConverterMBCSTable *mbcsTable;
   1758     const int32_t *extIndexes;
   1759     uint8_t outputType;
   1760     int8_t maxBytesPerUChar;
   1761 
   1762     if(pArgs->onlyTestIsLoadable) {
   1763         return;
   1764     }
   1765 
   1766     mbcsTable=&cnv->sharedData->mbcs;
   1767     outputType=mbcsTable->outputType;
   1768 
   1769     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
   1770         /* the swaplfnl option does not apply, remove it */
   1771         cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1772     }
   1773 
   1774     if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1775         /* do this because double-checked locking is broken */
   1776         UBool isCached;
   1777 
   1778         umtx_lock(NULL);
   1779         isCached=mbcsTable->swapLFNLStateTable!=NULL;
   1780         umtx_unlock(NULL);
   1781 
   1782         if(!isCached) {
   1783             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
   1784                 if(U_FAILURE(*pErrorCode)) {
   1785                     return; /* something went wrong */
   1786                 }
   1787 
   1788                 /* the option does not apply, remove it */
   1789                 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1790             }
   1791         }
   1792     }
   1793 
   1794     if(uprv_strstr(pArgs->name, "18030")!=NULL) {
   1795         if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
   1796             /* set a flag for GB 18030 mode, which changes the callback behavior */
   1797             cnv->options|=_MBCS_OPTION_GB18030;
   1798         }
   1799     } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
   1800         /* set a flag for KEIS converter, which changes the SI/SO character sequence */
   1801         cnv->options|=_MBCS_OPTION_KEIS;
   1802     } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
   1803         /* set a flag for JEF converter, which changes the SI/SO character sequence */
   1804         cnv->options|=_MBCS_OPTION_JEF;
   1805     } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
   1806         /* set a flag for JIPS converter, which changes the SI/SO character sequence */
   1807         cnv->options|=_MBCS_OPTION_JIPS;
   1808     }
   1809 
   1810     /* fix maxBytesPerUChar depending on outputType and options etc. */
   1811     if(outputType==MBCS_OUTPUT_2_SISO) {
   1812         cnv->maxBytesPerUChar=3; /* SO+DBCS */
   1813     }
   1814 
   1815     extIndexes=mbcsTable->extIndexes;
   1816     if(extIndexes!=NULL) {
   1817         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
   1818         if(outputType==MBCS_OUTPUT_2_SISO) {
   1819             ++maxBytesPerUChar; /* SO + multiple DBCS */
   1820         }
   1821 
   1822         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
   1823             cnv->maxBytesPerUChar=maxBytesPerUChar;
   1824         }
   1825     }
   1826 
   1827 #if 0
   1828     /*
   1829      * documentation of UConverter fields used for status
   1830      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
   1831      */
   1832 
   1833     /* toUnicode */
   1834     cnv->toUnicodeStatus=0;     /* offset */
   1835     cnv->mode=0;                /* state */
   1836     cnv->toULength=0;           /* byteIndex */
   1837 
   1838     /* fromUnicode */
   1839     cnv->fromUChar32=0;
   1840     cnv->fromUnicodeStatus=1;   /* prevLength */
   1841 #endif
   1842 }
   1843 
   1844 static const char *
   1845 ucnv_MBCSGetName(const UConverter *cnv) {
   1846     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
   1847         return cnv->sharedData->mbcs.swapLFNLName;
   1848     } else {
   1849         return cnv->sharedData->staticData->name;
   1850     }
   1851 }
   1852 
   1853 /* MBCS-to-Unicode conversion functions ------------------------------------- */
   1854 
   1855 static UChar32
   1856 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
   1857     const _MBCSToUFallback *toUFallbacks;
   1858     uint32_t i, start, limit;
   1859 
   1860     limit=mbcsTable->countToUFallbacks;
   1861     if(limit>0) {
   1862         /* do a binary search for the fallback mapping */
   1863         toUFallbacks=mbcsTable->toUFallbacks;
   1864         start=0;
   1865         while(start<limit-1) {
   1866             i=(start+limit)/2;
   1867             if(offset<toUFallbacks[i].offset) {
   1868                 limit=i;
   1869             } else {
   1870                 start=i;
   1871             }
   1872         }
   1873 
   1874         /* did we really find it? */
   1875         if(offset==toUFallbacks[start].offset) {
   1876             return toUFallbacks[start].codePoint;
   1877         }
   1878     }
   1879 
   1880     return 0xfffe;
   1881 }
   1882 
   1883 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
   1884 static void
   1885 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1886                                 UErrorCode *pErrorCode) {
   1887     UConverter *cnv;
   1888     const uint8_t *source, *sourceLimit;
   1889     UChar *target;
   1890     const UChar *targetLimit;
   1891     int32_t *offsets;
   1892 
   1893     const int32_t (*stateTable)[256];
   1894 
   1895     int32_t sourceIndex;
   1896 
   1897     int32_t entry;
   1898     UChar c;
   1899     uint8_t action;
   1900 
   1901     /* set up the local pointers */
   1902     cnv=pArgs->converter;
   1903     source=(const uint8_t *)pArgs->source;
   1904     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1905     target=pArgs->target;
   1906     targetLimit=pArgs->targetLimit;
   1907     offsets=pArgs->offsets;
   1908 
   1909     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1910         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   1911     } else {
   1912         stateTable=cnv->sharedData->mbcs.stateTable;
   1913     }
   1914 
   1915     /* sourceIndex=-1 if the current character began in the previous buffer */
   1916     sourceIndex=0;
   1917 
   1918     /* conversion loop */
   1919     while(source<sourceLimit) {
   1920         /*
   1921          * This following test is to see if available input would overflow the output.
   1922          * It does not catch output of more than one code unit that
   1923          * overflows as a result of a surrogate pair or callback output
   1924          * from the last source byte.
   1925          * Therefore, those situations also test for overflows and will
   1926          * then break the loop, too.
   1927          */
   1928         if(target>=targetLimit) {
   1929             /* target is full */
   1930             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1931             break;
   1932         }
   1933 
   1934         entry=stateTable[0][*source++];
   1935         /* MBCS_ENTRY_IS_FINAL(entry) */
   1936 
   1937         /* test the most common case first */
   1938         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   1939             /* output BMP code point */
   1940             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   1941             if(offsets!=NULL) {
   1942                 *offsets++=sourceIndex;
   1943             }
   1944 
   1945             /* normal end of action codes: prepare for a new character */
   1946             ++sourceIndex;
   1947             continue;
   1948         }
   1949 
   1950         /*
   1951          * An if-else-if chain provides more reliable performance for
   1952          * the most common cases compared to a switch.
   1953          */
   1954         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   1955         if(action==MBCS_STATE_VALID_DIRECT_20 ||
   1956            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   1957         ) {
   1958             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   1959             /* output surrogate pair */
   1960             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   1961             if(offsets!=NULL) {
   1962                 *offsets++=sourceIndex;
   1963             }
   1964             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   1965             if(target<targetLimit) {
   1966                 *target++=c;
   1967                 if(offsets!=NULL) {
   1968                     *offsets++=sourceIndex;
   1969                 }
   1970             } else {
   1971                 /* target overflow */
   1972                 cnv->UCharErrorBuffer[0]=c;
   1973                 cnv->UCharErrorBufferLength=1;
   1974                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1975                 break;
   1976             }
   1977 
   1978             ++sourceIndex;
   1979             continue;
   1980         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   1981             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   1982                 /* output BMP code point */
   1983                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   1984                 if(offsets!=NULL) {
   1985                     *offsets++=sourceIndex;
   1986                 }
   1987 
   1988                 ++sourceIndex;
   1989                 continue;
   1990             }
   1991         } else if(action==MBCS_STATE_UNASSIGNED) {
   1992             /* just fall through */
   1993         } else if(action==MBCS_STATE_ILLEGAL) {
   1994             /* callback(illegal) */
   1995             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1996         } else {
   1997             /* reserved, must never occur */
   1998             ++sourceIndex;
   1999             continue;
   2000         }
   2001 
   2002         if(U_FAILURE(*pErrorCode)) {
   2003             /* callback(illegal) */
   2004             break;
   2005         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2006             /* try an extension mapping */
   2007             pArgs->source=(const char *)source;
   2008             cnv->toUBytes[0]=*(source-1);
   2009             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2010                                     1, &source, sourceLimit,
   2011                                     &target, targetLimit,
   2012                                     &offsets, sourceIndex,
   2013                                     pArgs->flush,
   2014                                     pErrorCode);
   2015             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
   2016 
   2017             if(U_FAILURE(*pErrorCode)) {
   2018                 /* not mappable or buffer overflow */
   2019                 break;
   2020             }
   2021         }
   2022     }
   2023 
   2024     /* write back the updated pointers */
   2025     pArgs->source=(const char *)source;
   2026     pArgs->target=target;
   2027     pArgs->offsets=offsets;
   2028 }
   2029 
   2030 /*
   2031  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
   2032  * that only map to and from the BMP.
   2033  * In addition to single-byte optimizations, the offset calculations
   2034  * become much easier.
   2035  */
   2036 static void
   2037 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
   2038                             UErrorCode *pErrorCode) {
   2039     UConverter *cnv;
   2040     const uint8_t *source, *sourceLimit, *lastSource;
   2041     UChar *target;
   2042     int32_t targetCapacity, length;
   2043     int32_t *offsets;
   2044 
   2045     const int32_t (*stateTable)[256];
   2046 
   2047     int32_t sourceIndex;
   2048 
   2049     int32_t entry;
   2050     uint8_t action;
   2051 
   2052     /* set up the local pointers */
   2053     cnv=pArgs->converter;
   2054     source=(const uint8_t *)pArgs->source;
   2055     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2056     target=pArgs->target;
   2057     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   2058     offsets=pArgs->offsets;
   2059 
   2060     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2061         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2062     } else {
   2063         stateTable=cnv->sharedData->mbcs.stateTable;
   2064     }
   2065 
   2066     /* sourceIndex=-1 if the current character began in the previous buffer */
   2067     sourceIndex=0;
   2068     lastSource=source;
   2069 
   2070     /*
   2071      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   2072      * for the minimum of the sourceLength and targetCapacity
   2073      */
   2074     length=(int32_t)(sourceLimit-source);
   2075     if(length<targetCapacity) {
   2076         targetCapacity=length;
   2077     }
   2078 
   2079 #if MBCS_UNROLL_SINGLE_TO_BMP
   2080     /* unrolling makes it faster on Pentium III/Windows 2000 */
   2081     /* unroll the loop with the most common case */
   2082 unrolled:
   2083     if(targetCapacity>=16) {
   2084         int32_t count, loops, oredEntries;
   2085 
   2086         loops=count=targetCapacity>>4;
   2087         do {
   2088             oredEntries=entry=stateTable[0][*source++];
   2089             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2090             oredEntries|=entry=stateTable[0][*source++];
   2091             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2092             oredEntries|=entry=stateTable[0][*source++];
   2093             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2094             oredEntries|=entry=stateTable[0][*source++];
   2095             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2096             oredEntries|=entry=stateTable[0][*source++];
   2097             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2098             oredEntries|=entry=stateTable[0][*source++];
   2099             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2100             oredEntries|=entry=stateTable[0][*source++];
   2101             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2102             oredEntries|=entry=stateTable[0][*source++];
   2103             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2104             oredEntries|=entry=stateTable[0][*source++];
   2105             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2106             oredEntries|=entry=stateTable[0][*source++];
   2107             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2108             oredEntries|=entry=stateTable[0][*source++];
   2109             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2110             oredEntries|=entry=stateTable[0][*source++];
   2111             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2112             oredEntries|=entry=stateTable[0][*source++];
   2113             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2114             oredEntries|=entry=stateTable[0][*source++];
   2115             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2116             oredEntries|=entry=stateTable[0][*source++];
   2117             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2118             oredEntries|=entry=stateTable[0][*source++];
   2119             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2120 
   2121             /* were all 16 entries really valid? */
   2122             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
   2123                 /* no, return to the first of these 16 */
   2124                 source-=16;
   2125                 target-=16;
   2126                 break;
   2127             }
   2128         } while(--count>0);
   2129         count=loops-count;
   2130         targetCapacity-=16*count;
   2131 
   2132         if(offsets!=NULL) {
   2133             lastSource+=16*count;
   2134             while(count>0) {
   2135                 *offsets++=sourceIndex++;
   2136                 *offsets++=sourceIndex++;
   2137                 *offsets++=sourceIndex++;
   2138                 *offsets++=sourceIndex++;
   2139                 *offsets++=sourceIndex++;
   2140                 *offsets++=sourceIndex++;
   2141                 *offsets++=sourceIndex++;
   2142                 *offsets++=sourceIndex++;
   2143                 *offsets++=sourceIndex++;
   2144                 *offsets++=sourceIndex++;
   2145                 *offsets++=sourceIndex++;
   2146                 *offsets++=sourceIndex++;
   2147                 *offsets++=sourceIndex++;
   2148                 *offsets++=sourceIndex++;
   2149                 *offsets++=sourceIndex++;
   2150                 *offsets++=sourceIndex++;
   2151                 --count;
   2152             }
   2153         }
   2154     }
   2155 #endif
   2156 
   2157     /* conversion loop */
   2158     while(targetCapacity > 0 && source < sourceLimit) {
   2159         entry=stateTable[0][*source++];
   2160         /* MBCS_ENTRY_IS_FINAL(entry) */
   2161 
   2162         /* test the most common case first */
   2163         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2164             /* output BMP code point */
   2165             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2166             --targetCapacity;
   2167             continue;
   2168         }
   2169 
   2170         /*
   2171          * An if-else-if chain provides more reliable performance for
   2172          * the most common cases compared to a switch.
   2173          */
   2174         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2175         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2176             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2177                 /* output BMP code point */
   2178                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2179                 --targetCapacity;
   2180                 continue;
   2181             }
   2182         } else if(action==MBCS_STATE_UNASSIGNED) {
   2183             /* just fall through */
   2184         } else if(action==MBCS_STATE_ILLEGAL) {
   2185             /* callback(illegal) */
   2186             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2187         } else {
   2188             /* reserved, must never occur */
   2189             continue;
   2190         }
   2191 
   2192         /* set offsets since the start or the last extension */
   2193         if(offsets!=NULL) {
   2194             int32_t count=(int32_t)(source-lastSource);
   2195 
   2196             /* predecrement: do not set the offset for the callback-causing character */
   2197             while(--count>0) {
   2198                 *offsets++=sourceIndex++;
   2199             }
   2200             /* offset and sourceIndex are now set for the current character */
   2201         }
   2202 
   2203         if(U_FAILURE(*pErrorCode)) {
   2204             /* callback(illegal) */
   2205             break;
   2206         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2207             /* try an extension mapping */
   2208             lastSource=source;
   2209             cnv->toUBytes[0]=*(source-1);
   2210             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2211                                     1, &source, sourceLimit,
   2212                                     &target, pArgs->targetLimit,
   2213                                     &offsets, sourceIndex,
   2214                                     pArgs->flush,
   2215                                     pErrorCode);
   2216             sourceIndex+=1+(int32_t)(source-lastSource);
   2217 
   2218             if(U_FAILURE(*pErrorCode)) {
   2219                 /* not mappable or buffer overflow */
   2220                 break;
   2221             }
   2222 
   2223             /* recalculate the targetCapacity after an extension mapping */
   2224             targetCapacity=(int32_t)(pArgs->targetLimit-target);
   2225             length=(int32_t)(sourceLimit-source);
   2226             if(length<targetCapacity) {
   2227                 targetCapacity=length;
   2228             }
   2229         }
   2230 
   2231 #if MBCS_UNROLL_SINGLE_TO_BMP
   2232         /* unrolling makes it faster on Pentium III/Windows 2000 */
   2233         goto unrolled;
   2234 #endif
   2235     }
   2236 
   2237     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
   2238         /* target is full */
   2239         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2240     }
   2241 
   2242     /* set offsets since the start or the last callback */
   2243     if(offsets!=NULL) {
   2244         size_t count=source-lastSource;
   2245         while(count>0) {
   2246             *offsets++=sourceIndex++;
   2247             --count;
   2248         }
   2249     }
   2250 
   2251     /* write back the updated pointers */
   2252     pArgs->source=(const char *)source;
   2253     pArgs->target=target;
   2254     pArgs->offsets=offsets;
   2255 }
   2256 
   2257 static UBool
   2258 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
   2259     const int32_t *row=stateTable[state];
   2260     int32_t b, entry;
   2261     /* First test for final entries in this state for some commonly valid byte values. */
   2262     entry=row[0xa1];
   2263     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2264         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2265     ) {
   2266         return TRUE;
   2267     }
   2268     entry=row[0x41];
   2269     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2270         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2271     ) {
   2272         return TRUE;
   2273     }
   2274     /* Then test for final entries in this state. */
   2275     for(b=0; b<=0xff; ++b) {
   2276         entry=row[b];
   2277         if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2278             MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2279         ) {
   2280             return TRUE;
   2281         }
   2282     }
   2283     /* Then recurse for transition entries. */
   2284     for(b=0; b<=0xff; ++b) {
   2285         entry=row[b];
   2286         if( MBCS_ENTRY_IS_TRANSITION(entry) &&
   2287             hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
   2288         ) {
   2289             return TRUE;
   2290         }
   2291     }
   2292     return FALSE;
   2293 }
   2294 
   2295 /*
   2296  * Is byte b a single/lead byte in this state?
   2297  * Recurse for transition states, because here we don't want to say that
   2298  * b is a lead byte if all byte sequences that start with b are illegal.
   2299  */
   2300 static UBool
   2301 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
   2302     const int32_t *row=stateTable[state];
   2303     int32_t entry=row[b];
   2304     if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
   2305         return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
   2306     } else {
   2307         uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2308         if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
   2309             return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
   2310         } else {
   2311             return action!=MBCS_STATE_ILLEGAL;
   2312         }
   2313     }
   2314 }
   2315 
   2316 U_CFUNC void
   2317 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2318                           UErrorCode *pErrorCode) {
   2319     UConverter *cnv;
   2320     const uint8_t *source, *sourceLimit;
   2321     UChar *target;
   2322     const UChar *targetLimit;
   2323     int32_t *offsets;
   2324 
   2325     const int32_t (*stateTable)[256];
   2326     const uint16_t *unicodeCodeUnits;
   2327 
   2328     uint32_t offset;
   2329     uint8_t state;
   2330     int8_t byteIndex;
   2331     uint8_t *bytes;
   2332 
   2333     int32_t sourceIndex, nextSourceIndex;
   2334 
   2335     int32_t entry;
   2336     UChar c;
   2337     uint8_t action;
   2338 
   2339     /* use optimized function if possible */
   2340     cnv=pArgs->converter;
   2341 
   2342     if(cnv->preToULength>0) {
   2343         /*
   2344          * pass sourceIndex=-1 because we continue from an earlier buffer
   2345          * in the future, this may change with continuous offsets
   2346          */
   2347         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
   2348 
   2349         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
   2350             return;
   2351         }
   2352     }
   2353 
   2354     if(cnv->sharedData->mbcs.countStates==1) {
   2355         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   2356             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
   2357         } else {
   2358             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
   2359         }
   2360         return;
   2361     }
   2362 
   2363     /* set up the local pointers */
   2364     source=(const uint8_t *)pArgs->source;
   2365     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2366     target=pArgs->target;
   2367     targetLimit=pArgs->targetLimit;
   2368     offsets=pArgs->offsets;
   2369 
   2370     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2371         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2372     } else {
   2373         stateTable=cnv->sharedData->mbcs.stateTable;
   2374     }
   2375     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2376 
   2377     /* get the converter state from UConverter */
   2378     offset=cnv->toUnicodeStatus;
   2379     byteIndex=cnv->toULength;
   2380     bytes=cnv->toUBytes;
   2381 
   2382     /*
   2383      * if we are in the SBCS state for a DBCS-only converter,
   2384      * then load the DBCS state from the MBCS data
   2385      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2386      */
   2387     if((state=(uint8_t)(cnv->mode))==0) {
   2388         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2389     }
   2390 
   2391     /* sourceIndex=-1 if the current character began in the previous buffer */
   2392     sourceIndex=byteIndex==0 ? 0 : -1;
   2393     nextSourceIndex=0;
   2394 
   2395     /* conversion loop */
   2396     while(source<sourceLimit) {
   2397         /*
   2398          * This following test is to see if available input would overflow the output.
   2399          * It does not catch output of more than one code unit that
   2400          * overflows as a result of a surrogate pair or callback output
   2401          * from the last source byte.
   2402          * Therefore, those situations also test for overflows and will
   2403          * then break the loop, too.
   2404          */
   2405         if(target>=targetLimit) {
   2406             /* target is full */
   2407             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2408             break;
   2409         }
   2410 
   2411         if(byteIndex==0) {
   2412             /* optimized loop for 1/2-byte input and BMP output */
   2413             if(offsets==NULL) {
   2414                 do {
   2415                     entry=stateTable[state][*source];
   2416                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2417                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2418                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2419 
   2420                         ++source;
   2421                         if( source<sourceLimit &&
   2422                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2423                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2424                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2425                         ) {
   2426                             ++source;
   2427                             *target++=c;
   2428                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2429                             offset=0;
   2430                         } else {
   2431                             /* set the state and leave the optimized loop */
   2432                             bytes[0]=*(source-1);
   2433                             byteIndex=1;
   2434                             break;
   2435                         }
   2436                     } else {
   2437                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2438                             /* output BMP code point */
   2439                             ++source;
   2440                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2441                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2442                         } else {
   2443                             /* leave the optimized loop */
   2444                             break;
   2445                         }
   2446                     }
   2447                 } while(source<sourceLimit && target<targetLimit);
   2448             } else /* offsets!=NULL */ {
   2449                 do {
   2450                     entry=stateTable[state][*source];
   2451                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2452                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2453                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2454 
   2455                         ++source;
   2456                         if( source<sourceLimit &&
   2457                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2458                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2459                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2460                         ) {
   2461                             ++source;
   2462                             *target++=c;
   2463                             if(offsets!=NULL) {
   2464                                 *offsets++=sourceIndex;
   2465                                 sourceIndex=(nextSourceIndex+=2);
   2466                             }
   2467                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2468                             offset=0;
   2469                         } else {
   2470                             /* set the state and leave the optimized loop */
   2471                             ++nextSourceIndex;
   2472                             bytes[0]=*(source-1);
   2473                             byteIndex=1;
   2474                             break;
   2475                         }
   2476                     } else {
   2477                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2478                             /* output BMP code point */
   2479                             ++source;
   2480                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2481                             if(offsets!=NULL) {
   2482                                 *offsets++=sourceIndex;
   2483                                 sourceIndex=++nextSourceIndex;
   2484                             }
   2485                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2486                         } else {
   2487                             /* leave the optimized loop */
   2488                             break;
   2489                         }
   2490                     }
   2491                 } while(source<sourceLimit && target<targetLimit);
   2492             }
   2493 
   2494             /*
   2495              * these tests and break statements could be put inside the loop
   2496              * if C had "break outerLoop" like Java
   2497              */
   2498             if(source>=sourceLimit) {
   2499                 break;
   2500             }
   2501             if(target>=targetLimit) {
   2502                 /* target is full */
   2503                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2504                 break;
   2505             }
   2506 
   2507             ++nextSourceIndex;
   2508             bytes[byteIndex++]=*source++;
   2509         } else /* byteIndex>0 */ {
   2510             ++nextSourceIndex;
   2511             entry=stateTable[state][bytes[byteIndex++]=*source++];
   2512         }
   2513 
   2514         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2515             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2516             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2517             continue;
   2518         }
   2519 
   2520         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2521         cnv->mode=state;
   2522 
   2523         /* set the next state early so that we can reuse the entry variable */
   2524         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2525 
   2526         /*
   2527          * An if-else-if chain provides more reliable performance for
   2528          * the most common cases compared to a switch.
   2529          */
   2530         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2531         if(action==MBCS_STATE_VALID_16) {
   2532             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2533             c=unicodeCodeUnits[offset];
   2534             if(c<0xfffe) {
   2535                 /* output BMP code point */
   2536                 *target++=c;
   2537                 if(offsets!=NULL) {
   2538                     *offsets++=sourceIndex;
   2539                 }
   2540                 byteIndex=0;
   2541             } else if(c==0xfffe) {
   2542                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2543                     /* output fallback BMP code point */
   2544                     *target++=(UChar)entry;
   2545                     if(offsets!=NULL) {
   2546                         *offsets++=sourceIndex;
   2547                     }
   2548                     byteIndex=0;
   2549                 }
   2550             } else {
   2551                 /* callback(illegal) */
   2552                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2553             }
   2554         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   2555             /* output BMP code point */
   2556             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2557             if(offsets!=NULL) {
   2558                 *offsets++=sourceIndex;
   2559             }
   2560             byteIndex=0;
   2561         } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2562             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2563             c=unicodeCodeUnits[offset++];
   2564             if(c<0xd800) {
   2565                 /* output BMP code point below 0xd800 */
   2566                 *target++=c;
   2567                 if(offsets!=NULL) {
   2568                     *offsets++=sourceIndex;
   2569                 }
   2570                 byteIndex=0;
   2571             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2572                 /* output roundtrip or fallback surrogate pair */
   2573                 *target++=(UChar)(c&0xdbff);
   2574                 if(offsets!=NULL) {
   2575                     *offsets++=sourceIndex;
   2576                 }
   2577                 byteIndex=0;
   2578                 if(target<targetLimit) {
   2579                     *target++=unicodeCodeUnits[offset];
   2580                     if(offsets!=NULL) {
   2581                         *offsets++=sourceIndex;
   2582                     }
   2583                 } else {
   2584                     /* target overflow */
   2585                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
   2586                     cnv->UCharErrorBufferLength=1;
   2587                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2588 
   2589                     offset=0;
   2590                     break;
   2591                 }
   2592             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2593                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2594                 *target++=unicodeCodeUnits[offset];
   2595                 if(offsets!=NULL) {
   2596                     *offsets++=sourceIndex;
   2597                 }
   2598                 byteIndex=0;
   2599             } else if(c==0xffff) {
   2600                 /* callback(illegal) */
   2601                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2602             }
   2603         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2604                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2605         ) {
   2606             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2607             /* output surrogate pair */
   2608             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2609             if(offsets!=NULL) {
   2610                 *offsets++=sourceIndex;
   2611             }
   2612             byteIndex=0;
   2613             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2614             if(target<targetLimit) {
   2615                 *target++=c;
   2616                 if(offsets!=NULL) {
   2617                     *offsets++=sourceIndex;
   2618                 }
   2619             } else {
   2620                 /* target overflow */
   2621                 cnv->UCharErrorBuffer[0]=c;
   2622                 cnv->UCharErrorBufferLength=1;
   2623                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2624 
   2625                 offset=0;
   2626                 break;
   2627             }
   2628         } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2629             /*
   2630              * This serves as a state change without any output.
   2631              * It is useful for reading simple stateful encodings,
   2632              * for example using just Shift-In/Shift-Out codes.
   2633              * The 21 unused bits may later be used for more sophisticated
   2634              * state transitions.
   2635              */
   2636             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
   2637                 byteIndex=0;
   2638             } else {
   2639                 /* SI/SO are illegal for DBCS-only conversion */
   2640                 state=(uint8_t)(cnv->mode); /* restore the previous state */
   2641 
   2642                 /* callback(illegal) */
   2643                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2644             }
   2645         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2646             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2647                 /* output BMP code point */
   2648                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2649                 if(offsets!=NULL) {
   2650                     *offsets++=sourceIndex;
   2651                 }
   2652                 byteIndex=0;
   2653             }
   2654         } else if(action==MBCS_STATE_UNASSIGNED) {
   2655             /* just fall through */
   2656         } else if(action==MBCS_STATE_ILLEGAL) {
   2657             /* callback(illegal) */
   2658             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2659         } else {
   2660             /* reserved, must never occur */
   2661             byteIndex=0;
   2662         }
   2663 
   2664         /* end of action codes: prepare for a new character */
   2665         offset=0;
   2666 
   2667         if(byteIndex==0) {
   2668             sourceIndex=nextSourceIndex;
   2669         } else if(U_FAILURE(*pErrorCode)) {
   2670             /* callback(illegal) */
   2671             if(byteIndex>1) {
   2672                 /*
   2673                  * Ticket 5691: consistent illegal sequences:
   2674                  * - We include at least the first byte in the illegal sequence.
   2675                  * - If any of the non-initial bytes could be the start of a character,
   2676                  *   we stop the illegal sequence before the first one of those.
   2677                  */
   2678                 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   2679                 int8_t i;
   2680                 for(i=1;
   2681                     i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
   2682                     ++i) {}
   2683                 if(i<byteIndex) {
   2684                     /* Back out some bytes. */
   2685                     int8_t backOutDistance=byteIndex-i;
   2686                     int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
   2687                     byteIndex=i;  /* length of reported illegal byte sequence */
   2688                     if(backOutDistance<=bytesFromThisBuffer) {
   2689                         source-=backOutDistance;
   2690                     } else {
   2691                         /* Back out bytes from the previous buffer: Need to replay them. */
   2692                         cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   2693                         /* preToULength is negative! */
   2694                         uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
   2695                         source=(const uint8_t *)pArgs->source;
   2696                     }
   2697                 }
   2698             }
   2699             break;
   2700         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2701             /* try an extension mapping */
   2702             pArgs->source=(const char *)source;
   2703             byteIndex=_extToU(cnv, cnv->sharedData,
   2704                               byteIndex, &source, sourceLimit,
   2705                               &target, targetLimit,
   2706                               &offsets, sourceIndex,
   2707                               pArgs->flush,
   2708                               pErrorCode);
   2709             sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
   2710 
   2711             if(U_FAILURE(*pErrorCode)) {
   2712                 /* not mappable or buffer overflow */
   2713                 break;
   2714             }
   2715         }
   2716     }
   2717 
   2718     /* set the converter state back into UConverter */
   2719     cnv->toUnicodeStatus=offset;
   2720     cnv->mode=state;
   2721     cnv->toULength=byteIndex;
   2722 
   2723     /* write back the updated pointers */
   2724     pArgs->source=(const char *)source;
   2725     pArgs->target=target;
   2726     pArgs->offsets=offsets;
   2727 }
   2728 
   2729 /*
   2730  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
   2731  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
   2732  */
   2733 static UChar32
   2734 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2735                         UErrorCode *pErrorCode) {
   2736     UConverter *cnv;
   2737     const int32_t (*stateTable)[256];
   2738     const uint8_t *source, *sourceLimit;
   2739 
   2740     int32_t entry;
   2741     uint8_t action;
   2742 
   2743     /* set up the local pointers */
   2744     cnv=pArgs->converter;
   2745     source=(const uint8_t *)pArgs->source;
   2746     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2747     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2748         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2749     } else {
   2750         stateTable=cnv->sharedData->mbcs.stateTable;
   2751     }
   2752 
   2753     /* conversion loop */
   2754     while(source<sourceLimit) {
   2755         entry=stateTable[0][*source++];
   2756         /* MBCS_ENTRY_IS_FINAL(entry) */
   2757 
   2758         /* write back the updated pointer early so that we can return directly */
   2759         pArgs->source=(const char *)source;
   2760 
   2761         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2762             /* output BMP code point */
   2763             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2764         }
   2765 
   2766         /*
   2767          * An if-else-if chain provides more reliable performance for
   2768          * the most common cases compared to a switch.
   2769          */
   2770         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2771         if( action==MBCS_STATE_VALID_DIRECT_20 ||
   2772             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2773         ) {
   2774             /* output supplementary code point */
   2775             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2776         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2777             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2778                 /* output BMP code point */
   2779                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2780             }
   2781         } else if(action==MBCS_STATE_UNASSIGNED) {
   2782             /* just fall through */
   2783         } else if(action==MBCS_STATE_ILLEGAL) {
   2784             /* callback(illegal) */
   2785             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2786         } else {
   2787             /* reserved, must never occur */
   2788             continue;
   2789         }
   2790 
   2791         if(U_FAILURE(*pErrorCode)) {
   2792             /* callback(illegal) */
   2793             break;
   2794         } else /* unassigned sequence */ {
   2795             /* defer to the generic implementation */
   2796             pArgs->source=(const char *)source-1;
   2797             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2798         }
   2799     }
   2800 
   2801     /* no output because of empty input or only state changes */
   2802     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2803     return 0xffff;
   2804 }
   2805 
   2806 /*
   2807  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
   2808  * conversion without offset handling.
   2809  *
   2810  * When a character does not have a mapping to Unicode, then we return to the
   2811  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
   2812  * handling.
   2813  * We also defer to the generic code in other complicated cases and have them
   2814  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
   2815  *
   2816  * All normal mappings and errors are handled here.
   2817  */
   2818 static UChar32
   2819 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2820                   UErrorCode *pErrorCode) {
   2821     UConverter *cnv;
   2822     const uint8_t *source, *sourceLimit, *lastSource;
   2823 
   2824     const int32_t (*stateTable)[256];
   2825     const uint16_t *unicodeCodeUnits;
   2826 
   2827     uint32_t offset;
   2828     uint8_t state;
   2829 
   2830     int32_t entry;
   2831     UChar32 c;
   2832     uint8_t action;
   2833 
   2834     /* use optimized function if possible */
   2835     cnv=pArgs->converter;
   2836 
   2837     if(cnv->preToULength>0) {
   2838         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
   2839         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2840     }
   2841 
   2842     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
   2843         /*
   2844          * Using the generic ucnv_getNextUChar() code lets us deal correctly
   2845          * with the rare case of a codepage that maps single surrogates
   2846          * without adding the complexity to this already complicated function here.
   2847          */
   2848         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2849     } else if(cnv->sharedData->mbcs.countStates==1) {
   2850         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
   2851     }
   2852 
   2853     /* set up the local pointers */
   2854     source=lastSource=(const uint8_t *)pArgs->source;
   2855     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2856 
   2857     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2858         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2859     } else {
   2860         stateTable=cnv->sharedData->mbcs.stateTable;
   2861     }
   2862     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2863 
   2864     /* get the converter state from UConverter */
   2865     offset=cnv->toUnicodeStatus;
   2866 
   2867     /*
   2868      * if we are in the SBCS state for a DBCS-only converter,
   2869      * then load the DBCS state from the MBCS data
   2870      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2871      */
   2872     if((state=(uint8_t)(cnv->mode))==0) {
   2873         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2874     }
   2875 
   2876     /* conversion loop */
   2877     c=U_SENTINEL;
   2878     while(source<sourceLimit) {
   2879         entry=stateTable[state][*source++];
   2880         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2881             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2882             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2883 
   2884             /* optimization for 1/2-byte input and BMP output */
   2885             if( source<sourceLimit &&
   2886                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2887                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2888                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2889             ) {
   2890                 ++source;
   2891                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2892                 /* output BMP code point */
   2893                 break;
   2894             }
   2895         } else {
   2896             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2897             cnv->mode=state;
   2898 
   2899             /* set the next state early so that we can reuse the entry variable */
   2900             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2901 
   2902             /*
   2903              * An if-else-if chain provides more reliable performance for
   2904              * the most common cases compared to a switch.
   2905              */
   2906             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2907             if(action==MBCS_STATE_VALID_DIRECT_16) {
   2908                 /* output BMP code point */
   2909                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2910                 break;
   2911             } else if(action==MBCS_STATE_VALID_16) {
   2912                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2913                 c=unicodeCodeUnits[offset];
   2914                 if(c<0xfffe) {
   2915                     /* output BMP code point */
   2916                     break;
   2917                 } else if(c==0xfffe) {
   2918                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2919                         break;
   2920                     }
   2921                 } else {
   2922                     /* callback(illegal) */
   2923                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2924                 }
   2925             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2926                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2927                 c=unicodeCodeUnits[offset++];
   2928                 if(c<0xd800) {
   2929                     /* output BMP code point below 0xd800 */
   2930                     break;
   2931                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2932                     /* output roundtrip or fallback supplementary code point */
   2933                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
   2934                     break;
   2935                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2936                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2937                     c=unicodeCodeUnits[offset];
   2938                     break;
   2939                 } else if(c==0xffff) {
   2940                     /* callback(illegal) */
   2941                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2942                 }
   2943             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2944                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2945             ) {
   2946                 /* output supplementary code point */
   2947                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2948                 break;
   2949             } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2950                 /*
   2951                  * This serves as a state change without any output.
   2952                  * It is useful for reading simple stateful encodings,
   2953                  * for example using just Shift-In/Shift-Out codes.
   2954                  * The 21 unused bits may later be used for more sophisticated
   2955                  * state transitions.
   2956                  */
   2957                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
   2958                     /* SI/SO are illegal for DBCS-only conversion */
   2959                     state=(uint8_t)(cnv->mode); /* restore the previous state */
   2960 
   2961                     /* callback(illegal) */
   2962                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2963                 }
   2964             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2965                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2966                     /* output BMP code point */
   2967                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2968                     break;
   2969                 }
   2970             } else if(action==MBCS_STATE_UNASSIGNED) {
   2971                 /* just fall through */
   2972             } else if(action==MBCS_STATE_ILLEGAL) {
   2973                 /* callback(illegal) */
   2974                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2975             } else {
   2976                 /* reserved (must never occur), or only state change */
   2977                 offset=0;
   2978                 lastSource=source;
   2979                 continue;
   2980             }
   2981 
   2982             /* end of action codes: prepare for a new character */
   2983             offset=0;
   2984 
   2985             if(U_FAILURE(*pErrorCode)) {
   2986                 /* callback(illegal) */
   2987                 break;
   2988             } else /* unassigned sequence */ {
   2989                 /* defer to the generic implementation */
   2990                 cnv->toUnicodeStatus=0;
   2991                 cnv->mode=state;
   2992                 pArgs->source=(const char *)lastSource;
   2993                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2994             }
   2995         }
   2996     }
   2997 
   2998     if(c<0) {
   2999         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
   3000             /* incomplete character byte sequence */
   3001             uint8_t *bytes=cnv->toUBytes;
   3002             cnv->toULength=(int8_t)(source-lastSource);
   3003             do {
   3004                 *bytes++=*lastSource++;
   3005             } while(lastSource<source);
   3006             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3007         } else if(U_FAILURE(*pErrorCode)) {
   3008             /* callback(illegal) */
   3009             /*
   3010              * Ticket 5691: consistent illegal sequences:
   3011              * - We include at least the first byte in the illegal sequence.
   3012              * - If any of the non-initial bytes could be the start of a character,
   3013              *   we stop the illegal sequence before the first one of those.
   3014              */
   3015             UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   3016             uint8_t *bytes=cnv->toUBytes;
   3017             *bytes++=*lastSource++;     /* first byte */
   3018             if(lastSource==source) {
   3019                 cnv->toULength=1;
   3020             } else /* lastSource<source: multi-byte character */ {
   3021                 int8_t i;
   3022                 for(i=1;
   3023                     lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
   3024                     ++i
   3025                 ) {
   3026                     *bytes++=*lastSource++;
   3027                 }
   3028                 cnv->toULength=i;
   3029                 source=lastSource;
   3030             }
   3031         } else {
   3032             /* no output because of empty input or only state changes */
   3033             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   3034         }
   3035         c=0xffff;
   3036     }
   3037 
   3038     /* set the converter state back into UConverter, ready for a new character */
   3039     cnv->toUnicodeStatus=0;
   3040     cnv->mode=state;
   3041 
   3042     /* write back the updated pointer */
   3043     pArgs->source=(const char *)source;
   3044     return c;
   3045 }
   3046 
   3047 #if 0
   3048 /*
   3049  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3050  * Removal improves code coverage.
   3051  */
   3052 /**
   3053  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
   3054  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3055  * It does not handle conversion extensions (_extToU()).
   3056  */
   3057 U_CFUNC UChar32
   3058 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
   3059                               uint8_t b, UBool useFallback) {
   3060     int32_t entry;
   3061     uint8_t action;
   3062 
   3063     entry=sharedData->mbcs.stateTable[0][b];
   3064     /* MBCS_ENTRY_IS_FINAL(entry) */
   3065 
   3066     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   3067         /* output BMP code point */
   3068         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3069     }
   3070 
   3071     /*
   3072      * An if-else-if chain provides more reliable performance for
   3073      * the most common cases compared to a switch.
   3074      */
   3075     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3076     if(action==MBCS_STATE_VALID_DIRECT_20) {
   3077         /* output supplementary code point */
   3078         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3079     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3080         if(!TO_U_USE_FALLBACK(useFallback)) {
   3081             return 0xfffe;
   3082         }
   3083         /* output BMP code point */
   3084         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3085     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3086         if(!TO_U_USE_FALLBACK(useFallback)) {
   3087             return 0xfffe;
   3088         }
   3089         /* output supplementary code point */
   3090         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3091     } else if(action==MBCS_STATE_UNASSIGNED) {
   3092         return 0xfffe;
   3093     } else if(action==MBCS_STATE_ILLEGAL) {
   3094         return 0xffff;
   3095     } else {
   3096         /* reserved, must never occur */
   3097         return 0xffff;
   3098     }
   3099 }
   3100 #endif
   3101 
   3102 /*
   3103  * This is a simple version of _MBCSGetNextUChar() that is used
   3104  * by other converter implementations.
   3105  * It only returns an "assigned" result if it consumes the entire input.
   3106  * It does not use state from the converter, nor error codes.
   3107  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3108  * It handles conversion extensions but not GB 18030.
   3109  *
   3110  * Return value:
   3111  * U+fffe   unassigned
   3112  * U+ffff   illegal
   3113  * otherwise the Unicode code point
   3114  */
   3115 U_CFUNC UChar32
   3116 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
   3117                         const char *source, int32_t length,
   3118                         UBool useFallback) {
   3119     const int32_t (*stateTable)[256];
   3120     const uint16_t *unicodeCodeUnits;
   3121 
   3122     uint32_t offset;
   3123     uint8_t state, action;
   3124 
   3125     UChar32 c;
   3126     int32_t i, entry;
   3127 
   3128     if(length<=0) {
   3129         /* no input at all: "illegal" */
   3130         return 0xffff;
   3131     }
   3132 
   3133 #if 0
   3134 /*
   3135  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3136  * TODO In future releases, verify that this function is never called for SBCS
   3137  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
   3138  * Removal improves code coverage.
   3139  */
   3140     /* use optimized function if possible */
   3141     if(sharedData->mbcs.countStates==1) {
   3142         if(length==1) {
   3143             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
   3144         } else {
   3145             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
   3146         }
   3147     }
   3148 #endif
   3149 
   3150     /* set up the local pointers */
   3151     stateTable=sharedData->mbcs.stateTable;
   3152     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
   3153 
   3154     /* converter state */
   3155     offset=0;
   3156     state=sharedData->mbcs.dbcsOnlyState;
   3157 
   3158     /* conversion loop */
   3159     for(i=0;;) {
   3160         entry=stateTable[state][(uint8_t)source[i++]];
   3161         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3162             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3163             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3164 
   3165             if(i==length) {
   3166                 return 0xffff; /* truncated character */
   3167             }
   3168         } else {
   3169             /*
   3170              * An if-else-if chain provides more reliable performance for
   3171              * the most common cases compared to a switch.
   3172              */
   3173             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3174             if(action==MBCS_STATE_VALID_16) {
   3175                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3176                 c=unicodeCodeUnits[offset];
   3177                 if(c!=0xfffe) {
   3178                     /* done */
   3179                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3180                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
   3181                 /* else done with 0xfffe */
   3182                 }
   3183                 break;
   3184             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   3185                 /* output BMP code point */
   3186                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3187                 break;
   3188             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3189                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3190                 c=unicodeCodeUnits[offset++];
   3191                 if(c<0xd800) {
   3192                     /* output BMP code point below 0xd800 */
   3193                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3194                     /* output roundtrip or fallback supplementary code point */
   3195                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
   3196                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3197                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3198                     c=unicodeCodeUnits[offset];
   3199                 } else if(c==0xffff) {
   3200                     return 0xffff;
   3201                 } else {
   3202                     c=0xfffe;
   3203                 }
   3204                 break;
   3205             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
   3206                 /* output supplementary code point */
   3207                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3208                 break;
   3209             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3210                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3211                     c=0xfffe;
   3212                     break;
   3213                 }
   3214                 /* output BMP code point */
   3215                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3216                 break;
   3217             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3218                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3219                     c=0xfffe;
   3220                     break;
   3221                 }
   3222                 /* output supplementary code point */
   3223                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3224                 break;
   3225             } else if(action==MBCS_STATE_UNASSIGNED) {
   3226                 c=0xfffe;
   3227                 break;
   3228             }
   3229 
   3230             /*
   3231              * forbid MBCS_STATE_CHANGE_ONLY for this function,
   3232              * and MBCS_STATE_ILLEGAL and reserved action codes
   3233              */
   3234             return 0xffff;
   3235         }
   3236     }
   3237 
   3238     if(i!=length) {
   3239         /* illegal for this function: not all input consumed */
   3240         return 0xffff;
   3241     }
   3242 
   3243     if(c==0xfffe) {
   3244         /* try an extension mapping */
   3245         const int32_t *cx=sharedData->mbcs.extIndexes;
   3246         if(cx!=NULL) {
   3247             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
   3248         }
   3249     }
   3250 
   3251     return c;
   3252 }
   3253 
   3254 /* MBCS-from-Unicode conversion functions ----------------------------------- */
   3255 
   3256 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
   3257 static void
   3258 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3259                                   UErrorCode *pErrorCode) {
   3260     UConverter *cnv;
   3261     const UChar *source, *sourceLimit;
   3262     uint8_t *target;
   3263     int32_t targetCapacity;
   3264     int32_t *offsets;
   3265 
   3266     const uint16_t *table;
   3267     const uint16_t *mbcsIndex;
   3268     const uint8_t *bytes;
   3269 
   3270     UChar32 c;
   3271 
   3272     int32_t sourceIndex, nextSourceIndex;
   3273 
   3274     uint32_t stage2Entry;
   3275     uint32_t asciiRoundtrips;
   3276     uint32_t value;
   3277     uint8_t unicodeMask;
   3278 
   3279     /* use optimized function if possible */
   3280     cnv=pArgs->converter;
   3281     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3282 
   3283     /* set up the local pointers */
   3284     source=pArgs->source;
   3285     sourceLimit=pArgs->sourceLimit;
   3286     target=(uint8_t *)pArgs->target;
   3287     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3288     offsets=pArgs->offsets;
   3289 
   3290     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3291     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3292     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3293         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3294     } else {
   3295         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   3296     }
   3297     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3298 
   3299     /* get the converter state from UConverter */
   3300     c=cnv->fromUChar32;
   3301 
   3302     /* sourceIndex=-1 if the current character began in the previous buffer */
   3303     sourceIndex= c==0 ? 0 : -1;
   3304     nextSourceIndex=0;
   3305 
   3306     /* conversion loop */
   3307     if(c!=0 && targetCapacity>0) {
   3308         goto getTrail;
   3309     }
   3310 
   3311     while(source<sourceLimit) {
   3312         /*
   3313          * This following test is to see if available input would overflow the output.
   3314          * It does not catch output of more than one byte that
   3315          * overflows as a result of a multi-byte character or callback output
   3316          * from the last source character.
   3317          * Therefore, those situations also test for overflows and will
   3318          * then break the loop, too.
   3319          */
   3320         if(targetCapacity>0) {
   3321             /*
   3322              * Get a correct Unicode code point:
   3323              * a single UChar for a BMP code point or
   3324              * a matched surrogate pair for a "supplementary code point".
   3325              */
   3326             c=*source++;
   3327             ++nextSourceIndex;
   3328             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3329                 *target++=(uint8_t)c;
   3330                 if(offsets!=NULL) {
   3331                     *offsets++=sourceIndex;
   3332                     sourceIndex=nextSourceIndex;
   3333                 }
   3334                 --targetCapacity;
   3335                 c=0;
   3336                 continue;
   3337             }
   3338             /*
   3339              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   3340              * to avoid dealing with surrogates.
   3341              * MBCS_FAST_MAX must be >=0xd7ff.
   3342              */
   3343             if(c<=0xd7ff) {
   3344                 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
   3345                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   3346                 if(value==0) {
   3347                     goto unassigned;
   3348                 }
   3349                 /* output the value */
   3350             } else {
   3351                 /*
   3352                  * This also tests if the codepage maps single surrogates.
   3353                  * If it does, then surrogates are not paired but mapped separately.
   3354                  * Note that in this case unmatched surrogates are not detected.
   3355                  */
   3356                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3357                     if(U16_IS_SURROGATE_LEAD(c)) {
   3358 getTrail:
   3359                         if(source<sourceLimit) {
   3360                             /* test the following code unit */
   3361                             UChar trail=*source;
   3362                             if(U16_IS_TRAIL(trail)) {
   3363                                 ++source;
   3364                                 ++nextSourceIndex;
   3365                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   3366                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3367                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3368                                     /* callback(unassigned) */
   3369                                     goto unassigned;
   3370                                 }
   3371                                 /* convert this supplementary code point */
   3372                                 /* exit this condition tree */
   3373                             } else {
   3374                                 /* this is an unmatched lead code unit (1st surrogate) */
   3375                                 /* callback(illegal) */
   3376                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3377                                 break;
   3378                             }
   3379                         } else {
   3380                             /* no more input */
   3381                             break;
   3382                         }
   3383                     } else {
   3384                         /* this is an unmatched trail code unit (2nd surrogate) */
   3385                         /* callback(illegal) */
   3386                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3387                         break;
   3388                     }
   3389                 }
   3390 
   3391                 /* convert the Unicode code point in c into codepage bytes */
   3392                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   3393 
   3394                 /* get the bytes and the length for the output */
   3395                 /* MBCS_OUTPUT_2 */
   3396                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   3397 
   3398                 /* is this code point assigned, or do we use fallbacks? */
   3399                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   3400                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   3401                 ) {
   3402                     /*
   3403                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   3404                      * There is no way with this data structure for fallback output
   3405                      * to be a zero byte.
   3406                      */
   3407 
   3408 unassigned:
   3409                     /* try an extension mapping */
   3410                     pArgs->source=source;
   3411                     c=_extFromU(cnv, cnv->sharedData,
   3412                                 c, &source, sourceLimit,
   3413                                 &target, target+targetCapacity,
   3414                                 &offsets, sourceIndex,
   3415                                 pArgs->flush,
   3416                                 pErrorCode);
   3417                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   3418 
   3419                     if(U_FAILURE(*pErrorCode)) {
   3420                         /* not mappable or buffer overflow */
   3421                         break;
   3422                     } else {
   3423                         /* a mapping was written to the target, continue */
   3424 
   3425                         /* recalculate the targetCapacity after an extension mapping */
   3426                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3427 
   3428                         /* normal end of conversion: prepare for a new character */
   3429                         sourceIndex=nextSourceIndex;
   3430                         continue;
   3431                     }
   3432                 }
   3433             }
   3434 
   3435             /* write the output character bytes from value and length */
   3436             /* from the first if in the loop we know that targetCapacity>0 */
   3437             if(value<=0xff) {
   3438                 /* this is easy because we know that there is enough space */
   3439                 *target++=(uint8_t)value;
   3440                 if(offsets!=NULL) {
   3441                     *offsets++=sourceIndex;
   3442                 }
   3443                 --targetCapacity;
   3444             } else /* length==2 */ {
   3445                 *target++=(uint8_t)(value>>8);
   3446                 if(2<=targetCapacity) {
   3447                     *target++=(uint8_t)value;
   3448                     if(offsets!=NULL) {
   3449                         *offsets++=sourceIndex;
   3450                         *offsets++=sourceIndex;
   3451                     }
   3452                     targetCapacity-=2;
   3453                 } else {
   3454                     if(offsets!=NULL) {
   3455                         *offsets++=sourceIndex;
   3456                     }
   3457                     cnv->charErrorBuffer[0]=(char)value;
   3458                     cnv->charErrorBufferLength=1;
   3459 
   3460                     /* target overflow */
   3461                     targetCapacity=0;
   3462                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3463                     c=0;
   3464                     break;
   3465                 }
   3466             }
   3467 
   3468             /* normal end of conversion: prepare for a new character */
   3469             c=0;
   3470             sourceIndex=nextSourceIndex;
   3471             continue;
   3472         } else {
   3473             /* target is full */
   3474             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3475             break;
   3476         }
   3477     }
   3478 
   3479     /* set the converter state back into UConverter */
   3480     cnv->fromUChar32=c;
   3481 
   3482     /* write back the updated pointers */
   3483     pArgs->source=source;
   3484     pArgs->target=(char *)target;
   3485     pArgs->offsets=offsets;
   3486 }
   3487 
   3488 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
   3489 static void
   3490 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3491                                   UErrorCode *pErrorCode) {
   3492     UConverter *cnv;
   3493     const UChar *source, *sourceLimit;
   3494     uint8_t *target;
   3495     int32_t targetCapacity;
   3496     int32_t *offsets;
   3497 
   3498     const uint16_t *table;
   3499     const uint16_t *results;
   3500 
   3501     UChar32 c;
   3502 
   3503     int32_t sourceIndex, nextSourceIndex;
   3504 
   3505     uint16_t value, minValue;
   3506     UBool hasSupplementary;
   3507 
   3508     /* set up the local pointers */
   3509     cnv=pArgs->converter;
   3510     source=pArgs->source;
   3511     sourceLimit=pArgs->sourceLimit;
   3512     target=(uint8_t *)pArgs->target;
   3513     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3514     offsets=pArgs->offsets;
   3515 
   3516     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3517     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3518         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3519     } else {
   3520         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3521     }
   3522 
   3523     if(cnv->useFallback) {
   3524         /* use all roundtrip and fallback results */
   3525         minValue=0x800;
   3526     } else {
   3527         /* use only roundtrips and fallbacks from private-use characters */
   3528         minValue=0xc00;
   3529     }
   3530     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   3531 
   3532     /* get the converter state from UConverter */
   3533     c=cnv->fromUChar32;
   3534 
   3535     /* sourceIndex=-1 if the current character began in the previous buffer */
   3536     sourceIndex= c==0 ? 0 : -1;
   3537     nextSourceIndex=0;
   3538 
   3539     /* conversion loop */
   3540     if(c!=0 && targetCapacity>0) {
   3541         goto getTrail;
   3542     }
   3543 
   3544     while(source<sourceLimit) {
   3545         /*
   3546          * This following test is to see if available input would overflow the output.
   3547          * It does not catch output of more than one byte that
   3548          * overflows as a result of a multi-byte character or callback output
   3549          * from the last source character.
   3550          * Therefore, those situations also test for overflows and will
   3551          * then break the loop, too.
   3552          */
   3553         if(targetCapacity>0) {
   3554             /*
   3555              * Get a correct Unicode code point:
   3556              * a single UChar for a BMP code point or
   3557              * a matched surrogate pair for a "supplementary code point".
   3558              */
   3559             c=*source++;
   3560             ++nextSourceIndex;
   3561             if(U16_IS_SURROGATE(c)) {
   3562                 if(U16_IS_SURROGATE_LEAD(c)) {
   3563 getTrail:
   3564                     if(source<sourceLimit) {
   3565                         /* test the following code unit */
   3566                         UChar trail=*source;
   3567                         if(U16_IS_TRAIL(trail)) {
   3568                             ++source;
   3569                             ++nextSourceIndex;
   3570                             c=U16_GET_SUPPLEMENTARY(c, trail);
   3571                             if(!hasSupplementary) {
   3572                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3573                                 /* callback(unassigned) */
   3574                                 goto unassigned;
   3575                             }
   3576                             /* convert this supplementary code point */
   3577                             /* exit this condition tree */
   3578                         } else {
   3579                             /* this is an unmatched lead code unit (1st surrogate) */
   3580                             /* callback(illegal) */
   3581                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3582                             break;
   3583                         }
   3584                     } else {
   3585                         /* no more input */
   3586                         break;
   3587                     }
   3588                 } else {
   3589                     /* this is an unmatched trail code unit (2nd surrogate) */
   3590                     /* callback(illegal) */
   3591                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3592                     break;
   3593                 }
   3594             }
   3595 
   3596             /* convert the Unicode code point in c into codepage bytes */
   3597             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3598 
   3599             /* is this code point assigned, or do we use fallbacks? */
   3600             if(value>=minValue) {
   3601                 /* assigned, write the output character bytes from value and length */
   3602                 /* length==1 */
   3603                 /* this is easy because we know that there is enough space */
   3604                 *target++=(uint8_t)value;
   3605                 if(offsets!=NULL) {
   3606                     *offsets++=sourceIndex;
   3607                 }
   3608                 --targetCapacity;
   3609 
   3610                 /* normal end of conversion: prepare for a new character */
   3611                 c=0;
   3612                 sourceIndex=nextSourceIndex;
   3613             } else { /* unassigned */
   3614 unassigned:
   3615                 /* try an extension mapping */
   3616                 pArgs->source=source;
   3617                 c=_extFromU(cnv, cnv->sharedData,
   3618                             c, &source, sourceLimit,
   3619                             &target, target+targetCapacity,
   3620                             &offsets, sourceIndex,
   3621                             pArgs->flush,
   3622                             pErrorCode);
   3623                 nextSourceIndex+=(int32_t)(source-pArgs->source);
   3624 
   3625                 if(U_FAILURE(*pErrorCode)) {
   3626                     /* not mappable or buffer overflow */
   3627                     break;
   3628                 } else {
   3629                     /* a mapping was written to the target, continue */
   3630 
   3631                     /* recalculate the targetCapacity after an extension mapping */
   3632                     targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3633 
   3634                     /* normal end of conversion: prepare for a new character */
   3635                     sourceIndex=nextSourceIndex;
   3636                 }
   3637             }
   3638         } else {
   3639             /* target is full */
   3640             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3641             break;
   3642         }
   3643     }
   3644 
   3645     /* set the converter state back into UConverter */
   3646     cnv->fromUChar32=c;
   3647 
   3648     /* write back the updated pointers */
   3649     pArgs->source=source;
   3650     pArgs->target=(char *)target;
   3651     pArgs->offsets=offsets;
   3652 }
   3653 
   3654 /*
   3655  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
   3656  * that map only to and from the BMP.
   3657  * In addition to single-byte/state optimizations, the offset calculations
   3658  * become much easier.
   3659  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
   3660  * but measurements have shown that this diminishes performance
   3661  * in more cases than it improves it.
   3662  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
   3663  * for various MBCS and SBCS optimizations.
   3664  */
   3665 static void
   3666 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3667                               UErrorCode *pErrorCode) {
   3668     UConverter *cnv;
   3669     const UChar *source, *sourceLimit, *lastSource;
   3670     uint8_t *target;
   3671     int32_t targetCapacity, length;
   3672     int32_t *offsets;
   3673 
   3674     const uint16_t *table;
   3675     const uint16_t *results;
   3676 
   3677     UChar32 c;
   3678 
   3679     int32_t sourceIndex;
   3680 
   3681     uint32_t asciiRoundtrips;
   3682     uint16_t value, minValue;
   3683 
   3684     /* set up the local pointers */
   3685     cnv=pArgs->converter;
   3686     source=pArgs->source;
   3687     sourceLimit=pArgs->sourceLimit;
   3688     target=(uint8_t *)pArgs->target;
   3689     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3690     offsets=pArgs->offsets;
   3691 
   3692     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3693     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3694         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3695     } else {
   3696         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3697     }
   3698     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3699 
   3700     if(cnv->useFallback) {
   3701         /* use all roundtrip and fallback results */
   3702         minValue=0x800;
   3703     } else {
   3704         /* use only roundtrips and fallbacks from private-use characters */
   3705         minValue=0xc00;
   3706     }
   3707 
   3708     /* get the converter state from UConverter */
   3709     c=cnv->fromUChar32;
   3710 
   3711     /* sourceIndex=-1 if the current character began in the previous buffer */
   3712     sourceIndex= c==0 ? 0 : -1;
   3713     lastSource=source;
   3714 
   3715     /*
   3716      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   3717      * for the minimum of the sourceLength and targetCapacity
   3718      */
   3719     length=(int32_t)(sourceLimit-source);
   3720     if(length<targetCapacity) {
   3721         targetCapacity=length;
   3722     }
   3723 
   3724     /* conversion loop */
   3725     if(c!=0 && targetCapacity>0) {
   3726         goto getTrail;
   3727     }
   3728 
   3729 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3730     /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3731     /* unroll the loop with the most common case */
   3732 unrolled:
   3733     if(targetCapacity>=4) {
   3734         int32_t count, loops;
   3735         uint16_t andedValues;
   3736 
   3737         loops=count=targetCapacity>>2;
   3738         do {
   3739             c=*source++;
   3740             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3741             *target++=(uint8_t)value;
   3742             c=*source++;
   3743             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3744             *target++=(uint8_t)value;
   3745             c=*source++;
   3746             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3747             *target++=(uint8_t)value;
   3748             c=*source++;
   3749             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3750             *target++=(uint8_t)value;
   3751 
   3752             /* were all 4 entries really valid? */
   3753             if(andedValues<minValue) {
   3754                 /* no, return to the first of these 4 */
   3755                 source-=4;
   3756                 target-=4;
   3757                 break;
   3758             }
   3759         } while(--count>0);
   3760         count=loops-count;
   3761         targetCapacity-=4*count;
   3762 
   3763         if(offsets!=NULL) {
   3764             lastSource+=4*count;
   3765             while(count>0) {
   3766                 *offsets++=sourceIndex++;
   3767                 *offsets++=sourceIndex++;
   3768                 *offsets++=sourceIndex++;
   3769                 *offsets++=sourceIndex++;
   3770                 --count;
   3771             }
   3772         }
   3773 
   3774         c=0;
   3775     }
   3776 #endif
   3777 
   3778     while(targetCapacity>0) {
   3779         /*
   3780          * Get a correct Unicode code point:
   3781          * a single UChar for a BMP code point or
   3782          * a matched surrogate pair for a "supplementary code point".
   3783          */
   3784         c=*source++;
   3785         /*
   3786          * Do not immediately check for single surrogates:
   3787          * Assume that they are unassigned and check for them in that case.
   3788          * This speeds up the conversion of assigned characters.
   3789          */
   3790         /* convert the Unicode code point in c into codepage bytes */
   3791         if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3792             *target++=(uint8_t)c;
   3793             --targetCapacity;
   3794             c=0;
   3795             continue;
   3796         }
   3797         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3798         /* is this code point assigned, or do we use fallbacks? */
   3799         if(value>=minValue) {
   3800             /* assigned, write the output character bytes from value and length */
   3801             /* length==1 */
   3802             /* this is easy because we know that there is enough space */
   3803             *target++=(uint8_t)value;
   3804             --targetCapacity;
   3805 
   3806             /* normal end of conversion: prepare for a new character */
   3807             c=0;
   3808             continue;
   3809         } else if(!U16_IS_SURROGATE(c)) {
   3810             /* normal, unassigned BMP character */
   3811         } else if(U16_IS_SURROGATE_LEAD(c)) {
   3812 getTrail:
   3813             if(source<sourceLimit) {
   3814                 /* test the following code unit */
   3815                 UChar trail=*source;
   3816                 if(U16_IS_TRAIL(trail)) {
   3817                     ++source;
   3818                     c=U16_GET_SUPPLEMENTARY(c, trail);
   3819                     /* this codepage does not map supplementary code points */
   3820                     /* callback(unassigned) */
   3821                 } else {
   3822                     /* this is an unmatched lead code unit (1st surrogate) */
   3823                     /* callback(illegal) */
   3824                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3825                     break;
   3826                 }
   3827             } else {
   3828                 /* no more input */
   3829                 if (pArgs->flush) {
   3830                     *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3831                 }
   3832                 break;
   3833             }
   3834         } else {
   3835             /* this is an unmatched trail code unit (2nd surrogate) */
   3836             /* callback(illegal) */
   3837             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3838             break;
   3839         }
   3840 
   3841         /* c does not have a mapping */
   3842 
   3843         /* get the number of code units for c to correctly advance sourceIndex */
   3844         length=U16_LENGTH(c);
   3845 
   3846         /* set offsets since the start or the last extension */
   3847         if(offsets!=NULL) {
   3848             int32_t count=(int32_t)(source-lastSource);
   3849 
   3850             /* do not set the offset for this character */
   3851             count-=length;
   3852 
   3853             while(count>0) {
   3854                 *offsets++=sourceIndex++;
   3855                 --count;
   3856             }
   3857             /* offsets and sourceIndex are now set for the current character */
   3858         }
   3859 
   3860         /* try an extension mapping */
   3861         lastSource=source;
   3862         c=_extFromU(cnv, cnv->sharedData,
   3863                     c, &source, sourceLimit,
   3864                     &target, (const uint8_t *)(pArgs->targetLimit),
   3865                     &offsets, sourceIndex,
   3866                     pArgs->flush,
   3867                     pErrorCode);
   3868         sourceIndex+=length+(int32_t)(source-lastSource);
   3869         lastSource=source;
   3870 
   3871         if(U_FAILURE(*pErrorCode)) {
   3872             /* not mappable or buffer overflow */
   3873             break;
   3874         } else {
   3875             /* a mapping was written to the target, continue */
   3876 
   3877             /* recalculate the targetCapacity after an extension mapping */
   3878             targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3879             length=(int32_t)(sourceLimit-source);
   3880             if(length<targetCapacity) {
   3881                 targetCapacity=length;
   3882             }
   3883         }
   3884 
   3885 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3886         /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3887         goto unrolled;
   3888 #endif
   3889     }
   3890 
   3891     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
   3892         /* target is full */
   3893         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3894     }
   3895 
   3896     /* set offsets since the start or the last callback */
   3897     if(offsets!=NULL) {
   3898         size_t count=source-lastSource;
   3899         if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
   3900             /*
   3901             Caller gave us a partial supplementary character,
   3902             which this function couldn't convert in any case.
   3903             The callback will handle the offset.
   3904             */
   3905             count--;
   3906         }
   3907         while(count>0) {
   3908             *offsets++=sourceIndex++;
   3909             --count;
   3910         }
   3911     }
   3912 
   3913     /* set the converter state back into UConverter */
   3914     cnv->fromUChar32=c;
   3915 
   3916     /* write back the updated pointers */
   3917     pArgs->source=source;
   3918     pArgs->target=(char *)target;
   3919     pArgs->offsets=offsets;
   3920 }
   3921 
   3922 /* Begin Android-added */
   3923 #undef si_value
   3924 #undef so_value
   3925 /* End Android-added */
   3926 
   3927 U_CFUNC void
   3928 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3929                             UErrorCode *pErrorCode) {
   3930     UConverter *cnv;
   3931     const UChar *source, *sourceLimit;
   3932     uint8_t *target;
   3933     int32_t targetCapacity;
   3934     int32_t *offsets;
   3935 
   3936     const uint16_t *table;
   3937     const uint16_t *mbcsIndex;
   3938     const uint8_t *p, *bytes;
   3939     uint8_t outputType;
   3940 
   3941     UChar32 c;
   3942 
   3943     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
   3944 
   3945     uint32_t stage2Entry;
   3946     uint32_t asciiRoundtrips;
   3947     uint32_t value;
   3948     uint8_t si_value[2] = {0, 0};
   3949     uint8_t so_value[2] = {0, 0};
   3950     uint8_t si_value_length, so_value_length;
   3951     int32_t length = 0, prevLength;
   3952     uint8_t unicodeMask;
   3953 
   3954     cnv=pArgs->converter;
   3955 
   3956     if(cnv->preFromUFirstCP>=0) {
   3957         /*
   3958          * pass sourceIndex=-1 because we continue from an earlier buffer
   3959          * in the future, this may change with continuous offsets
   3960          */
   3961         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
   3962 
   3963         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
   3964             return;
   3965         }
   3966     }
   3967 
   3968     /* use optimized function if possible */
   3969     outputType=cnv->sharedData->mbcs.outputType;
   3970     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3971     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3972         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3973             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
   3974         } else {
   3975             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
   3976         }
   3977         return;
   3978     } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
   3979         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
   3980         return;
   3981     }
   3982 
   3983     /* set up the local pointers */
   3984     source=pArgs->source;
   3985     sourceLimit=pArgs->sourceLimit;
   3986     target=(uint8_t *)pArgs->target;
   3987     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3988     offsets=pArgs->offsets;
   3989 
   3990     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3991     if(cnv->sharedData->mbcs.utf8Friendly) {
   3992         mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3993     } else {
   3994         mbcsIndex=NULL;
   3995     }
   3996     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3997         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3998     } else {
   3999         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   4000     }
   4001     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   4002 
   4003     /* get the converter state from UConverter */
   4004     c=cnv->fromUChar32;
   4005 
   4006     if(outputType==MBCS_OUTPUT_2_SISO) {
   4007         prevLength=cnv->fromUnicodeStatus;
   4008         if(prevLength==0) {
   4009             /* set the real value */
   4010             prevLength=1;
   4011         }
   4012     } else {
   4013         /* prevent fromUnicodeStatus from being set to something non-0 */
   4014         prevLength=0;
   4015     }
   4016 
   4017     /* sourceIndex=-1 if the current character began in the previous buffer */
   4018     prevSourceIndex=-1;
   4019     sourceIndex= c==0 ? 0 : -1;
   4020     nextSourceIndex=0;
   4021 
   4022     /* Get the SI/SO character for the converter */
   4023     si_value_length = getSISOBytes(SI, cnv->options, si_value);
   4024     so_value_length = getSISOBytes(SO, cnv->options, so_value);
   4025 
   4026     /* conversion loop */
   4027     /*
   4028      * This is another piece of ugly code:
   4029      * A goto into the loop if the converter state contains a first surrogate
   4030      * from the previous function call.
   4031      * It saves me to check in each loop iteration a check of if(c==0)
   4032      * and duplicating the trail-surrogate-handling code in the else
   4033      * branch of that check.
   4034      * I could not find any other way to get around this other than
   4035      * using a function call for the conversion and callback, which would
   4036      * be even more inefficient.
   4037      *
   4038      * Markus Scherer 2000-jul-19
   4039      */
   4040     if(c!=0 && targetCapacity>0) {
   4041         goto getTrail;
   4042     }
   4043 
   4044     while(source<sourceLimit) {
   4045         /*
   4046          * This following test is to see if available input would overflow the output.
   4047          * It does not catch output of more than one byte that
   4048          * overflows as a result of a multi-byte character or callback output
   4049          * from the last source character.
   4050          * Therefore, those situations also test for overflows and will
   4051          * then break the loop, too.
   4052          */
   4053         if(targetCapacity>0) {
   4054             /*
   4055              * Get a correct Unicode code point:
   4056              * a single UChar for a BMP code point or
   4057              * a matched surrogate pair for a "supplementary code point".
   4058              */
   4059             c=*source++;
   4060             ++nextSourceIndex;
   4061             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   4062                 *target++=(uint8_t)c;
   4063                 if(offsets!=NULL) {
   4064                     *offsets++=sourceIndex;
   4065                     prevSourceIndex=sourceIndex;
   4066                     sourceIndex=nextSourceIndex;
   4067                 }
   4068                 --targetCapacity;
   4069                 c=0;
   4070                 continue;
   4071             }
   4072             /*
   4073              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   4074              * to avoid dealing with surrogates.
   4075              * MBCS_FAST_MAX must be >=0xd7ff.
   4076              */
   4077             if(c<=0xd7ff && mbcsIndex!=NULL) {
   4078                 value=mbcsIndex[c>>6];
   4079 
   4080                 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
   4081                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   4082                 switch(outputType) {
   4083                 case MBCS_OUTPUT_2:
   4084                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4085                     if(value<=0xff) {
   4086                         if(value==0) {
   4087                             goto unassigned;
   4088                         } else {
   4089                             length=1;
   4090                         }
   4091                     } else {
   4092                         length=2;
   4093                     }
   4094                     break;
   4095                 case MBCS_OUTPUT_2_SISO:
   4096                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4097                     /*
   4098                      * Save the old state in the converter object
   4099                      * right here, then change the local prevLength state variable if necessary.
   4100                      * Then, if this character turns out to be unassigned or a fallback that
   4101                      * is not taken, the callback code must not save the new state in the converter
   4102                      * because the new state is for a character that is not output.
   4103                      * However, the callback must still restore the state from the converter
   4104                      * in case the callback function changed it for its output.
   4105                      */
   4106                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4107                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4108                     if(value<=0xff) {
   4109                         if(value==0) {
   4110                             goto unassigned;
   4111                         } else if(prevLength<=1) {
   4112                             length=1;
   4113                         } else {
   4114                             /* change from double-byte mode to single-byte */
   4115                             if (si_value_length == 1) {
   4116                                 value|=(uint32_t)si_value[0]<<8;
   4117                                 length = 2;
   4118                             } else if (si_value_length == 2) {
   4119                                 value|=(uint32_t)si_value[1]<<8;
   4120                                 value|=(uint32_t)si_value[0]<<16;
   4121                                 length = 3;
   4122                             }
   4123                             prevLength=1;
   4124                         }
   4125                     } else {
   4126                         if(prevLength==2) {
   4127                             length=2;
   4128                         } else {
   4129                             /* change from single-byte mode to double-byte */
   4130                             if (so_value_length == 1) {
   4131                                 value|=(uint32_t)so_value[0]<<16;
   4132                                 length = 3;
   4133                             } else if (so_value_length == 2) {
   4134                                 value|=(uint32_t)so_value[1]<<16;
   4135                                 value|=(uint32_t)so_value[0]<<24;
   4136                                 length = 4;
   4137                             }
   4138                             prevLength=2;
   4139                         }
   4140                     }
   4141                     break;
   4142                 case MBCS_OUTPUT_DBCS_ONLY:
   4143                     /* table with single-byte results, but only DBCS mappings used */
   4144                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4145                     if(value<=0xff) {
   4146                         /* no mapping or SBCS result, not taken for DBCS-only */
   4147                         goto unassigned;
   4148                     } else {
   4149                         length=2;
   4150                     }
   4151                     break;
   4152                 case MBCS_OUTPUT_3:
   4153                     p=bytes+(value+(c&0x3f))*3;
   4154                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4155                     if(value<=0xff) {
   4156                         if(value==0) {
   4157                             goto unassigned;
   4158                         } else {
   4159                             length=1;
   4160                         }
   4161                     } else if(value<=0xffff) {
   4162                         length=2;
   4163                     } else {
   4164                         length=3;
   4165                     }
   4166                     break;
   4167                 case MBCS_OUTPUT_4:
   4168                     value=((const uint32_t *)bytes)[value +(c&0x3f)];
   4169                     if(value<=0xff) {
   4170                         if(value==0) {
   4171                             goto unassigned;
   4172                         } else {
   4173                             length=1;
   4174                         }
   4175                     } else if(value<=0xffff) {
   4176                         length=2;
   4177                     } else if(value<=0xffffff) {
   4178                         length=3;
   4179                     } else {
   4180                         length=4;
   4181                     }
   4182                     break;
   4183                 case MBCS_OUTPUT_3_EUC:
   4184                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4185                     /* EUC 16-bit fixed-length representation */
   4186                     if(value<=0xff) {
   4187                         if(value==0) {
   4188                             goto unassigned;
   4189                         } else {
   4190                             length=1;
   4191                         }
   4192                     } else if((value&0x8000)==0) {
   4193                         value|=0x8e8000;
   4194                         length=3;
   4195                     } else if((value&0x80)==0) {
   4196                         value|=0x8f0080;
   4197                         length=3;
   4198                     } else {
   4199                         length=2;
   4200                     }
   4201                     break;
   4202                 case MBCS_OUTPUT_4_EUC:
   4203                     p=bytes+(value+(c&0x3f))*3;
   4204                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4205                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4206                     if(value<=0xff) {
   4207                         if(value==0) {
   4208                             goto unassigned;
   4209                         } else {
   4210                             length=1;
   4211                         }
   4212                     } else if(value<=0xffff) {
   4213                         length=2;
   4214                     } else if((value&0x800000)==0) {
   4215                         value|=0x8e800000;
   4216                         length=4;
   4217                     } else if((value&0x8000)==0) {
   4218                         value|=0x8f008000;
   4219                         length=4;
   4220                     } else {
   4221                         length=3;
   4222                     }
   4223                     break;
   4224                 default:
   4225                     /* must not occur */
   4226                     /*
   4227                      * To avoid compiler warnings that value & length may be
   4228                      * used without having been initialized, we set them here.
   4229                      * In reality, this is unreachable code.
   4230                      * Not having a default branch also causes warnings with
   4231                      * some compilers.
   4232                      */
   4233                     value=0;
   4234                     length=0;
   4235                     break;
   4236                 }
   4237                 /* output the value */
   4238             } else {
   4239                 /*
   4240                  * This also tests if the codepage maps single surrogates.
   4241                  * If it does, then surrogates are not paired but mapped separately.
   4242                  * Note that in this case unmatched surrogates are not detected.
   4243                  */
   4244                 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4245                     if(U16_IS_SURROGATE_LEAD(c)) {
   4246 getTrail:
   4247                         if(source<sourceLimit) {
   4248                             /* test the following code unit */
   4249                             UChar trail=*source;
   4250                             if(U16_IS_TRAIL(trail)) {
   4251                                 ++source;
   4252                                 ++nextSourceIndex;
   4253                                 c=U16_GET_SUPPLEMENTARY(c, trail);
   4254                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4255                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4256                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4257                                     /* callback(unassigned) */
   4258                                     goto unassigned;
   4259                                 }
   4260                                 /* convert this supplementary code point */
   4261                                 /* exit this condition tree */
   4262                             } else {
   4263                                 /* this is an unmatched lead code unit (1st surrogate) */
   4264                                 /* callback(illegal) */
   4265                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4266                                 break;
   4267                             }
   4268                         } else {
   4269                             /* no more input */
   4270                             break;
   4271                         }
   4272                     } else {
   4273                         /* this is an unmatched trail code unit (2nd surrogate) */
   4274                         /* callback(illegal) */
   4275                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4276                         break;
   4277                     }
   4278                 }
   4279 
   4280                 /* convert the Unicode code point in c into codepage bytes */
   4281 
   4282                 /*
   4283                  * The basic lookup is a triple-stage compact array (trie) lookup.
   4284                  * For details see the beginning of this file.
   4285                  *
   4286                  * Single-byte codepages are handled with a different data structure
   4287                  * by _MBCSSingle... functions.
   4288                  *
   4289                  * The result consists of a 32-bit value from stage 2 and
   4290                  * a pointer to as many bytes as are stored per character.
   4291                  * The pointer points to the character's bytes in stage 3.
   4292                  * Bits 15..0 of the stage 2 entry contain the stage 3 index
   4293                  * for that pointer, while bits 31..16 are flags for which of
   4294                  * the 16 characters in the block are roundtrip-assigned.
   4295                  *
   4296                  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
   4297                  * respectively as uint32_t, in the platform encoding.
   4298                  * For 3-byte codepages, the bytes are always stored in big-endian order.
   4299                  *
   4300                  * For EUC encodings that use only either 0x8e or 0x8f as the first
   4301                  * byte of their longest byte sequences, the first two bytes in
   4302                  * this third stage indicate with their 7th bits whether these bytes
   4303                  * are to be written directly or actually need to be preceeded by
   4304                  * one of the two Single-Shift codes. With this, the third stage
   4305                  * stores one byte fewer per character than the actual maximum length of
   4306                  * EUC byte sequences.
   4307                  *
   4308                  * Other than that, leading zero bytes are removed and the other
   4309                  * bytes output. A single zero byte may be output if the "assigned"
   4310                  * bit in stage 2 was on.
   4311                  * The data structure does not support zero byte output as a fallback,
   4312                  * and also does not allow output of leading zeros.
   4313                  */
   4314                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4315 
   4316                 /* get the bytes and the length for the output */
   4317                 switch(outputType) {
   4318                 case MBCS_OUTPUT_2:
   4319                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4320                     if(value<=0xff) {
   4321                         length=1;
   4322                     } else {
   4323                         length=2;
   4324                     }
   4325                     break;
   4326                 case MBCS_OUTPUT_2_SISO:
   4327                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4328                     /*
   4329                      * Save the old state in the converter object
   4330                      * right here, then change the local prevLength state variable if necessary.
   4331                      * Then, if this character turns out to be unassigned or a fallback that
   4332                      * is not taken, the callback code must not save the new state in the converter
   4333                      * because the new state is for a character that is not output.
   4334                      * However, the callback must still restore the state from the converter
   4335                      * in case the callback function changed it for its output.
   4336                      */
   4337                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4338                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4339                     if(value<=0xff) {
   4340                         if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
   4341                             /* no mapping, leave value==0 */
   4342                             length=0;
   4343                         } else if(prevLength<=1) {
   4344                             length=1;
   4345                         } else {
   4346                             /* change from double-byte mode to single-byte */
   4347                             if (si_value_length == 1) {
   4348                                 value|=(uint32_t)si_value[0]<<8;
   4349                                 length = 2;
   4350                             } else if (si_value_length == 2) {
   4351                                 value|=(uint32_t)si_value[1]<<8;
   4352                                 value|=(uint32_t)si_value[0]<<16;
   4353                                 length = 3;
   4354                             }
   4355                             prevLength=1;
   4356                         }
   4357                     } else {
   4358                         if(prevLength==2) {
   4359                             length=2;
   4360                         } else {
   4361                             /* change from single-byte mode to double-byte */
   4362                             if (so_value_length == 1) {
   4363                                 value|=(uint32_t)so_value[0]<<16;
   4364                                 length = 3;
   4365                             } else if (so_value_length == 2) {
   4366                                 value|=(uint32_t)so_value[1]<<16;
   4367                                 value|=(uint32_t)so_value[0]<<24;
   4368                                 length = 4;
   4369                             }
   4370                             prevLength=2;
   4371                         }
   4372                     }
   4373                     break;
   4374                 case MBCS_OUTPUT_DBCS_ONLY:
   4375                     /* table with single-byte results, but only DBCS mappings used */
   4376                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4377                     if(value<=0xff) {
   4378                         /* no mapping or SBCS result, not taken for DBCS-only */
   4379                         value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4380                         length=0;
   4381                     } else {
   4382                         length=2;
   4383                     }
   4384                     break;
   4385                 case MBCS_OUTPUT_3:
   4386                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4387                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4388                     if(value<=0xff) {
   4389                         length=1;
   4390                     } else if(value<=0xffff) {
   4391                         length=2;
   4392                     } else {
   4393                         length=3;
   4394                     }
   4395                     break;
   4396                 case MBCS_OUTPUT_4:
   4397                     value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
   4398                     if(value<=0xff) {
   4399                         length=1;
   4400                     } else if(value<=0xffff) {
   4401                         length=2;
   4402                     } else if(value<=0xffffff) {
   4403                         length=3;
   4404                     } else {
   4405                         length=4;
   4406                     }
   4407                     break;
   4408                 case MBCS_OUTPUT_3_EUC:
   4409                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4410                     /* EUC 16-bit fixed-length representation */
   4411                     if(value<=0xff) {
   4412                         length=1;
   4413                     } else if((value&0x8000)==0) {
   4414                         value|=0x8e8000;
   4415                         length=3;
   4416                     } else if((value&0x80)==0) {
   4417                         value|=0x8f0080;
   4418                         length=3;
   4419                     } else {
   4420                         length=2;
   4421                     }
   4422                     break;
   4423                 case MBCS_OUTPUT_4_EUC:
   4424                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4425                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4426                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4427                     if(value<=0xff) {
   4428                         length=1;
   4429                     } else if(value<=0xffff) {
   4430                         length=2;
   4431                     } else if((value&0x800000)==0) {
   4432                         value|=0x8e800000;
   4433                         length=4;
   4434                     } else if((value&0x8000)==0) {
   4435                         value|=0x8f008000;
   4436                         length=4;
   4437                     } else {
   4438                         length=3;
   4439                     }
   4440                     break;
   4441                 default:
   4442                     /* must not occur */
   4443                     /*
   4444                      * To avoid compiler warnings that value & length may be
   4445                      * used without having been initialized, we set them here.
   4446                      * In reality, this is unreachable code.
   4447                      * Not having a default branch also causes warnings with
   4448                      * some compilers.
   4449                      */
   4450                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4451                     length=0;
   4452                     break;
   4453                 }
   4454 
   4455                 /* is this code point assigned, or do we use fallbacks? */
   4456                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
   4457                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   4458                 ) {
   4459                     /*
   4460                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4461                      * There is no way with this data structure for fallback output
   4462                      * to be a zero byte.
   4463                      */
   4464 
   4465 unassigned:
   4466                     /* try an extension mapping */
   4467                     pArgs->source=source;
   4468                     c=_extFromU(cnv, cnv->sharedData,
   4469                                 c, &source, sourceLimit,
   4470                                 &target, target+targetCapacity,
   4471                                 &offsets, sourceIndex,
   4472                                 pArgs->flush,
   4473                                 pErrorCode);
   4474                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   4475                     prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
   4476 
   4477                     if(U_FAILURE(*pErrorCode)) {
   4478                         /* not mappable or buffer overflow */
   4479                         break;
   4480                     } else {
   4481                         /* a mapping was written to the target, continue */
   4482 
   4483                         /* recalculate the targetCapacity after an extension mapping */
   4484                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4485 
   4486                         /* normal end of conversion: prepare for a new character */
   4487                         if(offsets!=NULL) {
   4488                             prevSourceIndex=sourceIndex;
   4489                             sourceIndex=nextSourceIndex;
   4490                         }
   4491                         continue;
   4492                     }
   4493                 }
   4494             }
   4495 
   4496             /* write the output character bytes from value and length */
   4497             /* from the first if in the loop we know that targetCapacity>0 */
   4498             if(length<=targetCapacity) {
   4499                 if(offsets==NULL) {
   4500                     switch(length) {
   4501                         /* each branch falls through to the next one */
   4502                     case 4:
   4503                         *target++=(uint8_t)(value>>24);
   4504                     case 3: /*fall through*/
   4505                         *target++=(uint8_t)(value>>16);
   4506                     case 2: /*fall through*/
   4507                         *target++=(uint8_t)(value>>8);
   4508                     case 1: /*fall through*/
   4509                         *target++=(uint8_t)value;
   4510                     default:
   4511                         /* will never occur */
   4512                         break;
   4513                     }
   4514                 } else {
   4515                     switch(length) {
   4516                         /* each branch falls through to the next one */
   4517                     case 4:
   4518                         *target++=(uint8_t)(value>>24);
   4519                         *offsets++=sourceIndex;
   4520                     case 3: /*fall through*/
   4521                         *target++=(uint8_t)(value>>16);
   4522                         *offsets++=sourceIndex;
   4523                     case 2: /*fall through*/
   4524                         *target++=(uint8_t)(value>>8);
   4525                         *offsets++=sourceIndex;
   4526                     case 1: /*fall through*/
   4527                         *target++=(uint8_t)value;
   4528                         *offsets++=sourceIndex;
   4529                     default:
   4530                         /* will never occur */
   4531                         break;
   4532                     }
   4533                 }
   4534                 targetCapacity-=length;
   4535             } else {
   4536                 uint8_t *charErrorBuffer;
   4537 
   4538                 /*
   4539                  * We actually do this backwards here:
   4540                  * In order to save an intermediate variable, we output
   4541                  * first to the overflow buffer what does not fit into the
   4542                  * regular target.
   4543                  */
   4544                 /* we know that 1<=targetCapacity<length<=4 */
   4545                 length-=targetCapacity;
   4546                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   4547                 switch(length) {
   4548                     /* each branch falls through to the next one */
   4549                 case 3:
   4550                     *charErrorBuffer++=(uint8_t)(value>>16);
   4551                 case 2: /*fall through*/
   4552                     *charErrorBuffer++=(uint8_t)(value>>8);
   4553                 case 1: /*fall through*/
   4554                     *charErrorBuffer=(uint8_t)value;
   4555                 default:
   4556                     /* will never occur */
   4557                     break;
   4558                 }
   4559                 cnv->charErrorBufferLength=(int8_t)length;
   4560 
   4561                 /* now output what fits into the regular target */
   4562                 value>>=8*length; /* length was reduced by targetCapacity */
   4563                 switch(targetCapacity) {
   4564                     /* each branch falls through to the next one */
   4565                 case 3:
   4566                     *target++=(uint8_t)(value>>16);
   4567                     if(offsets!=NULL) {
   4568                         *offsets++=sourceIndex;
   4569                     }
   4570                 case 2: /*fall through*/
   4571                     *target++=(uint8_t)(value>>8);
   4572                     if(offsets!=NULL) {
   4573                         *offsets++=sourceIndex;
   4574                     }
   4575                 case 1: /*fall through*/
   4576                     *target++=(uint8_t)value;
   4577                     if(offsets!=NULL) {
   4578                         *offsets++=sourceIndex;
   4579                     }
   4580                 default:
   4581                     /* will never occur */
   4582                     break;
   4583                 }
   4584 
   4585                 /* target overflow */
   4586                 targetCapacity=0;
   4587                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4588                 c=0;
   4589                 break;
   4590             }
   4591 
   4592             /* normal end of conversion: prepare for a new character */
   4593             c=0;
   4594             if(offsets!=NULL) {
   4595                 prevSourceIndex=sourceIndex;
   4596                 sourceIndex=nextSourceIndex;
   4597             }
   4598             continue;
   4599         } else {
   4600             /* target is full */
   4601             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4602             break;
   4603         }
   4604     }
   4605 
   4606     /*
   4607      * the end of the input stream and detection of truncated input
   4608      * are handled by the framework, but for EBCDIC_STATEFUL conversion
   4609      * we need to emit an SI at the very end
   4610      *
   4611      * conditions:
   4612      *   successful
   4613      *   EBCDIC_STATEFUL in DBCS mode
   4614      *   end of input and no truncated input
   4615      */
   4616     if( U_SUCCESS(*pErrorCode) &&
   4617         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
   4618         pArgs->flush && source>=sourceLimit && c==0
   4619     ) {
   4620         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
   4621         if(targetCapacity>0) {
   4622             *target++=(uint8_t)si_value[0];
   4623             if (si_value_length == 2) {
   4624                 if (targetCapacity<2) {
   4625                     cnv->charErrorBuffer[0]=(uint8_t)si_value[1];
   4626                     cnv->charErrorBufferLength=1;
   4627                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4628                 } else {
   4629                     *target++=(uint8_t)si_value[1];
   4630                 }
   4631             }
   4632             if(offsets!=NULL) {
   4633                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
   4634                 *offsets++=prevSourceIndex;
   4635             }
   4636         } else {
   4637             /* target is full */
   4638             cnv->charErrorBuffer[0]=(uint8_t)si_value[0];
   4639             if (si_value_length == 2) {
   4640                 cnv->charErrorBuffer[1]=(uint8_t)si_value[1];
   4641             }
   4642             cnv->charErrorBufferLength=si_value_length;
   4643             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4644         }
   4645         prevLength=1; /* we switched into SBCS */
   4646     }
   4647 
   4648     /* set the converter state back into UConverter */
   4649     cnv->fromUChar32=c;
   4650     cnv->fromUnicodeStatus=prevLength;
   4651 
   4652     /* write back the updated pointers */
   4653     pArgs->source=source;
   4654     pArgs->target=(char *)target;
   4655     pArgs->offsets=offsets;
   4656 }
   4657 
   4658 /*
   4659  * This is another simple conversion function for internal use by other
   4660  * conversion implementations.
   4661  * It does not use the converter state nor call callbacks.
   4662  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4663  * It handles conversion extensions but not GB 18030.
   4664  *
   4665  * It converts one single Unicode code point into codepage bytes, encoded
   4666  * as one 32-bit value. The function returns the number of bytes in *pValue:
   4667  * 1..4 the number of bytes in *pValue
   4668  * 0    unassigned (*pValue undefined)
   4669  * -1   illegal (currently not used, *pValue undefined)
   4670  *
   4671  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
   4672  * the second to last byte in bits 15..8, etc.
   4673  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
   4674  */
   4675 U_CFUNC int32_t
   4676 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
   4677                  UChar32 c, uint32_t *pValue,
   4678                  UBool useFallback) {
   4679     const int32_t *cx;
   4680     const uint16_t *table;
   4681 #if 0
   4682 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4683     const uint8_t *p;
   4684 #endif
   4685     uint32_t stage2Entry;
   4686     uint32_t value;
   4687     int32_t length;
   4688 
   4689     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4690     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4691         table=sharedData->mbcs.fromUnicodeTable;
   4692 
   4693         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4694         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
   4695             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4696             /* is this code point assigned, or do we use fallbacks? */
   4697             if(useFallback ? value>=0x800 : value>=0xc00) {
   4698                 *pValue=value&0xff;
   4699                 return 1;
   4700             }
   4701         } else /* outputType!=MBCS_OUTPUT_1 */ {
   4702             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4703 
   4704             /* get the bytes and the length for the output */
   4705             switch(sharedData->mbcs.outputType) {
   4706             case MBCS_OUTPUT_2:
   4707                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4708                 if(value<=0xff) {
   4709                     length=1;
   4710                 } else {
   4711                     length=2;
   4712                 }
   4713                 break;
   4714 #if 0
   4715 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4716             case MBCS_OUTPUT_DBCS_ONLY:
   4717                 /* table with single-byte results, but only DBCS mappings used */
   4718                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4719                 if(value<=0xff) {
   4720                     /* no mapping or SBCS result, not taken for DBCS-only */
   4721                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4722                     length=0;
   4723                 } else {
   4724                     length=2;
   4725                 }
   4726                 break;
   4727             case MBCS_OUTPUT_3:
   4728                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4729                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4730                 if(value<=0xff) {
   4731                     length=1;
   4732                 } else if(value<=0xffff) {
   4733                     length=2;
   4734                 } else {
   4735                     length=3;
   4736                 }
   4737                 break;
   4738             case MBCS_OUTPUT_4:
   4739                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4740                 if(value<=0xff) {
   4741                     length=1;
   4742                 } else if(value<=0xffff) {
   4743                     length=2;
   4744                 } else if(value<=0xffffff) {
   4745                     length=3;
   4746                 } else {
   4747                     length=4;
   4748                 }
   4749                 break;
   4750             case MBCS_OUTPUT_3_EUC:
   4751                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4752                 /* EUC 16-bit fixed-length representation */
   4753                 if(value<=0xff) {
   4754                     length=1;
   4755                 } else if((value&0x8000)==0) {
   4756                     value|=0x8e8000;
   4757                     length=3;
   4758                 } else if((value&0x80)==0) {
   4759                     value|=0x8f0080;
   4760                     length=3;
   4761                 } else {
   4762                     length=2;
   4763                 }
   4764                 break;
   4765             case MBCS_OUTPUT_4_EUC:
   4766                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4767                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4768                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4769                 if(value<=0xff) {
   4770                     length=1;
   4771                 } else if(value<=0xffff) {
   4772                     length=2;
   4773                 } else if((value&0x800000)==0) {
   4774                     value|=0x8e800000;
   4775                     length=4;
   4776                 } else if((value&0x8000)==0) {
   4777                     value|=0x8f008000;
   4778                     length=4;
   4779                 } else {
   4780                     length=3;
   4781                 }
   4782                 break;
   4783 #endif
   4784             default:
   4785                 /* must not occur */
   4786                 return -1;
   4787             }
   4788 
   4789             /* is this code point assigned, or do we use fallbacks? */
   4790             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   4791                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
   4792             ) {
   4793                 /*
   4794                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4795                  * There is no way with this data structure for fallback output
   4796                  * to be a zero byte.
   4797                  */
   4798                 /* assigned */
   4799                 *pValue=value;
   4800                 return length;
   4801             }
   4802         }
   4803     }
   4804 
   4805     cx=sharedData->mbcs.extIndexes;
   4806     if(cx!=NULL) {
   4807         length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
   4808         return length>=0 ? length : -length;  /* return abs(length); */
   4809     }
   4810 
   4811     /* unassigned */
   4812     return 0;
   4813 }
   4814 
   4815 
   4816 #if 0
   4817 /*
   4818  * This function has been moved to ucnv2022.c for inlining.
   4819  * This implementation is here only for documentation purposes
   4820  */
   4821 
   4822 /**
   4823  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
   4824  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4825  * It does not handle conversion extensions (_extFromU()).
   4826  *
   4827  * It returns the codepage byte for the code point, or -1 if it is unassigned.
   4828  */
   4829 U_CFUNC int32_t
   4830 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
   4831                        UChar32 c,
   4832                        UBool useFallback) {
   4833     const uint16_t *table;
   4834     int32_t value;
   4835 
   4836     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4837     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4838         return -1;
   4839     }
   4840 
   4841     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4842     table=sharedData->mbcs.fromUnicodeTable;
   4843 
   4844     /* get the byte for the output */
   4845     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4846     /* is this code point assigned, or do we use fallbacks? */
   4847     if(useFallback ? value>=0x800 : value>=0xc00) {
   4848         return value&0xff;
   4849     } else {
   4850         return -1;
   4851     }
   4852 }
   4853 #endif
   4854 
   4855 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
   4856 
   4857 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
   4858 static const UChar32
   4859 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
   4860 
   4861 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   4862 static const UChar32
   4863 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   4864 
   4865 static void
   4866 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   4867                   UConverterToUnicodeArgs *pToUArgs,
   4868                   UErrorCode *pErrorCode) {
   4869     UConverter *utf8, *cnv;
   4870     const uint8_t *source, *sourceLimit;
   4871     uint8_t *target;
   4872     int32_t targetCapacity;
   4873 
   4874     const uint16_t *table, *sbcsIndex;
   4875     const uint16_t *results;
   4876 
   4877     int8_t oldToULength, toULength, toULimit;
   4878 
   4879     UChar32 c;
   4880     uint8_t b, t1, t2;
   4881 
   4882     uint32_t asciiRoundtrips;
   4883     uint16_t value, minValue;
   4884     UBool hasSupplementary;
   4885 
   4886     /* set up the local pointers */
   4887     utf8=pToUArgs->converter;
   4888     cnv=pFromUArgs->converter;
   4889     source=(uint8_t *)pToUArgs->source;
   4890     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   4891     target=(uint8_t *)pFromUArgs->target;
   4892     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   4893 
   4894     table=cnv->sharedData->mbcs.fromUnicodeTable;
   4895     sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
   4896     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   4897         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   4898     } else {
   4899         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   4900     }
   4901     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   4902 
   4903     if(cnv->useFallback) {
   4904         /* use all roundtrip and fallback results */
   4905         minValue=0x800;
   4906     } else {
   4907         /* use only roundtrips and fallbacks from private-use characters */
   4908         minValue=0xc00;
   4909     }
   4910     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   4911 
   4912     /* get the converter state from the UTF-8 UConverter */
   4913     c=(UChar32)utf8->toUnicodeStatus;
   4914     if(c!=0) {
   4915         toULength=oldToULength=utf8->toULength;
   4916         toULimit=(int8_t)utf8->mode;
   4917     } else {
   4918         toULength=oldToULength=toULimit=0;
   4919     }
   4920 
   4921     /*
   4922      * Make sure that the last byte sequence before sourceLimit is complete
   4923      * or runs into a lead byte.
   4924      * Do not go back into the bytes that will be read for finishing a partial
   4925      * sequence from the previous buffer.
   4926      * In the conversion loop compare source with sourceLimit only once
   4927      * per multi-byte character.
   4928      */
   4929     {
   4930         int32_t i, length;
   4931 
   4932         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   4933         for(i=0; i<3 && i<length;) {
   4934             b=*(sourceLimit-i-1);
   4935             if(U8_IS_TRAIL(b)) {
   4936                 ++i;
   4937             } else {
   4938                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
   4939                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   4940                     sourceLimit-=i+1;
   4941                 }
   4942                 break;
   4943             }
   4944         }
   4945     }
   4946 
   4947     if(c!=0 && targetCapacity>0) {
   4948         utf8->toUnicodeStatus=0;
   4949         utf8->toULength=0;
   4950         goto moreBytes;
   4951         /*
   4952          * Note: We could avoid the goto by duplicating some of the moreBytes
   4953          * code, but only up to the point of collecting a complete UTF-8
   4954          * sequence; then recurse for the toUBytes[toULength]
   4955          * and then continue with normal conversion.
   4956          *
   4957          * If so, move this code to just after initializing the minimum
   4958          * set of local variables for reading the UTF-8 input
   4959          * (utf8, source, target, limits but not cnv, table, minValue, etc.).
   4960          *
   4961          * Potential advantages:
   4962          * - avoid the goto
   4963          * - oldToULength could become a local variable in just those code blocks
   4964          *   that deal with buffer boundaries
   4965          * - possibly faster if the goto prevents some compiler optimizations
   4966          *   (this would need measuring to confirm)
   4967          * Disadvantage:
   4968          * - code duplication
   4969          */
   4970     }
   4971 
   4972     /* conversion loop */
   4973     while(source<sourceLimit) {
   4974         if(targetCapacity>0) {
   4975             b=*source++;
   4976             if((int8_t)b>=0) {
   4977                 /* convert ASCII */
   4978                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   4979                     *target++=(uint8_t)b;
   4980                     --targetCapacity;
   4981                     continue;
   4982                 } else {
   4983                     c=b;
   4984                     value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
   4985                 }
   4986             } else {
   4987                 if(b<0xe0) {
   4988                     if( /* handle U+0080..U+07FF inline */
   4989                         b>=0xc2 &&
   4990                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   4991                     ) {
   4992                         c=b&0x1f;
   4993                         ++source;
   4994                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
   4995                         if(value>=minValue) {
   4996                             *target++=(uint8_t)value;
   4997                             --targetCapacity;
   4998                             continue;
   4999                         } else {
   5000                             c=(c<<6)|t1;
   5001                         }
   5002                     } else {
   5003                         c=-1;
   5004                     }
   5005                 } else if(b==0xe0) {
   5006                     if( /* handle U+0800..U+0FFF inline */
   5007                         (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
   5008                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5009                     ) {
   5010                         c=t1;
   5011                         source+=2;
   5012                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
   5013                         if(value>=minValue) {
   5014                             *target++=(uint8_t)value;
   5015                             --targetCapacity;
   5016                             continue;
   5017                         } else {
   5018                             c=(c<<6)|t2;
   5019                         }
   5020                     } else {
   5021                         c=-1;
   5022                     }
   5023                 } else {
   5024                     c=-1;
   5025                 }
   5026 
   5027                 if(c<0) {
   5028                     /* handle "complicated" and error cases, and continuing partial characters */
   5029                     oldToULength=0;
   5030                     toULength=1;
   5031                     toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5032                     c=b;
   5033 moreBytes:
   5034                     while(toULength<toULimit) {
   5035                         /*
   5036                          * The sourceLimit may have been adjusted before the conversion loop
   5037                          * to stop before a truncated sequence.
   5038                          * Here we need to use the real limit in case we have two truncated
   5039                          * sequences at the end.
   5040                          * See ticket #7492.
   5041                          */
   5042                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5043                             b=*source;
   5044                             if(U8_IS_TRAIL(b)) {
   5045                                 ++source;
   5046                                 ++toULength;
   5047                                 c=(c<<6)+b;
   5048                             } else {
   5049                                 break; /* sequence too short, stop with toULength<toULimit */
   5050                             }
   5051                         } else {
   5052                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5053                             source-=(toULength-oldToULength);
   5054                             while(oldToULength<toULength) {
   5055                                 utf8->toUBytes[oldToULength++]=*source++;
   5056                             }
   5057                             utf8->toUnicodeStatus=c;
   5058                             utf8->toULength=toULength;
   5059                             utf8->mode=toULimit;
   5060                             pToUArgs->source=(char *)source;
   5061                             pFromUArgs->target=(char *)target;
   5062                             return;
   5063                         }
   5064                     }
   5065 
   5066                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5067                         (toULength==3 || toULength==2) &&             /* BMP */
   5068                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5069                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5070                     ) {
   5071                         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5072                     } else if(
   5073                         toULength==toULimit && toULength==4 &&
   5074                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5075                     ) {
   5076                         /* supplementary code point */
   5077                         if(!hasSupplementary) {
   5078                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5079                             value=0;
   5080                         } else {
   5081                             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5082                         }
   5083                     } else {
   5084                         /* error handling: illegal UTF-8 byte sequence */
   5085                         source-=(toULength-oldToULength);
   5086                         while(oldToULength<toULength) {
   5087                             utf8->toUBytes[oldToULength++]=*source++;
   5088                         }
   5089                         utf8->toULength=toULength;
   5090                         pToUArgs->source=(char *)source;
   5091                         pFromUArgs->target=(char *)target;
   5092                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5093                         return;
   5094                     }
   5095                 }
   5096             }
   5097 
   5098             if(value>=minValue) {
   5099                 /* output the mapping for c */
   5100                 *target++=(uint8_t)value;
   5101                 --targetCapacity;
   5102             } else {
   5103                 /* value<minValue means c is unassigned (unmappable) */
   5104                 /*
   5105                  * Try an extension mapping.
   5106                  * Pass in no source because we don't have UTF-16 input.
   5107                  * If we have a partial match on c, we will return and revert
   5108                  * to UTF-8->UTF-16->charset conversion.
   5109                  */
   5110                 static const UChar nul=0;
   5111                 const UChar *noSource=&nul;
   5112                 c=_extFromU(cnv, cnv->sharedData,
   5113                             c, &noSource, noSource,
   5114                             &target, target+targetCapacity,
   5115                             NULL, -1,
   5116                             pFromUArgs->flush,
   5117                             pErrorCode);
   5118 
   5119                 if(U_FAILURE(*pErrorCode)) {
   5120                     /* not mappable or buffer overflow */
   5121                     cnv->fromUChar32=c;
   5122                     break;
   5123                 } else if(cnv->preFromUFirstCP>=0) {
   5124                     /*
   5125                      * Partial match, return and revert to pivoting.
   5126                      * In normal from-UTF-16 conversion, we would just continue
   5127                      * but then exit the loop because the extension match would
   5128                      * have consumed the source.
   5129                      */
   5130                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5131                     break;
   5132                 } else {
   5133                     /* a mapping was written to the target, continue */
   5134 
   5135                     /* recalculate the targetCapacity after an extension mapping */
   5136                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5137                 }
   5138             }
   5139         } else {
   5140             /* target is full */
   5141             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5142             break;
   5143         }
   5144     }
   5145 
   5146     /*
   5147      * The sourceLimit may have been adjusted before the conversion loop
   5148      * to stop before a truncated sequence.
   5149      * If so, then collect the truncated sequence now.
   5150      */
   5151     if(U_SUCCESS(*pErrorCode) &&
   5152             cnv->preFromUFirstCP<0 &&
   5153             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5154         c=utf8->toUBytes[0]=b=*source++;
   5155         toULength=1;
   5156         toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5157         while(source<sourceLimit) {
   5158             utf8->toUBytes[toULength++]=b=*source++;
   5159             c=(c<<6)+b;
   5160         }
   5161         utf8->toUnicodeStatus=c;
   5162         utf8->toULength=toULength;
   5163         utf8->mode=toULimit;
   5164     }
   5165 
   5166     /* write back the updated pointers */
   5167     pToUArgs->source=(char *)source;
   5168     pFromUArgs->target=(char *)target;
   5169 }
   5170 
   5171 static void
   5172 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5173                   UConverterToUnicodeArgs *pToUArgs,
   5174                   UErrorCode *pErrorCode) {
   5175     UConverter *utf8, *cnv;
   5176     const uint8_t *source, *sourceLimit;
   5177     uint8_t *target;
   5178     int32_t targetCapacity;
   5179 
   5180     const uint16_t *table, *mbcsIndex;
   5181     const uint16_t *results;
   5182 
   5183     int8_t oldToULength, toULength, toULimit;
   5184 
   5185     UChar32 c;
   5186     uint8_t b, t1, t2;
   5187 
   5188     uint32_t stage2Entry;
   5189     uint32_t asciiRoundtrips;
   5190     uint16_t value;
   5191     UBool hasSupplementary;
   5192 
   5193     /* set up the local pointers */
   5194     utf8=pToUArgs->converter;
   5195     cnv=pFromUArgs->converter;
   5196     source=(uint8_t *)pToUArgs->source;
   5197     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5198     target=(uint8_t *)pFromUArgs->target;
   5199     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5200 
   5201     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5202     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   5203     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5204         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5205     } else {
   5206         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5207     }
   5208     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5209 
   5210     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5211 
   5212     /* get the converter state from the UTF-8 UConverter */
   5213     c=(UChar32)utf8->toUnicodeStatus;
   5214     if(c!=0) {
   5215         toULength=oldToULength=utf8->toULength;
   5216         toULimit=(int8_t)utf8->mode;
   5217     } else {
   5218         toULength=oldToULength=toULimit=0;
   5219     }
   5220 
   5221     /*
   5222      * Make sure that the last byte sequence before sourceLimit is complete
   5223      * or runs into a lead byte.
   5224      * Do not go back into the bytes that will be read for finishing a partial
   5225      * sequence from the previous buffer.
   5226      * In the conversion loop compare source with sourceLimit only once
   5227      * per multi-byte character.
   5228      */
   5229     {
   5230         int32_t i, length;
   5231 
   5232         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5233         for(i=0; i<3 && i<length;) {
   5234             b=*(sourceLimit-i-1);
   5235             if(U8_IS_TRAIL(b)) {
   5236                 ++i;
   5237             } else {
   5238                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
   5239                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   5240                     sourceLimit-=i+1;
   5241                 }
   5242                 break;
   5243             }
   5244         }
   5245     }
   5246 
   5247     if(c!=0 && targetCapacity>0) {
   5248         utf8->toUnicodeStatus=0;
   5249         utf8->toULength=0;
   5250         goto moreBytes;
   5251         /* See note in ucnv_SBCSFromUTF8() about this goto. */
   5252     }
   5253 
   5254     /* conversion loop */
   5255     while(source<sourceLimit) {
   5256         if(targetCapacity>0) {
   5257             b=*source++;
   5258             if((int8_t)b>=0) {
   5259                 /* convert ASCII */
   5260                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5261                     *target++=b;
   5262                     --targetCapacity;
   5263                     continue;
   5264                 } else {
   5265                     value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
   5266                     if(value==0) {
   5267                         c=b;
   5268                         goto unassigned;
   5269                     }
   5270                 }
   5271             } else {
   5272                 if(b>0xe0) {
   5273                     if( /* handle U+1000..U+D7FF inline */
   5274                         (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
   5275                                                         (b==0xed && (t1 <= 0x1f))) &&
   5276                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5277                     ) {
   5278                         c=((b&0xf)<<6)|t1;
   5279                         source+=2;
   5280                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
   5281                         if(value==0) {
   5282                             c=(c<<6)|t2;
   5283                             goto unassigned;
   5284                         }
   5285                     } else {
   5286                         c=-1;
   5287                     }
   5288                 } else if(b<0xe0) {
   5289                     if( /* handle U+0080..U+07FF inline */
   5290                         b>=0xc2 &&
   5291                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5292                     ) {
   5293                         c=b&0x1f;
   5294                         ++source;
   5295                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
   5296                         if(value==0) {
   5297                             c=(c<<6)|t1;
   5298                             goto unassigned;
   5299                         }
   5300                     } else {
   5301                         c=-1;
   5302                     }
   5303                 } else {
   5304                     c=-1;
   5305                 }
   5306 
   5307                 if(c<0) {
   5308                     /* handle "complicated" and error cases, and continuing partial characters */
   5309                     oldToULength=0;
   5310                     toULength=1;
   5311                     toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5312                     c=b;
   5313 moreBytes:
   5314                     while(toULength<toULimit) {
   5315                         /*
   5316                          * The sourceLimit may have been adjusted before the conversion loop
   5317                          * to stop before a truncated sequence.
   5318                          * Here we need to use the real limit in case we have two truncated
   5319                          * sequences at the end.
   5320                          * See ticket #7492.
   5321                          */
   5322                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5323                             b=*source;
   5324                             if(U8_IS_TRAIL(b)) {
   5325                                 ++source;
   5326                                 ++toULength;
   5327                                 c=(c<<6)+b;
   5328                             } else {
   5329                                 break; /* sequence too short, stop with toULength<toULimit */
   5330                             }
   5331                         } else {
   5332                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5333                             source-=(toULength-oldToULength);
   5334                             while(oldToULength<toULength) {
   5335                                 utf8->toUBytes[oldToULength++]=*source++;
   5336                             }
   5337                             utf8->toUnicodeStatus=c;
   5338                             utf8->toULength=toULength;
   5339                             utf8->mode=toULimit;
   5340                             pToUArgs->source=(char *)source;
   5341                             pFromUArgs->target=(char *)target;
   5342                             return;
   5343                         }
   5344                     }
   5345 
   5346                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5347                         (toULength==3 || toULength==2) &&             /* BMP */
   5348                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5349                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5350                     ) {
   5351                         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5352                     } else if(
   5353                         toULength==toULimit && toULength==4 &&
   5354                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5355                     ) {
   5356                         /* supplementary code point */
   5357                         if(!hasSupplementary) {
   5358                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5359                             stage2Entry=0;
   5360                         } else {
   5361                             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5362                         }
   5363                     } else {
   5364                         /* error handling: illegal UTF-8 byte sequence */
   5365                         source-=(toULength-oldToULength);
   5366                         while(oldToULength<toULength) {
   5367                             utf8->toUBytes[oldToULength++]=*source++;
   5368                         }
   5369                         utf8->toULength=toULength;
   5370                         pToUArgs->source=(char *)source;
   5371                         pFromUArgs->target=(char *)target;
   5372                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5373                         return;
   5374                     }
   5375 
   5376                     /* get the bytes and the length for the output */
   5377                     /* MBCS_OUTPUT_2 */
   5378                     value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
   5379 
   5380                     /* is this code point assigned, or do we use fallbacks? */
   5381                     if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   5382                          (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   5383                     ) {
   5384                         goto unassigned;
   5385                     }
   5386                 }
   5387             }
   5388 
   5389             /* write the output character bytes from value and length */
   5390             /* from the first if in the loop we know that targetCapacity>0 */
   5391             if(value<=0xff) {
   5392                 /* this is easy because we know that there is enough space */
   5393                 *target++=(uint8_t)value;
   5394                 --targetCapacity;
   5395             } else /* length==2 */ {
   5396                 *target++=(uint8_t)(value>>8);
   5397                 if(2<=targetCapacity) {
   5398                     *target++=(uint8_t)value;
   5399                     targetCapacity-=2;
   5400                 } else {
   5401                     cnv->charErrorBuffer[0]=(char)value;
   5402                     cnv->charErrorBufferLength=1;
   5403 
   5404                     /* target overflow */
   5405                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5406                     break;
   5407                 }
   5408             }
   5409             continue;
   5410 
   5411 unassigned:
   5412             {
   5413                 /*
   5414                  * Try an extension mapping.
   5415                  * Pass in no source because we don't have UTF-16 input.
   5416                  * If we have a partial match on c, we will return and revert
   5417                  * to UTF-8->UTF-16->charset conversion.
   5418                  */
   5419                 static const UChar nul=0;
   5420                 const UChar *noSource=&nul;
   5421                 c=_extFromU(cnv, cnv->sharedData,
   5422                             c, &noSource, noSource,
   5423                             &target, target+targetCapacity,
   5424                             NULL, -1,
   5425                             pFromUArgs->flush,
   5426                             pErrorCode);
   5427 
   5428                 if(U_FAILURE(*pErrorCode)) {
   5429                     /* not mappable or buffer overflow */
   5430                     cnv->fromUChar32=c;
   5431                     break;
   5432                 } else if(cnv->preFromUFirstCP>=0) {
   5433                     /*
   5434                      * Partial match, return and revert to pivoting.
   5435                      * In normal from-UTF-16 conversion, we would just continue
   5436                      * but then exit the loop because the extension match would
   5437                      * have consumed the source.
   5438                      */
   5439                     *pErrorCode=U_USING_DEFAULT_WARNING;
   5440                     break;
   5441                 } else {
   5442                     /* a mapping was written to the target, continue */
   5443 
   5444                     /* recalculate the targetCapacity after an extension mapping */
   5445                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5446                     continue;
   5447                 }
   5448             }
   5449         } else {
   5450             /* target is full */
   5451             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5452             break;
   5453         }
   5454     }
   5455 
   5456     /*
   5457      * The sourceLimit may have been adjusted before the conversion loop
   5458      * to stop before a truncated sequence.
   5459      * If so, then collect the truncated sequence now.
   5460      */
   5461     if(U_SUCCESS(*pErrorCode) &&
   5462             cnv->preFromUFirstCP<0 &&
   5463             source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5464         c=utf8->toUBytes[0]=b=*source++;
   5465         toULength=1;
   5466         toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
   5467         while(source<sourceLimit) {
   5468             utf8->toUBytes[toULength++]=b=*source++;
   5469             c=(c<<6)+b;
   5470         }
   5471         utf8->toUnicodeStatus=c;
   5472         utf8->toULength=toULength;
   5473         utf8->mode=toULimit;
   5474     }
   5475 
   5476     /* write back the updated pointers */
   5477     pToUArgs->source=(char *)source;
   5478     pFromUArgs->target=(char *)target;
   5479 }
   5480 
   5481 /* miscellaneous ------------------------------------------------------------ */
   5482 
   5483 static void
   5484 ucnv_MBCSGetStarters(const UConverter* cnv,
   5485                  UBool starters[256],
   5486                  UErrorCode *pErrorCode) {
   5487     const int32_t *state0;
   5488     int i;
   5489 
   5490     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
   5491     for(i=0; i<256; ++i) {
   5492         /* all bytes that cause a state transition from state 0 are lead bytes */
   5493         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
   5494     }
   5495 }
   5496 
   5497 /*
   5498  * This is an internal function that allows other converter implementations
   5499  * to check whether a byte is a lead byte.
   5500  */
   5501 U_CFUNC UBool
   5502 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
   5503     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
   5504 }
   5505 
   5506 static void
   5507 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
   5508               int32_t offsetIndex,
   5509               UErrorCode *pErrorCode) {
   5510     UConverter *cnv=pArgs->converter;
   5511     char *p, *subchar;
   5512     char buffer[4];
   5513     int32_t length;
   5514 
   5515     /* first, select between subChar and subChar1 */
   5516     if( cnv->subChar1!=0 &&
   5517         (cnv->sharedData->mbcs.extIndexes!=NULL ?
   5518             cnv->useSubChar1 :
   5519             (cnv->invalidUCharBuffer[0]<=0xff))
   5520     ) {
   5521         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
   5522         subchar=(char *)&cnv->subChar1;
   5523         length=1;
   5524     } else {
   5525         /* select subChar in all other cases */
   5526         subchar=(char *)cnv->subChars;
   5527         length=cnv->subCharLen;
   5528     }
   5529 
   5530     /* reset the selector for the next code point */
   5531     cnv->useSubChar1=FALSE;
   5532 
   5533     if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
   5534         p=buffer;
   5535 
   5536         /* fromUnicodeStatus contains prevLength */
   5537         switch(length) {
   5538         case 1:
   5539             if(cnv->fromUnicodeStatus==2) {
   5540                 /* DBCS mode and SBCS sub char: change to SBCS */
   5541                 cnv->fromUnicodeStatus=1;
   5542                 *p++=UCNV_SI;
   5543             }
   5544             *p++=subchar[0];
   5545             break;
   5546         case 2:
   5547             if(cnv->fromUnicodeStatus<=1) {
   5548                 /* SBCS mode and DBCS sub char: change to DBCS */
   5549                 cnv->fromUnicodeStatus=2;
   5550                 *p++=UCNV_SO;
   5551             }
   5552             *p++=subchar[0];
   5553             *p++=subchar[1];
   5554             break;
   5555         default:
   5556             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   5557             return;
   5558         }
   5559         subchar=buffer;
   5560         length=(int32_t)(p-buffer);
   5561     }
   5562 
   5563     ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
   5564 }
   5565 
   5566 U_CFUNC UConverterType
   5567 ucnv_MBCSGetType(const UConverter* converter) {
   5568     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
   5569     if(converter->sharedData->mbcs.countStates==1) {
   5570         return (UConverterType)UCNV_SBCS;
   5571     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
   5572         return (UConverterType)UCNV_EBCDIC_STATEFUL;
   5573     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
   5574         return (UConverterType)UCNV_DBCS;
   5575     }
   5576     return (UConverterType)UCNV_MBCS;
   5577 }
   5578 
   5579 static const UConverterImpl _SBCSUTF8Impl={
   5580     UCNV_MBCS,
   5581 
   5582     ucnv_MBCSLoad,
   5583     ucnv_MBCSUnload,
   5584 
   5585     ucnv_MBCSOpen,
   5586     NULL,
   5587     NULL,
   5588 
   5589     ucnv_MBCSToUnicodeWithOffsets,
   5590     ucnv_MBCSToUnicodeWithOffsets,
   5591     ucnv_MBCSFromUnicodeWithOffsets,
   5592     ucnv_MBCSFromUnicodeWithOffsets,
   5593     ucnv_MBCSGetNextUChar,
   5594 
   5595     ucnv_MBCSGetStarters,
   5596     ucnv_MBCSGetName,
   5597     ucnv_MBCSWriteSub,
   5598     NULL,
   5599     ucnv_MBCSGetUnicodeSet,
   5600 
   5601     NULL,
   5602     ucnv_SBCSFromUTF8
   5603 };
   5604 
   5605 static const UConverterImpl _DBCSUTF8Impl={
   5606     UCNV_MBCS,
   5607 
   5608     ucnv_MBCSLoad,
   5609     ucnv_MBCSUnload,
   5610 
   5611     ucnv_MBCSOpen,
   5612     NULL,
   5613     NULL,
   5614 
   5615     ucnv_MBCSToUnicodeWithOffsets,
   5616     ucnv_MBCSToUnicodeWithOffsets,
   5617     ucnv_MBCSFromUnicodeWithOffsets,
   5618     ucnv_MBCSFromUnicodeWithOffsets,
   5619     ucnv_MBCSGetNextUChar,
   5620 
   5621     ucnv_MBCSGetStarters,
   5622     ucnv_MBCSGetName,
   5623     ucnv_MBCSWriteSub,
   5624     NULL,
   5625     ucnv_MBCSGetUnicodeSet,
   5626 
   5627     NULL,
   5628     ucnv_DBCSFromUTF8
   5629 };
   5630 
   5631 static const UConverterImpl _MBCSImpl={
   5632     UCNV_MBCS,
   5633 
   5634     ucnv_MBCSLoad,
   5635     ucnv_MBCSUnload,
   5636 
   5637     ucnv_MBCSOpen,
   5638     NULL,
   5639     NULL,
   5640 
   5641     ucnv_MBCSToUnicodeWithOffsets,
   5642     ucnv_MBCSToUnicodeWithOffsets,
   5643     ucnv_MBCSFromUnicodeWithOffsets,
   5644     ucnv_MBCSFromUnicodeWithOffsets,
   5645     ucnv_MBCSGetNextUChar,
   5646 
   5647     ucnv_MBCSGetStarters,
   5648     ucnv_MBCSGetName,
   5649     ucnv_MBCSWriteSub,
   5650     NULL,
   5651     ucnv_MBCSGetUnicodeSet
   5652 };
   5653 
   5654 
   5655 /* Static data is in tools/makeconv/ucnvstat.c for data-based
   5656  * converters. Be sure to update it as well.
   5657  */
   5658 
   5659 const UConverterSharedData _MBCSData={
   5660     sizeof(UConverterSharedData), 1,
   5661     NULL, NULL, NULL, FALSE, &_MBCSImpl,
   5662     0
   5663 };
   5664 
   5665 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   5666