Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvmbcs.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000jul03
     14 *   created by: Markus W. Scherer
     15 *
     16 *   The current code in this file replaces the previous implementation
     17 *   of conversion code from multi-byte codepages to Unicode and back.
     18 *   This implementation supports the following:
     19 *   - legacy variable-length codepages with up to 4 bytes per character
     20 *   - all Unicode code points (up to 0x10ffff)
     21 *   - efficient distinction of unassigned vs. illegal byte sequences
     22 *   - it is possible in fromUnicode() to directly deal with simple
     23 *     stateful encodings (used for EBCDIC_STATEFUL)
     24 *   - it is possible to convert Unicode code points
     25 *     to a single zero byte (but not as a fallback except for SBCS)
     26 *
     27 *   Remaining limitations in fromUnicode:
     28 *   - byte sequences must not have leading zero bytes
     29 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
     30 *   - limitation to up to 4 bytes per character
     31 *
     32 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
     33 *   limitations and adds m:n character mappings and other features.
     34 *   See ucnv_ext.h for details.
     35 *
     36 *   Change history:
     37 *
     38 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
     39 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
     40 *                             macros to ucnvmbcs.h file
     41 */
     42 
     43 #include "unicode/utypes.h"
     44 
     45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     46 
     47 #include "unicode/ucnv.h"
     48 #include "unicode/ucnv_cb.h"
     49 #include "unicode/udata.h"
     50 #include "unicode/uset.h"
     51 #include "ucnv_bld.h"
     52 #include "ucnvmbcs.h"
     53 #include "ucnv_ext.h"
     54 #include "ucnv_cnv.h"
     55 #include "umutex.h"
     56 #include "cmemory.h"
     57 #include "cstring.h"
     58 
     59 /* control optimizations according to the platform */
     60 #define MBCS_UNROLL_SINGLE_TO_BMP 1
     61 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
     62 
     63 /*
     64  * _MBCSHeader versions 5.3 & 4.3
     65  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
     66  *
     67  * This version is optional. Version 5 is used for incompatible data format changes.
     68  * makeconv will continue to generate version 4 files if possible.
     69  *
     70  * Changes from version 4:
     71  *
     72  * The main difference is an additional _MBCSHeader field with
     73  * - the length (number of uint32_t) of the _MBCSHeader
     74  * - flags for further incompatible data format changes
     75  * - flags for further, backward compatible data format changes
     76  *
     77  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
     78  * the file and needs to be reconstituted at load time.
     79  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
     80  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
     81  * (For details about these structures see below, and see ucnvmbcs.h.)
     82  *
     83  *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
     84  *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
     85  *   precision markers for all mappings.)
     86  *
     87  *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
     88  *   omitted data that can be reconstituted from the toUnicode data.
     89  *
     90  *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
     91  *   With only roundtrip mappings in the base fromUnicode data, this part is fully
     92  *   redundant with the mbcsIndex and will be reconstituted from that (also using the
     93  *   stage 1 table which contains the information about how stage 2 was compacted).
     94  *
     95  *   The rest of the stage 2 table, the part for code points above maxFastUChar,
     96  *   is stored in the file and will be appended to the reconstituted part.
     97  *
     98  *   The entire fromUBytes array is omitted from the file and will be reconstitued.
     99  *   This is done by enumerating all toUnicode roundtrip mappings, performing
    100  *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
    101  *   writing instead of reading the byte values.
    102  *
    103  * _MBCSHeader version 4.3
    104  *
    105  * Change from version 4.2:
    106  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
    107  *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
    108  *   files which can be used instead of stages 1 & 2.
    109  *   Faster lookups for roundtrips from most commonly used characters,
    110  *   and lookups from UTF-8 byte sequences with a natural bit distribution.
    111  *   See ucnvmbcs.h for more details.
    112  *
    113  * Change from version 4.1:
    114  * - Added an optional extension table structure at the end of the .cnv file.
    115  *   It is present if the upper bits of the header flags field contains a non-zero
    116  *   byte offset to it.
    117  *   Files that contain only a conversion table and no base table
    118  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
    119  *   These contain the base table name between the MBCS header and the extension
    120  *   data.
    121  *
    122  * Change from version 4.0:
    123  * - Replace header.reserved with header.fromUBytesLength so that all
    124  *   fields in the data have length.
    125  *
    126  * Changes from version 3 (for performance improvements):
    127  * - new bit distribution for state table entries
    128  * - reordered action codes
    129  * - new data structure for single-byte fromUnicode
    130  *   + stage 2 only contains indexes
    131  *   + stage 3 stores 16 bits per character with classification bits 15..8
    132  * - no multiplier for stage 1 entries
    133  * - stage 2 for non-single-byte codepages contains the index and the flags in
    134  *   one 32-bit value
    135  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
    136  *
    137  * For more details about old versions of the MBCS data structure, see
    138  * the corresponding versions of this file.
    139  *
    140  * Converting stateless codepage data ---------------------------------------***
    141  * (or codepage data with simple states) to Unicode.
    142  *
    143  * Data structure and algorithm for converting from complex legacy codepages
    144  * to Unicode. (Designed before 2000-may-22.)
    145  *
    146  * The basic idea is that the structure of legacy codepages can be described
    147  * with state tables.
    148  * When reading a byte stream, each input byte causes a state transition.
    149  * Some transitions result in the output of a code point, some result in
    150  * "unassigned" or "illegal" output.
    151  * This is used here for character conversion.
    152  *
    153  * The data structure begins with a state table consisting of a row
    154  * per state, with 256 entries (columns) per row for each possible input
    155  * byte value.
    156  * Each entry is 32 bits wide, with two formats distinguished by
    157  * the sign bit (bit 31):
    158  *
    159  * One format for transitional entries (bit 31 not set) for non-final bytes, and
    160  * one format for final entries (bit 31 set).
    161  * Both formats contain the number of the next state in the same bit
    162  * positions.
    163  * State 0 is the initial state.
    164  *
    165  * Most of the time, the offset values of subsequent states are added
    166  * up to a scalar value. This value will eventually be the index of
    167  * the Unicode code point in a table that follows the state table.
    168  * The effect is that the code points for final state table rows
    169  * are contiguous. The code points of final state rows follow each other
    170  * in the order of the references to those final states by previous
    171  * states, etc.
    172  *
    173  * For some terminal states, the offset is itself the output Unicode
    174  * code point (16 bits for a BMP code point or 20 bits for a supplementary
    175  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
    176  * For others, the code point in the Unicode table is stored with either
    177  * one or two code units: one for BMP code points, two for a pair of
    178  * surrogates.
    179  * All code points for a final state entry take up the same number of code
    180  * units, regardless of whether they all actually _use_ the same number
    181  * of code units. This is necessary for simple array access.
    182  *
    183  * An additional feature comes in with what in ICU is called "fallback"
    184  * mappings:
    185  *
    186  * In addition to round-trippable, precise, 1:1 mappings, there are often
    187  * mappings defined between similar, though not the same, characters.
    188  * Typically, such mappings occur only in fromUnicode mapping tables because
    189  * Unicode has a superset repertoire of most other codepages. However, it
    190  * is possible to provide such mappings in the toUnicode tables, too.
    191  * In this case, the fallback mappings are partly integrated into the
    192  * general state tables because the structure of the encoding includes their
    193  * byte sequences.
    194  * For final entries in an initial state, fallback mappings are stored in
    195  * the entry itself like with roundtrip mappings.
    196  * For other final entries, they are stored in the code units table if
    197  * the entry is for a pair of code units.
    198  * For single-unit results in the code units table, there is no space to
    199  * alternatively hold a fallback mapping; in this case, the code unit
    200  * is stored as U+fffe (unassigned), and the fallback mapping needs to
    201  * be looked up by the scalar offset value in a separate table.
    202  *
    203  * "Unassigned" state entries really mean "structurally unassigned",
    204  * i.e., such a byte sequence will never have a mapping result.
    205  *
    206  * The interpretation of the bits in each entry is as follows:
    207  *
    208  * Bit 31 not set, not a terminal entry ("transitional"):
    209  * 30..24 next state
    210  * 23..0  offset delta, to be added up
    211  *
    212  * Bit 31 set, terminal ("final") entry:
    213  * 30..24 next state (regardless of action code)
    214  * 23..20 action code:
    215  *        action codes 0 and 1 result in precise-mapping Unicode code points
    216  *        0  valid byte sequence
    217  *           19..16 not used, 0
    218  *           15..0  16-bit Unicode BMP code point
    219  *                  never U+fffe or U+ffff
    220  *        1  valid byte sequence
    221  *           19..0  20-bit Unicode supplementary code point
    222  *                  never U+fffe or U+ffff
    223  *
    224  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
    225  *        2  valid byte sequence (fallback)
    226  *           19..16 not used, 0
    227  *           15..0  16-bit Unicode BMP code point as fallback result
    228  *        3  valid byte sequence (fallback)
    229  *           19..0  20-bit Unicode supplementary code point as fallback result
    230  *
    231  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
    232  *        depending on the code units they result in
    233  *        4  valid byte sequence
    234  *           19..9  not used, 0
    235  *            8..0  final offset delta
    236  *                  pointing to one 16-bit code unit which may be
    237  *                  fffe  unassigned -- look for a fallback for this offset
    238  *                  ffff  illegal
    239  *        5  valid byte sequence
    240  *           19..9  not used, 0
    241  *            8..0  final offset delta
    242  *                  pointing to two 16-bit code units
    243  *                  (typically UTF-16 surrogates)
    244  *                  the result depends on the first code unit as follows:
    245  *                  0000..d7ff  roundtrip BMP code point (1st alone)
    246  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
    247  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
    248  *                  e000        roundtrip BMP code point (2nd alone)
    249  *                  e001        fallback BMP code point (2nd alone)
    250  *                  fffe        unassigned
    251  *                  ffff        illegal
    252  *           (the final offset deltas are at most 255 * 2,
    253  *            times 2 because of storing code unit pairs)
    254  *
    255  *        6  unassigned byte sequence
    256  *           19..16 not used, 0
    257  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
    258  *                  this does not contain a final offset delta because the main
    259  *                  purpose of this action code is to save scalar offset values;
    260  *                  therefore, fallback values cannot be assigned to byte
    261  *                  sequences that result in this action code
    262  *        7  illegal byte sequence
    263  *           19..16 not used, 0
    264  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
    265  *        8  state change only
    266  *           19..0  not used, 0
    267  *           useful for state changes in simple stateful encodings,
    268  *           at Shift-In/Shift-Out codes
    269  *
    270  *
    271  *        9..15 reserved for future use
    272  *           current implementations will only perform a state change
    273  *           and ignore bits 19..0
    274  *
    275  * An encoding with contiguous ranges of unassigned byte sequences, like
    276  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
    277  * at least two states for the trail bytes:
    278  * One trail byte state that results in code points, and one that only
    279  * has "unassigned" and "illegal" terminal states.
    280  *
    281  * Note: partly by accident, this data structure supports simple stateful
    282  * encodings without any additional logic.
    283  * Currently, only simple Shift-In/Shift-Out schemes are handled with
    284  * appropriate state tables (especially EBCDIC_STATEFUL!).
    285  *
    286  * MBCS version 2 added:
    287  * unassigned and illegal action codes have U+fffe and U+ffff
    288  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
    289  *
    290  * Converting from Unicode to codepage bytes --------------------------------***
    291  *
    292  * The conversion data structure for fromUnicode is designed for the known
    293  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
    294  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
    295  * a roundtrip mapping.
    296  *
    297  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
    298  * like in the character properties table.
    299  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
    300  * with the resulting bytes is at offsetFromUBytes.
    301  *
    302  * Beginning with version 4, single-byte codepages have a significantly different
    303  * trie compared to other codepages.
    304  * In all cases, the entry in stage 1 is directly the index of the block of
    305  * 64 entries in stage 2.
    306  *
    307  * Single-byte lookup:
    308  *
    309  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
    310  * Stage 3 contains one 16-bit word per result:
    311  * Bits 15..8 indicate the kind of result:
    312  *    f  roundtrip result
    313  *    c  fallback result from private-use code point
    314  *    8  fallback result from other code points
    315  *    0  unassigned
    316  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
    317  *
    318  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
    319  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
    320  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    321  * ASCII code points can be looked up with a linear array access into stage 3.
    322  * See maxFastUChar and other details in ucnvmbcs.h.
    323  *
    324  * Multi-byte lookup:
    325  *
    326  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
    327  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
    328  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
    329  *             If this test is false, then a non-zero result will be interpreted as
    330  *             a fallback mapping.
    331  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
    332  *
    333  * Stage 3 contains 2, 3, or 4 bytes per result.
    334  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
    335  * while 3 bytes are stored as bytes in big-endian order.
    336  * Leading zero bytes are ignored, and the number of bytes is counted.
    337  * A zero byte mapping result is possible as a roundtrip result.
    338  * For some output types, the actual result is processed from this;
    339  * see ucnv_MBCSFromUnicodeWithOffsets().
    340  *
    341  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
    342  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
    343  *
    344  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
    345  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
    346  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
    347  * ASCII code points can be looked up with a linear array access into stage 3.
    348  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
    349  *
    350  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
    351  * for compaction.
    352  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
    353  * may overlap by any number of entries.
    354  *
    355  * MBCS version 2 added:
    356  * the converter checks for known output types, which allows
    357  * adding new ones without crashing an unaware converter
    358  */
    359 
    360 static const UConverterImpl _SBCSUTF8Impl;
    361 static const UConverterImpl _DBCSUTF8Impl;
    362 
    363 /* GB 18030 data ------------------------------------------------------------ */
    364 
    365 /* helper macros for linear values for GB 18030 four-byte sequences */
    366 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
    367 
    368 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
    369 
    370 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
    371 
    372 /*
    373  * Some ranges of GB 18030 where both the Unicode code points and the
    374  * GB four-byte sequences are contiguous and are handled algorithmically by
    375  * the special callback functions below.
    376  * The values are start & end of Unicode & GB codes.
    377  *
    378  * Note that single surrogates are not mapped by GB 18030
    379  * as of the re-released mapping tables from 2000-nov-30.
    380  */
    381 static const uint32_t
    382 gb18030Ranges[13][4]={
    383     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
    384     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
    385     {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
    386     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
    387     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
    388     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
    389     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
    390     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
    391     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
    392     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
    393     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
    394     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
    395     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
    396 };
    397 
    398 /* bit flag for UConverter.options indicating GB 18030 special handling */
    399 #define _MBCS_OPTION_GB18030 0x8000
    400 
    401 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
    402 #define _MBCS_OPTION_KEIS 0x01000
    403 #define _MBCS_OPTION_JEF  0x02000
    404 #define _MBCS_OPTION_JIPS 0x04000
    405 
    406 #define KEIS_SO_CHAR_1 0x0A
    407 #define KEIS_SO_CHAR_2 0x42
    408 #define KEIS_SI_CHAR_1 0x0A
    409 #define KEIS_SI_CHAR_2 0x41
    410 
    411 #define JEF_SO_CHAR 0x28
    412 #define JEF_SI_CHAR 0x29
    413 
    414 #define JIPS_SO_CHAR_1 0x1A
    415 #define JIPS_SO_CHAR_2 0x70
    416 #define JIPS_SI_CHAR_1 0x1A
    417 #define JIPS_SI_CHAR_2 0x71
    418 
    419 enum SISO_Option {
    420     SI,
    421     SO
    422 };
    423 typedef enum SISO_Option SISO_Option;
    424 
    425 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
    426     int32_t SISOLength = 0;
    427 
    428     switch (option) {
    429         case SI:
    430             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    431                 value[0] = KEIS_SI_CHAR_1;
    432                 value[1] = KEIS_SI_CHAR_2;
    433                 SISOLength = 2;
    434             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    435                 value[0] = JEF_SI_CHAR;
    436                 SISOLength = 1;
    437             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    438                 value[0] = JIPS_SI_CHAR_1;
    439                 value[1] = JIPS_SI_CHAR_2;
    440                 SISOLength = 2;
    441             } else {
    442                 value[0] = UCNV_SI;
    443                 SISOLength = 1;
    444             }
    445             break;
    446         case SO:
    447             if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
    448                 value[0] = KEIS_SO_CHAR_1;
    449                 value[1] = KEIS_SO_CHAR_2;
    450                 SISOLength = 2;
    451             } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
    452                 value[0] = JEF_SO_CHAR;
    453                 SISOLength = 1;
    454             } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
    455                 value[0] = JIPS_SO_CHAR_1;
    456                 value[1] = JIPS_SO_CHAR_2;
    457                 SISOLength = 2;
    458             } else {
    459                 value[0] = UCNV_SO;
    460                 SISOLength = 1;
    461             }
    462             break;
    463         default:
    464             /* Should never happen. */
    465             break;
    466     }
    467 
    468     return SISOLength;
    469 }
    470 
    471 /* Miscellaneous ------------------------------------------------------------ */
    472 
    473 /**
    474  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
    475  * consecutive sequences of bytes, starting from the one encoded in value,
    476  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
    477  * Does not currently support m:n mappings or reverse fallbacks.
    478  * This function will not be called for sequences of bytes with leading zeros.
    479  *
    480  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
    481  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
    482  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
    483  *        not map to anything
    484  * @return TRUE to continue enumeration, FALSE to stop
    485  */
    486 typedef UBool U_CALLCONV
    487 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
    488 
    489 /* similar to ucnv_MBCSGetNextUChar() but recursive */
    490 static UBool
    491 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
    492         int32_t state, uint32_t offset,
    493         uint32_t value,
    494         UConverterEnumToUCallback *callback, const void *context,
    495         UErrorCode *pErrorCode) {
    496     UChar32 codePoints[32];
    497     const int32_t *row;
    498     const uint16_t *unicodeCodeUnits;
    499     UChar32 anyCodePoints;
    500     int32_t b, limit;
    501 
    502     row=mbcsTable->stateTable[state];
    503     unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
    504 
    505     value<<=8;
    506     anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
    507 
    508     b=(stateProps[state]&0x38)<<2;
    509     if(b==0 && stateProps[state]>=0x40) {
    510         /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
    511         codePoints[0]=U_SENTINEL;
    512         b=1;
    513     }
    514     limit=((stateProps[state]&7)+1)<<5;
    515     while(b<limit) {
    516         int32_t entry=row[b];
    517         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    518             int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
    519             if(stateProps[nextState]>=0) {
    520                 /* recurse to a state with non-ignorable actions */
    521                 if(!enumToU(
    522                         mbcsTable, stateProps, nextState,
    523                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
    524                         value|(uint32_t)b,
    525                         callback, context,
    526                         pErrorCode)) {
    527                     return FALSE;
    528                 }
    529             }
    530             codePoints[b&0x1f]=U_SENTINEL;
    531         } else {
    532             UChar32 c;
    533             int32_t action;
    534 
    535             /*
    536              * An if-else-if chain provides more reliable performance for
    537              * the most common cases compared to a switch.
    538              */
    539             action=MBCS_ENTRY_FINAL_ACTION(entry);
    540             if(action==MBCS_STATE_VALID_DIRECT_16) {
    541                 /* output BMP code point */
    542                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
    543             } else if(action==MBCS_STATE_VALID_16) {
    544                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    545                 c=unicodeCodeUnits[finalOffset];
    546                 if(c<0xfffe) {
    547                     /* output BMP code point */
    548                 } else {
    549                     c=U_SENTINEL;
    550                 }
    551             } else if(action==MBCS_STATE_VALID_16_PAIR) {
    552                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    553                 c=unicodeCodeUnits[finalOffset++];
    554                 if(c<0xd800) {
    555                     /* output BMP code point below 0xd800 */
    556                 } else if(c<=0xdbff) {
    557                     /* output roundtrip or fallback supplementary code point */
    558                     c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
    559                 } else if(c==0xe000) {
    560                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
    561                     c=unicodeCodeUnits[finalOffset];
    562                 } else {
    563                     c=U_SENTINEL;
    564                 }
    565             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
    566                 /* output supplementary code point */
    567                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
    568             } else {
    569                 c=U_SENTINEL;
    570             }
    571 
    572             codePoints[b&0x1f]=c;
    573             anyCodePoints&=c;
    574         }
    575         if(((++b)&0x1f)==0) {
    576             if(anyCodePoints>=0) {
    577                 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
    578                     return FALSE;
    579                 }
    580                 anyCodePoints=-1;
    581             }
    582         }
    583     }
    584     return TRUE;
    585 }
    586 
    587 /*
    588  * Only called if stateProps[state]==-1.
    589  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
    590  * MBCS_STATE_CHANGE_ONLY.
    591  */
    592 static int8_t
    593 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
    594     const int32_t *row;
    595     int32_t min, max, entry, nextState;
    596 
    597     row=stateTable[state];
    598     stateProps[state]=0;
    599 
    600     /* find first non-ignorable state */
    601     for(min=0;; ++min) {
    602         entry=row[min];
    603         nextState=MBCS_ENTRY_STATE(entry);
    604         if(stateProps[nextState]==-1) {
    605             getStateProp(stateTable, stateProps, nextState);
    606         }
    607         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    608             if(stateProps[nextState]>=0) {
    609                 break;
    610             }
    611         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    612             break;
    613         }
    614         if(min==0xff) {
    615             stateProps[state]=-0x40;  /* (int8_t)0xc0 */
    616             return stateProps[state];
    617         }
    618     }
    619     stateProps[state]|=(int8_t)((min>>5)<<3);
    620 
    621     /* find last non-ignorable state */
    622     for(max=0xff; min<max; --max) {
    623         entry=row[max];
    624         nextState=MBCS_ENTRY_STATE(entry);
    625         if(stateProps[nextState]==-1) {
    626             getStateProp(stateTable, stateProps, nextState);
    627         }
    628         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    629             if(stateProps[nextState]>=0) {
    630                 break;
    631             }
    632         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
    633             break;
    634         }
    635     }
    636     stateProps[state]|=(int8_t)(max>>5);
    637 
    638     /* recurse further and collect direct-state information */
    639     while(min<=max) {
    640         entry=row[min];
    641         nextState=MBCS_ENTRY_STATE(entry);
    642         if(stateProps[nextState]==-1) {
    643             getStateProp(stateTable, stateProps, nextState);
    644         }
    645         if(MBCS_ENTRY_IS_FINAL(entry)) {
    646             stateProps[nextState]|=0x40;
    647             if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
    648                 stateProps[state]|=0x40;
    649             }
    650         }
    651         ++min;
    652     }
    653     return stateProps[state];
    654 }
    655 
    656 /*
    657  * Internal function enumerating the toUnicode data of an MBCS converter.
    658  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
    659  * table, but could also be used for a future ucnv_getUnicodeSet() option
    660  * that includes reverse fallbacks (after updating this function's implementation).
    661  * Currently only handles roundtrip mappings.
    662  * Does not currently handle extensions.
    663  */
    664 static void
    665 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
    666                        UConverterEnumToUCallback *callback, const void *context,
    667                        UErrorCode *pErrorCode) {
    668     /*
    669      * Properties for each state, to speed up the enumeration.
    670      * Ignorable actions are unassigned/illegal/state-change-only:
    671      * They do not lead to mappings.
    672      *
    673      * Bits 7..6:
    674      * 1 direct/initial state (stateful converters have multiple)
    675      * 0 non-initial state with transitions or with non-ignorable result actions
    676      * -1 final state with only ignorable actions
    677      *
    678      * Bits 5..3:
    679      * The lowest byte value with non-ignorable actions is
    680      * value<<5 (rounded down).
    681      *
    682      * Bits 2..0:
    683      * The highest byte value with non-ignorable actions is
    684      * (value<<5)&0x1f (rounded up).
    685      */
    686     int8_t stateProps[MBCS_MAX_STATE_COUNT];
    687     int32_t state;
    688 
    689     uprv_memset(stateProps, -1, sizeof(stateProps));
    690 
    691     /* recurse from state 0 and set all stateProps */
    692     getStateProp(mbcsTable->stateTable, stateProps, 0);
    693 
    694     for(state=0; state<mbcsTable->countStates; ++state) {
    695         /*if(stateProps[state]==-1) {
    696             printf("unused/unreachable <icu:state> %d\n", state);
    697         }*/
    698         if(stateProps[state]>=0x40) {
    699             /* start from each direct state */
    700             enumToU(
    701                 mbcsTable, stateProps, state, 0, 0,
    702                 callback, context,
    703                 pErrorCode);
    704         }
    705     }
    706 }
    707 
    708 U_CFUNC void
    709 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    710                                          const USetAdder *sa,
    711                                          UConverterUnicodeSet which,
    712                                          UConverterSetFilter filter,
    713                                          UErrorCode *pErrorCode) {
    714     const UConverterMBCSTable *mbcsTable;
    715     const uint16_t *table;
    716 
    717     uint32_t st3;
    718     uint16_t st1, maxStage1, st2;
    719 
    720     UChar32 c;
    721 
    722     /* enumerate the from-Unicode trie table */
    723     mbcsTable=&sharedData->mbcs;
    724     table=mbcsTable->fromUnicodeTable;
    725     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
    726         maxStage1=0x440;
    727     } else {
    728         maxStage1=0x40;
    729     }
    730 
    731     c=0; /* keep track of the current code point while enumerating */
    732 
    733     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
    734         const uint16_t *stage2, *stage3, *results;
    735         uint16_t minValue;
    736 
    737         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
    738 
    739         /*
    740          * Set a threshold variable for selecting which mappings to use.
    741          * See ucnv_MBCSSingleFromBMPWithOffsets() and
    742          * MBCS_SINGLE_RESULT_FROM_U() for details.
    743          */
    744         if(which==UCNV_ROUNDTRIP_SET) {
    745             /* use only roundtrips */
    746             minValue=0xf00;
    747         } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
    748             /* use all roundtrip and fallback results */
    749             minValue=0x800;
    750         }
    751 
    752         for(st1=0; st1<maxStage1; ++st1) {
    753             st2=table[st1];
    754             if(st2>maxStage1) {
    755                 stage2=table+st2;
    756                 for(st2=0; st2<64; ++st2) {
    757                     if((st3=stage2[st2])!=0) {
    758                         /* read the stage 3 block */
    759                         stage3=results+st3;
    760 
    761                         do {
    762                             if(*stage3++>=minValue) {
    763                                 sa->add(sa->set, c);
    764                             }
    765                         } while((++c&0xf)!=0);
    766                     } else {
    767                         c+=16; /* empty stage 3 block */
    768                     }
    769                 }
    770             } else {
    771                 c+=1024; /* empty stage 2 block */
    772             }
    773         }
    774     } else {
    775         const uint32_t *stage2;
    776         const uint8_t *stage3, *bytes;
    777         uint32_t st3Multiplier;
    778         uint32_t value;
    779         UBool useFallback;
    780 
    781         bytes=mbcsTable->fromUnicodeBytes;
    782 
    783         useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
    784 
    785         switch(mbcsTable->outputType) {
    786         case MBCS_OUTPUT_3:
    787         case MBCS_OUTPUT_4_EUC:
    788             st3Multiplier=3;
    789             break;
    790         case MBCS_OUTPUT_4:
    791             st3Multiplier=4;
    792             break;
    793         default:
    794             st3Multiplier=2;
    795             break;
    796         }
    797 
    798         for(st1=0; st1<maxStage1; ++st1) {
    799             st2=table[st1];
    800             if(st2>(maxStage1>>1)) {
    801                 stage2=(const uint32_t *)table+st2;
    802                 for(st2=0; st2<64; ++st2) {
    803                     if((st3=stage2[st2])!=0) {
    804                         /* read the stage 3 block */
    805                         stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
    806 
    807                         /* get the roundtrip flags for the stage 3 block */
    808                         st3>>=16;
    809 
    810                         /*
    811                          * Add code points for which the roundtrip flag is set,
    812                          * or which map to non-zero bytes if we use fallbacks.
    813                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
    814                          */
    815                         switch(filter) {
    816                         case UCNV_SET_FILTER_NONE:
    817                             do {
    818                                 if(st3&1) {
    819                                     sa->add(sa->set, c);
    820                                     stage3+=st3Multiplier;
    821                                 } else if(useFallback) {
    822                                     uint8_t b=0;
    823                                     switch(st3Multiplier) {
    824                                     case 4:
    825                                         b|=*stage3++;
    826                                     case 3:
    827                                         b|=*stage3++;
    828                                     case 2:
    829                                         b|=stage3[0]|stage3[1];
    830                                         stage3+=2;
    831                                     default:
    832                                         break;
    833                                     }
    834                                     if(b!=0) {
    835                                         sa->add(sa->set, c);
    836                                     }
    837                                 }
    838                                 st3>>=1;
    839                             } while((++c&0xf)!=0);
    840                             break;
    841                         case UCNV_SET_FILTER_DBCS_ONLY:
    842                              /* Ignore single-byte results (<0x100). */
    843                             do {
    844                                 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
    845                                     sa->add(sa->set, c);
    846                                 }
    847                                 st3>>=1;
    848                                 stage3+=2;  /* +=st3Multiplier */
    849                             } while((++c&0xf)!=0);
    850                             break;
    851                         case UCNV_SET_FILTER_2022_CN:
    852                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
    853                             do {
    854                                 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
    855                                     sa->add(sa->set, c);
    856                                 }
    857                                 st3>>=1;
    858                                 stage3+=3;  /* +=st3Multiplier */
    859                             } while((++c&0xf)!=0);
    860                             break;
    861                         case UCNV_SET_FILTER_SJIS:
    862                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
    863                             do {
    864                                 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
    865                                     sa->add(sa->set, c);
    866                                 }
    867                                 st3>>=1;
    868                                 stage3+=2;  /* +=st3Multiplier */
    869                             } while((++c&0xf)!=0);
    870                             break;
    871                         case UCNV_SET_FILTER_GR94DBCS:
    872                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
    873                             do {
    874                                 if( ((st3&1)!=0 || useFallback) &&
    875                                     (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
    876                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
    877                                 ) {
    878                                     sa->add(sa->set, c);
    879                                 }
    880                                 st3>>=1;
    881                                 stage3+=2;  /* +=st3Multiplier */
    882                             } while((++c&0xf)!=0);
    883                             break;
    884                         case UCNV_SET_FILTER_HZ:
    885                             /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
    886                             do {
    887                                 if( ((st3&1)!=0 || useFallback) &&
    888                                     (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
    889                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
    890                                 ) {
    891                                     sa->add(sa->set, c);
    892                                 }
    893                                 st3>>=1;
    894                                 stage3+=2;  /* +=st3Multiplier */
    895                             } while((++c&0xf)!=0);
    896                             break;
    897                         default:
    898                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    899                             return;
    900                         }
    901                     } else {
    902                         c+=16; /* empty stage 3 block */
    903                     }
    904                 }
    905             } else {
    906                 c+=1024; /* empty stage 2 block */
    907             }
    908         }
    909     }
    910 
    911     ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
    912 }
    913 
    914 U_CFUNC void
    915 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
    916                                  const USetAdder *sa,
    917                                  UConverterUnicodeSet which,
    918                                  UErrorCode *pErrorCode) {
    919     ucnv_MBCSGetFilteredUnicodeSetForUnicode(
    920         sharedData, sa, which,
    921         sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
    922             UCNV_SET_FILTER_DBCS_ONLY :
    923             UCNV_SET_FILTER_NONE,
    924         pErrorCode);
    925 }
    926 
    927 static void
    928 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
    929                    const USetAdder *sa,
    930                    UConverterUnicodeSet which,
    931                    UErrorCode *pErrorCode) {
    932     if(cnv->options&_MBCS_OPTION_GB18030) {
    933         sa->addRange(sa->set, 0, 0xd7ff);
    934         sa->addRange(sa->set, 0xe000, 0x10ffff);
    935     } else {
    936         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
    937     }
    938 }
    939 
    940 /* conversion extensions for input not in the main table -------------------- */
    941 
    942 /*
    943  * Hardcoded extension handling for GB 18030.
    944  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
    945  *
    946  * In the future, conversion extensions may handle m:n mappings and delta tables,
    947  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
    948  *
    949  * If an input character cannot be mapped, then these functions set an error
    950  * code. The framework will then call the callback function.
    951  */
    952 
    953 /*
    954  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
    955  *         else return 0 after output has been written to the target
    956  */
    957 static UChar32
    958 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
    959           UChar32 cp,
    960           const UChar **source, const UChar *sourceLimit,
    961           uint8_t **target, const uint8_t *targetLimit,
    962           int32_t **offsets, int32_t sourceIndex,
    963           UBool flush,
    964           UErrorCode *pErrorCode) {
    965     const int32_t *cx;
    966 
    967     cnv->useSubChar1=FALSE;
    968 
    969     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
    970         ucnv_extInitialMatchFromU(
    971             cnv, cx,
    972             cp, source, sourceLimit,
    973             (char **)target, (char *)targetLimit,
    974             offsets, sourceIndex,
    975             flush,
    976             pErrorCode)
    977     ) {
    978         return 0; /* an extension mapping handled the input */
    979     }
    980 
    981     /* GB 18030 */
    982     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
    983         const uint32_t *range;
    984         int32_t i;
    985 
    986         range=gb18030Ranges[0];
    987         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
    988             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
    989                 /* found the Unicode code point, output the four-byte sequence for it */
    990                 uint32_t linear;
    991                 char bytes[4];
    992 
    993                 /* get the linear value of the first GB 18030 code in this range */
    994                 linear=range[2]-LINEAR_18030_BASE;
    995 
    996                 /* add the offset from the beginning of the range */
    997                 linear+=((uint32_t)cp-range[0]);
    998 
    999                 /* turn this into a four-byte sequence */
   1000                 bytes[3]=(char)(0x30+linear%10); linear/=10;
   1001                 bytes[2]=(char)(0x81+linear%126); linear/=126;
   1002                 bytes[1]=(char)(0x30+linear%10); linear/=10;
   1003                 bytes[0]=(char)(0x81+linear);
   1004 
   1005                 /* output this sequence */
   1006                 ucnv_fromUWriteBytes(cnv,
   1007                                      bytes, 4, (char **)target, (char *)targetLimit,
   1008                                      offsets, sourceIndex, pErrorCode);
   1009                 return 0;
   1010             }
   1011         }
   1012     }
   1013 
   1014     /* no mapping */
   1015     *pErrorCode=U_INVALID_CHAR_FOUND;
   1016     return cp;
   1017 }
   1018 
   1019 /*
   1020  * Input sequence: cnv->toUBytes[0..length[
   1021  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
   1022  *         else return 0 after output has been written to the target
   1023  */
   1024 static int8_t
   1025 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
   1026         int8_t length,
   1027         const uint8_t **source, const uint8_t *sourceLimit,
   1028         UChar **target, const UChar *targetLimit,
   1029         int32_t **offsets, int32_t sourceIndex,
   1030         UBool flush,
   1031         UErrorCode *pErrorCode) {
   1032     const int32_t *cx;
   1033 
   1034     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
   1035         ucnv_extInitialMatchToU(
   1036             cnv, cx,
   1037             length, (const char **)source, (const char *)sourceLimit,
   1038             target, targetLimit,
   1039             offsets, sourceIndex,
   1040             flush,
   1041             pErrorCode)
   1042     ) {
   1043         return 0; /* an extension mapping handled the input */
   1044     }
   1045 
   1046     /* GB 18030 */
   1047     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
   1048         const uint32_t *range;
   1049         uint32_t linear;
   1050         int32_t i;
   1051 
   1052         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
   1053         range=gb18030Ranges[0];
   1054         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
   1055             if(range[2]<=linear && linear<=range[3]) {
   1056                 /* found the sequence, output the Unicode code point for it */
   1057                 *pErrorCode=U_ZERO_ERROR;
   1058 
   1059                 /* add the linear difference between the input and start sequences to the start code point */
   1060                 linear=range[0]+(linear-range[2]);
   1061 
   1062                 /* output this code point */
   1063                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
   1064 
   1065                 return 0;
   1066             }
   1067         }
   1068     }
   1069 
   1070     /* no mapping */
   1071     *pErrorCode=U_INVALID_CHAR_FOUND;
   1072     return length;
   1073 }
   1074 
   1075 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
   1076 
   1077 /*
   1078  * This code modifies a standard EBCDIC<->Unicode mapping table for
   1079  * OS/390 (z/OS) Unix System Services (Open Edition).
   1080  * The difference is in the mapping of Line Feed and New Line control codes:
   1081  * Standard EBCDIC maps
   1082  *
   1083  *   <U000A> \x25 |0
   1084  *   <U0085> \x15 |0
   1085  *
   1086  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
   1087  * mapping
   1088  *
   1089  *   <U000A> \x15 |0
   1090  *   <U0085> \x25 |0
   1091  *
   1092  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
   1093  * by copying it into allocated memory and swapping the LF and NL values.
   1094  * It allows to support the same EBCDIC charset in both versions without
   1095  * duplicating the entire installed table.
   1096  */
   1097 
   1098 /* standard EBCDIC codes */
   1099 #define EBCDIC_LF 0x25
   1100 #define EBCDIC_NL 0x15
   1101 
   1102 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
   1103 #define EBCDIC_RT_LF 0xf25
   1104 #define EBCDIC_RT_NL 0xf15
   1105 
   1106 /* Unicode code points */
   1107 #define U_LF 0x0a
   1108 #define U_NL 0x85
   1109 
   1110 static UBool
   1111 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
   1112     UConverterMBCSTable *mbcsTable;
   1113 
   1114     const uint16_t *table, *results;
   1115     const uint8_t *bytes;
   1116 
   1117     int32_t (*newStateTable)[256];
   1118     uint16_t *newResults;
   1119     uint8_t *p;
   1120     char *name;
   1121 
   1122     uint32_t stage2Entry;
   1123     uint32_t size, sizeofFromUBytes;
   1124 
   1125     mbcsTable=&sharedData->mbcs;
   1126 
   1127     table=mbcsTable->fromUnicodeTable;
   1128     bytes=mbcsTable->fromUnicodeBytes;
   1129     results=(const uint16_t *)bytes;
   1130 
   1131     /*
   1132      * Check that this is an EBCDIC table with SBCS portion -
   1133      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
   1134      *
   1135      * If not, ignore the option. Options are always ignored if they do not apply.
   1136      */
   1137     if(!(
   1138          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
   1139          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
   1140          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
   1141     )) {
   1142         return FALSE;
   1143     }
   1144 
   1145     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1146         if(!(
   1147              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
   1148              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
   1149         )) {
   1150             return FALSE;
   1151         }
   1152     } else /* MBCS_OUTPUT_2_SISO */ {
   1153         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1154         if(!(
   1155              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
   1156              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
   1157         )) {
   1158             return FALSE;
   1159         }
   1160 
   1161         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1162         if(!(
   1163              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
   1164              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
   1165         )) {
   1166             return FALSE;
   1167         }
   1168     }
   1169 
   1170     if(mbcsTable->fromUBytesLength>0) {
   1171         /*
   1172          * We _know_ the number of bytes in the fromUnicodeBytes array
   1173          * starting with header.version 4.1.
   1174          */
   1175         sizeofFromUBytes=mbcsTable->fromUBytesLength;
   1176     } else {
   1177         /*
   1178          * Otherwise:
   1179          * There used to be code to enumerate the fromUnicode
   1180          * trie and find the highest entry, but it was removed in ICU 3.2
   1181          * because it was not tested and caused a low code coverage number.
   1182          * See Jitterbug 3674.
   1183          * This affects only some .cnv file formats with a header.version
   1184          * below 4.1, and only when swaplfnl is requested.
   1185          *
   1186          * ucnvmbcs.c revision 1.99 is the last one with the
   1187          * ucnv_MBCSSizeofFromUBytes() function.
   1188          */
   1189         *pErrorCode=U_INVALID_FORMAT_ERROR;
   1190         return FALSE;
   1191     }
   1192 
   1193     /*
   1194      * The table has an appropriate format.
   1195      * Allocate and build
   1196      * - a modified to-Unicode state table
   1197      * - a modified from-Unicode output array
   1198      * - a converter name string with the swap option appended
   1199      */
   1200     size=
   1201         mbcsTable->countStates*1024+
   1202         sizeofFromUBytes+
   1203         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
   1204     p=(uint8_t *)uprv_malloc(size);
   1205     if(p==NULL) {
   1206         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1207         return FALSE;
   1208     }
   1209 
   1210     /* copy and modify the to-Unicode state table */
   1211     newStateTable=(int32_t (*)[256])p;
   1212     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
   1213 
   1214     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
   1215     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
   1216 
   1217     /* copy and modify the from-Unicode result table */
   1218     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
   1219     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
   1220 
   1221     /* conveniently, the table access macros work on the left side of expressions */
   1222     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
   1223         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
   1224         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
   1225     } else /* MBCS_OUTPUT_2_SISO */ {
   1226         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
   1227         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
   1228 
   1229         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
   1230         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
   1231     }
   1232 
   1233     /* set the canonical converter name */
   1234     name=(char *)newResults+sizeofFromUBytes;
   1235     uprv_strcpy(name, sharedData->staticData->name);
   1236     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
   1237 
   1238     /* set the pointers */
   1239     umtx_lock(NULL);
   1240     if(mbcsTable->swapLFNLStateTable==NULL) {
   1241         mbcsTable->swapLFNLStateTable=newStateTable;
   1242         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
   1243         mbcsTable->swapLFNLName=name;
   1244 
   1245         newStateTable=NULL;
   1246     }
   1247     umtx_unlock(NULL);
   1248 
   1249     /* release the allocated memory if another thread beat us to it */
   1250     if(newStateTable!=NULL) {
   1251         uprv_free(newStateTable);
   1252     }
   1253     return TRUE;
   1254 }
   1255 
   1256 /* reconstitute omitted fromUnicode data ------------------------------------ */
   1257 
   1258 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
   1259 static UBool U_CALLCONV
   1260 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
   1261     UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
   1262     const uint16_t *table;
   1263     uint32_t *stage2;
   1264     uint8_t *bytes, *p;
   1265     UChar32 c;
   1266     int32_t i, st3;
   1267 
   1268     table=mbcsTable->fromUnicodeTable;
   1269     bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
   1270 
   1271     /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
   1272     switch(mbcsTable->outputType) {
   1273     case MBCS_OUTPUT_3_EUC:
   1274         if(value<=0xffff) {
   1275             /* short sequences are stored directly */
   1276             /* code set 0 or 1 */
   1277         } else if(value<=0x8effff) {
   1278             /* code set 2 */
   1279             value&=0x7fff;
   1280         } else /* first byte is 0x8f */ {
   1281             /* code set 3 */
   1282             value&=0xff7f;
   1283         }
   1284         break;
   1285     case MBCS_OUTPUT_4_EUC:
   1286         if(value<=0xffffff) {
   1287             /* short sequences are stored directly */
   1288             /* code set 0 or 1 */
   1289         } else if(value<=0x8effffff) {
   1290             /* code set 2 */
   1291             value&=0x7fffff;
   1292         } else /* first byte is 0x8f */ {
   1293             /* code set 3 */
   1294             value&=0xff7fff;
   1295         }
   1296         break;
   1297     default:
   1298         break;
   1299     }
   1300 
   1301     for(i=0; i<=0x1f; ++value, ++i) {
   1302         c=codePoints[i];
   1303         if(c<0) {
   1304             continue;
   1305         }
   1306 
   1307         /* locate the stage 2 & 3 data */
   1308         stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
   1309         p=bytes;
   1310         st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
   1311 
   1312         /* write the codepage bytes into stage 3 */
   1313         switch(mbcsTable->outputType) {
   1314         case MBCS_OUTPUT_3:
   1315         case MBCS_OUTPUT_4_EUC:
   1316             p+=st3*3;
   1317             p[0]=(uint8_t)(value>>16);
   1318             p[1]=(uint8_t)(value>>8);
   1319             p[2]=(uint8_t)value;
   1320             break;
   1321         case MBCS_OUTPUT_4:
   1322             ((uint32_t *)p)[st3]=value;
   1323             break;
   1324         default:
   1325             /* 2 bytes per character */
   1326             ((uint16_t *)p)[st3]=(uint16_t)value;
   1327             break;
   1328         }
   1329 
   1330         /* set the roundtrip flag */
   1331         *stage2|=(1UL<<(16+(c&0xf)));
   1332     }
   1333     return TRUE;
   1334  }
   1335 
   1336 static void
   1337 reconstituteData(UConverterMBCSTable *mbcsTable,
   1338                  uint32_t stage1Length, uint32_t stage2Length,
   1339                  uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
   1340                  UErrorCode *pErrorCode) {
   1341     uint16_t *stage1;
   1342     uint32_t *stage2;
   1343     uint8_t *bytes;
   1344     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
   1345     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
   1346     if(mbcsTable->reconstitutedData==NULL) {
   1347         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1348         return;
   1349     }
   1350     uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
   1351 
   1352     /* copy existing data and reroute the pointers */
   1353     stage1=(uint16_t *)mbcsTable->reconstitutedData;
   1354     uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
   1355 
   1356     stage2=(uint32_t *)(stage1+stage1Length);
   1357     uprv_memcpy(stage2+(fullStage2Length-stage2Length),
   1358                 mbcsTable->fromUnicodeTable+stage1Length,
   1359                 stage2Length*4);
   1360 
   1361     mbcsTable->fromUnicodeTable=stage1;
   1362     mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
   1363 
   1364     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
   1365     stage2=(uint32_t *)stage1;
   1366 
   1367     /* reconstitute the initial part of stage 2 from the mbcsIndex */
   1368     {
   1369         int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
   1370         int32_t stageUTF8Index=0;
   1371         int32_t st1, st2, st3, i;
   1372 
   1373         for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
   1374             st2=stage1[st1];
   1375             if(st2!=stage1Length/2) {
   1376                 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
   1377                 for(i=0; i<16; ++i) {
   1378                     st3=mbcsTable->mbcsIndex[stageUTF8Index++];
   1379                     if(st3!=0) {
   1380                         /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
   1381                         st3>>=4;
   1382                         /*
   1383                          * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
   1384                          * allocated together as a single 64-block for access from the mbcsIndex
   1385                          */
   1386                         stage2[st2++]=st3++;
   1387                         stage2[st2++]=st3++;
   1388                         stage2[st2++]=st3++;
   1389                         stage2[st2++]=st3;
   1390                     } else {
   1391                         /* no stage 3 block, skip */
   1392                         st2+=4;
   1393                     }
   1394                 }
   1395             } else {
   1396                 /* no stage 2 block, skip */
   1397                 stageUTF8Index+=16;
   1398             }
   1399         }
   1400     }
   1401 
   1402     /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
   1403     ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
   1404 }
   1405 
   1406 /* MBCS setup functions ----------------------------------------------------- */
   1407 
   1408 static void
   1409 ucnv_MBCSLoad(UConverterSharedData *sharedData,
   1410           UConverterLoadArgs *pArgs,
   1411           const uint8_t *raw,
   1412           UErrorCode *pErrorCode) {
   1413     UDataInfo info;
   1414     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1415     _MBCSHeader *header=(_MBCSHeader *)raw;
   1416     uint32_t offset;
   1417     uint32_t headerLength;
   1418     UBool noFromU=FALSE;
   1419 
   1420     if(header->version[0]==4) {
   1421         headerLength=MBCS_HEADER_V4_LENGTH;
   1422     } else if(header->version[0]==5 && header->version[1]>=3 &&
   1423               (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
   1424         headerLength=header->options&MBCS_OPT_LENGTH_MASK;
   1425         noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
   1426     } else {
   1427         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1428         return;
   1429     }
   1430 
   1431     mbcsTable->outputType=(uint8_t)header->flags;
   1432     if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
   1433         *pErrorCode=U_INVALID_TABLE_FORMAT;
   1434         return;
   1435     }
   1436 
   1437     /* extension data, header version 4.2 and higher */
   1438     offset=header->flags>>8;
   1439     if(offset!=0) {
   1440         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
   1441     }
   1442 
   1443     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
   1444         UConverterLoadArgs args={ 0 };
   1445         UConverterSharedData *baseSharedData;
   1446         const int32_t *extIndexes;
   1447         const char *baseName;
   1448 
   1449         /* extension-only file, load the base table and set values appropriately */
   1450         if((extIndexes=mbcsTable->extIndexes)==NULL) {
   1451             /* extension-only file without extension */
   1452             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1453             return;
   1454         }
   1455 
   1456         if(pArgs->nestedLoads!=1) {
   1457             /* an extension table must not be loaded as a base table */
   1458             *pErrorCode=U_INVALID_TABLE_FILE;
   1459             return;
   1460         }
   1461 
   1462         /* load the base table */
   1463         baseName=(const char *)header+headerLength*4;
   1464         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
   1465             /* forbid loading this same extension-only file */
   1466             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1467             return;
   1468         }
   1469 
   1470         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
   1471         args.size=sizeof(UConverterLoadArgs);
   1472         args.nestedLoads=2;
   1473         args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
   1474         args.reserved=pArgs->reserved;
   1475         args.options=pArgs->options;
   1476         args.pkg=pArgs->pkg;
   1477         args.name=baseName;
   1478         baseSharedData=ucnv_load(&args, pErrorCode);
   1479         if(U_FAILURE(*pErrorCode)) {
   1480             return;
   1481         }
   1482         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
   1483             baseSharedData->mbcs.baseSharedData!=NULL
   1484         ) {
   1485             ucnv_unload(baseSharedData);
   1486             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1487             return;
   1488         }
   1489         if(pArgs->onlyTestIsLoadable) {
   1490             /*
   1491              * Exit as soon as we know that we can load the converter
   1492              * and the format is valid and supported.
   1493              * The worst that can happen in the following code is a memory
   1494              * allocation error.
   1495              */
   1496             ucnv_unload(baseSharedData);
   1497             return;
   1498         }
   1499 
   1500         /* copy the base table data */
   1501         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
   1502 
   1503         /* overwrite values with relevant ones for the extension converter */
   1504         mbcsTable->baseSharedData=baseSharedData;
   1505         mbcsTable->extIndexes=extIndexes;
   1506 
   1507         /*
   1508          * It would be possible to share the swapLFNL data with a base converter,
   1509          * but the generated name would have to be different, and the memory
   1510          * would have to be free'd only once.
   1511          * It is easier to just create the data for the extension converter
   1512          * separately when it is requested.
   1513          */
   1514         mbcsTable->swapLFNLStateTable=NULL;
   1515         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
   1516         mbcsTable->swapLFNLName=NULL;
   1517 
   1518         /*
   1519          * The reconstitutedData must be deleted only when the base converter
   1520          * is unloaded.
   1521          */
   1522         mbcsTable->reconstitutedData=NULL;
   1523 
   1524         /*
   1525          * Set a special, runtime-only outputType if the extension converter
   1526          * is a DBCS version of a base converter that also maps single bytes.
   1527          */
   1528         if( sharedData->staticData->conversionType==UCNV_DBCS ||
   1529                 (sharedData->staticData->conversionType==UCNV_MBCS &&
   1530                  sharedData->staticData->minBytesPerChar>=2)
   1531         ) {
   1532             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
   1533                 /* the base converter is SI/SO-stateful */
   1534                 int32_t entry;
   1535 
   1536                 /* get the dbcs state from the state table entry for SO=0x0e */
   1537                 entry=mbcsTable->stateTable[0][0xe];
   1538                 if( MBCS_ENTRY_IS_FINAL(entry) &&
   1539                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
   1540                     MBCS_ENTRY_FINAL_STATE(entry)!=0
   1541                 ) {
   1542                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
   1543 
   1544                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1545                 }
   1546             } else if(
   1547                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
   1548                 baseSharedData->staticData->minBytesPerChar==1 &&
   1549                 baseSharedData->staticData->maxBytesPerChar==2 &&
   1550                 mbcsTable->countStates<=127
   1551             ) {
   1552                 /* non-stateful base converter, need to modify the state table */
   1553                 int32_t (*newStateTable)[256];
   1554                 int32_t *state;
   1555                 int32_t i, count;
   1556 
   1557                 /* allocate a new state table and copy the base state table contents */
   1558                 count=mbcsTable->countStates;
   1559                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
   1560                 if(newStateTable==NULL) {
   1561                     ucnv_unload(baseSharedData);
   1562                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1563                     return;
   1564                 }
   1565 
   1566                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
   1567 
   1568                 /* change all final single-byte entries to go to a new all-illegal state */
   1569                 state=newStateTable[0];
   1570                 for(i=0; i<256; ++i) {
   1571                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
   1572                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
   1573                     }
   1574                 }
   1575 
   1576                 /* build the new all-illegal state */
   1577                 state=newStateTable[count];
   1578                 for(i=0; i<256; ++i) {
   1579                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
   1580                 }
   1581                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
   1582                 mbcsTable->countStates=(uint8_t)(count+1);
   1583                 mbcsTable->stateTableOwned=TRUE;
   1584 
   1585                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
   1586             }
   1587         }
   1588 
   1589         /*
   1590          * unlike below for files with base tables, do not get the unicodeMask
   1591          * from the sharedData; instead, use the base table's unicodeMask,
   1592          * which we copied in the memcpy above;
   1593          * this is necessary because the static data unicodeMask, especially
   1594          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
   1595          */
   1596     } else {
   1597         /* conversion file with a base table; an additional extension table is optional */
   1598         /* make sure that the output type is known */
   1599         switch(mbcsTable->outputType) {
   1600         case MBCS_OUTPUT_1:
   1601         case MBCS_OUTPUT_2:
   1602         case MBCS_OUTPUT_3:
   1603         case MBCS_OUTPUT_4:
   1604         case MBCS_OUTPUT_3_EUC:
   1605         case MBCS_OUTPUT_4_EUC:
   1606         case MBCS_OUTPUT_2_SISO:
   1607             /* OK */
   1608             break;
   1609         default:
   1610             *pErrorCode=U_INVALID_TABLE_FORMAT;
   1611             return;
   1612         }
   1613         if(pArgs->onlyTestIsLoadable) {
   1614             /*
   1615              * Exit as soon as we know that we can load the converter
   1616              * and the format is valid and supported.
   1617              * The worst that can happen in the following code is a memory
   1618              * allocation error.
   1619              */
   1620             return;
   1621         }
   1622 
   1623         mbcsTable->countStates=(uint8_t)header->countStates;
   1624         mbcsTable->countToUFallbacks=header->countToUFallbacks;
   1625         mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
   1626         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
   1627         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
   1628 
   1629         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
   1630         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
   1631         mbcsTable->fromUBytesLength=header->fromUBytesLength;
   1632 
   1633         /*
   1634          * converter versions 6.1 and up contain a unicodeMask that is
   1635          * used here to select the most efficient function implementations
   1636          */
   1637         info.size=sizeof(UDataInfo);
   1638         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
   1639         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
   1640             /* mask off possible future extensions to be safe */
   1641             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
   1642         } else {
   1643             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
   1644             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
   1645         }
   1646 
   1647         /*
   1648          * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
   1649          * Check for the header version, SBCS vs. MBCS, and for whether the
   1650          * data structures are optimized for code points as high as what the
   1651          * runtime code is designed for.
   1652          * The implementation does not handle mapping tables with entries for
   1653          * unpaired surrogates.
   1654          */
   1655         if( header->version[1]>=3 &&
   1656             (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
   1657             (mbcsTable->countStates==1 ?
   1658                 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
   1659                 (header->version[2]>=(MBCS_FAST_MAX>>8))
   1660             )
   1661         ) {
   1662             mbcsTable->utf8Friendly=TRUE;
   1663 
   1664             if(mbcsTable->countStates==1) {
   1665                 /*
   1666                  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
   1667                  * Build a table with indexes to each block, to be used instead of
   1668                  * the regular stage 1/2 table.
   1669                  */
   1670                 int32_t i;
   1671                 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
   1672                     mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
   1673                 }
   1674                 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
   1675                 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
   1676             } else {
   1677                 /*
   1678                  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
   1679                  * The .cnv file is prebuilt with an additional stage table with indexes
   1680                  * to each block.
   1681                  */
   1682                 mbcsTable->mbcsIndex=(const uint16_t *)
   1683                     (mbcsTable->fromUnicodeBytes+
   1684                      (noFromU ? 0 : mbcsTable->fromUBytesLength));
   1685                 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
   1686             }
   1687         }
   1688 
   1689         /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
   1690         {
   1691             uint32_t asciiRoundtrips=0xffffffff;
   1692             int32_t i;
   1693 
   1694             for(i=0; i<0x80; ++i) {
   1695                 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
   1696                     asciiRoundtrips&=~((uint32_t)1<<(i>>2));
   1697                 }
   1698             }
   1699             mbcsTable->asciiRoundtrips=asciiRoundtrips;
   1700         }
   1701 
   1702         if(noFromU) {
   1703             uint32_t stage1Length=
   1704                 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
   1705                     0x440 : 0x40;
   1706             uint32_t stage2Length=
   1707                 (header->offsetFromUBytes-header->offsetFromUTable)/4-
   1708                 stage1Length/2;
   1709             reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
   1710         }
   1711     }
   1712 
   1713     /* Set the impl pointer here so that it is set for both extension-only and base tables. */
   1714     if(mbcsTable->utf8Friendly) {
   1715         if(mbcsTable->countStates==1) {
   1716             sharedData->impl=&_SBCSUTF8Impl;
   1717         } else {
   1718             if(mbcsTable->outputType==MBCS_OUTPUT_2) {
   1719                 sharedData->impl=&_DBCSUTF8Impl;
   1720             }
   1721         }
   1722     }
   1723 
   1724     if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
   1725         /*
   1726          * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
   1727          * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
   1728          */
   1729         mbcsTable->asciiRoundtrips=0;
   1730     }
   1731 }
   1732 
   1733 static void
   1734 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
   1735     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
   1736 
   1737     if(mbcsTable->swapLFNLStateTable!=NULL) {
   1738         uprv_free(mbcsTable->swapLFNLStateTable);
   1739     }
   1740     if(mbcsTable->stateTableOwned) {
   1741         uprv_free((void *)mbcsTable->stateTable);
   1742     }
   1743     if(mbcsTable->baseSharedData!=NULL) {
   1744         ucnv_unload(mbcsTable->baseSharedData);
   1745     }
   1746     if(mbcsTable->reconstitutedData!=NULL) {
   1747         uprv_free(mbcsTable->reconstitutedData);
   1748     }
   1749 }
   1750 
   1751 static void
   1752 ucnv_MBCSOpen(UConverter *cnv,
   1753               UConverterLoadArgs *pArgs,
   1754               UErrorCode *pErrorCode) {
   1755     UConverterMBCSTable *mbcsTable;
   1756     const int32_t *extIndexes;
   1757     uint8_t outputType;
   1758     int8_t maxBytesPerUChar;
   1759 
   1760     if(pArgs->onlyTestIsLoadable) {
   1761         return;
   1762     }
   1763 
   1764     mbcsTable=&cnv->sharedData->mbcs;
   1765     outputType=mbcsTable->outputType;
   1766 
   1767     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
   1768         /* the swaplfnl option does not apply, remove it */
   1769         cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1770     }
   1771 
   1772     if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1773         /* do this because double-checked locking is broken */
   1774         UBool isCached;
   1775 
   1776         umtx_lock(NULL);
   1777         isCached=mbcsTable->swapLFNLStateTable!=NULL;
   1778         umtx_unlock(NULL);
   1779 
   1780         if(!isCached) {
   1781             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
   1782                 if(U_FAILURE(*pErrorCode)) {
   1783                     return; /* something went wrong */
   1784                 }
   1785 
   1786                 /* the option does not apply, remove it */
   1787                 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
   1788             }
   1789         }
   1790     }
   1791 
   1792     if(uprv_strstr(pArgs->name, "18030")!=NULL) {
   1793         if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
   1794             /* set a flag for GB 18030 mode, which changes the callback behavior */
   1795             cnv->options|=_MBCS_OPTION_GB18030;
   1796         }
   1797     } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
   1798         /* set a flag for KEIS converter, which changes the SI/SO character sequence */
   1799         cnv->options|=_MBCS_OPTION_KEIS;
   1800     } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
   1801         /* set a flag for JEF converter, which changes the SI/SO character sequence */
   1802         cnv->options|=_MBCS_OPTION_JEF;
   1803     } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
   1804         /* set a flag for JIPS converter, which changes the SI/SO character sequence */
   1805         cnv->options|=_MBCS_OPTION_JIPS;
   1806     }
   1807 
   1808     /* fix maxBytesPerUChar depending on outputType and options etc. */
   1809     if(outputType==MBCS_OUTPUT_2_SISO) {
   1810         cnv->maxBytesPerUChar=3; /* SO+DBCS */
   1811     }
   1812 
   1813     extIndexes=mbcsTable->extIndexes;
   1814     if(extIndexes!=NULL) {
   1815         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
   1816         if(outputType==MBCS_OUTPUT_2_SISO) {
   1817             ++maxBytesPerUChar; /* SO + multiple DBCS */
   1818         }
   1819 
   1820         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
   1821             cnv->maxBytesPerUChar=maxBytesPerUChar;
   1822         }
   1823     }
   1824 
   1825 #if 0
   1826     /*
   1827      * documentation of UConverter fields used for status
   1828      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
   1829      */
   1830 
   1831     /* toUnicode */
   1832     cnv->toUnicodeStatus=0;     /* offset */
   1833     cnv->mode=0;                /* state */
   1834     cnv->toULength=0;           /* byteIndex */
   1835 
   1836     /* fromUnicode */
   1837     cnv->fromUChar32=0;
   1838     cnv->fromUnicodeStatus=1;   /* prevLength */
   1839 #endif
   1840 }
   1841 
   1842 static const char *
   1843 ucnv_MBCSGetName(const UConverter *cnv) {
   1844     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
   1845         return cnv->sharedData->mbcs.swapLFNLName;
   1846     } else {
   1847         return cnv->sharedData->staticData->name;
   1848     }
   1849 }
   1850 
   1851 /* MBCS-to-Unicode conversion functions ------------------------------------- */
   1852 
   1853 static UChar32
   1854 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
   1855     const _MBCSToUFallback *toUFallbacks;
   1856     uint32_t i, start, limit;
   1857 
   1858     limit=mbcsTable->countToUFallbacks;
   1859     if(limit>0) {
   1860         /* do a binary search for the fallback mapping */
   1861         toUFallbacks=mbcsTable->toUFallbacks;
   1862         start=0;
   1863         while(start<limit-1) {
   1864             i=(start+limit)/2;
   1865             if(offset<toUFallbacks[i].offset) {
   1866                 limit=i;
   1867             } else {
   1868                 start=i;
   1869             }
   1870         }
   1871 
   1872         /* did we really find it? */
   1873         if(offset==toUFallbacks[start].offset) {
   1874             return toUFallbacks[start].codePoint;
   1875         }
   1876     }
   1877 
   1878     return 0xfffe;
   1879 }
   1880 
   1881 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
   1882 static void
   1883 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1884                                 UErrorCode *pErrorCode) {
   1885     UConverter *cnv;
   1886     const uint8_t *source, *sourceLimit;
   1887     UChar *target;
   1888     const UChar *targetLimit;
   1889     int32_t *offsets;
   1890 
   1891     const int32_t (*stateTable)[256];
   1892 
   1893     int32_t sourceIndex;
   1894 
   1895     int32_t entry;
   1896     UChar c;
   1897     uint8_t action;
   1898 
   1899     /* set up the local pointers */
   1900     cnv=pArgs->converter;
   1901     source=(const uint8_t *)pArgs->source;
   1902     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1903     target=pArgs->target;
   1904     targetLimit=pArgs->targetLimit;
   1905     offsets=pArgs->offsets;
   1906 
   1907     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   1908         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   1909     } else {
   1910         stateTable=cnv->sharedData->mbcs.stateTable;
   1911     }
   1912 
   1913     /* sourceIndex=-1 if the current character began in the previous buffer */
   1914     sourceIndex=0;
   1915 
   1916     /* conversion loop */
   1917     while(source<sourceLimit) {
   1918         /*
   1919          * This following test is to see if available input would overflow the output.
   1920          * It does not catch output of more than one code unit that
   1921          * overflows as a result of a surrogate pair or callback output
   1922          * from the last source byte.
   1923          * Therefore, those situations also test for overflows and will
   1924          * then break the loop, too.
   1925          */
   1926         if(target>=targetLimit) {
   1927             /* target is full */
   1928             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1929             break;
   1930         }
   1931 
   1932         entry=stateTable[0][*source++];
   1933         /* MBCS_ENTRY_IS_FINAL(entry) */
   1934 
   1935         /* test the most common case first */
   1936         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   1937             /* output BMP code point */
   1938             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   1939             if(offsets!=NULL) {
   1940                 *offsets++=sourceIndex;
   1941             }
   1942 
   1943             /* normal end of action codes: prepare for a new character */
   1944             ++sourceIndex;
   1945             continue;
   1946         }
   1947 
   1948         /*
   1949          * An if-else-if chain provides more reliable performance for
   1950          * the most common cases compared to a switch.
   1951          */
   1952         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   1953         if(action==MBCS_STATE_VALID_DIRECT_20 ||
   1954            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   1955         ) {
   1956             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   1957             /* output surrogate pair */
   1958             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   1959             if(offsets!=NULL) {
   1960                 *offsets++=sourceIndex;
   1961             }
   1962             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   1963             if(target<targetLimit) {
   1964                 *target++=c;
   1965                 if(offsets!=NULL) {
   1966                     *offsets++=sourceIndex;
   1967                 }
   1968             } else {
   1969                 /* target overflow */
   1970                 cnv->UCharErrorBuffer[0]=c;
   1971                 cnv->UCharErrorBufferLength=1;
   1972                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1973                 break;
   1974             }
   1975 
   1976             ++sourceIndex;
   1977             continue;
   1978         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   1979             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   1980                 /* output BMP code point */
   1981                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   1982                 if(offsets!=NULL) {
   1983                     *offsets++=sourceIndex;
   1984                 }
   1985 
   1986                 ++sourceIndex;
   1987                 continue;
   1988             }
   1989         } else if(action==MBCS_STATE_UNASSIGNED) {
   1990             /* just fall through */
   1991         } else if(action==MBCS_STATE_ILLEGAL) {
   1992             /* callback(illegal) */
   1993             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1994         } else {
   1995             /* reserved, must never occur */
   1996             ++sourceIndex;
   1997             continue;
   1998         }
   1999 
   2000         if(U_FAILURE(*pErrorCode)) {
   2001             /* callback(illegal) */
   2002             break;
   2003         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2004             /* try an extension mapping */
   2005             pArgs->source=(const char *)source;
   2006             cnv->toUBytes[0]=*(source-1);
   2007             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2008                                     1, &source, sourceLimit,
   2009                                     &target, targetLimit,
   2010                                     &offsets, sourceIndex,
   2011                                     pArgs->flush,
   2012                                     pErrorCode);
   2013             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
   2014 
   2015             if(U_FAILURE(*pErrorCode)) {
   2016                 /* not mappable or buffer overflow */
   2017                 break;
   2018             }
   2019         }
   2020     }
   2021 
   2022     /* write back the updated pointers */
   2023     pArgs->source=(const char *)source;
   2024     pArgs->target=target;
   2025     pArgs->offsets=offsets;
   2026 }
   2027 
   2028 /*
   2029  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
   2030  * that only map to and from the BMP.
   2031  * In addition to single-byte optimizations, the offset calculations
   2032  * become much easier.
   2033  */
   2034 static void
   2035 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
   2036                             UErrorCode *pErrorCode) {
   2037     UConverter *cnv;
   2038     const uint8_t *source, *sourceLimit, *lastSource;
   2039     UChar *target;
   2040     int32_t targetCapacity, length;
   2041     int32_t *offsets;
   2042 
   2043     const int32_t (*stateTable)[256];
   2044 
   2045     int32_t sourceIndex;
   2046 
   2047     int32_t entry;
   2048     uint8_t action;
   2049 
   2050     /* set up the local pointers */
   2051     cnv=pArgs->converter;
   2052     source=(const uint8_t *)pArgs->source;
   2053     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2054     target=pArgs->target;
   2055     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   2056     offsets=pArgs->offsets;
   2057 
   2058     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2059         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2060     } else {
   2061         stateTable=cnv->sharedData->mbcs.stateTable;
   2062     }
   2063 
   2064     /* sourceIndex=-1 if the current character began in the previous buffer */
   2065     sourceIndex=0;
   2066     lastSource=source;
   2067 
   2068     /*
   2069      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   2070      * for the minimum of the sourceLength and targetCapacity
   2071      */
   2072     length=(int32_t)(sourceLimit-source);
   2073     if(length<targetCapacity) {
   2074         targetCapacity=length;
   2075     }
   2076 
   2077 #if MBCS_UNROLL_SINGLE_TO_BMP
   2078     /* unrolling makes it faster on Pentium III/Windows 2000 */
   2079     /* unroll the loop with the most common case */
   2080 unrolled:
   2081     if(targetCapacity>=16) {
   2082         int32_t count, loops, oredEntries;
   2083 
   2084         loops=count=targetCapacity>>4;
   2085         do {
   2086             oredEntries=entry=stateTable[0][*source++];
   2087             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2088             oredEntries|=entry=stateTable[0][*source++];
   2089             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2090             oredEntries|=entry=stateTable[0][*source++];
   2091             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2092             oredEntries|=entry=stateTable[0][*source++];
   2093             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2094             oredEntries|=entry=stateTable[0][*source++];
   2095             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2096             oredEntries|=entry=stateTable[0][*source++];
   2097             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2098             oredEntries|=entry=stateTable[0][*source++];
   2099             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2100             oredEntries|=entry=stateTable[0][*source++];
   2101             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2102             oredEntries|=entry=stateTable[0][*source++];
   2103             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2104             oredEntries|=entry=stateTable[0][*source++];
   2105             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2106             oredEntries|=entry=stateTable[0][*source++];
   2107             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2108             oredEntries|=entry=stateTable[0][*source++];
   2109             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2110             oredEntries|=entry=stateTable[0][*source++];
   2111             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2112             oredEntries|=entry=stateTable[0][*source++];
   2113             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2114             oredEntries|=entry=stateTable[0][*source++];
   2115             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2116             oredEntries|=entry=stateTable[0][*source++];
   2117             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2118 
   2119             /* were all 16 entries really valid? */
   2120             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
   2121                 /* no, return to the first of these 16 */
   2122                 source-=16;
   2123                 target-=16;
   2124                 break;
   2125             }
   2126         } while(--count>0);
   2127         count=loops-count;
   2128         targetCapacity-=16*count;
   2129 
   2130         if(offsets!=NULL) {
   2131             lastSource+=16*count;
   2132             while(count>0) {
   2133                 *offsets++=sourceIndex++;
   2134                 *offsets++=sourceIndex++;
   2135                 *offsets++=sourceIndex++;
   2136                 *offsets++=sourceIndex++;
   2137                 *offsets++=sourceIndex++;
   2138                 *offsets++=sourceIndex++;
   2139                 *offsets++=sourceIndex++;
   2140                 *offsets++=sourceIndex++;
   2141                 *offsets++=sourceIndex++;
   2142                 *offsets++=sourceIndex++;
   2143                 *offsets++=sourceIndex++;
   2144                 *offsets++=sourceIndex++;
   2145                 *offsets++=sourceIndex++;
   2146                 *offsets++=sourceIndex++;
   2147                 *offsets++=sourceIndex++;
   2148                 *offsets++=sourceIndex++;
   2149                 --count;
   2150             }
   2151         }
   2152     }
   2153 #endif
   2154 
   2155     /* conversion loop */
   2156     while(targetCapacity > 0 && source < sourceLimit) {
   2157         entry=stateTable[0][*source++];
   2158         /* MBCS_ENTRY_IS_FINAL(entry) */
   2159 
   2160         /* test the most common case first */
   2161         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2162             /* output BMP code point */
   2163             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2164             --targetCapacity;
   2165             continue;
   2166         }
   2167 
   2168         /*
   2169          * An if-else-if chain provides more reliable performance for
   2170          * the most common cases compared to a switch.
   2171          */
   2172         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2173         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2174             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2175                 /* output BMP code point */
   2176                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2177                 --targetCapacity;
   2178                 continue;
   2179             }
   2180         } else if(action==MBCS_STATE_UNASSIGNED) {
   2181             /* just fall through */
   2182         } else if(action==MBCS_STATE_ILLEGAL) {
   2183             /* callback(illegal) */
   2184             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2185         } else {
   2186             /* reserved, must never occur */
   2187             continue;
   2188         }
   2189 
   2190         /* set offsets since the start or the last extension */
   2191         if(offsets!=NULL) {
   2192             int32_t count=(int32_t)(source-lastSource);
   2193 
   2194             /* predecrement: do not set the offset for the callback-causing character */
   2195             while(--count>0) {
   2196                 *offsets++=sourceIndex++;
   2197             }
   2198             /* offset and sourceIndex are now set for the current character */
   2199         }
   2200 
   2201         if(U_FAILURE(*pErrorCode)) {
   2202             /* callback(illegal) */
   2203             break;
   2204         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2205             /* try an extension mapping */
   2206             lastSource=source;
   2207             cnv->toUBytes[0]=*(source-1);
   2208             cnv->toULength=_extToU(cnv, cnv->sharedData,
   2209                                     1, &source, sourceLimit,
   2210                                     &target, pArgs->targetLimit,
   2211                                     &offsets, sourceIndex,
   2212                                     pArgs->flush,
   2213                                     pErrorCode);
   2214             sourceIndex+=1+(int32_t)(source-lastSource);
   2215 
   2216             if(U_FAILURE(*pErrorCode)) {
   2217                 /* not mappable or buffer overflow */
   2218                 break;
   2219             }
   2220 
   2221             /* recalculate the targetCapacity after an extension mapping */
   2222             targetCapacity=(int32_t)(pArgs->targetLimit-target);
   2223             length=(int32_t)(sourceLimit-source);
   2224             if(length<targetCapacity) {
   2225                 targetCapacity=length;
   2226             }
   2227         }
   2228 
   2229 #if MBCS_UNROLL_SINGLE_TO_BMP
   2230         /* unrolling makes it faster on Pentium III/Windows 2000 */
   2231         goto unrolled;
   2232 #endif
   2233     }
   2234 
   2235     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
   2236         /* target is full */
   2237         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2238     }
   2239 
   2240     /* set offsets since the start or the last callback */
   2241     if(offsets!=NULL) {
   2242         size_t count=source-lastSource;
   2243         while(count>0) {
   2244             *offsets++=sourceIndex++;
   2245             --count;
   2246         }
   2247     }
   2248 
   2249     /* write back the updated pointers */
   2250     pArgs->source=(const char *)source;
   2251     pArgs->target=target;
   2252     pArgs->offsets=offsets;
   2253 }
   2254 
   2255 static UBool
   2256 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
   2257     const int32_t *row=stateTable[state];
   2258     int32_t b, entry;
   2259     /* First test for final entries in this state for some commonly valid byte values. */
   2260     entry=row[0xa1];
   2261     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2262         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2263     ) {
   2264         return TRUE;
   2265     }
   2266     entry=row[0x41];
   2267     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2268         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2269     ) {
   2270         return TRUE;
   2271     }
   2272     /* Then test for final entries in this state. */
   2273     for(b=0; b<=0xff; ++b) {
   2274         entry=row[b];
   2275         if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
   2276             MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
   2277         ) {
   2278             return TRUE;
   2279         }
   2280     }
   2281     /* Then recurse for transition entries. */
   2282     for(b=0; b<=0xff; ++b) {
   2283         entry=row[b];
   2284         if( MBCS_ENTRY_IS_TRANSITION(entry) &&
   2285             hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
   2286         ) {
   2287             return TRUE;
   2288         }
   2289     }
   2290     return FALSE;
   2291 }
   2292 
   2293 /*
   2294  * Is byte b a single/lead byte in this state?
   2295  * Recurse for transition states, because here we don't want to say that
   2296  * b is a lead byte if all byte sequences that start with b are illegal.
   2297  */
   2298 static UBool
   2299 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
   2300     const int32_t *row=stateTable[state];
   2301     int32_t entry=row[b];
   2302     if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
   2303         return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
   2304     } else {
   2305         uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2306         if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
   2307             return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
   2308         } else {
   2309             return action!=MBCS_STATE_ILLEGAL;
   2310         }
   2311     }
   2312 }
   2313 
   2314 U_CFUNC void
   2315 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   2316                           UErrorCode *pErrorCode) {
   2317     UConverter *cnv;
   2318     const uint8_t *source, *sourceLimit;
   2319     UChar *target;
   2320     const UChar *targetLimit;
   2321     int32_t *offsets;
   2322 
   2323     const int32_t (*stateTable)[256];
   2324     const uint16_t *unicodeCodeUnits;
   2325 
   2326     uint32_t offset;
   2327     uint8_t state;
   2328     int8_t byteIndex;
   2329     uint8_t *bytes;
   2330 
   2331     int32_t sourceIndex, nextSourceIndex;
   2332 
   2333     int32_t entry;
   2334     UChar c;
   2335     uint8_t action;
   2336 
   2337     /* use optimized function if possible */
   2338     cnv=pArgs->converter;
   2339 
   2340     if(cnv->preToULength>0) {
   2341         /*
   2342          * pass sourceIndex=-1 because we continue from an earlier buffer
   2343          * in the future, this may change with continuous offsets
   2344          */
   2345         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
   2346 
   2347         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
   2348             return;
   2349         }
   2350     }
   2351 
   2352     if(cnv->sharedData->mbcs.countStates==1) {
   2353         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   2354             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
   2355         } else {
   2356             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
   2357         }
   2358         return;
   2359     }
   2360 
   2361     /* set up the local pointers */
   2362     source=(const uint8_t *)pArgs->source;
   2363     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2364     target=pArgs->target;
   2365     targetLimit=pArgs->targetLimit;
   2366     offsets=pArgs->offsets;
   2367 
   2368     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2369         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2370     } else {
   2371         stateTable=cnv->sharedData->mbcs.stateTable;
   2372     }
   2373     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2374 
   2375     /* get the converter state from UConverter */
   2376     offset=cnv->toUnicodeStatus;
   2377     byteIndex=cnv->toULength;
   2378     bytes=cnv->toUBytes;
   2379 
   2380     /*
   2381      * if we are in the SBCS state for a DBCS-only converter,
   2382      * then load the DBCS state from the MBCS data
   2383      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2384      */
   2385     if((state=(uint8_t)(cnv->mode))==0) {
   2386         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2387     }
   2388 
   2389     /* sourceIndex=-1 if the current character began in the previous buffer */
   2390     sourceIndex=byteIndex==0 ? 0 : -1;
   2391     nextSourceIndex=0;
   2392 
   2393     /* conversion loop */
   2394     while(source<sourceLimit) {
   2395         /*
   2396          * This following test is to see if available input would overflow the output.
   2397          * It does not catch output of more than one code unit that
   2398          * overflows as a result of a surrogate pair or callback output
   2399          * from the last source byte.
   2400          * Therefore, those situations also test for overflows and will
   2401          * then break the loop, too.
   2402          */
   2403         if(target>=targetLimit) {
   2404             /* target is full */
   2405             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2406             break;
   2407         }
   2408 
   2409         if(byteIndex==0) {
   2410             /* optimized loop for 1/2-byte input and BMP output */
   2411             if(offsets==NULL) {
   2412                 do {
   2413                     entry=stateTable[state][*source];
   2414                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2415                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2416                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2417 
   2418                         ++source;
   2419                         if( source<sourceLimit &&
   2420                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2421                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2422                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2423                         ) {
   2424                             ++source;
   2425                             *target++=c;
   2426                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2427                             offset=0;
   2428                         } else {
   2429                             /* set the state and leave the optimized loop */
   2430                             bytes[0]=*(source-1);
   2431                             byteIndex=1;
   2432                             break;
   2433                         }
   2434                     } else {
   2435                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2436                             /* output BMP code point */
   2437                             ++source;
   2438                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2439                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2440                         } else {
   2441                             /* leave the optimized loop */
   2442                             break;
   2443                         }
   2444                     }
   2445                 } while(source<sourceLimit && target<targetLimit);
   2446             } else /* offsets!=NULL */ {
   2447                 do {
   2448                     entry=stateTable[state][*source];
   2449                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2450                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2451                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2452 
   2453                         ++source;
   2454                         if( source<sourceLimit &&
   2455                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2456                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2457                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2458                         ) {
   2459                             ++source;
   2460                             *target++=c;
   2461                             if(offsets!=NULL) {
   2462                                 *offsets++=sourceIndex;
   2463                                 sourceIndex=(nextSourceIndex+=2);
   2464                             }
   2465                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2466                             offset=0;
   2467                         } else {
   2468                             /* set the state and leave the optimized loop */
   2469                             ++nextSourceIndex;
   2470                             bytes[0]=*(source-1);
   2471                             byteIndex=1;
   2472                             break;
   2473                         }
   2474                     } else {
   2475                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2476                             /* output BMP code point */
   2477                             ++source;
   2478                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2479                             if(offsets!=NULL) {
   2480                                 *offsets++=sourceIndex;
   2481                                 sourceIndex=++nextSourceIndex;
   2482                             }
   2483                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2484                         } else {
   2485                             /* leave the optimized loop */
   2486                             break;
   2487                         }
   2488                     }
   2489                 } while(source<sourceLimit && target<targetLimit);
   2490             }
   2491 
   2492             /*
   2493              * these tests and break statements could be put inside the loop
   2494              * if C had "break outerLoop" like Java
   2495              */
   2496             if(source>=sourceLimit) {
   2497                 break;
   2498             }
   2499             if(target>=targetLimit) {
   2500                 /* target is full */
   2501                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2502                 break;
   2503             }
   2504 
   2505             ++nextSourceIndex;
   2506             bytes[byteIndex++]=*source++;
   2507         } else /* byteIndex>0 */ {
   2508             ++nextSourceIndex;
   2509             entry=stateTable[state][bytes[byteIndex++]=*source++];
   2510         }
   2511 
   2512         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2513             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2514             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2515             continue;
   2516         }
   2517 
   2518         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2519         cnv->mode=state;
   2520 
   2521         /* set the next state early so that we can reuse the entry variable */
   2522         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2523 
   2524         /*
   2525          * An if-else-if chain provides more reliable performance for
   2526          * the most common cases compared to a switch.
   2527          */
   2528         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2529         if(action==MBCS_STATE_VALID_16) {
   2530             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2531             c=unicodeCodeUnits[offset];
   2532             if(c<0xfffe) {
   2533                 /* output BMP code point */
   2534                 *target++=c;
   2535                 if(offsets!=NULL) {
   2536                     *offsets++=sourceIndex;
   2537                 }
   2538                 byteIndex=0;
   2539             } else if(c==0xfffe) {
   2540                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2541                     /* output fallback BMP code point */
   2542                     *target++=(UChar)entry;
   2543                     if(offsets!=NULL) {
   2544                         *offsets++=sourceIndex;
   2545                     }
   2546                     byteIndex=0;
   2547                 }
   2548             } else {
   2549                 /* callback(illegal) */
   2550                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2551             }
   2552         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   2553             /* output BMP code point */
   2554             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2555             if(offsets!=NULL) {
   2556                 *offsets++=sourceIndex;
   2557             }
   2558             byteIndex=0;
   2559         } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2560             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2561             c=unicodeCodeUnits[offset++];
   2562             if(c<0xd800) {
   2563                 /* output BMP code point below 0xd800 */
   2564                 *target++=c;
   2565                 if(offsets!=NULL) {
   2566                     *offsets++=sourceIndex;
   2567                 }
   2568                 byteIndex=0;
   2569             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2570                 /* output roundtrip or fallback surrogate pair */
   2571                 *target++=(UChar)(c&0xdbff);
   2572                 if(offsets!=NULL) {
   2573                     *offsets++=sourceIndex;
   2574                 }
   2575                 byteIndex=0;
   2576                 if(target<targetLimit) {
   2577                     *target++=unicodeCodeUnits[offset];
   2578                     if(offsets!=NULL) {
   2579                         *offsets++=sourceIndex;
   2580                     }
   2581                 } else {
   2582                     /* target overflow */
   2583                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
   2584                     cnv->UCharErrorBufferLength=1;
   2585                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2586 
   2587                     offset=0;
   2588                     break;
   2589                 }
   2590             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2591                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2592                 *target++=unicodeCodeUnits[offset];
   2593                 if(offsets!=NULL) {
   2594                     *offsets++=sourceIndex;
   2595                 }
   2596                 byteIndex=0;
   2597             } else if(c==0xffff) {
   2598                 /* callback(illegal) */
   2599                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2600             }
   2601         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2602                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2603         ) {
   2604             entry=MBCS_ENTRY_FINAL_VALUE(entry);
   2605             /* output surrogate pair */
   2606             *target++=(UChar)(0xd800|(UChar)(entry>>10));
   2607             if(offsets!=NULL) {
   2608                 *offsets++=sourceIndex;
   2609             }
   2610             byteIndex=0;
   2611             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
   2612             if(target<targetLimit) {
   2613                 *target++=c;
   2614                 if(offsets!=NULL) {
   2615                     *offsets++=sourceIndex;
   2616                 }
   2617             } else {
   2618                 /* target overflow */
   2619                 cnv->UCharErrorBuffer[0]=c;
   2620                 cnv->UCharErrorBufferLength=1;
   2621                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   2622 
   2623                 offset=0;
   2624                 break;
   2625             }
   2626         } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2627             /*
   2628              * This serves as a state change without any output.
   2629              * It is useful for reading simple stateful encodings,
   2630              * for example using just Shift-In/Shift-Out codes.
   2631              * The 21 unused bits may later be used for more sophisticated
   2632              * state transitions.
   2633              */
   2634             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
   2635                 byteIndex=0;
   2636             } else {
   2637                 /* SI/SO are illegal for DBCS-only conversion */
   2638                 state=(uint8_t)(cnv->mode); /* restore the previous state */
   2639 
   2640                 /* callback(illegal) */
   2641                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2642             }
   2643         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2644             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2645                 /* output BMP code point */
   2646                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2647                 if(offsets!=NULL) {
   2648                     *offsets++=sourceIndex;
   2649                 }
   2650                 byteIndex=0;
   2651             }
   2652         } else if(action==MBCS_STATE_UNASSIGNED) {
   2653             /* just fall through */
   2654         } else if(action==MBCS_STATE_ILLEGAL) {
   2655             /* callback(illegal) */
   2656             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2657         } else {
   2658             /* reserved, must never occur */
   2659             byteIndex=0;
   2660         }
   2661 
   2662         /* end of action codes: prepare for a new character */
   2663         offset=0;
   2664 
   2665         if(byteIndex==0) {
   2666             sourceIndex=nextSourceIndex;
   2667         } else if(U_FAILURE(*pErrorCode)) {
   2668             /* callback(illegal) */
   2669             if(byteIndex>1) {
   2670                 /*
   2671                  * Ticket 5691: consistent illegal sequences:
   2672                  * - We include at least the first byte in the illegal sequence.
   2673                  * - If any of the non-initial bytes could be the start of a character,
   2674                  *   we stop the illegal sequence before the first one of those.
   2675                  */
   2676                 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   2677                 int8_t i;
   2678                 for(i=1;
   2679                     i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
   2680                     ++i) {}
   2681                 if(i<byteIndex) {
   2682                     /* Back out some bytes. */
   2683                     int8_t backOutDistance=byteIndex-i;
   2684                     int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
   2685                     byteIndex=i;  /* length of reported illegal byte sequence */
   2686                     if(backOutDistance<=bytesFromThisBuffer) {
   2687                         source-=backOutDistance;
   2688                     } else {
   2689                         /* Back out bytes from the previous buffer: Need to replay them. */
   2690                         cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
   2691                         /* preToULength is negative! */
   2692                         uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
   2693                         source=(const uint8_t *)pArgs->source;
   2694                     }
   2695                 }
   2696             }
   2697             break;
   2698         } else /* unassigned sequences indicated with byteIndex>0 */ {
   2699             /* try an extension mapping */
   2700             pArgs->source=(const char *)source;
   2701             byteIndex=_extToU(cnv, cnv->sharedData,
   2702                               byteIndex, &source, sourceLimit,
   2703                               &target, targetLimit,
   2704                               &offsets, sourceIndex,
   2705                               pArgs->flush,
   2706                               pErrorCode);
   2707             sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
   2708 
   2709             if(U_FAILURE(*pErrorCode)) {
   2710                 /* not mappable or buffer overflow */
   2711                 break;
   2712             }
   2713         }
   2714     }
   2715 
   2716     /* set the converter state back into UConverter */
   2717     cnv->toUnicodeStatus=offset;
   2718     cnv->mode=state;
   2719     cnv->toULength=byteIndex;
   2720 
   2721     /* write back the updated pointers */
   2722     pArgs->source=(const char *)source;
   2723     pArgs->target=target;
   2724     pArgs->offsets=offsets;
   2725 }
   2726 
   2727 /*
   2728  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
   2729  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
   2730  */
   2731 static UChar32
   2732 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2733                         UErrorCode *pErrorCode) {
   2734     UConverter *cnv;
   2735     const int32_t (*stateTable)[256];
   2736     const uint8_t *source, *sourceLimit;
   2737 
   2738     int32_t entry;
   2739     uint8_t action;
   2740 
   2741     /* set up the local pointers */
   2742     cnv=pArgs->converter;
   2743     source=(const uint8_t *)pArgs->source;
   2744     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2745     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2746         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2747     } else {
   2748         stateTable=cnv->sharedData->mbcs.stateTable;
   2749     }
   2750 
   2751     /* conversion loop */
   2752     while(source<sourceLimit) {
   2753         entry=stateTable[0][*source++];
   2754         /* MBCS_ENTRY_IS_FINAL(entry) */
   2755 
   2756         /* write back the updated pointer early so that we can return directly */
   2757         pArgs->source=(const char *)source;
   2758 
   2759         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   2760             /* output BMP code point */
   2761             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2762         }
   2763 
   2764         /*
   2765          * An if-else-if chain provides more reliable performance for
   2766          * the most common cases compared to a switch.
   2767          */
   2768         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2769         if( action==MBCS_STATE_VALID_DIRECT_20 ||
   2770             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2771         ) {
   2772             /* output supplementary code point */
   2773             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2774         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2775             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2776                 /* output BMP code point */
   2777                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2778             }
   2779         } else if(action==MBCS_STATE_UNASSIGNED) {
   2780             /* just fall through */
   2781         } else if(action==MBCS_STATE_ILLEGAL) {
   2782             /* callback(illegal) */
   2783             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2784         } else {
   2785             /* reserved, must never occur */
   2786             continue;
   2787         }
   2788 
   2789         if(U_FAILURE(*pErrorCode)) {
   2790             /* callback(illegal) */
   2791             break;
   2792         } else /* unassigned sequence */ {
   2793             /* defer to the generic implementation */
   2794             pArgs->source=(const char *)source-1;
   2795             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2796         }
   2797     }
   2798 
   2799     /* no output because of empty input or only state changes */
   2800     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2801     return 0xffff;
   2802 }
   2803 
   2804 /*
   2805  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
   2806  * conversion without offset handling.
   2807  *
   2808  * When a character does not have a mapping to Unicode, then we return to the
   2809  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
   2810  * handling.
   2811  * We also defer to the generic code in other complicated cases and have them
   2812  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
   2813  *
   2814  * All normal mappings and errors are handled here.
   2815  */
   2816 static UChar32
   2817 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
   2818                   UErrorCode *pErrorCode) {
   2819     UConverter *cnv;
   2820     const uint8_t *source, *sourceLimit, *lastSource;
   2821 
   2822     const int32_t (*stateTable)[256];
   2823     const uint16_t *unicodeCodeUnits;
   2824 
   2825     uint32_t offset;
   2826     uint8_t state;
   2827 
   2828     int32_t entry;
   2829     UChar32 c;
   2830     uint8_t action;
   2831 
   2832     /* use optimized function if possible */
   2833     cnv=pArgs->converter;
   2834 
   2835     if(cnv->preToULength>0) {
   2836         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
   2837         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2838     }
   2839 
   2840     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
   2841         /*
   2842          * Using the generic ucnv_getNextUChar() code lets us deal correctly
   2843          * with the rare case of a codepage that maps single surrogates
   2844          * without adding the complexity to this already complicated function here.
   2845          */
   2846         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2847     } else if(cnv->sharedData->mbcs.countStates==1) {
   2848         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
   2849     }
   2850 
   2851     /* set up the local pointers */
   2852     source=lastSource=(const uint8_t *)pArgs->source;
   2853     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   2854 
   2855     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   2856         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
   2857     } else {
   2858         stateTable=cnv->sharedData->mbcs.stateTable;
   2859     }
   2860     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
   2861 
   2862     /* get the converter state from UConverter */
   2863     offset=cnv->toUnicodeStatus;
   2864 
   2865     /*
   2866      * if we are in the SBCS state for a DBCS-only converter,
   2867      * then load the DBCS state from the MBCS data
   2868      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
   2869      */
   2870     if((state=(uint8_t)(cnv->mode))==0) {
   2871         state=cnv->sharedData->mbcs.dbcsOnlyState;
   2872     }
   2873 
   2874     /* conversion loop */
   2875     c=U_SENTINEL;
   2876     while(source<sourceLimit) {
   2877         entry=stateTable[state][*source++];
   2878         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   2879             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   2880             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   2881 
   2882             /* optimization for 1/2-byte input and BMP output */
   2883             if( source<sourceLimit &&
   2884                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
   2885                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
   2886                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
   2887             ) {
   2888                 ++source;
   2889                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2890                 /* output BMP code point */
   2891                 break;
   2892             }
   2893         } else {
   2894             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
   2895             cnv->mode=state;
   2896 
   2897             /* set the next state early so that we can reuse the entry variable */
   2898             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
   2899 
   2900             /*
   2901              * An if-else-if chain provides more reliable performance for
   2902              * the most common cases compared to a switch.
   2903              */
   2904             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   2905             if(action==MBCS_STATE_VALID_DIRECT_16) {
   2906                 /* output BMP code point */
   2907                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2908                 break;
   2909             } else if(action==MBCS_STATE_VALID_16) {
   2910                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2911                 c=unicodeCodeUnits[offset];
   2912                 if(c<0xfffe) {
   2913                     /* output BMP code point */
   2914                     break;
   2915                 } else if(c==0xfffe) {
   2916                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
   2917                         break;
   2918                     }
   2919                 } else {
   2920                     /* callback(illegal) */
   2921                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2922                 }
   2923             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   2924                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   2925                 c=unicodeCodeUnits[offset++];
   2926                 if(c<0xd800) {
   2927                     /* output BMP code point below 0xd800 */
   2928                     break;
   2929                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   2930                     /* output roundtrip or fallback supplementary code point */
   2931                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
   2932                     break;
   2933                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   2934                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   2935                     c=unicodeCodeUnits[offset];
   2936                     break;
   2937                 } else if(c==0xffff) {
   2938                     /* callback(illegal) */
   2939                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2940                 }
   2941             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
   2942                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
   2943             ) {
   2944                 /* output supplementary code point */
   2945                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
   2946                 break;
   2947             } else if(action==MBCS_STATE_CHANGE_ONLY) {
   2948                 /*
   2949                  * This serves as a state change without any output.
   2950                  * It is useful for reading simple stateful encodings,
   2951                  * for example using just Shift-In/Shift-Out codes.
   2952                  * The 21 unused bits may later be used for more sophisticated
   2953                  * state transitions.
   2954                  */
   2955                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
   2956                     /* SI/SO are illegal for DBCS-only conversion */
   2957                     state=(uint8_t)(cnv->mode); /* restore the previous state */
   2958 
   2959                     /* callback(illegal) */
   2960                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2961                 }
   2962             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   2963                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   2964                     /* output BMP code point */
   2965                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   2966                     break;
   2967                 }
   2968             } else if(action==MBCS_STATE_UNASSIGNED) {
   2969                 /* just fall through */
   2970             } else if(action==MBCS_STATE_ILLEGAL) {
   2971                 /* callback(illegal) */
   2972                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   2973             } else {
   2974                 /* reserved (must never occur), or only state change */
   2975                 offset=0;
   2976                 lastSource=source;
   2977                 continue;
   2978             }
   2979 
   2980             /* end of action codes: prepare for a new character */
   2981             offset=0;
   2982 
   2983             if(U_FAILURE(*pErrorCode)) {
   2984                 /* callback(illegal) */
   2985                 break;
   2986             } else /* unassigned sequence */ {
   2987                 /* defer to the generic implementation */
   2988                 cnv->toUnicodeStatus=0;
   2989                 cnv->mode=state;
   2990                 pArgs->source=(const char *)lastSource;
   2991                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   2992             }
   2993         }
   2994     }
   2995 
   2996     if(c<0) {
   2997         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
   2998             /* incomplete character byte sequence */
   2999             uint8_t *bytes=cnv->toUBytes;
   3000             cnv->toULength=(int8_t)(source-lastSource);
   3001             do {
   3002                 *bytes++=*lastSource++;
   3003             } while(lastSource<source);
   3004             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3005         } else if(U_FAILURE(*pErrorCode)) {
   3006             /* callback(illegal) */
   3007             /*
   3008              * Ticket 5691: consistent illegal sequences:
   3009              * - We include at least the first byte in the illegal sequence.
   3010              * - If any of the non-initial bytes could be the start of a character,
   3011              *   we stop the illegal sequence before the first one of those.
   3012              */
   3013             UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
   3014             uint8_t *bytes=cnv->toUBytes;
   3015             *bytes++=*lastSource++;     /* first byte */
   3016             if(lastSource==source) {
   3017                 cnv->toULength=1;
   3018             } else /* lastSource<source: multi-byte character */ {
   3019                 int8_t i;
   3020                 for(i=1;
   3021                     lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
   3022                     ++i
   3023                 ) {
   3024                     *bytes++=*lastSource++;
   3025                 }
   3026                 cnv->toULength=i;
   3027                 source=lastSource;
   3028             }
   3029         } else {
   3030             /* no output because of empty input or only state changes */
   3031             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   3032         }
   3033         c=0xffff;
   3034     }
   3035 
   3036     /* set the converter state back into UConverter, ready for a new character */
   3037     cnv->toUnicodeStatus=0;
   3038     cnv->mode=state;
   3039 
   3040     /* write back the updated pointer */
   3041     pArgs->source=(const char *)source;
   3042     return c;
   3043 }
   3044 
   3045 #if 0
   3046 /*
   3047  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3048  * Removal improves code coverage.
   3049  */
   3050 /**
   3051  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
   3052  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3053  * It does not handle conversion extensions (_extToU()).
   3054  */
   3055 U_CFUNC UChar32
   3056 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
   3057                               uint8_t b, UBool useFallback) {
   3058     int32_t entry;
   3059     uint8_t action;
   3060 
   3061     entry=sharedData->mbcs.stateTable[0][b];
   3062     /* MBCS_ENTRY_IS_FINAL(entry) */
   3063 
   3064     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
   3065         /* output BMP code point */
   3066         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3067     }
   3068 
   3069     /*
   3070      * An if-else-if chain provides more reliable performance for
   3071      * the most common cases compared to a switch.
   3072      */
   3073     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3074     if(action==MBCS_STATE_VALID_DIRECT_20) {
   3075         /* output supplementary code point */
   3076         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3077     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3078         if(!TO_U_USE_FALLBACK(useFallback)) {
   3079             return 0xfffe;
   3080         }
   3081         /* output BMP code point */
   3082         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3083     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3084         if(!TO_U_USE_FALLBACK(useFallback)) {
   3085             return 0xfffe;
   3086         }
   3087         /* output supplementary code point */
   3088         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3089     } else if(action==MBCS_STATE_UNASSIGNED) {
   3090         return 0xfffe;
   3091     } else if(action==MBCS_STATE_ILLEGAL) {
   3092         return 0xffff;
   3093     } else {
   3094         /* reserved, must never occur */
   3095         return 0xffff;
   3096     }
   3097 }
   3098 #endif
   3099 
   3100 /*
   3101  * This is a simple version of _MBCSGetNextUChar() that is used
   3102  * by other converter implementations.
   3103  * It only returns an "assigned" result if it consumes the entire input.
   3104  * It does not use state from the converter, nor error codes.
   3105  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   3106  * It handles conversion extensions but not GB 18030.
   3107  *
   3108  * Return value:
   3109  * U+fffe   unassigned
   3110  * U+ffff   illegal
   3111  * otherwise the Unicode code point
   3112  */
   3113 U_CFUNC UChar32
   3114 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
   3115                         const char *source, int32_t length,
   3116                         UBool useFallback) {
   3117     const int32_t (*stateTable)[256];
   3118     const uint16_t *unicodeCodeUnits;
   3119 
   3120     uint32_t offset;
   3121     uint8_t state, action;
   3122 
   3123     UChar32 c;
   3124     int32_t i, entry;
   3125 
   3126     if(length<=0) {
   3127         /* no input at all: "illegal" */
   3128         return 0xffff;
   3129     }
   3130 
   3131 #if 0
   3132 /*
   3133  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
   3134  * TODO In future releases, verify that this function is never called for SBCS
   3135  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
   3136  * Removal improves code coverage.
   3137  */
   3138     /* use optimized function if possible */
   3139     if(sharedData->mbcs.countStates==1) {
   3140         if(length==1) {
   3141             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
   3142         } else {
   3143             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
   3144         }
   3145     }
   3146 #endif
   3147 
   3148     /* set up the local pointers */
   3149     stateTable=sharedData->mbcs.stateTable;
   3150     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
   3151 
   3152     /* converter state */
   3153     offset=0;
   3154     state=sharedData->mbcs.dbcsOnlyState;
   3155 
   3156     /* conversion loop */
   3157     for(i=0;;) {
   3158         entry=stateTable[state][(uint8_t)source[i++]];
   3159         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   3160             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   3161             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   3162 
   3163             if(i==length) {
   3164                 return 0xffff; /* truncated character */
   3165             }
   3166         } else {
   3167             /*
   3168              * An if-else-if chain provides more reliable performance for
   3169              * the most common cases compared to a switch.
   3170              */
   3171             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
   3172             if(action==MBCS_STATE_VALID_16) {
   3173                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3174                 c=unicodeCodeUnits[offset];
   3175                 if(c!=0xfffe) {
   3176                     /* done */
   3177                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
   3178                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
   3179                 /* else done with 0xfffe */
   3180                 }
   3181                 break;
   3182             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
   3183                 /* output BMP code point */
   3184                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3185                 break;
   3186             } else if(action==MBCS_STATE_VALID_16_PAIR) {
   3187                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
   3188                 c=unicodeCodeUnits[offset++];
   3189                 if(c<0xd800) {
   3190                     /* output BMP code point below 0xd800 */
   3191                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
   3192                     /* output roundtrip or fallback supplementary code point */
   3193                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
   3194                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
   3195                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
   3196                     c=unicodeCodeUnits[offset];
   3197                 } else if(c==0xffff) {
   3198                     return 0xffff;
   3199                 } else {
   3200                     c=0xfffe;
   3201                 }
   3202                 break;
   3203             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
   3204                 /* output supplementary code point */
   3205                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3206                 break;
   3207             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
   3208                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3209                     c=0xfffe;
   3210                     break;
   3211                 }
   3212                 /* output BMP code point */
   3213                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
   3214                 break;
   3215             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
   3216                 if(!TO_U_USE_FALLBACK(useFallback)) {
   3217                     c=0xfffe;
   3218                     break;
   3219                 }
   3220                 /* output supplementary code point */
   3221                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
   3222                 break;
   3223             } else if(action==MBCS_STATE_UNASSIGNED) {
   3224                 c=0xfffe;
   3225                 break;
   3226             }
   3227 
   3228             /*
   3229              * forbid MBCS_STATE_CHANGE_ONLY for this function,
   3230              * and MBCS_STATE_ILLEGAL and reserved action codes
   3231              */
   3232             return 0xffff;
   3233         }
   3234     }
   3235 
   3236     if(i!=length) {
   3237         /* illegal for this function: not all input consumed */
   3238         return 0xffff;
   3239     }
   3240 
   3241     if(c==0xfffe) {
   3242         /* try an extension mapping */
   3243         const int32_t *cx=sharedData->mbcs.extIndexes;
   3244         if(cx!=NULL) {
   3245             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
   3246         }
   3247     }
   3248 
   3249     return c;
   3250 }
   3251 
   3252 /* MBCS-from-Unicode conversion functions ----------------------------------- */
   3253 
   3254 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
   3255 static void
   3256 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3257                                   UErrorCode *pErrorCode) {
   3258     UConverter *cnv;
   3259     const UChar *source, *sourceLimit;
   3260     uint8_t *target;
   3261     int32_t targetCapacity;
   3262     int32_t *offsets;
   3263 
   3264     const uint16_t *table;
   3265     const uint16_t *mbcsIndex;
   3266     const uint8_t *bytes;
   3267 
   3268     UChar32 c;
   3269 
   3270     int32_t sourceIndex, nextSourceIndex;
   3271 
   3272     uint32_t stage2Entry;
   3273     uint32_t asciiRoundtrips;
   3274     uint32_t value;
   3275     uint8_t unicodeMask;
   3276 
   3277     /* use optimized function if possible */
   3278     cnv=pArgs->converter;
   3279     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3280 
   3281     /* set up the local pointers */
   3282     source=pArgs->source;
   3283     sourceLimit=pArgs->sourceLimit;
   3284     target=(uint8_t *)pArgs->target;
   3285     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3286     offsets=pArgs->offsets;
   3287 
   3288     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3289     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3290     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3291         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3292     } else {
   3293         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   3294     }
   3295     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3296 
   3297     /* get the converter state from UConverter */
   3298     c=cnv->fromUChar32;
   3299 
   3300     /* sourceIndex=-1 if the current character began in the previous buffer */
   3301     sourceIndex= c==0 ? 0 : -1;
   3302     nextSourceIndex=0;
   3303 
   3304     /* conversion loop */
   3305     if(c!=0 && targetCapacity>0) {
   3306         goto getTrail;
   3307     }
   3308 
   3309     while(source<sourceLimit) {
   3310         /*
   3311          * This following test is to see if available input would overflow the output.
   3312          * It does not catch output of more than one byte that
   3313          * overflows as a result of a multi-byte character or callback output
   3314          * from the last source character.
   3315          * Therefore, those situations also test for overflows and will
   3316          * then break the loop, too.
   3317          */
   3318         if(targetCapacity>0) {
   3319             /*
   3320              * Get a correct Unicode code point:
   3321              * a single UChar for a BMP code point or
   3322              * a matched surrogate pair for a "supplementary code point".
   3323              */
   3324             c=*source++;
   3325             ++nextSourceIndex;
   3326             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3327                 *target++=(uint8_t)c;
   3328                 if(offsets!=NULL) {
   3329                     *offsets++=sourceIndex;
   3330                     sourceIndex=nextSourceIndex;
   3331                 }
   3332                 --targetCapacity;
   3333                 c=0;
   3334                 continue;
   3335             }
   3336             /*
   3337              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   3338              * to avoid dealing with surrogates.
   3339              * MBCS_FAST_MAX must be >=0xd7ff.
   3340              */
   3341             if(c<=0xd7ff) {
   3342                 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
   3343                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   3344                 if(value==0) {
   3345                     goto unassigned;
   3346                 }
   3347                 /* output the value */
   3348             } else {
   3349                 /*
   3350                  * This also tests if the codepage maps single surrogates.
   3351                  * If it does, then surrogates are not paired but mapped separately.
   3352                  * Note that in this case unmatched surrogates are not detected.
   3353                  */
   3354                 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3355                     if(UTF_IS_SURROGATE_FIRST(c)) {
   3356 getTrail:
   3357                         if(source<sourceLimit) {
   3358                             /* test the following code unit */
   3359                             UChar trail=*source;
   3360                             if(UTF_IS_SECOND_SURROGATE(trail)) {
   3361                                 ++source;
   3362                                 ++nextSourceIndex;
   3363                                 c=UTF16_GET_PAIR_VALUE(c, trail);
   3364                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3365                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3366                                     /* callback(unassigned) */
   3367                                     goto unassigned;
   3368                                 }
   3369                                 /* convert this supplementary code point */
   3370                                 /* exit this condition tree */
   3371                             } else {
   3372                                 /* this is an unmatched lead code unit (1st surrogate) */
   3373                                 /* callback(illegal) */
   3374                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3375                                 break;
   3376                             }
   3377                         } else {
   3378                             /* no more input */
   3379                             break;
   3380                         }
   3381                     } else {
   3382                         /* this is an unmatched trail code unit (2nd surrogate) */
   3383                         /* callback(illegal) */
   3384                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3385                         break;
   3386                     }
   3387                 }
   3388 
   3389                 /* convert the Unicode code point in c into codepage bytes */
   3390                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   3391 
   3392                 /* get the bytes and the length for the output */
   3393                 /* MBCS_OUTPUT_2 */
   3394                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   3395 
   3396                 /* is this code point assigned, or do we use fallbacks? */
   3397                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   3398                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   3399                 ) {
   3400                     /*
   3401                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   3402                      * There is no way with this data structure for fallback output
   3403                      * to be a zero byte.
   3404                      */
   3405 
   3406 unassigned:
   3407                     /* try an extension mapping */
   3408                     pArgs->source=source;
   3409                     c=_extFromU(cnv, cnv->sharedData,
   3410                                 c, &source, sourceLimit,
   3411                                 &target, target+targetCapacity,
   3412                                 &offsets, sourceIndex,
   3413                                 pArgs->flush,
   3414                                 pErrorCode);
   3415                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   3416 
   3417                     if(U_FAILURE(*pErrorCode)) {
   3418                         /* not mappable or buffer overflow */
   3419                         break;
   3420                     } else {
   3421                         /* a mapping was written to the target, continue */
   3422 
   3423                         /* recalculate the targetCapacity after an extension mapping */
   3424                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3425 
   3426                         /* normal end of conversion: prepare for a new character */
   3427                         sourceIndex=nextSourceIndex;
   3428                         continue;
   3429                     }
   3430                 }
   3431             }
   3432 
   3433             /* write the output character bytes from value and length */
   3434             /* from the first if in the loop we know that targetCapacity>0 */
   3435             if(value<=0xff) {
   3436                 /* this is easy because we know that there is enough space */
   3437                 *target++=(uint8_t)value;
   3438                 if(offsets!=NULL) {
   3439                     *offsets++=sourceIndex;
   3440                 }
   3441                 --targetCapacity;
   3442             } else /* length==2 */ {
   3443                 *target++=(uint8_t)(value>>8);
   3444                 if(2<=targetCapacity) {
   3445                     *target++=(uint8_t)value;
   3446                     if(offsets!=NULL) {
   3447                         *offsets++=sourceIndex;
   3448                         *offsets++=sourceIndex;
   3449                     }
   3450                     targetCapacity-=2;
   3451                 } else {
   3452                     if(offsets!=NULL) {
   3453                         *offsets++=sourceIndex;
   3454                     }
   3455                     cnv->charErrorBuffer[0]=(char)value;
   3456                     cnv->charErrorBufferLength=1;
   3457 
   3458                     /* target overflow */
   3459                     targetCapacity=0;
   3460                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3461                     c=0;
   3462                     break;
   3463                 }
   3464             }
   3465 
   3466             /* normal end of conversion: prepare for a new character */
   3467             c=0;
   3468             sourceIndex=nextSourceIndex;
   3469             continue;
   3470         } else {
   3471             /* target is full */
   3472             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3473             break;
   3474         }
   3475     }
   3476 
   3477     /* set the converter state back into UConverter */
   3478     cnv->fromUChar32=c;
   3479 
   3480     /* write back the updated pointers */
   3481     pArgs->source=source;
   3482     pArgs->target=(char *)target;
   3483     pArgs->offsets=offsets;
   3484 }
   3485 
   3486 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
   3487 static void
   3488 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3489                                   UErrorCode *pErrorCode) {
   3490     UConverter *cnv;
   3491     const UChar *source, *sourceLimit;
   3492     uint8_t *target;
   3493     int32_t targetCapacity;
   3494     int32_t *offsets;
   3495 
   3496     const uint16_t *table;
   3497     const uint16_t *results;
   3498 
   3499     UChar32 c;
   3500 
   3501     int32_t sourceIndex, nextSourceIndex;
   3502 
   3503     uint16_t value, minValue;
   3504     UBool hasSupplementary;
   3505 
   3506     /* set up the local pointers */
   3507     cnv=pArgs->converter;
   3508     source=pArgs->source;
   3509     sourceLimit=pArgs->sourceLimit;
   3510     target=(uint8_t *)pArgs->target;
   3511     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3512     offsets=pArgs->offsets;
   3513 
   3514     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3515     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3516         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3517     } else {
   3518         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3519     }
   3520 
   3521     if(cnv->useFallback) {
   3522         /* use all roundtrip and fallback results */
   3523         minValue=0x800;
   3524     } else {
   3525         /* use only roundtrips and fallbacks from private-use characters */
   3526         minValue=0xc00;
   3527     }
   3528     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   3529 
   3530     /* get the converter state from UConverter */
   3531     c=cnv->fromUChar32;
   3532 
   3533     /* sourceIndex=-1 if the current character began in the previous buffer */
   3534     sourceIndex= c==0 ? 0 : -1;
   3535     nextSourceIndex=0;
   3536 
   3537     /* conversion loop */
   3538     if(c!=0 && targetCapacity>0) {
   3539         goto getTrail;
   3540     }
   3541 
   3542     while(source<sourceLimit) {
   3543         /*
   3544          * This following test is to see if available input would overflow the output.
   3545          * It does not catch output of more than one byte that
   3546          * overflows as a result of a multi-byte character or callback output
   3547          * from the last source character.
   3548          * Therefore, those situations also test for overflows and will
   3549          * then break the loop, too.
   3550          */
   3551         if(targetCapacity>0) {
   3552             /*
   3553              * Get a correct Unicode code point:
   3554              * a single UChar for a BMP code point or
   3555              * a matched surrogate pair for a "supplementary code point".
   3556              */
   3557             c=*source++;
   3558             ++nextSourceIndex;
   3559             if(UTF_IS_SURROGATE(c)) {
   3560                 if(UTF_IS_SURROGATE_FIRST(c)) {
   3561 getTrail:
   3562                     if(source<sourceLimit) {
   3563                         /* test the following code unit */
   3564                         UChar trail=*source;
   3565                         if(UTF_IS_SECOND_SURROGATE(trail)) {
   3566                             ++source;
   3567                             ++nextSourceIndex;
   3568                             c=UTF16_GET_PAIR_VALUE(c, trail);
   3569                             if(!hasSupplementary) {
   3570                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   3571                                 /* callback(unassigned) */
   3572                                 goto unassigned;
   3573                             }
   3574                             /* convert this supplementary code point */
   3575                             /* exit this condition tree */
   3576                         } else {
   3577                             /* this is an unmatched lead code unit (1st surrogate) */
   3578                             /* callback(illegal) */
   3579                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3580                             break;
   3581                         }
   3582                     } else {
   3583                         /* no more input */
   3584                         break;
   3585                     }
   3586                 } else {
   3587                     /* this is an unmatched trail code unit (2nd surrogate) */
   3588                     /* callback(illegal) */
   3589                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3590                     break;
   3591                 }
   3592             }
   3593 
   3594             /* convert the Unicode code point in c into codepage bytes */
   3595             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3596 
   3597             /* is this code point assigned, or do we use fallbacks? */
   3598             if(value>=minValue) {
   3599                 /* assigned, write the output character bytes from value and length */
   3600                 /* length==1 */
   3601                 /* this is easy because we know that there is enough space */
   3602                 *target++=(uint8_t)value;
   3603                 if(offsets!=NULL) {
   3604                     *offsets++=sourceIndex;
   3605                 }
   3606                 --targetCapacity;
   3607 
   3608                 /* normal end of conversion: prepare for a new character */
   3609                 c=0;
   3610                 sourceIndex=nextSourceIndex;
   3611             } else { /* unassigned */
   3612 unassigned:
   3613                 /* try an extension mapping */
   3614                 pArgs->source=source;
   3615                 c=_extFromU(cnv, cnv->sharedData,
   3616                             c, &source, sourceLimit,
   3617                             &target, target+targetCapacity,
   3618                             &offsets, sourceIndex,
   3619                             pArgs->flush,
   3620                             pErrorCode);
   3621                 nextSourceIndex+=(int32_t)(source-pArgs->source);
   3622 
   3623                 if(U_FAILURE(*pErrorCode)) {
   3624                     /* not mappable or buffer overflow */
   3625                     break;
   3626                 } else {
   3627                     /* a mapping was written to the target, continue */
   3628 
   3629                     /* recalculate the targetCapacity after an extension mapping */
   3630                     targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3631 
   3632                     /* normal end of conversion: prepare for a new character */
   3633                     sourceIndex=nextSourceIndex;
   3634                 }
   3635             }
   3636         } else {
   3637             /* target is full */
   3638             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3639             break;
   3640         }
   3641     }
   3642 
   3643     /* set the converter state back into UConverter */
   3644     cnv->fromUChar32=c;
   3645 
   3646     /* write back the updated pointers */
   3647     pArgs->source=source;
   3648     pArgs->target=(char *)target;
   3649     pArgs->offsets=offsets;
   3650 }
   3651 
   3652 /*
   3653  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
   3654  * that map only to and from the BMP.
   3655  * In addition to single-byte/state optimizations, the offset calculations
   3656  * become much easier.
   3657  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
   3658  * but measurements have shown that this diminishes performance
   3659  * in more cases than it improves it.
   3660  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
   3661  * for various MBCS and SBCS optimizations.
   3662  */
   3663 static void
   3664 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3665                               UErrorCode *pErrorCode) {
   3666     UConverter *cnv;
   3667     const UChar *source, *sourceLimit, *lastSource;
   3668     uint8_t *target;
   3669     int32_t targetCapacity, length;
   3670     int32_t *offsets;
   3671 
   3672     const uint16_t *table;
   3673     const uint16_t *results;
   3674 
   3675     UChar32 c;
   3676 
   3677     int32_t sourceIndex;
   3678 
   3679     uint32_t asciiRoundtrips;
   3680     uint16_t value, minValue;
   3681 
   3682     /* set up the local pointers */
   3683     cnv=pArgs->converter;
   3684     source=pArgs->source;
   3685     sourceLimit=pArgs->sourceLimit;
   3686     target=(uint8_t *)pArgs->target;
   3687     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3688     offsets=pArgs->offsets;
   3689 
   3690     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3691     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3692         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3693     } else {
   3694         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   3695     }
   3696     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3697 
   3698     if(cnv->useFallback) {
   3699         /* use all roundtrip and fallback results */
   3700         minValue=0x800;
   3701     } else {
   3702         /* use only roundtrips and fallbacks from private-use characters */
   3703         minValue=0xc00;
   3704     }
   3705 
   3706     /* get the converter state from UConverter */
   3707     c=cnv->fromUChar32;
   3708 
   3709     /* sourceIndex=-1 if the current character began in the previous buffer */
   3710     sourceIndex= c==0 ? 0 : -1;
   3711     lastSource=source;
   3712 
   3713     /*
   3714      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
   3715      * for the minimum of the sourceLength and targetCapacity
   3716      */
   3717     length=(int32_t)(sourceLimit-source);
   3718     if(length<targetCapacity) {
   3719         targetCapacity=length;
   3720     }
   3721 
   3722     /* conversion loop */
   3723     if(c!=0 && targetCapacity>0) {
   3724         goto getTrail;
   3725     }
   3726 
   3727 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3728     /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3729     /* unroll the loop with the most common case */
   3730 unrolled:
   3731     if(targetCapacity>=4) {
   3732         int32_t count, loops;
   3733         uint16_t andedValues;
   3734 
   3735         loops=count=targetCapacity>>2;
   3736         do {
   3737             c=*source++;
   3738             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3739             *target++=(uint8_t)value;
   3740             c=*source++;
   3741             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3742             *target++=(uint8_t)value;
   3743             c=*source++;
   3744             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3745             *target++=(uint8_t)value;
   3746             c=*source++;
   3747             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3748             *target++=(uint8_t)value;
   3749 
   3750             /* were all 4 entries really valid? */
   3751             if(andedValues<minValue) {
   3752                 /* no, return to the first of these 4 */
   3753                 source-=4;
   3754                 target-=4;
   3755                 break;
   3756             }
   3757         } while(--count>0);
   3758         count=loops-count;
   3759         targetCapacity-=4*count;
   3760 
   3761         if(offsets!=NULL) {
   3762             lastSource+=4*count;
   3763             while(count>0) {
   3764                 *offsets++=sourceIndex++;
   3765                 *offsets++=sourceIndex++;
   3766                 *offsets++=sourceIndex++;
   3767                 *offsets++=sourceIndex++;
   3768                 --count;
   3769             }
   3770         }
   3771 
   3772         c=0;
   3773     }
   3774 #endif
   3775 
   3776     while(targetCapacity>0) {
   3777         /*
   3778          * Get a correct Unicode code point:
   3779          * a single UChar for a BMP code point or
   3780          * a matched surrogate pair for a "supplementary code point".
   3781          */
   3782         c=*source++;
   3783         /*
   3784          * Do not immediately check for single surrogates:
   3785          * Assume that they are unassigned and check for them in that case.
   3786          * This speeds up the conversion of assigned characters.
   3787          */
   3788         /* convert the Unicode code point in c into codepage bytes */
   3789         if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   3790             *target++=(uint8_t)c;
   3791             --targetCapacity;
   3792             c=0;
   3793             continue;
   3794         }
   3795         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   3796         /* is this code point assigned, or do we use fallbacks? */
   3797         if(value>=minValue) {
   3798             /* assigned, write the output character bytes from value and length */
   3799             /* length==1 */
   3800             /* this is easy because we know that there is enough space */
   3801             *target++=(uint8_t)value;
   3802             --targetCapacity;
   3803 
   3804             /* normal end of conversion: prepare for a new character */
   3805             c=0;
   3806             continue;
   3807         } else if(!UTF_IS_SURROGATE(c)) {
   3808             /* normal, unassigned BMP character */
   3809         } else if(UTF_IS_SURROGATE_FIRST(c)) {
   3810 getTrail:
   3811             if(source<sourceLimit) {
   3812                 /* test the following code unit */
   3813                 UChar trail=*source;
   3814                 if(UTF_IS_SECOND_SURROGATE(trail)) {
   3815                     ++source;
   3816                     c=UTF16_GET_PAIR_VALUE(c, trail);
   3817                     /* this codepage does not map supplementary code points */
   3818                     /* callback(unassigned) */
   3819                 } else {
   3820                     /* this is an unmatched lead code unit (1st surrogate) */
   3821                     /* callback(illegal) */
   3822                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3823                     break;
   3824                 }
   3825             } else {
   3826                 /* no more input */
   3827                 if (pArgs->flush) {
   3828                     *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   3829                 }
   3830                 break;
   3831             }
   3832         } else {
   3833             /* this is an unmatched trail code unit (2nd surrogate) */
   3834             /* callback(illegal) */
   3835             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   3836             break;
   3837         }
   3838 
   3839         /* c does not have a mapping */
   3840 
   3841         /* get the number of code units for c to correctly advance sourceIndex */
   3842         length=U16_LENGTH(c);
   3843 
   3844         /* set offsets since the start or the last extension */
   3845         if(offsets!=NULL) {
   3846             int32_t count=(int32_t)(source-lastSource);
   3847 
   3848             /* do not set the offset for this character */
   3849             count-=length;
   3850 
   3851             while(count>0) {
   3852                 *offsets++=sourceIndex++;
   3853                 --count;
   3854             }
   3855             /* offsets and sourceIndex are now set for the current character */
   3856         }
   3857 
   3858         /* try an extension mapping */
   3859         lastSource=source;
   3860         c=_extFromU(cnv, cnv->sharedData,
   3861                     c, &source, sourceLimit,
   3862                     &target, (const uint8_t *)(pArgs->targetLimit),
   3863                     &offsets, sourceIndex,
   3864                     pArgs->flush,
   3865                     pErrorCode);
   3866         sourceIndex+=length+(int32_t)(source-lastSource);
   3867         lastSource=source;
   3868 
   3869         if(U_FAILURE(*pErrorCode)) {
   3870             /* not mappable or buffer overflow */
   3871             break;
   3872         } else {
   3873             /* a mapping was written to the target, continue */
   3874 
   3875             /* recalculate the targetCapacity after an extension mapping */
   3876             targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   3877             length=(int32_t)(sourceLimit-source);
   3878             if(length<targetCapacity) {
   3879                 targetCapacity=length;
   3880             }
   3881         }
   3882 
   3883 #if MBCS_UNROLL_SINGLE_FROM_BMP
   3884         /* unrolling makes it slower on Pentium III/Windows 2000?! */
   3885         goto unrolled;
   3886 #endif
   3887     }
   3888 
   3889     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
   3890         /* target is full */
   3891         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   3892     }
   3893 
   3894     /* set offsets since the start or the last callback */
   3895     if(offsets!=NULL) {
   3896         size_t count=source-lastSource;
   3897         if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
   3898             /*
   3899             Caller gave us a partial supplementary character,
   3900             which this function couldn't convert in any case.
   3901             The callback will handle the offset.
   3902             */
   3903             count--;
   3904         }
   3905         while(count>0) {
   3906             *offsets++=sourceIndex++;
   3907             --count;
   3908         }
   3909     }
   3910 
   3911     /* set the converter state back into UConverter */
   3912     cnv->fromUChar32=c;
   3913 
   3914     /* write back the updated pointers */
   3915     pArgs->source=source;
   3916     pArgs->target=(char *)target;
   3917     pArgs->offsets=offsets;
   3918 }
   3919 
   3920 U_CFUNC void
   3921 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   3922                             UErrorCode *pErrorCode) {
   3923     UConverter *cnv;
   3924     const UChar *source, *sourceLimit;
   3925     uint8_t *target;
   3926     int32_t targetCapacity;
   3927     int32_t *offsets;
   3928 
   3929     const uint16_t *table;
   3930     const uint16_t *mbcsIndex;
   3931     const uint8_t *p, *bytes;
   3932     uint8_t outputType;
   3933 
   3934     UChar32 c;
   3935 
   3936     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
   3937 
   3938     uint32_t stage2Entry;
   3939     uint32_t asciiRoundtrips;
   3940     uint32_t value;
   3941     uint8_t si_value[2], so_value[2], si_value_length, so_value_length;
   3942     int32_t length, prevLength;
   3943     uint8_t unicodeMask;
   3944 
   3945     cnv=pArgs->converter;
   3946 
   3947     if(cnv->preFromUFirstCP>=0) {
   3948         /*
   3949          * pass sourceIndex=-1 because we continue from an earlier buffer
   3950          * in the future, this may change with continuous offsets
   3951          */
   3952         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
   3953 
   3954         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
   3955             return;
   3956         }
   3957     }
   3958 
   3959     /* use optimized function if possible */
   3960     outputType=cnv->sharedData->mbcs.outputType;
   3961     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
   3962     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   3963         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   3964             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
   3965         } else {
   3966             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
   3967         }
   3968         return;
   3969     } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
   3970         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
   3971         return;
   3972     }
   3973 
   3974     /* set up the local pointers */
   3975     source=pArgs->source;
   3976     sourceLimit=pArgs->sourceLimit;
   3977     target=(uint8_t *)pArgs->target;
   3978     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
   3979     offsets=pArgs->offsets;
   3980 
   3981     table=cnv->sharedData->mbcs.fromUnicodeTable;
   3982     if(cnv->sharedData->mbcs.utf8Friendly) {
   3983         mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   3984     } else {
   3985         mbcsIndex=NULL;
   3986     }
   3987     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   3988         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   3989     } else {
   3990         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
   3991     }
   3992     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   3993 
   3994     /* get the converter state from UConverter */
   3995     c=cnv->fromUChar32;
   3996 
   3997     if(outputType==MBCS_OUTPUT_2_SISO) {
   3998         prevLength=cnv->fromUnicodeStatus;
   3999         if(prevLength==0) {
   4000             /* set the real value */
   4001             prevLength=1;
   4002         }
   4003     } else {
   4004         /* prevent fromUnicodeStatus from being set to something non-0 */
   4005         prevLength=0;
   4006     }
   4007 
   4008     /* sourceIndex=-1 if the current character began in the previous buffer */
   4009     prevSourceIndex=-1;
   4010     sourceIndex= c==0 ? 0 : -1;
   4011     nextSourceIndex=0;
   4012 
   4013     /* Get the SI/SO character for the converter */
   4014     si_value_length = getSISOBytes(SI, cnv->options, si_value);
   4015     so_value_length = getSISOBytes(SO, cnv->options, so_value);
   4016 
   4017     /* conversion loop */
   4018     /*
   4019      * This is another piece of ugly code:
   4020      * A goto into the loop if the converter state contains a first surrogate
   4021      * from the previous function call.
   4022      * It saves me to check in each loop iteration a check of if(c==0)
   4023      * and duplicating the trail-surrogate-handling code in the else
   4024      * branch of that check.
   4025      * I could not find any other way to get around this other than
   4026      * using a function call for the conversion and callback, which would
   4027      * be even more inefficient.
   4028      *
   4029      * Markus Scherer 2000-jul-19
   4030      */
   4031     if(c!=0 && targetCapacity>0) {
   4032         goto getTrail;
   4033     }
   4034 
   4035     while(source<sourceLimit) {
   4036         /*
   4037          * This following test is to see if available input would overflow the output.
   4038          * It does not catch output of more than one byte that
   4039          * overflows as a result of a multi-byte character or callback output
   4040          * from the last source character.
   4041          * Therefore, those situations also test for overflows and will
   4042          * then break the loop, too.
   4043          */
   4044         if(targetCapacity>0) {
   4045             /*
   4046              * Get a correct Unicode code point:
   4047              * a single UChar for a BMP code point or
   4048              * a matched surrogate pair for a "supplementary code point".
   4049              */
   4050             c=*source++;
   4051             ++nextSourceIndex;
   4052             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
   4053                 *target++=(uint8_t)c;
   4054                 if(offsets!=NULL) {
   4055                     *offsets++=sourceIndex;
   4056                     prevSourceIndex=sourceIndex;
   4057                     sourceIndex=nextSourceIndex;
   4058                 }
   4059                 --targetCapacity;
   4060                 c=0;
   4061                 continue;
   4062             }
   4063             /*
   4064              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
   4065              * to avoid dealing with surrogates.
   4066              * MBCS_FAST_MAX must be >=0xd7ff.
   4067              */
   4068             if(c<=0xd7ff && mbcsIndex!=NULL) {
   4069                 value=mbcsIndex[c>>6];
   4070 
   4071                 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
   4072                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
   4073                 switch(outputType) {
   4074                 case MBCS_OUTPUT_2:
   4075                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4076                     if(value<=0xff) {
   4077                         if(value==0) {
   4078                             goto unassigned;
   4079                         } else {
   4080                             length=1;
   4081                         }
   4082                     } else {
   4083                         length=2;
   4084                     }
   4085                     break;
   4086                 case MBCS_OUTPUT_2_SISO:
   4087                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4088                     /*
   4089                      * Save the old state in the converter object
   4090                      * right here, then change the local prevLength state variable if necessary.
   4091                      * Then, if this character turns out to be unassigned or a fallback that
   4092                      * is not taken, the callback code must not save the new state in the converter
   4093                      * because the new state is for a character that is not output.
   4094                      * However, the callback must still restore the state from the converter
   4095                      * in case the callback function changed it for its output.
   4096                      */
   4097                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4098                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4099                     if(value<=0xff) {
   4100                         if(value==0) {
   4101                             goto unassigned;
   4102                         } else if(prevLength<=1) {
   4103                             length=1;
   4104                         } else {
   4105                             /* change from double-byte mode to single-byte */
   4106                             if (si_value_length == 1) {
   4107                                 value|=(uint32_t)si_value[0]<<8;
   4108                                 length = 2;
   4109                             } else if (si_value_length == 2) {
   4110                                 value|=(uint32_t)si_value[1]<<8;
   4111                                 value|=(uint32_t)si_value[0]<<16;
   4112                                 length = 3;
   4113                             }
   4114                             prevLength=1;
   4115                         }
   4116                     } else {
   4117                         if(prevLength==2) {
   4118                             length=2;
   4119                         } else {
   4120                             /* change from single-byte mode to double-byte */
   4121                             if (so_value_length == 1) {
   4122                                 value|=(uint32_t)so_value[0]<<16;
   4123                                 length = 3;
   4124                             } else if (so_value_length == 2) {
   4125                                 value|=(uint32_t)so_value[1]<<16;
   4126                                 value|=(uint32_t)so_value[0]<<24;
   4127                                 length = 4;
   4128                             }
   4129                             prevLength=2;
   4130                         }
   4131                     }
   4132                     break;
   4133                 case MBCS_OUTPUT_DBCS_ONLY:
   4134                     /* table with single-byte results, but only DBCS mappings used */
   4135                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4136                     if(value<=0xff) {
   4137                         /* no mapping or SBCS result, not taken for DBCS-only */
   4138                         goto unassigned;
   4139                     } else {
   4140                         length=2;
   4141                     }
   4142                     break;
   4143                 case MBCS_OUTPUT_3:
   4144                     p=bytes+(value+(c&0x3f))*3;
   4145                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4146                     if(value<=0xff) {
   4147                         if(value==0) {
   4148                             goto unassigned;
   4149                         } else {
   4150                             length=1;
   4151                         }
   4152                     } else if(value<=0xffff) {
   4153                         length=2;
   4154                     } else {
   4155                         length=3;
   4156                     }
   4157                     break;
   4158                 case MBCS_OUTPUT_4:
   4159                     value=((const uint32_t *)bytes)[value +(c&0x3f)];
   4160                     if(value<=0xff) {
   4161                         if(value==0) {
   4162                             goto unassigned;
   4163                         } else {
   4164                             length=1;
   4165                         }
   4166                     } else if(value<=0xffff) {
   4167                         length=2;
   4168                     } else if(value<=0xffffff) {
   4169                         length=3;
   4170                     } else {
   4171                         length=4;
   4172                     }
   4173                     break;
   4174                 case MBCS_OUTPUT_3_EUC:
   4175                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
   4176                     /* EUC 16-bit fixed-length representation */
   4177                     if(value<=0xff) {
   4178                         if(value==0) {
   4179                             goto unassigned;
   4180                         } else {
   4181                             length=1;
   4182                         }
   4183                     } else if((value&0x8000)==0) {
   4184                         value|=0x8e8000;
   4185                         length=3;
   4186                     } else if((value&0x80)==0) {
   4187                         value|=0x8f0080;
   4188                         length=3;
   4189                     } else {
   4190                         length=2;
   4191                     }
   4192                     break;
   4193                 case MBCS_OUTPUT_4_EUC:
   4194                     p=bytes+(value+(c&0x3f))*3;
   4195                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4196                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4197                     if(value<=0xff) {
   4198                         if(value==0) {
   4199                             goto unassigned;
   4200                         } else {
   4201                             length=1;
   4202                         }
   4203                     } else if(value<=0xffff) {
   4204                         length=2;
   4205                     } else if((value&0x800000)==0) {
   4206                         value|=0x8e800000;
   4207                         length=4;
   4208                     } else if((value&0x8000)==0) {
   4209                         value|=0x8f008000;
   4210                         length=4;
   4211                     } else {
   4212                         length=3;
   4213                     }
   4214                     break;
   4215                 default:
   4216                     /* must not occur */
   4217                     /*
   4218                      * To avoid compiler warnings that value & length may be
   4219                      * used without having been initialized, we set them here.
   4220                      * In reality, this is unreachable code.
   4221                      * Not having a default branch also causes warnings with
   4222                      * some compilers.
   4223                      */
   4224                     value=0;
   4225                     length=0;
   4226                     break;
   4227                 }
   4228                 /* output the value */
   4229             } else {
   4230                 /*
   4231                  * This also tests if the codepage maps single surrogates.
   4232                  * If it does, then surrogates are not paired but mapped separately.
   4233                  * Note that in this case unmatched surrogates are not detected.
   4234                  */
   4235                 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
   4236                     if(UTF_IS_SURROGATE_FIRST(c)) {
   4237 getTrail:
   4238                         if(source<sourceLimit) {
   4239                             /* test the following code unit */
   4240                             UChar trail=*source;
   4241                             if(UTF_IS_SECOND_SURROGATE(trail)) {
   4242                                 ++source;
   4243                                 ++nextSourceIndex;
   4244                                 c=UTF16_GET_PAIR_VALUE(c, trail);
   4245                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4246                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4247                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4248                                     /* callback(unassigned) */
   4249                                     goto unassigned;
   4250                                 }
   4251                                 /* convert this supplementary code point */
   4252                                 /* exit this condition tree */
   4253                             } else {
   4254                                 /* this is an unmatched lead code unit (1st surrogate) */
   4255                                 /* callback(illegal) */
   4256                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4257                                 break;
   4258                             }
   4259                         } else {
   4260                             /* no more input */
   4261                             break;
   4262                         }
   4263                     } else {
   4264                         /* this is an unmatched trail code unit (2nd surrogate) */
   4265                         /* callback(illegal) */
   4266                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   4267                         break;
   4268                     }
   4269                 }
   4270 
   4271                 /* convert the Unicode code point in c into codepage bytes */
   4272 
   4273                 /*
   4274                  * The basic lookup is a triple-stage compact array (trie) lookup.
   4275                  * For details see the beginning of this file.
   4276                  *
   4277                  * Single-byte codepages are handled with a different data structure
   4278                  * by _MBCSSingle... functions.
   4279                  *
   4280                  * The result consists of a 32-bit value from stage 2 and
   4281                  * a pointer to as many bytes as are stored per character.
   4282                  * The pointer points to the character's bytes in stage 3.
   4283                  * Bits 15..0 of the stage 2 entry contain the stage 3 index
   4284                  * for that pointer, while bits 31..16 are flags for which of
   4285                  * the 16 characters in the block are roundtrip-assigned.
   4286                  *
   4287                  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
   4288                  * respectively as uint32_t, in the platform encoding.
   4289                  * For 3-byte codepages, the bytes are always stored in big-endian order.
   4290                  *
   4291                  * For EUC encodings that use only either 0x8e or 0x8f as the first
   4292                  * byte of their longest byte sequences, the first two bytes in
   4293                  * this third stage indicate with their 7th bits whether these bytes
   4294                  * are to be written directly or actually need to be preceeded by
   4295                  * one of the two Single-Shift codes. With this, the third stage
   4296                  * stores one byte fewer per character than the actual maximum length of
   4297                  * EUC byte sequences.
   4298                  *
   4299                  * Other than that, leading zero bytes are removed and the other
   4300                  * bytes output. A single zero byte may be output if the "assigned"
   4301                  * bit in stage 2 was on.
   4302                  * The data structure does not support zero byte output as a fallback,
   4303                  * and also does not allow output of leading zeros.
   4304                  */
   4305                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4306 
   4307                 /* get the bytes and the length for the output */
   4308                 switch(outputType) {
   4309                 case MBCS_OUTPUT_2:
   4310                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4311                     if(value<=0xff) {
   4312                         length=1;
   4313                     } else {
   4314                         length=2;
   4315                     }
   4316                     break;
   4317                 case MBCS_OUTPUT_2_SISO:
   4318                     /* 1/2-byte stateful with Shift-In/Shift-Out */
   4319                     /*
   4320                      * Save the old state in the converter object
   4321                      * right here, then change the local prevLength state variable if necessary.
   4322                      * Then, if this character turns out to be unassigned or a fallback that
   4323                      * is not taken, the callback code must not save the new state in the converter
   4324                      * because the new state is for a character that is not output.
   4325                      * However, the callback must still restore the state from the converter
   4326                      * in case the callback function changed it for its output.
   4327                      */
   4328                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
   4329                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4330                     if(value<=0xff) {
   4331                         if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
   4332                             /* no mapping, leave value==0 */
   4333                             length=0;
   4334                         } else if(prevLength<=1) {
   4335                             length=1;
   4336                         } else {
   4337                             /* change from double-byte mode to single-byte */
   4338                             if (si_value_length == 1) {
   4339                                 value|=(uint32_t)si_value[0]<<8;
   4340                                 length = 2;
   4341                             } else if (si_value_length == 2) {
   4342                                 value|=(uint32_t)si_value[1]<<8;
   4343                                 value|=(uint32_t)si_value[0]<<16;
   4344                                 length = 3;
   4345                             }
   4346                             prevLength=1;
   4347                         }
   4348                     } else {
   4349                         if(prevLength==2) {
   4350                             length=2;
   4351                         } else {
   4352                             /* change from single-byte mode to double-byte */
   4353                             if (so_value_length == 1) {
   4354                                 value|=(uint32_t)so_value[0]<<16;
   4355                                 length = 3;
   4356                             } else if (so_value_length == 2) {
   4357                                 value|=(uint32_t)so_value[1]<<16;
   4358                                 value|=(uint32_t)so_value[0]<<24;
   4359                                 length = 4;
   4360                             }
   4361                             prevLength=2;
   4362                         }
   4363                     }
   4364                     break;
   4365                 case MBCS_OUTPUT_DBCS_ONLY:
   4366                     /* table with single-byte results, but only DBCS mappings used */
   4367                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4368                     if(value<=0xff) {
   4369                         /* no mapping or SBCS result, not taken for DBCS-only */
   4370                         value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4371                         length=0;
   4372                     } else {
   4373                         length=2;
   4374                     }
   4375                     break;
   4376                 case MBCS_OUTPUT_3:
   4377                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4378                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4379                     if(value<=0xff) {
   4380                         length=1;
   4381                     } else if(value<=0xffff) {
   4382                         length=2;
   4383                     } else {
   4384                         length=3;
   4385                     }
   4386                     break;
   4387                 case MBCS_OUTPUT_4:
   4388                     value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
   4389                     if(value<=0xff) {
   4390                         length=1;
   4391                     } else if(value<=0xffff) {
   4392                         length=2;
   4393                     } else if(value<=0xffffff) {
   4394                         length=3;
   4395                     } else {
   4396                         length=4;
   4397                     }
   4398                     break;
   4399                 case MBCS_OUTPUT_3_EUC:
   4400                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
   4401                     /* EUC 16-bit fixed-length representation */
   4402                     if(value<=0xff) {
   4403                         length=1;
   4404                     } else if((value&0x8000)==0) {
   4405                         value|=0x8e8000;
   4406                         length=3;
   4407                     } else if((value&0x80)==0) {
   4408                         value|=0x8f0080;
   4409                         length=3;
   4410                     } else {
   4411                         length=2;
   4412                     }
   4413                     break;
   4414                 case MBCS_OUTPUT_4_EUC:
   4415                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
   4416                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4417                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4418                     if(value<=0xff) {
   4419                         length=1;
   4420                     } else if(value<=0xffff) {
   4421                         length=2;
   4422                     } else if((value&0x800000)==0) {
   4423                         value|=0x8e800000;
   4424                         length=4;
   4425                     } else if((value&0x8000)==0) {
   4426                         value|=0x8f008000;
   4427                         length=4;
   4428                     } else {
   4429                         length=3;
   4430                     }
   4431                     break;
   4432                 default:
   4433                     /* must not occur */
   4434                     /*
   4435                      * To avoid compiler warnings that value & length may be
   4436                      * used without having been initialized, we set them here.
   4437                      * In reality, this is unreachable code.
   4438                      * Not having a default branch also causes warnings with
   4439                      * some compilers.
   4440                      */
   4441                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4442                     length=0;
   4443                     break;
   4444                 }
   4445 
   4446                 /* is this code point assigned, or do we use fallbacks? */
   4447                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
   4448                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   4449                 ) {
   4450                     /*
   4451                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4452                      * There is no way with this data structure for fallback output
   4453                      * to be a zero byte.
   4454                      */
   4455 
   4456 unassigned:
   4457                     /* try an extension mapping */
   4458                     pArgs->source=source;
   4459                     c=_extFromU(cnv, cnv->sharedData,
   4460                                 c, &source, sourceLimit,
   4461                                 &target, target+targetCapacity,
   4462                                 &offsets, sourceIndex,
   4463                                 pArgs->flush,
   4464                                 pErrorCode);
   4465                     nextSourceIndex+=(int32_t)(source-pArgs->source);
   4466                     prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
   4467 
   4468                     if(U_FAILURE(*pErrorCode)) {
   4469                         /* not mappable or buffer overflow */
   4470                         break;
   4471                     } else {
   4472                         /* a mapping was written to the target, continue */
   4473 
   4474                         /* recalculate the targetCapacity after an extension mapping */
   4475                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
   4476 
   4477                         /* normal end of conversion: prepare for a new character */
   4478                         if(offsets!=NULL) {
   4479                             prevSourceIndex=sourceIndex;
   4480                             sourceIndex=nextSourceIndex;
   4481                         }
   4482                         continue;
   4483                     }
   4484                 }
   4485             }
   4486 
   4487             /* write the output character bytes from value and length */
   4488             /* from the first if in the loop we know that targetCapacity>0 */
   4489             if(length<=targetCapacity) {
   4490                 if(offsets==NULL) {
   4491                     switch(length) {
   4492                         /* each branch falls through to the next one */
   4493                     case 4:
   4494                         *target++=(uint8_t)(value>>24);
   4495                     case 3:
   4496                         *target++=(uint8_t)(value>>16);
   4497                     case 2:
   4498                         *target++=(uint8_t)(value>>8);
   4499                     case 1:
   4500                         *target++=(uint8_t)value;
   4501                     default:
   4502                         /* will never occur */
   4503                         break;
   4504                     }
   4505                 } else {
   4506                     switch(length) {
   4507                         /* each branch falls through to the next one */
   4508                     case 4:
   4509                         *target++=(uint8_t)(value>>24);
   4510                         *offsets++=sourceIndex;
   4511                     case 3:
   4512                         *target++=(uint8_t)(value>>16);
   4513                         *offsets++=sourceIndex;
   4514                     case 2:
   4515                         *target++=(uint8_t)(value>>8);
   4516                         *offsets++=sourceIndex;
   4517                     case 1:
   4518                         *target++=(uint8_t)value;
   4519                         *offsets++=sourceIndex;
   4520                     default:
   4521                         /* will never occur */
   4522                         break;
   4523                     }
   4524                 }
   4525                 targetCapacity-=length;
   4526             } else {
   4527                 uint8_t *charErrorBuffer;
   4528 
   4529                 /*
   4530                  * We actually do this backwards here:
   4531                  * In order to save an intermediate variable, we output
   4532                  * first to the overflow buffer what does not fit into the
   4533                  * regular target.
   4534                  */
   4535                 /* we know that 1<=targetCapacity<length<=4 */
   4536                 length-=targetCapacity;
   4537                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
   4538                 switch(length) {
   4539                     /* each branch falls through to the next one */
   4540                 case 3:
   4541                     *charErrorBuffer++=(uint8_t)(value>>16);
   4542                 case 2:
   4543                     *charErrorBuffer++=(uint8_t)(value>>8);
   4544                 case 1:
   4545                     *charErrorBuffer=(uint8_t)value;
   4546                 default:
   4547                     /* will never occur */
   4548                     break;
   4549                 }
   4550                 cnv->charErrorBufferLength=(int8_t)length;
   4551 
   4552                 /* now output what fits into the regular target */
   4553                 value>>=8*length; /* length was reduced by targetCapacity */
   4554                 switch(targetCapacity) {
   4555                     /* each branch falls through to the next one */
   4556                 case 3:
   4557                     *target++=(uint8_t)(value>>16);
   4558                     if(offsets!=NULL) {
   4559                         *offsets++=sourceIndex;
   4560                     }
   4561                 case 2:
   4562                     *target++=(uint8_t)(value>>8);
   4563                     if(offsets!=NULL) {
   4564                         *offsets++=sourceIndex;
   4565                     }
   4566                 case 1:
   4567                     *target++=(uint8_t)value;
   4568                     if(offsets!=NULL) {
   4569                         *offsets++=sourceIndex;
   4570                     }
   4571                 default:
   4572                     /* will never occur */
   4573                     break;
   4574                 }
   4575 
   4576                 /* target overflow */
   4577                 targetCapacity=0;
   4578                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4579                 c=0;
   4580                 break;
   4581             }
   4582 
   4583             /* normal end of conversion: prepare for a new character */
   4584             c=0;
   4585             if(offsets!=NULL) {
   4586                 prevSourceIndex=sourceIndex;
   4587                 sourceIndex=nextSourceIndex;
   4588             }
   4589             continue;
   4590         } else {
   4591             /* target is full */
   4592             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4593             break;
   4594         }
   4595     }
   4596 
   4597     /*
   4598      * the end of the input stream and detection of truncated input
   4599      * are handled by the framework, but for EBCDIC_STATEFUL conversion
   4600      * we need to emit an SI at the very end
   4601      *
   4602      * conditions:
   4603      *   successful
   4604      *   EBCDIC_STATEFUL in DBCS mode
   4605      *   end of input and no truncated input
   4606      */
   4607     if( U_SUCCESS(*pErrorCode) &&
   4608         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
   4609         pArgs->flush && source>=sourceLimit && c==0
   4610     ) {
   4611         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
   4612         if(targetCapacity>0) {
   4613             *target++=(uint8_t)si_value[0];
   4614             if (si_value_length == 2) {
   4615                 if (targetCapacity<2) {
   4616                     cnv->charErrorBuffer[0]=(uint8_t)si_value[1];
   4617                     cnv->charErrorBufferLength=1;
   4618                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4619                 } else {
   4620                     *target++=(uint8_t)si_value[1];
   4621                 }
   4622             }
   4623             if(offsets!=NULL) {
   4624                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
   4625                 *offsets++=prevSourceIndex;
   4626             }
   4627         } else {
   4628             /* target is full */
   4629             cnv->charErrorBuffer[0]=(uint8_t)si_value[0];
   4630             if (si_value_length == 2) {
   4631                 cnv->charErrorBuffer[1]=(uint8_t)si_value[1];
   4632             }
   4633             cnv->charErrorBufferLength=si_value_length;
   4634             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   4635         }
   4636         prevLength=1; /* we switched into SBCS */
   4637     }
   4638 
   4639     /* set the converter state back into UConverter */
   4640     cnv->fromUChar32=c;
   4641     cnv->fromUnicodeStatus=prevLength;
   4642 
   4643     /* write back the updated pointers */
   4644     pArgs->source=source;
   4645     pArgs->target=(char *)target;
   4646     pArgs->offsets=offsets;
   4647 }
   4648 
   4649 /*
   4650  * This is another simple conversion function for internal use by other
   4651  * conversion implementations.
   4652  * It does not use the converter state nor call callbacks.
   4653  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4654  * It handles conversion extensions but not GB 18030.
   4655  *
   4656  * It converts one single Unicode code point into codepage bytes, encoded
   4657  * as one 32-bit value. The function returns the number of bytes in *pValue:
   4658  * 1..4 the number of bytes in *pValue
   4659  * 0    unassigned (*pValue undefined)
   4660  * -1   illegal (currently not used, *pValue undefined)
   4661  *
   4662  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
   4663  * the second to last byte in bits 15..8, etc.
   4664  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
   4665  */
   4666 U_CFUNC int32_t
   4667 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
   4668                  UChar32 c, uint32_t *pValue,
   4669                  UBool useFallback) {
   4670     const int32_t *cx;
   4671     const uint16_t *table;
   4672 #if 0
   4673 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4674     const uint8_t *p;
   4675 #endif
   4676     uint32_t stage2Entry;
   4677     uint32_t value;
   4678     int32_t length;
   4679 
   4680     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4681     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4682         table=sharedData->mbcs.fromUnicodeTable;
   4683 
   4684         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4685         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
   4686             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4687             /* is this code point assigned, or do we use fallbacks? */
   4688             if(useFallback ? value>=0x800 : value>=0xc00) {
   4689                 *pValue=value&0xff;
   4690                 return 1;
   4691             }
   4692         } else /* outputType!=MBCS_OUTPUT_1 */ {
   4693             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   4694 
   4695             /* get the bytes and the length for the output */
   4696             switch(sharedData->mbcs.outputType) {
   4697             case MBCS_OUTPUT_2:
   4698                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4699                 if(value<=0xff) {
   4700                     length=1;
   4701                 } else {
   4702                     length=2;
   4703                 }
   4704                 break;
   4705 #if 0
   4706 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
   4707             case MBCS_OUTPUT_DBCS_ONLY:
   4708                 /* table with single-byte results, but only DBCS mappings used */
   4709                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4710                 if(value<=0xff) {
   4711                     /* no mapping or SBCS result, not taken for DBCS-only */
   4712                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
   4713                     length=0;
   4714                 } else {
   4715                     length=2;
   4716                 }
   4717                 break;
   4718             case MBCS_OUTPUT_3:
   4719                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4720                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4721                 if(value<=0xff) {
   4722                     length=1;
   4723                 } else if(value<=0xffff) {
   4724                     length=2;
   4725                 } else {
   4726                     length=3;
   4727                 }
   4728                 break;
   4729             case MBCS_OUTPUT_4:
   4730                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4731                 if(value<=0xff) {
   4732                     length=1;
   4733                 } else if(value<=0xffff) {
   4734                     length=2;
   4735                 } else if(value<=0xffffff) {
   4736                     length=3;
   4737                 } else {
   4738                     length=4;
   4739                 }
   4740                 break;
   4741             case MBCS_OUTPUT_3_EUC:
   4742                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4743                 /* EUC 16-bit fixed-length representation */
   4744                 if(value<=0xff) {
   4745                     length=1;
   4746                 } else if((value&0x8000)==0) {
   4747                     value|=0x8e8000;
   4748                     length=3;
   4749                 } else if((value&0x80)==0) {
   4750                     value|=0x8f0080;
   4751                     length=3;
   4752                 } else {
   4753                     length=2;
   4754                 }
   4755                 break;
   4756             case MBCS_OUTPUT_4_EUC:
   4757                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
   4758                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
   4759                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
   4760                 if(value<=0xff) {
   4761                     length=1;
   4762                 } else if(value<=0xffff) {
   4763                     length=2;
   4764                 } else if((value&0x800000)==0) {
   4765                     value|=0x8e800000;
   4766                     length=4;
   4767                 } else if((value&0x8000)==0) {
   4768                     value|=0x8f008000;
   4769                     length=4;
   4770                 } else {
   4771                     length=3;
   4772                 }
   4773                 break;
   4774 #endif
   4775             default:
   4776                 /* must not occur */
   4777                 return -1;
   4778             }
   4779 
   4780             /* is this code point assigned, or do we use fallbacks? */
   4781             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   4782                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
   4783             ) {
   4784                 /*
   4785                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
   4786                  * There is no way with this data structure for fallback output
   4787                  * to be a zero byte.
   4788                  */
   4789                 /* assigned */
   4790                 *pValue=value;
   4791                 return length;
   4792             }
   4793         }
   4794     }
   4795 
   4796     cx=sharedData->mbcs.extIndexes;
   4797     if(cx!=NULL) {
   4798         length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
   4799         return length>=0 ? length : -length;  /* return abs(length); */
   4800     }
   4801 
   4802     /* unassigned */
   4803     return 0;
   4804 }
   4805 
   4806 
   4807 #if 0
   4808 /*
   4809  * This function has been moved to ucnv2022.c for inlining.
   4810  * This implementation is here only for documentation purposes
   4811  */
   4812 
   4813 /**
   4814  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
   4815  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
   4816  * It does not handle conversion extensions (_extFromU()).
   4817  *
   4818  * It returns the codepage byte for the code point, or -1 if it is unassigned.
   4819  */
   4820 U_CFUNC int32_t
   4821 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
   4822                        UChar32 c,
   4823                        UBool useFallback) {
   4824     const uint16_t *table;
   4825     int32_t value;
   4826 
   4827     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   4828     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
   4829         return -1;
   4830     }
   4831 
   4832     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
   4833     table=sharedData->mbcs.fromUnicodeTable;
   4834 
   4835     /* get the byte for the output */
   4836     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
   4837     /* is this code point assigned, or do we use fallbacks? */
   4838     if(useFallback ? value>=0x800 : value>=0xc00) {
   4839         return value&0xff;
   4840     } else {
   4841         return -1;
   4842     }
   4843 }
   4844 #endif
   4845 
   4846 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
   4847 
   4848 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
   4849 static const UChar32
   4850 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
   4851 
   4852 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
   4853 static const UChar32
   4854 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
   4855 
   4856 static void
   4857 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   4858                   UConverterToUnicodeArgs *pToUArgs,
   4859                   UErrorCode *pErrorCode) {
   4860     UConverter *utf8, *cnv;
   4861     const uint8_t *source, *sourceLimit;
   4862     uint8_t *target;
   4863     int32_t targetCapacity;
   4864 
   4865     const uint16_t *table, *sbcsIndex;
   4866     const uint16_t *results;
   4867 
   4868     int8_t oldToULength, toULength, toULimit;
   4869 
   4870     UChar32 c;
   4871     uint8_t b, t1, t2;
   4872 
   4873     uint32_t asciiRoundtrips;
   4874     uint16_t value, minValue;
   4875     UBool hasSupplementary;
   4876 
   4877     /* set up the local pointers */
   4878     utf8=pToUArgs->converter;
   4879     cnv=pFromUArgs->converter;
   4880     source=(uint8_t *)pToUArgs->source;
   4881     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   4882     target=(uint8_t *)pFromUArgs->target;
   4883     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   4884 
   4885     table=cnv->sharedData->mbcs.fromUnicodeTable;
   4886     sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
   4887     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   4888         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   4889     } else {
   4890         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   4891     }
   4892     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   4893 
   4894     if(cnv->useFallback) {
   4895         /* use all roundtrip and fallback results */
   4896         minValue=0x800;
   4897     } else {
   4898         /* use only roundtrips and fallbacks from private-use characters */
   4899         minValue=0xc00;
   4900     }
   4901     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   4902 
   4903     /* get the converter state from the UTF-8 UConverter */
   4904     c=(UChar32)utf8->toUnicodeStatus;
   4905     if(c!=0) {
   4906         toULength=oldToULength=utf8->toULength;
   4907         toULimit=(int8_t)utf8->mode;
   4908     } else {
   4909         toULength=oldToULength=toULimit=0;
   4910     }
   4911 
   4912     /*
   4913      * Make sure that the last byte sequence before sourceLimit is complete
   4914      * or runs into a lead byte.
   4915      * Do not go back into the bytes that will be read for finishing a partial
   4916      * sequence from the previous buffer.
   4917      * In the conversion loop compare source with sourceLimit only once
   4918      * per multi-byte character.
   4919      */
   4920     {
   4921         int32_t i, length;
   4922 
   4923         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   4924         for(i=0; i<3 && i<length;) {
   4925             b=*(sourceLimit-i-1);
   4926             if(U8_IS_TRAIL(b)) {
   4927                 ++i;
   4928             } else {
   4929                 if(i<utf8_countTrailBytes[b]) {
   4930                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   4931                     sourceLimit-=i+1;
   4932                 }
   4933                 break;
   4934             }
   4935         }
   4936     }
   4937 
   4938     if(c!=0 && targetCapacity>0) {
   4939         utf8->toUnicodeStatus=0;
   4940         utf8->toULength=0;
   4941         goto moreBytes;
   4942         /*
   4943          * Note: We could avoid the goto by duplicating some of the moreBytes
   4944          * code, but only up to the point of collecting a complete UTF-8
   4945          * sequence; then recurse for the toUBytes[toULength]
   4946          * and then continue with normal conversion.
   4947          *
   4948          * If so, move this code to just after initializing the minimum
   4949          * set of local variables for reading the UTF-8 input
   4950          * (utf8, source, target, limits but not cnv, table, minValue, etc.).
   4951          *
   4952          * Potential advantages:
   4953          * - avoid the goto
   4954          * - oldToULength could become a local variable in just those code blocks
   4955          *   that deal with buffer boundaries
   4956          * - possibly faster if the goto prevents some compiler optimizations
   4957          *   (this would need measuring to confirm)
   4958          * Disadvantage:
   4959          * - code duplication
   4960          */
   4961     }
   4962 
   4963     /* conversion loop */
   4964     while(source<sourceLimit) {
   4965         if(targetCapacity>0) {
   4966             b=*source++;
   4967             if((int8_t)b>=0) {
   4968                 /* convert ASCII */
   4969                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   4970                     *target++=(uint8_t)b;
   4971                     --targetCapacity;
   4972                     continue;
   4973                 } else {
   4974                     c=b;
   4975                     value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
   4976                 }
   4977             } else {
   4978                 if(b<0xe0) {
   4979                     if( /* handle U+0080..U+07FF inline */
   4980                         b>=0xc2 &&
   4981                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   4982                     ) {
   4983                         c=b&0x1f;
   4984                         ++source;
   4985                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
   4986                         if(value>=minValue) {
   4987                             *target++=(uint8_t)value;
   4988                             --targetCapacity;
   4989                             continue;
   4990                         } else {
   4991                             c=(c<<6)|t1;
   4992                         }
   4993                     } else {
   4994                         c=-1;
   4995                     }
   4996                 } else if(b==0xe0) {
   4997                     if( /* handle U+0800..U+0FFF inline */
   4998                         (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
   4999                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5000                     ) {
   5001                         c=t1;
   5002                         source+=2;
   5003                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
   5004                         if(value>=minValue) {
   5005                             *target++=(uint8_t)value;
   5006                             --targetCapacity;
   5007                             continue;
   5008                         } else {
   5009                             c=(c<<6)|t2;
   5010                         }
   5011                     } else {
   5012                         c=-1;
   5013                     }
   5014                 } else {
   5015                     c=-1;
   5016                 }
   5017 
   5018                 if(c<0) {
   5019                     /* handle "complicated" and error cases, and continuing partial characters */
   5020                     oldToULength=0;
   5021                     toULength=1;
   5022                     toULimit=utf8_countTrailBytes[b]+1;
   5023                     c=b;
   5024 moreBytes:
   5025                     while(toULength<toULimit) {
   5026                         /*
   5027                          * The sourceLimit may have been adjusted before the conversion loop
   5028                          * to stop before a truncated sequence.
   5029                          * Here we need to use the real limit in case we have two truncated
   5030                          * sequences at the end.
   5031                          * See ticket #7492.
   5032                          */
   5033                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5034                             b=*source;
   5035                             if(U8_IS_TRAIL(b)) {
   5036                                 ++source;
   5037                                 ++toULength;
   5038                                 c=(c<<6)+b;
   5039                             } else {
   5040                                 break; /* sequence too short, stop with toULength<toULimit */
   5041                             }
   5042                         } else {
   5043                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5044                             source-=(toULength-oldToULength);
   5045                             while(oldToULength<toULength) {
   5046                                 utf8->toUBytes[oldToULength++]=*source++;
   5047                             }
   5048                             utf8->toUnicodeStatus=c;
   5049                             utf8->toULength=toULength;
   5050                             utf8->mode=toULimit;
   5051                             pToUArgs->source=(char *)source;
   5052                             pFromUArgs->target=(char *)target;
   5053                             return;
   5054                         }
   5055                     }
   5056 
   5057                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5058                         (toULength==3 || toULength==2) &&             /* BMP */
   5059                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5060                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5061                     ) {
   5062                         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5063                     } else if(
   5064                         toULength==toULimit && toULength==4 &&
   5065                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5066                     ) {
   5067                         /* supplementary code point */
   5068                         if(!hasSupplementary) {
   5069                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5070                             value=0;
   5071                         } else {
   5072                             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
   5073                         }
   5074                     } else {
   5075                         /* error handling: illegal UTF-8 byte sequence */
   5076                         source-=(toULength-oldToULength);
   5077                         while(oldToULength<toULength) {
   5078                             utf8->toUBytes[oldToULength++]=*source++;
   5079                         }
   5080                         utf8->toULength=toULength;
   5081                         pToUArgs->source=(char *)source;
   5082                         pFromUArgs->target=(char *)target;
   5083                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5084                         return;
   5085                     }
   5086                 }
   5087             }
   5088 
   5089             if(value>=minValue) {
   5090                 /* output the mapping for c */
   5091                 *target++=(uint8_t)value;
   5092                 --targetCapacity;
   5093             } else {
   5094                 /* value<minValue means c is unassigned (unmappable) */
   5095                 /*
   5096                  * Try an extension mapping.
   5097                  * Pass in no source because we don't have UTF-16 input.
   5098                  * If we have a partial match on c, we will return and revert
   5099                  * to UTF-8->UTF-16->charset conversion.
   5100                  */
   5101                 static const UChar nul=0;
   5102                 const UChar *noSource=&nul;
   5103                 c=_extFromU(cnv, cnv->sharedData,
   5104                             c, &noSource, noSource,
   5105                             &target, target+targetCapacity,
   5106                             NULL, -1,
   5107                             pFromUArgs->flush,
   5108                             pErrorCode);
   5109 
   5110                 if(U_FAILURE(*pErrorCode)) {
   5111                     /* not mappable or buffer overflow */
   5112                     cnv->fromUChar32=c;
   5113                     break;
   5114                 } else if(cnv->preFromUFirstCP>=0) {
   5115                     /*
   5116                      * Partial match, return and revert to pivoting.
   5117                      * In normal from-UTF-16 conversion, we would just continue
   5118                      * but then exit the loop because the extension match would
   5119                      * have consumed the source.
   5120                      */
   5121                     break;
   5122                 } else {
   5123                     /* a mapping was written to the target, continue */
   5124 
   5125                     /* recalculate the targetCapacity after an extension mapping */
   5126                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5127                 }
   5128             }
   5129         } else {
   5130             /* target is full */
   5131             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5132             break;
   5133         }
   5134     }
   5135 
   5136     /*
   5137      * The sourceLimit may have been adjusted before the conversion loop
   5138      * to stop before a truncated sequence.
   5139      * If so, then collect the truncated sequence now.
   5140      */
   5141     if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5142         c=utf8->toUBytes[0]=b=*source++;
   5143         toULength=1;
   5144         toULimit=utf8_countTrailBytes[b]+1;
   5145         while(source<sourceLimit) {
   5146             utf8->toUBytes[toULength++]=b=*source++;
   5147             c=(c<<6)+b;
   5148         }
   5149         utf8->toUnicodeStatus=c;
   5150         utf8->toULength=toULength;
   5151         utf8->mode=toULimit;
   5152     }
   5153 
   5154     /* write back the updated pointers */
   5155     pToUArgs->source=(char *)source;
   5156     pFromUArgs->target=(char *)target;
   5157 }
   5158 
   5159 static void
   5160 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
   5161                   UConverterToUnicodeArgs *pToUArgs,
   5162                   UErrorCode *pErrorCode) {
   5163     UConverter *utf8, *cnv;
   5164     const uint8_t *source, *sourceLimit;
   5165     uint8_t *target;
   5166     int32_t targetCapacity;
   5167 
   5168     const uint16_t *table, *mbcsIndex;
   5169     const uint16_t *results;
   5170 
   5171     int8_t oldToULength, toULength, toULimit;
   5172 
   5173     UChar32 c;
   5174     uint8_t b, t1, t2;
   5175 
   5176     uint32_t stage2Entry;
   5177     uint32_t asciiRoundtrips;
   5178     uint16_t value, minValue;
   5179     UBool hasSupplementary;
   5180 
   5181     /* set up the local pointers */
   5182     utf8=pToUArgs->converter;
   5183     cnv=pFromUArgs->converter;
   5184     source=(uint8_t *)pToUArgs->source;
   5185     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
   5186     target=(uint8_t *)pFromUArgs->target;
   5187     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
   5188 
   5189     table=cnv->sharedData->mbcs.fromUnicodeTable;
   5190     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
   5191     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
   5192         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
   5193     } else {
   5194         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
   5195     }
   5196     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
   5197 
   5198     if(cnv->useFallback) {
   5199         /* use all roundtrip and fallback results */
   5200         minValue=0x800;
   5201     } else {
   5202         /* use only roundtrips and fallbacks from private-use characters */
   5203         minValue=0xc00;
   5204     }
   5205     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
   5206 
   5207     /* get the converter state from the UTF-8 UConverter */
   5208     c=(UChar32)utf8->toUnicodeStatus;
   5209     if(c!=0) {
   5210         toULength=oldToULength=utf8->toULength;
   5211         toULimit=(int8_t)utf8->mode;
   5212     } else {
   5213         toULength=oldToULength=toULimit=0;
   5214     }
   5215 
   5216     /*
   5217      * Make sure that the last byte sequence before sourceLimit is complete
   5218      * or runs into a lead byte.
   5219      * Do not go back into the bytes that will be read for finishing a partial
   5220      * sequence from the previous buffer.
   5221      * In the conversion loop compare source with sourceLimit only once
   5222      * per multi-byte character.
   5223      */
   5224     {
   5225         int32_t i, length;
   5226 
   5227         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
   5228         for(i=0; i<3 && i<length;) {
   5229             b=*(sourceLimit-i-1);
   5230             if(U8_IS_TRAIL(b)) {
   5231                 ++i;
   5232             } else {
   5233                 if(i<utf8_countTrailBytes[b]) {
   5234                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
   5235                     sourceLimit-=i+1;
   5236                 }
   5237                 break;
   5238             }
   5239         }
   5240     }
   5241 
   5242     if(c!=0 && targetCapacity>0) {
   5243         utf8->toUnicodeStatus=0;
   5244         utf8->toULength=0;
   5245         goto moreBytes;
   5246         /* See note in ucnv_SBCSFromUTF8() about this goto. */
   5247     }
   5248 
   5249     /* conversion loop */
   5250     while(source<sourceLimit) {
   5251         if(targetCapacity>0) {
   5252             b=*source++;
   5253             if((int8_t)b>=0) {
   5254                 /* convert ASCII */
   5255                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
   5256                     *target++=b;
   5257                     --targetCapacity;
   5258                     continue;
   5259                 } else {
   5260                     value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
   5261                     if(value==0) {
   5262                         c=b;
   5263                         goto unassigned;
   5264                     }
   5265                 }
   5266             } else {
   5267                 if(b>0xe0) {
   5268                     if( /* handle U+1000..U+D7FF inline */
   5269                         (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
   5270                                                         (b==0xed && (t1 <= 0x1f))) &&
   5271                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
   5272                     ) {
   5273                         c=((b&0xf)<<6)|t1;
   5274                         source+=2;
   5275                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
   5276                         if(value==0) {
   5277                             c=(c<<6)|t2;
   5278                             goto unassigned;
   5279                         }
   5280                     } else {
   5281                         c=-1;
   5282                     }
   5283                 } else if(b<0xe0) {
   5284                     if( /* handle U+0080..U+07FF inline */
   5285                         b>=0xc2 &&
   5286                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
   5287                     ) {
   5288                         c=b&0x1f;
   5289                         ++source;
   5290                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
   5291                         if(value==0) {
   5292                             c=(c<<6)|t1;
   5293                             goto unassigned;
   5294                         }
   5295                     } else {
   5296                         c=-1;
   5297                     }
   5298                 } else {
   5299                     c=-1;
   5300                 }
   5301 
   5302                 if(c<0) {
   5303                     /* handle "complicated" and error cases, and continuing partial characters */
   5304                     oldToULength=0;
   5305                     toULength=1;
   5306                     toULimit=utf8_countTrailBytes[b]+1;
   5307                     c=b;
   5308 moreBytes:
   5309                     while(toULength<toULimit) {
   5310                         /*
   5311                          * The sourceLimit may have been adjusted before the conversion loop
   5312                          * to stop before a truncated sequence.
   5313                          * Here we need to use the real limit in case we have two truncated
   5314                          * sequences at the end.
   5315                          * See ticket #7492.
   5316                          */
   5317                         if(source<(uint8_t *)pToUArgs->sourceLimit) {
   5318                             b=*source;
   5319                             if(U8_IS_TRAIL(b)) {
   5320                                 ++source;
   5321                                 ++toULength;
   5322                                 c=(c<<6)+b;
   5323                             } else {
   5324                                 break; /* sequence too short, stop with toULength<toULimit */
   5325                             }
   5326                         } else {
   5327                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
   5328                             source-=(toULength-oldToULength);
   5329                             while(oldToULength<toULength) {
   5330                                 utf8->toUBytes[oldToULength++]=*source++;
   5331                             }
   5332                             utf8->toUnicodeStatus=c;
   5333                             utf8->toULength=toULength;
   5334                             utf8->mode=toULimit;
   5335                             pToUArgs->source=(char *)source;
   5336                             pFromUArgs->target=(char *)target;
   5337                             return;
   5338                         }
   5339                     }
   5340 
   5341                     if( toULength==toULimit &&      /* consumed all trail bytes */
   5342                         (toULength==3 || toULength==2) &&             /* BMP */
   5343                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
   5344                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
   5345                     ) {
   5346                         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5347                     } else if(
   5348                         toULength==toULimit && toULength==4 &&
   5349                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
   5350                     ) {
   5351                         /* supplementary code point */
   5352                         if(!hasSupplementary) {
   5353                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
   5354                             stage2Entry=0;
   5355                         } else {
   5356                             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
   5357                         }
   5358                     } else {
   5359                         /* error handling: illegal UTF-8 byte sequence */
   5360                         source-=(toULength-oldToULength);
   5361                         while(oldToULength<toULength) {
   5362                             utf8->toUBytes[oldToULength++]=*source++;
   5363                         }
   5364                         utf8->toULength=toULength;
   5365                         pToUArgs->source=(char *)source;
   5366                         pFromUArgs->target=(char *)target;
   5367                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   5368                         return;
   5369                     }
   5370 
   5371                     /* get the bytes and the length for the output */
   5372                     /* MBCS_OUTPUT_2 */
   5373                     value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
   5374 
   5375                     /* is this code point assigned, or do we use fallbacks? */
   5376                     if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
   5377                          (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
   5378                     ) {
   5379                         goto unassigned;
   5380                     }
   5381                 }
   5382             }
   5383 
   5384             /* write the output character bytes from value and length */
   5385             /* from the first if in the loop we know that targetCapacity>0 */
   5386             if(value<=0xff) {
   5387                 /* this is easy because we know that there is enough space */
   5388                 *target++=(uint8_t)value;
   5389                 --targetCapacity;
   5390             } else /* length==2 */ {
   5391                 *target++=(uint8_t)(value>>8);
   5392                 if(2<=targetCapacity) {
   5393                     *target++=(uint8_t)value;
   5394                     targetCapacity-=2;
   5395                 } else {
   5396                     cnv->charErrorBuffer[0]=(char)value;
   5397                     cnv->charErrorBufferLength=1;
   5398 
   5399                     /* target overflow */
   5400                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5401                     break;
   5402                 }
   5403             }
   5404             continue;
   5405 
   5406 unassigned:
   5407             {
   5408                 /*
   5409                  * Try an extension mapping.
   5410                  * Pass in no source because we don't have UTF-16 input.
   5411                  * If we have a partial match on c, we will return and revert
   5412                  * to UTF-8->UTF-16->charset conversion.
   5413                  */
   5414                 static const UChar nul=0;
   5415                 const UChar *noSource=&nul;
   5416                 c=_extFromU(cnv, cnv->sharedData,
   5417                             c, &noSource, noSource,
   5418                             &target, target+targetCapacity,
   5419                             NULL, -1,
   5420                             pFromUArgs->flush,
   5421                             pErrorCode);
   5422 
   5423                 if(U_FAILURE(*pErrorCode)) {
   5424                     /* not mappable or buffer overflow */
   5425                     cnv->fromUChar32=c;
   5426                     break;
   5427                 } else if(cnv->preFromUFirstCP>=0) {
   5428                     /*
   5429                      * Partial match, return and revert to pivoting.
   5430                      * In normal from-UTF-16 conversion, we would just continue
   5431                      * but then exit the loop because the extension match would
   5432                      * have consumed the source.
   5433                      */
   5434                     break;
   5435                 } else {
   5436                     /* a mapping was written to the target, continue */
   5437 
   5438                     /* recalculate the targetCapacity after an extension mapping */
   5439                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
   5440                     continue;
   5441                 }
   5442             }
   5443         } else {
   5444             /* target is full */
   5445             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   5446             break;
   5447         }
   5448     }
   5449 
   5450     /*
   5451      * The sourceLimit may have been adjusted before the conversion loop
   5452      * to stop before a truncated sequence.
   5453      * If so, then collect the truncated sequence now.
   5454      */
   5455     if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
   5456         c=utf8->toUBytes[0]=b=*source++;
   5457         toULength=1;
   5458         toULimit=utf8_countTrailBytes[b]+1;
   5459         while(source<sourceLimit) {
   5460             utf8->toUBytes[toULength++]=b=*source++;
   5461             c=(c<<6)+b;
   5462         }
   5463         utf8->toUnicodeStatus=c;
   5464         utf8->toULength=toULength;
   5465         utf8->mode=toULimit;
   5466     }
   5467 
   5468     /* write back the updated pointers */
   5469     pToUArgs->source=(char *)source;
   5470     pFromUArgs->target=(char *)target;
   5471 }
   5472 
   5473 /* miscellaneous ------------------------------------------------------------ */
   5474 
   5475 static void
   5476 ucnv_MBCSGetStarters(const UConverter* cnv,
   5477                  UBool starters[256],
   5478                  UErrorCode *pErrorCode) {
   5479     const int32_t *state0;
   5480     int i;
   5481 
   5482     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
   5483     for(i=0; i<256; ++i) {
   5484         /* all bytes that cause a state transition from state 0 are lead bytes */
   5485         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
   5486     }
   5487 }
   5488 
   5489 /*
   5490  * This is an internal function that allows other converter implementations
   5491  * to check whether a byte is a lead byte.
   5492  */
   5493 U_CFUNC UBool
   5494 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
   5495     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
   5496 }
   5497 
   5498 static void
   5499 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
   5500               int32_t offsetIndex,
   5501               UErrorCode *pErrorCode) {
   5502     UConverter *cnv=pArgs->converter;
   5503     char *p, *subchar;
   5504     char buffer[4];
   5505     int32_t length;
   5506 
   5507     /* first, select between subChar and subChar1 */
   5508     if( cnv->subChar1!=0 &&
   5509         (cnv->sharedData->mbcs.extIndexes!=NULL ?
   5510             cnv->useSubChar1 :
   5511             (cnv->invalidUCharBuffer[0]<=0xff))
   5512     ) {
   5513         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
   5514         subchar=(char *)&cnv->subChar1;
   5515         length=1;
   5516     } else {
   5517         /* select subChar in all other cases */
   5518         subchar=(char *)cnv->subChars;
   5519         length=cnv->subCharLen;
   5520     }
   5521 
   5522     /* reset the selector for the next code point */
   5523     cnv->useSubChar1=FALSE;
   5524 
   5525     if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
   5526         p=buffer;
   5527 
   5528         /* fromUnicodeStatus contains prevLength */
   5529         switch(length) {
   5530         case 1:
   5531             if(cnv->fromUnicodeStatus==2) {
   5532                 /* DBCS mode and SBCS sub char: change to SBCS */
   5533                 cnv->fromUnicodeStatus=1;
   5534                 *p++=UCNV_SI;
   5535             }
   5536             *p++=subchar[0];
   5537             break;
   5538         case 2:
   5539             if(cnv->fromUnicodeStatus<=1) {
   5540                 /* SBCS mode and DBCS sub char: change to DBCS */
   5541                 cnv->fromUnicodeStatus=2;
   5542                 *p++=UCNV_SO;
   5543             }
   5544             *p++=subchar[0];
   5545             *p++=subchar[1];
   5546             break;
   5547         default:
   5548             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   5549             return;
   5550         }
   5551         subchar=buffer;
   5552         length=(int32_t)(p-buffer);
   5553     }
   5554 
   5555     ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
   5556 }
   5557 
   5558 U_CFUNC UConverterType
   5559 ucnv_MBCSGetType(const UConverter* converter) {
   5560     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
   5561     if(converter->sharedData->mbcs.countStates==1) {
   5562         return (UConverterType)UCNV_SBCS;
   5563     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
   5564         return (UConverterType)UCNV_EBCDIC_STATEFUL;
   5565     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
   5566         return (UConverterType)UCNV_DBCS;
   5567     }
   5568     return (UConverterType)UCNV_MBCS;
   5569 }
   5570 
   5571 static const UConverterImpl _SBCSUTF8Impl={
   5572     UCNV_MBCS,
   5573 
   5574     ucnv_MBCSLoad,
   5575     ucnv_MBCSUnload,
   5576 
   5577     ucnv_MBCSOpen,
   5578     NULL,
   5579     NULL,
   5580 
   5581     ucnv_MBCSToUnicodeWithOffsets,
   5582     ucnv_MBCSToUnicodeWithOffsets,
   5583     ucnv_MBCSFromUnicodeWithOffsets,
   5584     ucnv_MBCSFromUnicodeWithOffsets,
   5585     ucnv_MBCSGetNextUChar,
   5586 
   5587     ucnv_MBCSGetStarters,
   5588     ucnv_MBCSGetName,
   5589     ucnv_MBCSWriteSub,
   5590     NULL,
   5591     ucnv_MBCSGetUnicodeSet,
   5592 
   5593     NULL,
   5594     ucnv_SBCSFromUTF8
   5595 };
   5596 
   5597 static const UConverterImpl _DBCSUTF8Impl={
   5598     UCNV_MBCS,
   5599 
   5600     ucnv_MBCSLoad,
   5601     ucnv_MBCSUnload,
   5602 
   5603     ucnv_MBCSOpen,
   5604     NULL,
   5605     NULL,
   5606 
   5607     ucnv_MBCSToUnicodeWithOffsets,
   5608     ucnv_MBCSToUnicodeWithOffsets,
   5609     ucnv_MBCSFromUnicodeWithOffsets,
   5610     ucnv_MBCSFromUnicodeWithOffsets,
   5611     ucnv_MBCSGetNextUChar,
   5612 
   5613     ucnv_MBCSGetStarters,
   5614     ucnv_MBCSGetName,
   5615     ucnv_MBCSWriteSub,
   5616     NULL,
   5617     ucnv_MBCSGetUnicodeSet,
   5618 
   5619     NULL,
   5620     ucnv_DBCSFromUTF8
   5621 };
   5622 
   5623 static const UConverterImpl _MBCSImpl={
   5624     UCNV_MBCS,
   5625 
   5626     ucnv_MBCSLoad,
   5627     ucnv_MBCSUnload,
   5628 
   5629     ucnv_MBCSOpen,
   5630     NULL,
   5631     NULL,
   5632 
   5633     ucnv_MBCSToUnicodeWithOffsets,
   5634     ucnv_MBCSToUnicodeWithOffsets,
   5635     ucnv_MBCSFromUnicodeWithOffsets,
   5636     ucnv_MBCSFromUnicodeWithOffsets,
   5637     ucnv_MBCSGetNextUChar,
   5638 
   5639     ucnv_MBCSGetStarters,
   5640     ucnv_MBCSGetName,
   5641     ucnv_MBCSWriteSub,
   5642     NULL,
   5643     ucnv_MBCSGetUnicodeSet
   5644 };
   5645 
   5646 
   5647 /* Static data is in tools/makeconv/ucnvstat.c for data-based
   5648  * converters. Be sure to update it as well.
   5649  */
   5650 
   5651 const UConverterSharedData _MBCSData={
   5652     sizeof(UConverterSharedData), 1,
   5653     NULL, NULL, NULL, FALSE, &_MBCSImpl,
   5654     0
   5655 };
   5656 
   5657 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
   5658