Home | History | Annotate | Download | only in charset
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2006-2016, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 package com.ibm.icu.charset;
     11 
     12 import java.lang.reflect.Constructor;
     13 import java.lang.reflect.InvocationTargetException;
     14 import java.nio.charset.Charset;
     15 import java.nio.charset.IllegalCharsetNameException;
     16 import java.nio.charset.UnsupportedCharsetException;
     17 import java.util.HashMap;
     18 
     19 import com.ibm.icu.text.UnicodeSet;
     20 
     21 /**
     22  * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
     23  * This API is used to convert codepage or character encoded data to and
     24  * from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that
     25  * converter, you can get its properties, set options, convert your data.
     26  *
     27  * <p>Since many software programs recognize different converter names for
     28  * different types of converters, there are other functions in this API to
     29  * iterate over the converter aliases.
     30  *
     31  * <p>Note that {@link #name()} cannot always return a unique charset name.
     32  * {@link Charset} documents that,
     33  * for charsets listed in the IANA Charset Registry,
     34  * the {@link #name()} must be listed there,
     35  * and it must be the MIME-preferred name if there are multiple names.
     36  *
     37  * <p>However, there are different implementations of many if not most charsets,
     38  * ICU provides multiple variants for some of them,
     39  * ICU provides variants of some java.nio-system-supported charsets,
     40  * and ICU users are free to add more variants.
     41  * This is so that applications can be compatible with multiple implementations at the same time.
     42  *
     43  * <p>This is in conflict with the {@link Charset#name()} requirements.
     44  * It is not possible to offer variants of an IANA charset and
     45  * always use the MIME-preferred name and also have those names be unique.
     46  *
     47  * <p>{@link #name()} returns the MIME-preferred name, or IANA name,
     48  * so that it can always be used for the charset field in internet protocols.
     49  *
     50  * <p>Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU}
     51  * by using unique aliases (e.g., the ICU-canonical names).
     52  *
     53  * <p>{@link Charset} also documents that
     54  * Two charsets are equal if, and only if, they have the same canonical names.
     55  * This is not possible.
     56  *
     57  * <p>Unfortunately, {@link Charset#equals} is final, and
     58  * {@link Charset#availableCharsets} returns
     59  * a sorted map from canonical charset names to charset objects.
     60  * Since {@link #name()} cannot be unique,
     61  * {@link #equals} cannot work properly in such cases, and
     62  * {@link Charset#availableCharsets} can only include one variant for a name.
     63  *
     64  * @stable ICU 3.6
     65  */
     66 public abstract class CharsetICU extends Charset{
     67 
     68      String icuCanonicalName;
     69      int options;
     70 
     71      float  maxCharsPerByte;
     72 
     73      String name; /* +4: 60  internal name of the converter- invariant chars */
     74 
     75      int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */
     76 
     77      byte platform;                /* +68: 1 platform of the converter (only IBM now) */
     78      byte conversionType;          /* +69: 1 conversion type */
     79 
     80      int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
     81      int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */
     82 
     83      byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */
     84      byte subCharLen;              /* +76: 1 */
     85 
     86      byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
     87      byte hasFromUnicodeFallback; /* +78: 1 */
     88      short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
     89      byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
     90      //byte reserved[/*19*/];           /* +81: 19 to round out the structure */
     91 
     92 
     93     // typedef enum UConverterUnicodeSet {
     94      /**
     95       * Parameter that select the set of roundtrippable Unicode code points.
     96       * @stable ICU 4.0
     97       */
     98       public static final int ROUNDTRIP_SET=0;
     99       /**
    100        * Select the set of Unicode code points with roundtrip or fallback mappings.
    101        * Not supported at this point.
    102        * @internal
    103        * @deprecated This API is ICU internal only.
    104        */
    105       @Deprecated
    106       public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
    107 
    108     //} UConverterUnicodeSet;
    109 
    110     /**
    111      *
    112      * @param icuCanonicalName
    113      * @param canonicalName
    114      * @param aliases
    115      * @stable ICU 3.6
    116      */
    117     protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
    118         super(canonicalName,aliases);
    119         if(canonicalName.length() == 0){
    120             throw new IllegalCharsetNameException(canonicalName);
    121         }
    122         this.icuCanonicalName  = icuCanonicalName;
    123     }
    124 
    125     /**
    126      * Ascertains if a charset is a sub set of this charset
    127      * Implements the abstract method of super class.
    128      * @param cs charset to test
    129      * @return true if the given charset is a subset of this charset
    130      * @stable ICU 3.6
    131      */
    132     @Override
    133     public boolean contains(Charset cs){
    134         if (null == cs) {
    135             return false;
    136         } else if (this.equals(cs)) {
    137             return true;
    138         }
    139         return false;
    140     }
    141     private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>();
    142     static{
    143         algorithmicCharsets.put("LMBCS-1",               "com.ibm.icu.charset.CharsetLMBCS");
    144         algorithmicCharsets.put("LMBCS-2",               "com.ibm.icu.charset.CharsetLMBCS");
    145         algorithmicCharsets.put("LMBCS-3",               "com.ibm.icu.charset.CharsetLMBCS");
    146         algorithmicCharsets.put("LMBCS-4",               "com.ibm.icu.charset.CharsetLMBCS");
    147         algorithmicCharsets.put("LMBCS-5",               "com.ibm.icu.charset.CharsetLMBCS");
    148         algorithmicCharsets.put("LMBCS-6",               "com.ibm.icu.charset.CharsetLMBCS");
    149         algorithmicCharsets.put("LMBCS-8",               "com.ibm.icu.charset.CharsetLMBCS");
    150         algorithmicCharsets.put("LMBCS-11",              "com.ibm.icu.charset.CharsetLMBCS");
    151         algorithmicCharsets.put("LMBCS-16",              "com.ibm.icu.charset.CharsetLMBCS");
    152         algorithmicCharsets.put("LMBCS-17",              "com.ibm.icu.charset.CharsetLMBCS");
    153         algorithmicCharsets.put("LMBCS-18",              "com.ibm.icu.charset.CharsetLMBCS");
    154         algorithmicCharsets.put("LMBCS-19",              "com.ibm.icu.charset.CharsetLMBCS");
    155         algorithmicCharsets.put("BOCU-1",                "com.ibm.icu.charset.CharsetBOCU1" );
    156         algorithmicCharsets.put("SCSU",                  "com.ibm.icu.charset.CharsetSCSU" );
    157         algorithmicCharsets.put("US-ASCII",              "com.ibm.icu.charset.CharsetASCII" );
    158         algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );
    159         algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );
    160         algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );
    161         algorithmicCharsets.put("UTF-16BE,version=1",    "com.ibm.icu.charset.CharsetUTF16BE" );
    162         algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );
    163         algorithmicCharsets.put("UTF-16LE,version=1",    "com.ibm.icu.charset.CharsetUTF16LE" );
    164         algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );
    165         algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );
    166         algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );
    167         algorithmicCharsets.put("UTF-32BE",              "com.ibm.icu.charset.CharsetUTF32BE" );
    168         algorithmicCharsets.put("UTF-32LE",              "com.ibm.icu.charset.CharsetUTF32LE" );
    169         algorithmicCharsets.put("UTF32_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF32LE" );
    170         algorithmicCharsets.put("UTF32_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF32" );
    171         algorithmicCharsets.put("UTF-8",                 "com.ibm.icu.charset.CharsetUTF8" );
    172         algorithmicCharsets.put("CESU-8",                "com.ibm.icu.charset.CharsetCESU8" );
    173         algorithmicCharsets.put("UTF-7",                 "com.ibm.icu.charset.CharsetUTF7" );
    174         algorithmicCharsets.put("ISCII,version=0",       "com.ibm.icu.charset.CharsetISCII" );
    175         algorithmicCharsets.put("ISCII,version=1",       "com.ibm.icu.charset.CharsetISCII" );
    176         algorithmicCharsets.put("ISCII,version=2",       "com.ibm.icu.charset.CharsetISCII" );
    177         algorithmicCharsets.put("ISCII,version=3",       "com.ibm.icu.charset.CharsetISCII" );
    178         algorithmicCharsets.put("ISCII,version=4",       "com.ibm.icu.charset.CharsetISCII" );
    179         algorithmicCharsets.put("ISCII,version=5",       "com.ibm.icu.charset.CharsetISCII" );
    180         algorithmicCharsets.put("ISCII,version=6",       "com.ibm.icu.charset.CharsetISCII" );
    181         algorithmicCharsets.put("ISCII,version=7",       "com.ibm.icu.charset.CharsetISCII" );
    182         algorithmicCharsets.put("ISCII,version=8",       "com.ibm.icu.charset.CharsetISCII" );
    183         algorithmicCharsets.put("IMAP-mailbox-name",     "com.ibm.icu.charset.CharsetUTF7" );
    184         algorithmicCharsets.put("HZ",                    "com.ibm.icu.charset.CharsetHZ" );
    185         algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
    186         algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
    187         algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
    188         algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.ibm.icu.charset.CharsetISO2022" );
    189         algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.ibm.icu.charset.CharsetISO2022" );
    190         algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
    191         algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
    192         algorithmicCharsets.put("ISO_2022,locale=zh,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
    193         algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
    194         algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
    195         algorithmicCharsets.put("x11-compound-text",                          "com.ibm.icu.charset.CharsetCompoundText" );
    196         }
    197 
    198     /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
    199        String className = algorithmicCharsets.get(icuCanonicalName);
    200        if(className==null){
    201            //all the cnv files are loaded as MBCS
    202            className = "com.ibm.icu.charset.CharsetMBCS";
    203        }
    204        try{
    205            CharsetICU conv = null;
    206            Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
    207            Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class,  String[].class};
    208            final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes);
    209            Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
    210 
    211            // Run constructor
    212            try {
    213                conv = c.newInstance(params);
    214                if (conv != null) {
    215                    return conv;
    216                }
    217            }catch (InvocationTargetException e) {
    218                Throwable cause = e.getCause();
    219                UnsupportedCharsetException e2 = new UnsupportedCharsetException(
    220                        icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause);
    221                e2.initCause(cause);
    222                throw e2;
    223            }
    224        }catch(ClassNotFoundException ex){
    225        }catch(NoSuchMethodException ex){
    226        }catch (IllegalAccessException ex){
    227        }catch (InstantiationException ex){
    228        }
    229        throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
    230     }
    231 
    232     static final boolean isSurrogate(int c){
    233         return (((c)&0xfffff800)==0xd800);
    234     }
    235 
    236     /*
    237      * Returns the default charset name
    238      */
    239 //    static final String getDefaultCharsetName(){
    240 //        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
    241 //        return defaultEncoding;
    242 //    }
    243 
    244     /**
    245      * Returns a charset object for the named charset.
    246      * This method gurantee that ICU charset is returned when
    247      * available.  If the ICU charset provider does not support
    248      * the specified charset, then try other charset providers
    249      * including the standard Java charset provider.
    250      *
    251      * @param charsetName The name of the requested charset,
    252      * may be either a canonical name or an alias
    253      * @return A charset object for the named charset
    254      * @throws IllegalCharsetNameException If the given charset name
    255      * is illegal
    256      * @throws UnsupportedCharsetException If no support for the
    257      * named charset is available in this instance of th Java
    258      * virtual machine
    259      * @stable ICU 3.6
    260      */
    261     public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
    262         CharsetProviderICU icuProvider = new CharsetProviderICU();
    263         CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
    264         if (cs != null) {
    265             return cs;
    266         }
    267         return Charset.forName(charsetName);
    268     }
    269 
    270 //    /**
    271 //     * @see java.lang.Comparable#compareTo(java.lang.Object)
    272 //     * @stable 3.8
    273 //     */
    274 //    public int compareTo(Object otherObj) {
    275 //        if (!(otherObj instanceof CharsetICU)) {
    276 //            return -1;
    277 //        }
    278 //        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
    279 //    }
    280 
    281     /**
    282      * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
    283      * start of the stream for example U+FEFF (the Unicode BOM/signature
    284      * character) that can be ignored.
    285      *
    286      * Detects Unicode signature byte sequences at the start of the byte stream
    287      * and returns number of bytes of the BOM of the indicated Unicode charset.
    288      * 0 is returned when no Unicode signature is recognized.
    289      *
    290      */
    291     // TODO This should be proposed as CharsetDecoderICU API.
    292 //    static String detectUnicodeSignature(ByteBuffer source) {
    293 //        int signatureLength = 0; // number of bytes of the signature
    294 //        final int SIG_MAX_LEN = 5;
    295 //        String sigUniCharset = null; // states what unicode charset is the BOM
    296 //        int i = 0;
    297 //
    298 //        /*
    299 //         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
    300 //         * don't misdetect something
    301 //         */
    302 //        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
    303 //                (byte) 0xa5 };
    304 //
    305 //        while (i < source.remaining() && i < SIG_MAX_LEN) {
    306 //            start[i] = source.get(i);
    307 //            i++;
    308 //        }
    309 //
    310 //        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
    311 //            signatureLength = 2;
    312 //            sigUniCharset = "UTF-16BE";
    313 //            source.position(signatureLength);
    314 //            return sigUniCharset;
    315 //        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
    316 //            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
    317 //                signatureLength = 4;
    318 //                sigUniCharset = "UTF-32LE";
    319 //                source.position(signatureLength);
    320 //                return sigUniCharset;
    321 //            } else {
    322 //                signatureLength = 2;
    323 //                sigUniCharset = "UTF-16LE";
    324 //                source.position(signatureLength);
    325 //                return sigUniCharset;
    326 //            }
    327 //        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
    328 //                && start[2] == (byte) 0xBF) {
    329 //            signatureLength = 3;
    330 //            sigUniCharset = "UTF-8";
    331 //            source.position(signatureLength);
    332 //            return sigUniCharset;
    333 //        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
    334 //                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
    335 //            signatureLength = 4;
    336 //            sigUniCharset = "UTF-32BE";
    337 //            source.position(signatureLength);
    338 //            return sigUniCharset;
    339 //        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
    340 //                && start[2] == (byte) 0xFF) {
    341 //            signatureLength = 3;
    342 //            sigUniCharset = "SCSU";
    343 //            source.position(signatureLength);
    344 //            return sigUniCharset;
    345 //        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
    346 //                && start[2] == (byte) 0x28) {
    347 //            signatureLength = 3;
    348 //            sigUniCharset = "BOCU-1";
    349 //            source.position(signatureLength);
    350 //            return sigUniCharset;
    351 //        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
    352 //                && start[2] == (byte) 0x76) {
    353 //
    354 //            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
    355 //                signatureLength = 5;
    356 //                sigUniCharset = "UTF-7";
    357 //                source.position(signatureLength);
    358 //                return sigUniCharset;
    359 //            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
    360 //                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
    361 //                signatureLength = 4;
    362 //                sigUniCharset = "UTF-7";
    363 //                source.position(signatureLength);
    364 //                return sigUniCharset;
    365 //            }
    366 //        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
    367 //                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
    368 //            signatureLength = 4;
    369 //            sigUniCharset = "UTF-EBCDIC";
    370 //            source.position(signatureLength);
    371 //            return sigUniCharset;
    372 //        }
    373 //
    374 //        /* no known Unicode signature byte sequence recognized */
    375 //        return null;
    376 //    }
    377 
    378 
    379     abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
    380 
    381     /**
    382     * Returns the set of Unicode code points that can be converted by an ICU Converter.
    383     *
    384     * <p>The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
    385     * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
    386     * mappings or are only the result of reverse fallback mappings.  See UTR #22 "Character Mapping Markup Language" at  <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
    387     *
    388     * <p>In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
    389     *
    390     * <p>This is useful for example for
    391     * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
    392     *   without/before actually performing the conversion</li>
    393     * <li>testing if a converter can be used for text for typical text for a certain locale,
    394     *   by comparing its roundtrip set with the set of ExemplarCharacters from
    395     *   ICU's locale data or other sources</li></ul>
    396     *
    397     * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
    398     *                   the converter's specific set is filled in.
    399     * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
    400     * @throws IllegalArgumentException if the parameters does not match.
    401     * @stable ICU 4.0
    402     */
    403        public void getUnicodeSet(UnicodeSet setFillIn, int which){
    404            if( setFillIn == null || which != ROUNDTRIP_SET ){
    405                throw new IllegalArgumentException();
    406            }
    407            setFillIn.clear();
    408            getUnicodeSetImpl(setFillIn, which);
    409        }
    410 
    411        /**
    412         * Returns whether or not the charset of the converter has a fixed number of bytes
    413         * per charset character.
    414         * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
    415         * Another example is UTF-32 which is always 4 bytes per character.  A UTF-32 code point
    416         * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
    417         * Note: This method is not intended to be used to determine whether the charset has a
    418         * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
    419         * @return true if the converter is fixed-width
    420         * @stable ICU 4.8
    421         */
    422        public boolean isFixedWidth() {
    423            if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
    424                return true;
    425            }
    426 
    427            if (this instanceof CharsetMBCS) {
    428                if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
    429                    return true;
    430                }
    431            }
    432 
    433            return false;
    434        }
    435 
    436        static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
    437            setFillIn.add(0, 0xd7ff);
    438            setFillIn.add(0xe000, 0x10ffff);
    439        }
    440 
    441        static void getCompleteUnicodeSet(UnicodeSet setFillIn){
    442            setFillIn.add(0, 0x10ffff);
    443        }
    444 }
    445