android-9.0.0_r1.0/s

//  2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/**
*******************************************************************************
* Copyright (C) 2006-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/

package com.ibm.icu.charset;

import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;

import com.ibm.icu.text.UnicodeSet;

/**
 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
 * This API is used to convert codepage or character encoded data to and
 * from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that
 * converter, you can get its properties, set options, convert your data.
 *
 * <p>Since many software programs recognize different converter names for
 * different types of converters, there are other functions in this API to
 * iterate over the converter aliases.
 *
 * <p>Note that {@link #name()} cannot always return a unique charset name.
 * {@link Charset} documents that,
 * for charsets listed in the IANA Charset Registry,
 * the {@link #name()} must be listed there,
 * and it must be the MIME-preferred name if there are multiple names.
 *
 * <p>However, there are different implementations of many if not most charsets,
 * ICU provides multiple variants for some of them,
 * ICU provides variants of some java.nio-system-supported charsets,
 * and ICU users are free to add more variants.
 * This is so that applications can be compatible with multiple implementations at the same time.
 *
 * <p>This is in conflict with the {@link Charset#name()} requirements.
 * It is not possible to offer variants of an IANA charset and
 * always use the MIME-preferred name and also have those names be unique.
 *
 * <p>{@link #name()} returns the MIME-preferred name, or IANA name,
 * so that it can always be used for the charset field in internet protocols.
 *
 * <p>Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU}
 * by using unique aliases (e.g., the ICU-canonical names).
 *
 * <p>{@link Charset} also documents that
 * Two charsets are equal if, and only if, they have the same canonical names.
 * This is not possible.
 *
 * <p>Unfortunately, {@link Charset#equals} is final, and
 * {@link Charset#availableCharsets} returns
 * a sorted map from canonical charset names to charset objects.
 * Since {@link #name()} cannot be unique,
 * {@link #equals} cannot work properly in such cases, and
 * {@link Charset#availableCharsets} can only include one variant for a name.
 *
 * @stable ICU 3.6
 */
public abstract class CharsetICU extends Charset{

     String icuCanonicalName;
     int options;

     float  maxCharsPerByte;

     String name; /* +4: 60  internal name of the converter- invariant chars */

     int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */

     byte platform;                /* +68: 1 platform of the converter (only IBM now) */
     byte conversionType;          /* +69: 1 conversion type */

     int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
     int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */

     byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */
     byte subCharLen;              /* +76: 1 */

     byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
     byte hasFromUnicodeFallback; /* +78: 1 */
     short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
     byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
     //byte reserved[/*19*/];           /* +81: 19 to round out the structure */


    // typedef enum UConverterUnicodeSet {
     /**
      * Parameter that select the set of roundtrippable Unicode code points.
      * @stable ICU 4.0
      */
      public static final int ROUNDTRIP_SET=0;
      /**
       * Select the set of Unicode code points with roundtrip or fallback mappings.
       * Not supported at this point.
       * @internal
       * @deprecated This API is ICU internal only.
       */
      @Deprecated
      public static final int ROUNDTRIP_AND_FALLBACK_SET =1;

    //} UConverterUnicodeSet;

    /**
     *
     * @param icuCanonicalName
     * @param canonicalName
     * @param aliases
     * @stable ICU 3.6
     */
    protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
        super(canonicalName,aliases);
        if(canonicalName.length() == 0){
            throw new IllegalCharsetNameException(canonicalName);
        }
        this.icuCanonicalName  = icuCanonicalName;
    }

    /**
     * Ascertains if a charset is a sub set of this charset
     * Implements the abstract method of super class.
     * @param cs charset to test
     * @return true if the given charset is a subset of this charset
     * @stable ICU 3.6
     */
    @Override
    public boolean contains(Charset cs){
        if (null == cs) {
            return false;
        } else if (this.equals(cs)) {
            return true;
        }
        return false;
    }
    private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>();
    static{
        algorithmicCharsets.put("LMBCS-1",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-2",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-3",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-4",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-5",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-6",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-8",               "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-11",              "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-16",              "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-17",              "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-18",              "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("LMBCS-19",              "com.ibm.icu.charset.CharsetLMBCS");
        algorithmicCharsets.put("BOCU-1",                "com.ibm.icu.charset.CharsetBOCU1" );
        algorithmicCharsets.put("SCSU",                  "com.ibm.icu.charset.CharsetSCSU" );
        algorithmicCharsets.put("US-ASCII",              "com.ibm.icu.charset.CharsetASCII" );
        algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );
        algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );
        algorithmicCharsets.put("UTF-16BE,version=1",    "com.ibm.icu.charset.CharsetUTF16BE" );
        algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF-16LE,version=1",    "com.ibm.icu.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );
        algorithmicCharsets.put("UTF-32BE",              "com.ibm.icu.charset.CharsetUTF32BE" );
        algorithmicCharsets.put("UTF-32LE",              "com.ibm.icu.charset.CharsetUTF32LE" );
        algorithmicCharsets.put("UTF32_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF32LE" );
        algorithmicCharsets.put("UTF32_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF32" );
        algorithmicCharsets.put("UTF-8",                 "com.ibm.icu.charset.CharsetUTF8" );
        algorithmicCharsets.put("CESU-8",                "com.ibm.icu.charset.CharsetCESU8" );
        algorithmicCharsets.put("UTF-7",                 "com.ibm.icu.charset.CharsetUTF7" );
        algorithmicCharsets.put("ISCII,version=0",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=1",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=2",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=3",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=4",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=5",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=6",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=7",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=8",       "com.ibm.icu.charset.CharsetISCII" );
        algorithmicCharsets.put("IMAP-mailbox-name",     "com.ibm.icu.charset.CharsetUTF7" );
        algorithmicCharsets.put("HZ",                    "com.ibm.icu.charset.CharsetHZ" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=zh,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
        algorithmicCharsets.put("x11-compound-text",                          "com.ibm.icu.charset.CharsetCompoundText" );
        }

    /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
       String className = algorithmicCharsets.get(icuCanonicalName);
       if(className==null){
           //all the cnv files are loaded as MBCS
           className = "com.ibm.icu.charset.CharsetMBCS";
       }
       try{
           CharsetICU conv = null;
           Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
           Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class,  String[].class};
           final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes);
           Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};

           // Run constructor
           try {
               conv = c.newInstance(params);
               if (conv != null) {
                   return conv;
               }
           }catch (InvocationTargetException e) {
               Throwable cause = e.getCause();
               UnsupportedCharsetException e2 = new UnsupportedCharsetException(
                       icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause);
               e2.initCause(cause);
               throw e2;
           }
       }catch(ClassNotFoundException ex){
       }catch(NoSuchMethodException ex){
       }catch (IllegalAccessException ex){
       }catch (InstantiationException ex){
       }
       throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
    }

    static final boolean isSurrogate(int c){
        return (((c)&0xfffff800)==0xd800);
    }

    /*
     * Returns the default charset name
     */
//    static final String getDefaultCharsetName(){
//        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
//        return defaultEncoding;
//    }

    /**
     * Returns a charset object for the named charset.
     * This method gurantee that ICU charset is returned when
     * available.  If the ICU charset provider does not support
     * the specified charset, then try other charset providers
     * including the standard Java charset provider.
     *
     * @param charsetName The name of the requested charset,
     * may be either a canonical name or an alias
     * @return A charset object for the named charset
     * @throws IllegalCharsetNameException If the given charset name
     * is illegal
     * @throws UnsupportedCharsetException If no support for the
     * named charset is available in this instance of th Java
     * virtual machine
     * @stable ICU 3.6
     */
    public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
        CharsetProviderICU icuProvider = new CharsetProviderICU();
        CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
        if (cs != null) {
            return cs;
        }
        return Charset.forName(charsetName);
    }

//    /**
//     * @see java.lang.Comparable#compareTo(java.lang.Object)
//     * @stable 3.8
//     */
//    public int compareTo(Object otherObj) {
//        if (!(otherObj instanceof CharsetICU)) {
//            return -1;
//        }
//        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
//    }

    /**
     * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
     * start of the stream for example U+FEFF (the Unicode BOM/signature
     * character) that can be ignored.
     *
     * Detects Unicode signature byte sequences at the start of the byte stream
     * and returns number of bytes of the BOM of the indicated Unicode charset.
     * 0 is returned when no Unicode signature is recognized.
     *
     */
    // TODO This should be proposed as CharsetDecoderICU API.
//    static String detectUnicodeSignature(ByteBuffer source) {
//        int signatureLength = 0; // number of bytes of the signature
//        final int SIG_MAX_LEN = 5;
//        String sigUniCharset = null; // states what unicode charset is the BOM
//        int i = 0;
//
//        /*
//         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
//         * don't misdetect something
//         */
//        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
//                (byte) 0xa5 };
//
//        while (i < source.remaining() && i < SIG_MAX_LEN) {
//            start[i] = source.get(i);
//            i++;
//        }
//
//        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
//            signatureLength = 2;
//            sigUniCharset = "UTF-16BE";
//            source.position(signatureLength);
//            return sigUniCharset;
//        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
//            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
//                signatureLength = 4;
//                sigUniCharset = "UTF-32LE";
//                source.position(signatureLength);
//                return sigUniCharset;
//            } else {
//                signatureLength = 2;
//                sigUniCharset = "UTF-16LE";
//                source.position(signatureLength);
//                return sigUniCharset;
//            }
//        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
//                && start[2] == (byte) 0xBF) {
//            signatureLength = 3;
//            sigUniCharset = "UTF-8";
//            source.position(signatureLength);
//            return sigUniCharset;
//        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
//                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
//            signatureLength = 4;
//            sigUniCharset = "UTF-32BE";
//            source.position(signatureLength);
//            return sigUniCharset;
//        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
//                && start[2] == (byte) 0xFF) {
//            signatureLength = 3;
//            sigUniCharset = "SCSU";
//            source.position(signatureLength);
//            return sigUniCharset;
//        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
//                && start[2] == (byte) 0x28) {
//            signatureLength = 3;
//            sigUniCharset = "BOCU-1";
//            source.position(signatureLength);
//            return sigUniCharset;
//        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
//                && start[2] == (byte) 0x76) {
//
//            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
//                signatureLength = 5;
//                sigUniCharset = "UTF-7";
//                source.position(signatureLength);
//                return sigUniCharset;
//            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
//                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
//                signatureLength = 4;
//                sigUniCharset = "UTF-7";
//                source.position(signatureLength);
//                return sigUniCharset;
//            }
//        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
//                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
//            signatureLength = 4;
//            sigUniCharset = "UTF-EBCDIC";
//            source.position(signatureLength);
//            return sigUniCharset;
//        }
//
//        /* no known Unicode signature byte sequence recognized */
//        return null;
//    }


    abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);

    /**
    * Returns the set of Unicode code points that can be converted by an ICU Converter.
    *
    * <p>The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
    * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
    * mappings or are only the result of reverse fallback mappings.  See UTR #22 "Character Mapping Markup Language" at  <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
    *
    * <p>In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
    *
    * <p>This is useful for example for
    * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
    *   without/before actually performing the conversion</li>
    * <li>testing if a converter can be used for text for typical text for a certain locale,
    *   by comparing its roundtrip set with the set of ExemplarCharacters from
    *   ICU's locale data or other sources</li></ul>
    *
    * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
    *                   the converter's specific set is filled in.
    * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
    * @throws IllegalArgumentException if the parameters does not match.
    * @stable ICU 4.0
    */
       public void getUnicodeSet(UnicodeSet setFillIn, int which){
           if( setFillIn == null || which != ROUNDTRIP_SET ){
               throw new IllegalArgumentException();
           }
           setFillIn.clear();
           getUnicodeSetImpl(setFillIn, which);
       }

       /**
        * Returns whether or not the charset of the converter has a fixed number of bytes
        * per charset character.
        * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
        * Another example is UTF-32 which is always 4 bytes per character.  A UTF-32 code point
        * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
        * Note: This method is not intended to be used to determine whether the charset has a
        * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
        * @return true if the converter is fixed-width
        * @stable ICU 4.8
        */
       public boolean isFixedWidth() {
           if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
               return true;
           }

           if (this instanceof CharsetMBCS) {
               if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
                   return true;
               }
           }

           return false;
       }

       static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
           setFillIn.add(0, 0xd7ff);
           setFillIn.add(0xe000, 0x10ffff);
       }

       static void getCompleteUnicodeSet(UnicodeSet setFillIn){
           setFillIn.add(0, 0x10ffff);
       }
}