Home | History | Annotate | Download | only in charset
      1 /*
      2  *  Licensed to the Apache Software Foundation (ASF) under one or more
      3  *  contributor license agreements.  See the NOTICE file distributed with
      4  *  this work for additional information regarding copyright ownership.
      5  *  The ASF licenses this file to You under the Apache License, Version 2.0
      6  *  (the "License"); you may not use this file except in compliance with
      7  *  the License.  You may obtain a copy of the License at
      8  *
      9  *     http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  *  Unless required by applicable law or agreed to in writing, software
     12  *  distributed under the License is distributed on an "AS IS" BASIS,
     13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  *  See the License for the specific language governing permissions and
     15  *  limitations under the License.
     16  */
     17 
     18 package java.nio.charset;
     19 
     20 import java.io.UnsupportedEncodingException;
     21 import java.nio.ByteBuffer;
     22 import java.nio.CharBuffer;
     23 import java.nio.charset.spi.CharsetProvider;
     24 import java.util.Collections;
     25 import java.util.HashMap;
     26 import java.util.HashSet;
     27 import java.util.Iterator;
     28 import java.util.Locale;
     29 import java.util.ServiceLoader;
     30 import java.util.Set;
     31 import java.util.SortedMap;
     32 import java.util.TreeMap;
     33 import libcore.icu.NativeConverter;
     34 
     35 /**
     36  * A charset is a named mapping between Unicode characters and byte sequences. Every
     37  * {@code Charset} can <i>decode</i>, converting a byte sequence into a sequence of characters,
     38  * and some can also <i>encode</i>, converting a sequence of characters into a byte sequence.
     39  * Use the method {@link #canEncode} to find out whether a charset supports both.
     40  *
     41  * <h4>Characters</h4>
     42  * <p>In the context of this class, <i>character</i> always refers to a Java character: a Unicode
     43  * code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.)
     44  * Not all byte sequences will represent a character, and not
     45  * all characters can necessarily be represented by a given charset. The method {@link #contains}
     46  * can be used to determine whether every character representable by one charset can also be
     47  * represented by another (meaning that a lossless transformation is possible from the contained
     48  * to the container).
     49  *
     50  * <h4>Encodings</h4>
     51  * <p>There are many possible ways to represent Unicode characters as byte sequences.
     52  * See <a href="http://www.unicode.org/reports/tr17/">UTR#17: Unicode Character Encoding Model</a>
     53  * for detailed discussion.
     54  *
     55  * <p>The most important mappings capable of representing every character are the Unicode
     56  * Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most
     57  * common. UTF-8 (described in <a href="http://www.ietf.org/rfc/rfc3629.txt">RFC 3629</a>)
     58  * encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially
     59  * wasting space, but allowing efficient random access into BMP text), and UTF-32 uses
     60  * exactly 4 bytes per character (trading off even more space for efficient random access into text
     61  * that includes supplementary characters).
     62  *
     63  * <p>UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte
     64  * integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or
     65  * little-endian. To assist decoders, Unicode includes a special <i>byte order mark</i> (BOM)
     66  * character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped
     67  * code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees
     68  * {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while
     69  * {@code 0xff, 0xfe}, would indicate a little-endian byte sequence.
     70  *
     71  * <p>UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same
     72  * byte sequence, there is no information about endianness to convey. Seeing the bytes
     73  * corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to
     74  * suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and
     75  * will appear in the output character sequence. This means that a disadvantage to including a BOM
     76  * in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a
     77  * reason to prefer UTF-8: it's one less complication to worry about.)
     78  *
     79  * <p>Because a BOM indicates how the data that follows should be interpreted, a BOM should occur
     80  * as the first character in a character sequence.
     81  *
     82  * <p>See the <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a> for
     83  * more about dealing with BOMs.
     84  *
     85  * <h4>Endianness and BOM behavior</h4>
     86  *
     87  * <p>The following tables show the endianness and BOM behavior of the UTF-16 variants.
     88  *
     89  * <p>This table shows what the encoder writes. "BE" means that the byte sequence is big-endian,
     90  * "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}).
     91  * <p><table width="100%">
     92  * <tr> <th>Charset</th>  <th>Encoder writes</th>  </tr>
     93  * <tr> <td>UTF-16BE</td> <td>BE, no BOM</td>      </tr>
     94  * <tr> <td>UTF-16LE</td> <td>LE, no BOM</td>      </tr>
     95  * <tr> <td>UTF-16</td>   <td>BE, with BE BOM</td> </tr>
     96  * </table>
     97  *
     98  * <p>The next table shows how each variant's decoder behaves when reading a byte sequence.
     99  * The exact meaning of "failure" in the table is dependent on the
    100  * {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so
    101  * "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM
    102  * triggers the malformedInputAction".
    103  *
    104  * <p>The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character.
    105  *
    106  * <p><table width="100%">
    107  * <tr> <th>Charset</th>  <th>BE BOM</th>           <th>LE BOM</th>           <th>No BOM</th> </tr>
    108  * <tr> <td>UTF-16BE</td> <td>BE, includes BOM</td> <td>BE, failure</td>      <td>BE</td>     </tr>
    109  * <tr> <td>UTF-16LE</td> <td>LE, failure</td>      <td>LE, includes BOM</td> <td>LE</td>     </tr>
    110  * <tr> <td>UTF-16</td>   <td>BE</td>               <td>LE</td>               <td>BE</td>     </tr>
    111  * </table>
    112  *
    113  * <h4>Charset names</h4>
    114  * <p>A charset has a canonical name, returned by {@link #name}. Most charsets will
    115  * also have one or more aliases, returned by {@link #aliases}. A charset can be looked up
    116  * by canonical name or any of its aliases using {@link #forName}.
    117  *
    118  * <h4>Guaranteed-available charsets</h4>
    119  * <p>The following charsets are available on every Java implementation:
    120  * <ul>
    121  * <li>ISO-8859-1
    122  * <li>US-ASCII
    123  * <li>UTF-16
    124  * <li>UTF-16BE
    125  * <li>UTF-16LE
    126  * <li>UTF-8
    127  * </ul>
    128  * <p>All of these charsets support both decoding and encoding. The charsets whose names begin
    129  * "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets
    130  * can only represent small subsets of these characters. Except when required to do otherwise for
    131  * compatibility, new code should use one of the UTF charsets listed above. The platform's default
    132  * charset is UTF-8. (This is in contrast to some older implementations, where the default charset
    133  * depended on the user's locale.)
    134  *
    135  * <p>Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or
    136  * {@link #isSupported} to see what's available. If you intend to use the charset if it's
    137  * available, just call {@link #forName} and catch the exceptions it throws if the charset isn't
    138  * available.
    139  *
    140  * <p>Additional charsets can be made available by configuring one or more charset
    141  * providers through provider configuration files. Such files are always named
    142  * as "java.nio.charset.spi.CharsetProvider" and located in the
    143  * "META-INF/services" directory of one or more classpaths. The files should be
    144  * encoded in "UTF-8". Each line of their content specifies the class name of a
    145  * charset provider which extends {@link java.nio.charset.spi.CharsetProvider}.
    146  * A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace
    147  * is trimmed. Blank lines, and lines (after trimming) starting with "#" which are
    148  * regarded as comments, are both ignored. Duplicates of names already found are also
    149  * ignored. Both the configuration files and the provider classes will be loaded
    150  * using the thread context class loader.
    151  *
    152  * <p>Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances
    153  * it returns are inherently stateful.
    154  */
    155 public abstract class Charset implements Comparable<Charset> {
    156     private static final HashMap<String, Charset> CACHED_CHARSETS = new HashMap<String, Charset>();
    157 
    158     private static final Charset DEFAULT_CHARSET = getDefaultCharset();
    159 
    160     private final String canonicalName;
    161 
    162     private final HashSet<String> aliasesSet;
    163 
    164     /**
    165      * Constructs a <code>Charset</code> object. Duplicated aliases are
    166      * ignored.
    167      *
    168      * @param canonicalName
    169      *            the canonical name of the charset.
    170      * @param aliases
    171      *            an array containing all aliases of the charset. May be null.
    172      * @throws IllegalCharsetNameException
    173      *             on an illegal value being supplied for either
    174      *             <code>canonicalName</code> or for any element of
    175      *             <code>aliases</code>.
    176      */
    177     protected Charset(String canonicalName, String[] aliases) {
    178         // check whether the given canonical name is legal
    179         checkCharsetName(canonicalName);
    180         this.canonicalName = canonicalName;
    181         // check each alias and put into a set
    182         this.aliasesSet = new HashSet<String>();
    183         if (aliases != null) {
    184             for (String alias : aliases) {
    185                 checkCharsetName(alias);
    186                 this.aliasesSet.add(alias);
    187             }
    188         }
    189     }
    190 
    191     private static void checkCharsetName(String name) {
    192         if (name.isEmpty()) {
    193             throw new IllegalCharsetNameException(name);
    194         }
    195         int length = name.length();
    196         for (int i = 0; i < length; ++i) {
    197             if (!isValidCharsetNameCharacter(name.charAt(i))) {
    198                 throw new IllegalCharsetNameException(name);
    199             }
    200         }
    201     }
    202 
    203     private static boolean isValidCharsetNameCharacter(char c) {
    204         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
    205                 c == '-' || c == '.' || c == ':' || c == '_';
    206     }
    207 
    208     /**
    209      * Returns an immutable case-insensitive map from canonical names to {@code Charset} instances.
    210      * If multiple charsets have the same canonical name, it is unspecified which is returned in
    211      * the map. This method may be slow. If you know which charset you're looking for, use
    212      * {@link #forName}.
    213      * @return an immutable case-insensitive map from canonical names to {@code Charset} instances
    214      */
    215     public static SortedMap<String, Charset> availableCharsets() {
    216         // Start with a copy of the built-in charsets...
    217         TreeMap<String, Charset> charsets = new TreeMap<String, Charset>(String.CASE_INSENSITIVE_ORDER);
    218         for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
    219             Charset charset = NativeConverter.charsetForName(charsetName);
    220             charsets.put(charset.name(), charset);
    221         }
    222 
    223         // Add all charsets provided by all charset providers...
    224         for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
    225             Iterator<Charset> it = charsetProvider.charsets();
    226             while (it.hasNext()) {
    227                 Charset cs = it.next();
    228                 // A CharsetProvider can't override a built-in Charset.
    229                 if (!charsets.containsKey(cs.name())) {
    230                     charsets.put(cs.name(), cs);
    231                 }
    232             }
    233         }
    234 
    235         return Collections.unmodifiableSortedMap(charsets);
    236     }
    237 
    238     private static Charset cacheCharset(String charsetName, Charset cs) {
    239         synchronized (CACHED_CHARSETS) {
    240             // Get the canonical name for this charset, and the canonical instance from the table.
    241             String canonicalName = cs.name();
    242             Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName);
    243             if (canonicalCharset == null) {
    244                 canonicalCharset = cs;
    245             }
    246 
    247             // Cache the charset by its canonical name...
    248             CACHED_CHARSETS.put(canonicalName, canonicalCharset);
    249 
    250             // And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means
    251             // that many non-alias, non-canonical names are valid. For example, "utf8" isn't an
    252             // alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of
    253             // such names unduly.)
    254             CACHED_CHARSETS.put(charsetName, canonicalCharset);
    255 
    256             // And all its aliases...
    257             for (String alias : cs.aliasesSet) {
    258                 CACHED_CHARSETS.put(alias, canonicalCharset);
    259             }
    260 
    261             return canonicalCharset;
    262         }
    263     }
    264 
    265     /**
    266      * Returns a {@code Charset} instance for the named charset.
    267      *
    268      * @param charsetName a charset name (either canonical or an alias)
    269      * @throws IllegalCharsetNameException
    270      *             if the specified charset name is illegal.
    271      * @throws UnsupportedCharsetException
    272      *             if the desired charset is not supported by this runtime.
    273      */
    274     public static Charset forName(String charsetName) {
    275         // Is this charset in our cache?
    276         Charset cs;
    277         synchronized (CACHED_CHARSETS) {
    278             cs = CACHED_CHARSETS.get(charsetName);
    279             if (cs != null) {
    280                 return cs;
    281             }
    282         }
    283 
    284         if (charsetName == null) {
    285             throw new IllegalCharsetNameException(null);
    286         }
    287 
    288         // Is this a built-in charset supported by ICU?
    289         checkCharsetName(charsetName);
    290         cs = NativeConverter.charsetForName(charsetName);
    291         if (cs != null) {
    292             return cacheCharset(charsetName, cs);
    293         }
    294 
    295         // Does a configured CharsetProvider have this charset?
    296         for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
    297             cs = charsetProvider.charsetForName(charsetName);
    298             if (cs != null) {
    299                 return cacheCharset(charsetName, cs);
    300             }
    301         }
    302 
    303         throw new UnsupportedCharsetException(charsetName);
    304     }
    305 
    306     /**
    307      * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
    308      * which is all pre-nio code claims to throw.
    309      *
    310      * @hide internal use only
    311      */
    312     public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
    313         try {
    314             return Charset.forName(charsetName);
    315         } catch (Exception cause) {
    316             UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
    317             ex.initCause(cause);
    318             throw ex;
    319         }
    320     }
    321 
    322     /**
    323      * Determines whether the specified charset is supported by this runtime.
    324      *
    325      * @param charsetName
    326      *            the name of the charset.
    327      * @return true if the specified charset is supported, otherwise false.
    328      * @throws IllegalCharsetNameException
    329      *             if the specified charset name is illegal.
    330      */
    331     public static boolean isSupported(String charsetName) {
    332         try {
    333             forName(charsetName);
    334             return true;
    335         } catch (UnsupportedCharsetException ex) {
    336             return false;
    337         }
    338     }
    339 
    340     /**
    341      * Determines whether this charset is a superset of the given charset. A charset C1 contains
    342      * charset C2 if every character representable by C2 is also representable by C1. This means
    343      * that lossless conversion is possible from C2 to C1 (but not necessarily the other way
    344      * round). It does <i>not</i> imply that the two charsets use the same byte sequences for the
    345      * characters they share.
    346      *
    347      * <p>Note that this method is allowed to be conservative, and some implementations may return
    348      * false when this charset does contain the other charset. Android's implementation is precise,
    349      * and will always return true in such cases.
    350      *
    351      * @param charset
    352      *            a given charset.
    353      * @return true if this charset is a super set of the given charset,
    354      *         false if it's unknown or this charset is not a superset of
    355      *         the given charset.
    356      */
    357     public abstract boolean contains(Charset charset);
    358 
    359     /**
    360      * Gets a new instance of an encoder for this charset.
    361      *
    362      * @return a new instance of an encoder for this charset.
    363      */
    364     public abstract CharsetEncoder newEncoder();
    365 
    366     /**
    367      * Gets a new instance of a decoder for this charset.
    368      *
    369      * @return a new instance of a decoder for this charset.
    370      */
    371     public abstract CharsetDecoder newDecoder();
    372 
    373     /**
    374      * Gets the canonical name of this charset.
    375      *
    376      * @return this charset's name in canonical form.
    377      */
    378     public final String name() {
    379         return this.canonicalName;
    380     }
    381 
    382     /**
    383      * Gets the set of this charset's aliases.
    384      *
    385      * @return an unmodifiable set of this charset's aliases.
    386      */
    387     public final Set<String> aliases() {
    388         return Collections.unmodifiableSet(this.aliasesSet);
    389     }
    390 
    391     /**
    392      * Gets the name of this charset for the default locale.
    393      *
    394      * <p>The default implementation returns the canonical name of this charset.
    395      * Subclasses may return a localized display name.
    396      *
    397      * @return the name of this charset for the default locale.
    398      */
    399     public String displayName() {
    400         return this.canonicalName;
    401     }
    402 
    403     /**
    404      * Gets the name of this charset for the specified locale.
    405      *
    406      * <p>The default implementation returns the canonical name of this charset.
    407      * Subclasses may return a localized display name.
    408      *
    409      * @param l
    410      *            a certain locale
    411      * @return the name of this charset for the specified locale
    412      */
    413     public String displayName(Locale l) {
    414         return this.canonicalName;
    415     }
    416 
    417     /**
    418      * Indicates whether this charset is known to be registered in the IANA
    419      * Charset Registry.
    420      *
    421      * @return true if the charset is known to be registered, otherwise returns
    422      *         false.
    423      */
    424     public final boolean isRegistered() {
    425         return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-");
    426     }
    427 
    428     /**
    429      * Returns true if this charset supports encoding, false otherwise.
    430      *
    431      * @return true if this charset supports encoding, false otherwise.
    432      */
    433     public boolean canEncode() {
    434         return true;
    435     }
    436 
    437     /**
    438      * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from
    439      * {@code buffer}.
    440      * This method uses {@code CodingErrorAction.REPLACE}.
    441      *
    442      * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
    443      * for performance.
    444      *
    445      * @param buffer
    446      *            the character buffer containing the content to be encoded.
    447      * @return the result of the encoding.
    448      */
    449     public final ByteBuffer encode(CharBuffer buffer) {
    450         try {
    451             return newEncoder()
    452                     .onMalformedInput(CodingErrorAction.REPLACE)
    453                     .onUnmappableCharacter(CodingErrorAction.REPLACE).encode(
    454                             buffer);
    455         } catch (CharacterCodingException ex) {
    456             throw new Error(ex.getMessage(), ex);
    457         }
    458     }
    459 
    460     /**
    461      * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}.
    462      * This method uses {@code CodingErrorAction.REPLACE}.
    463      *
    464      * <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
    465      * for performance.
    466      *
    467      * @param s the string to be encoded.
    468      * @return the result of the encoding.
    469      */
    470     public final ByteBuffer encode(String s) {
    471         return encode(CharBuffer.wrap(s));
    472     }
    473 
    474     /**
    475      * Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}.
    476      * This method uses {@code CodingErrorAction.REPLACE}.
    477      *
    478      * <p>Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder}
    479      * for performance.
    480      *
    481      * @param buffer
    482      *            the byte buffer containing the content to be decoded.
    483      * @return a character buffer containing the output of the decoding.
    484      */
    485     public final CharBuffer decode(ByteBuffer buffer) {
    486         try {
    487             return newDecoder()
    488                     .onMalformedInput(CodingErrorAction.REPLACE)
    489                     .onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer);
    490         } catch (CharacterCodingException ex) {
    491             throw new Error(ex.getMessage(), ex);
    492         }
    493     }
    494 
    495     /*
    496      * -------------------------------------------------------------------
    497      * Methods implementing parent interface Comparable
    498      * -------------------------------------------------------------------
    499      */
    500 
    501     /**
    502      * Compares this charset with the given charset. This comparison is
    503      * based on the case insensitive canonical names of the charsets.
    504      *
    505      * @param charset
    506      *            the given object to be compared with.
    507      * @return a negative integer if less than the given object, a positive
    508      *         integer if larger than it, or 0 if equal to it.
    509      */
    510     public final int compareTo(Charset charset) {
    511         return this.canonicalName.compareToIgnoreCase(charset.canonicalName);
    512     }
    513 
    514     /*
    515      * -------------------------------------------------------------------
    516      * Methods overriding parent class Object
    517      * -------------------------------------------------------------------
    518      */
    519 
    520     /**
    521      * Determines whether this charset equals to the given object. They are
    522      * considered to be equal if they have the same canonical name.
    523      *
    524      * @param obj
    525      *            the given object to be compared with.
    526      * @return true if they have the same canonical name, otherwise false.
    527      */
    528     @Override
    529     public final boolean equals(Object obj) {
    530         if (obj instanceof Charset) {
    531             Charset that = (Charset) obj;
    532             return this.canonicalName.equals(that.canonicalName);
    533         }
    534         return false;
    535     }
    536 
    537     /**
    538      * Gets the hash code of this charset.
    539      *
    540      * @return the hash code of this charset.
    541      */
    542     @Override
    543     public final int hashCode() {
    544         return this.canonicalName.hashCode();
    545     }
    546 
    547     /**
    548      * Gets a string representation of this charset. Usually this contains the
    549      * canonical name of the charset.
    550      *
    551      * @return a string representation of this charset.
    552      */
    553     @Override
    554     public final String toString() {
    555         return getClass().getName() + "[" + this.canonicalName + "]";
    556     }
    557 
    558     /**
    559      * Returns the system's default charset. This is determined during VM startup, and will not
    560      * change thereafter. On Android, the default charset is UTF-8.
    561      */
    562     public static Charset defaultCharset() {
    563         return DEFAULT_CHARSET;
    564     }
    565 
    566     private static Charset getDefaultCharset() {
    567         String encoding = System.getProperty("file.encoding", "UTF-8");
    568         try {
    569             return Charset.forName(encoding);
    570         } catch (UnsupportedCharsetException e) {
    571             return Charset.forName("UTF-8");
    572         }
    573     }
    574 }
    575