1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 package java.nio.charset; 28 29 import java.io.UnsupportedEncodingException; 30 import libcore.icu.NativeConverter; 31 import java.nio.ByteBuffer; 32 import java.nio.CharBuffer; 33 import java.nio.charset.spi.CharsetProvider; 34 import java.security.AccessController; 35 import java.security.AccessControlException; 36 import java.security.PrivilegedAction; 37 import java.util.AbstractMap; 38 import java.util.Collections; 39 import java.util.HashMap; 40 import java.util.HashSet; 41 import java.util.Iterator; 42 import java.util.Locale; 43 import java.util.Map; 44 import java.util.NoSuchElementException; 45 import java.util.Set; 46 import java.util.ServiceLoader; 47 import java.util.ServiceConfigurationError; 48 import java.util.SortedMap; 49 import java.util.TreeMap; 50 import sun.misc.ASCIICaseInsensitiveComparator; 51 import sun.nio.cs.ThreadLocalCoders; 52 import sun.security.action.GetPropertyAction; 53 54 55 /** 56 * A named mapping between sequences of sixteen-bit Unicode <a 57 * href="../../lang/Character.html#unicode">code units</a> and sequences of 58 * bytes. This class defines methods for creating decoders and encoders and 59 * for retrieving the various names associated with a charset. Instances of 60 * this class are immutable. 61 * 62 * <p> This class also defines static methods for testing whether a particular 63 * charset is supported, for locating charset instances by name, and for 64 * constructing a map that contains every charset for which support is 65 * available in the current Java virtual machine. Support for new charsets can 66 * be added via the service-provider interface defined in the {@link 67 * java.nio.charset.spi.CharsetProvider} class. 68 * 69 * <p> All of the methods defined in this class are safe for use by multiple 70 * concurrent threads. 71 * 72 * 73 * <a name="names"><a name="charenc"> 74 * <h4>Charset names</h4> 75 * 76 * <p> Charsets are named by strings composed of the following characters: 77 * 78 * <ul> 79 * 80 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> 81 * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>), 82 * 83 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> 84 * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>), 85 * 86 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt> 87 * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>), 88 * 89 * <li> The dash character <tt>'-'</tt> 90 * (<tt>'\u002d'</tt>, <small>HYPHEN-MINUS</small>), 91 * 92 * <li> The plus character <tt>'+'</tt> 93 * (<tt>'\u002b'</tt>, <small>PLUS SIGN</small>), 94 * 95 * <li> The period character <tt>'.'</tt> 96 * (<tt>'\u002e'</tt>, <small>FULL STOP</small>), 97 * 98 * <li> The colon character <tt>':'</tt> 99 * (<tt>'\u003a'</tt>, <small>COLON</small>), and 100 * 101 * <li> The underscore character <tt>'_'</tt> 102 * (<tt>'\u005f'</tt>, <small>LOW LINE</small>). 103 * 104 * </ul> 105 * 106 * A charset name must begin with either a letter or a digit. The empty string 107 * is not a legal charset name. Charset names are not case-sensitive; that is, 108 * case is always ignored when comparing charset names. Charset names 109 * generally follow the conventions documented in <a 110 * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278: IANA Charset 111 * Registration Procedures</i></a>. 112 * 113 * <p> Every charset has a <i>canonical name</i> and may also have one or more 114 * <i>aliases</i>. The canonical name is returned by the {@link #name() name} method 115 * of this class. Canonical names are, by convention, usually in upper case. 116 * The aliases of a charset are returned by the {@link #aliases() aliases} 117 * method. 118 * 119 * <a name="hn"> 120 * 121 * <p> Some charsets have an <i>historical name</i> that is defined for 122 * compatibility with previous versions of the Java platform. A charset's 123 * historical name is either its canonical name or one of its aliases. The 124 * historical name is returned by the <tt>getEncoding()</tt> methods of the 125 * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link 126 * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes. 127 * 128 * <a name="iana"> 129 * 130 * <p> If a charset listed in the <a 131 * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset 132 * Registry</i></a> is supported by an implementation of the Java platform then 133 * its canonical name must be the name listed in the registry. Many charsets 134 * are given more than one name in the registry, in which case the registry 135 * identifies one of the names as <i>MIME-preferred</i>. If a charset has more 136 * than one registry name then its canonical name must be the MIME-preferred 137 * name and the other names in the registry must be valid aliases. If a 138 * supported charset is not listed in the IANA registry then its canonical name 139 * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>. 140 * 141 * <p> The IANA charset registry does change over time, and so the canonical 142 * name and the aliases of a particular charset may also change over time. To 143 * ensure compatibility it is recommended that no alias ever be removed from a 144 * charset, and that if the canonical name of a charset is changed then its 145 * previous canonical name be made into an alias. 146 * 147 * 148 * <h4>Standard charsets</h4> 149 * 150 * <a name="standard"> 151 * 152 * <p> Every implementation of the Java platform is required to support the 153 * following standard charsets. Consult the release documentation for your 154 * implementation to see if any other charsets are supported. The behavior 155 * of such optional charsets may differ between implementations. 156 * 157 * <blockquote><table width="80%" summary="Description of standard charsets"> 158 * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr> 159 * <tr><td valign=top><tt>US-ASCII</tt></td> 160 * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>, 161 * a.k.a. the Basic Latin block of the Unicode character set</td></tr> 162 * <tr><td valign=top><tt>ISO-8859-1 </tt></td> 163 * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr> 164 * <tr><td valign=top><tt>UTF-8</tt></td> 165 * <td>Eight-bit UCS Transformation Format</td></tr> 166 * <tr><td valign=top><tt>UTF-16BE</tt></td> 167 * <td>Sixteen-bit UCS Transformation Format, 168 * big-endian byte order</td></tr> 169 * <tr><td valign=top><tt>UTF-16LE</tt></td> 170 * <td>Sixteen-bit UCS Transformation Format, 171 * little-endian byte order</td></tr> 172 * <tr><td valign=top><tt>UTF-16</tt></td> 173 * <td>Sixteen-bit UCS Transformation Format, 174 * byte order identified by an optional byte-order mark</td></tr> 175 * </table></blockquote> 176 * 177 * <p> The <tt>UTF-8</tt> charset is specified by <a 178 * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279</i></a>; the 179 * transformation format upon which it is based is specified in 180 * Amendment 2 of ISO 10646-1 and is also described in the <a 181 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode 182 * Standard</i></a>. 183 * 184 * <p> The <tt>UTF-16</tt> charsets are specified by <a 185 * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC 2781</i></a>; the 186 * transformation formats upon which they are based are specified in 187 * Amendment 1 of ISO 10646-1 and are also described in the <a 188 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode 189 * Standard</i></a>. 190 * 191 * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are 192 * therefore sensitive to byte order. In these encodings the byte order of a 193 * stream may be indicated by an initial <i>byte-order mark</i> represented by 194 * the Unicode character <tt>'\uFEFF'</tt>. Byte-order marks are handled 195 * as follows: 196 * 197 * <ul> 198 * 199 * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt> 200 * charsets interpret the initial byte-order marks as a <small>ZERO-WIDTH 201 * NON-BREAKING SPACE</small>; when encoding, they do not write 202 * byte-order marks. </p></li> 203 204 * 205 * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets the 206 * byte-order mark at the beginning of the input stream to indicate the 207 * byte-order of the stream but defaults to big-endian if there is no 208 * byte-order mark; when encoding, it uses big-endian byte order and writes 209 * a big-endian byte-order mark. </p></li> 210 * 211 * </ul> 212 * 213 * In any case, byte order marks occuring after the first element of an 214 * input sequence are not omitted since the same code is used to represent 215 * <small>ZERO-WIDTH NON-BREAKING SPACE</small>. 216 * 217 * <p> Every instance of the Java virtual machine has a default charset, which 218 * may or may not be one of the standard charsets. The default charset is 219 * determined during virtual-machine startup and typically depends upon the 220 * locale and charset being used by the underlying operating system. </p> 221 * 222 * <p>The {@link StandardCharsets} class defines constants for each of the 223 * standard charsets. 224 * 225 * <h4>Terminology</h4> 226 * 227 * <p> The name of this class is taken from the terms used in 228 * <a href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278</i></a>. 229 * In that document a <i>charset</i> is defined as the combination of 230 * one or more coded character sets and a character-encoding scheme. 231 * (This definition is confusing; some other software systems define 232 * <i>charset</i> as a synonym for <i>coded character set</i>.) 233 * 234 * <p> A <i>coded character set</i> is a mapping between a set of abstract 235 * characters and a set of integers. US-ASCII, ISO 8859-1, 236 * JIS X 0201, and Unicode are examples of coded character sets. 237 * 238 * <p> Some standards have defined a <i>character set</i> to be simply a 239 * set of abstract characters without an associated assigned numbering. 240 * An alphabet is an example of such a character set. However, the subtle 241 * distinction between <i>character set</i> and <i>coded character set</i> 242 * is rarely used in practice; the former has become a short form for the 243 * latter, including in the Java API specification. 244 * 245 * <p> A <i>character-encoding scheme</i> is a mapping between one or more 246 * coded character sets and a set of octet (eight-bit byte) sequences. 247 * UTF-8, UTF-16, ISO 2022, and EUC are examples of 248 * character-encoding schemes. Encoding schemes are often associated with 249 * a particular coded character set; UTF-8, for example, is used only to 250 * encode Unicode. Some schemes, however, are associated with multiple 251 * coded character sets; EUC, for example, can be used to encode 252 * characters in a variety of Asian coded character sets. 253 * 254 * <p> When a coded character set is used exclusively with a single 255 * character-encoding scheme then the corresponding charset is usually 256 * named for the coded character set; otherwise a charset is usually named 257 * for the encoding scheme and, possibly, the locale of the coded 258 * character sets that it supports. Hence <tt>US-ASCII</tt> is both the 259 * name of a coded character set and of the charset that encodes it, while 260 * <tt>EUC-JP</tt> is the name of the charset that encodes the 261 * JIS X 0201, JIS X 0208, and JIS X 0212 262 * coded character sets for the Japanese language. 263 * 264 * <p> The native character encoding of the Java programming language is 265 * UTF-16. A charset in the Java platform therefore defines a mapping 266 * between sequences of sixteen-bit UTF-16 code units (that is, sequences 267 * of chars) and sequences of bytes. </p> 268 * 269 * 270 * @author Mark Reinhold 271 * @author JSR-51 Expert Group 272 * @since 1.4 273 * 274 * @see CharsetDecoder 275 * @see CharsetEncoder 276 * @see java.nio.charset.spi.CharsetProvider 277 * @see java.lang.Character 278 */ 279 280 public abstract class Charset 281 implements Comparable<Charset> 282 { 283 284 /* -- Static methods -- */ 285 286 private static volatile String bugLevel = null; 287 288 static boolean atBugLevel(String bl) { // package-private 289 String level = bugLevel; 290 if (level == null) { 291 if (!sun.misc.VM.isBooted()) 292 return false; 293 bugLevel = level = AccessController.doPrivileged( 294 new GetPropertyAction("sun.nio.cs.bugLevel", "")); 295 } 296 return level.equals(bl); 297 } 298 299 /** 300 * Checks that the given string is a legal charset name. </p> 301 * 302 * @param s 303 * A purported charset name 304 * 305 * @throws IllegalCharsetNameException 306 * If the given name is not a legal charset name 307 */ 308 private static void checkName(String s) { 309 int n = s.length(); 310 if (!atBugLevel("1.4")) { 311 if (n == 0) 312 throw new IllegalCharsetNameException(s); 313 } 314 for (int i = 0; i < n; i++) { 315 char c = s.charAt(i); 316 if (c >= 'A' && c <= 'Z') continue; 317 if (c >= 'a' && c <= 'z') continue; 318 if (c >= '0' && c <= '9') continue; 319 if (c == '-' && i != 0) continue; 320 if (c == '+' && i != 0) continue; 321 if (c == ':' && i != 0) continue; 322 if (c == '_' && i != 0) continue; 323 if (c == '.' && i != 0) continue; 324 throw new IllegalCharsetNameException(s); 325 } 326 } 327 328 /* The standard set of charsets */ 329 // Android-Removed: We use ICU's list of standard charsets. 330 // private static CharsetProvider standardProvider = new StandardCharsets(); 331 332 // Cache of the most-recently-returned charsets, 333 // along with the names that were used to find them 334 // 335 // cache1/2 usage is explained in the lookup method 336 // 337 private static volatile Map.Entry<String, Charset> cache1 = null; // "Level 1" cache 338 private static final HashMap<String, Charset> cache2 = new HashMap<>(); // "Level 2" cache 339 340 private static void cache(String charsetName, Charset cs) { 341 synchronized(cache2) { 342 String canonicalName = cs.name(); 343 Charset canonicalCharset = cache2.get(canonicalName); 344 345 if (canonicalCharset != null) { 346 cs = canonicalCharset; 347 } else { 348 cache2.put(canonicalName, cs); 349 350 for (String alias : cs.aliases()) { 351 cache2.put(alias, cs); 352 } 353 } 354 355 cache2.put(charsetName, cs); 356 } 357 358 cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs); 359 } 360 361 // Creates an iterator that walks over the available providers, ignoring 362 // those whose lookup or instantiation causes a security exception to be 363 // thrown. Should be invoked with full privileges. 364 // 365 private static Iterator providers() { 366 return new Iterator() { 367 368 ServiceLoader<CharsetProvider> sl = 369 ServiceLoader.load(CharsetProvider.class); 370 Iterator<CharsetProvider> i = sl.iterator(); 371 372 Object next = null; 373 374 private boolean getNext() { 375 while (next == null) { 376 try { 377 if (!i.hasNext()) 378 return false; 379 next = i.next(); 380 } catch (ServiceConfigurationError sce) { 381 if (sce.getCause() instanceof SecurityException) { 382 // Ignore security exceptions 383 continue; 384 } 385 throw sce; 386 } 387 } 388 return true; 389 } 390 391 public boolean hasNext() { 392 return getNext(); 393 } 394 395 public Object next() { 396 if (!getNext()) 397 throw new NoSuchElementException(); 398 Object n = next; 399 next = null; 400 return n; 401 } 402 403 public void remove() { 404 throw new UnsupportedOperationException(); 405 } 406 407 }; 408 } 409 410 // Thread-local gate to prevent recursive provider lookups 411 private static ThreadLocal<ThreadLocal> gate = new ThreadLocal<ThreadLocal>(); 412 413 private static Charset lookupViaProviders(final String charsetName) { 414 415 // The runtime startup sequence looks up standard charsets as a 416 // consequence of the VM's invocation of System.initializeSystemClass 417 // in order to, e.g., set system properties and encode filenames. At 418 // that point the application class loader has not been initialized, 419 // however, so we can't look for providers because doing so will cause 420 // that loader to be prematurely initialized with incomplete 421 // information. 422 // 423 if (!sun.misc.VM.isBooted()) 424 return null; 425 426 if (gate.get() != null) 427 // Avoid recursive provider lookups 428 return null; 429 try { 430 gate.set(gate); 431 432 return AccessController.doPrivileged( 433 new PrivilegedAction<Charset>() { 434 public Charset run() { 435 for (Iterator i = providers(); i.hasNext();) { 436 CharsetProvider cp = (CharsetProvider)i.next(); 437 Charset cs = cp.charsetForName(charsetName); 438 if (cs != null) 439 return cs; 440 } 441 return null; 442 } 443 }); 444 445 } finally { 446 gate.set(null); 447 } 448 } 449 450 // Android removed : Remove support for the extended charset provider. 451 // 452 /* The extended set of charsets */ 453 // private static Object extendedProviderLock = new Object(); 454 // private static boolean extendedProviderProbed = false; 455 // private static CharsetProvider extendedProvider = null; 456 // 457 // private static void probeExtendedProvider() { 458 // AccessController.doPrivileged(new PrivilegedAction<Object>() { 459 // public Object run() { 460 // try { 461 // Class epc 462 // = Class.forName("sun.nio.cs.ext.ExtendedCharsets"); 463 // extendedProvider = (CharsetProvider)epc.newInstance(); 464 // } catch (ClassNotFoundException x) { 465 // // Extended charsets not available 466 // // (charsets.jar not present) 467 // } catch (InstantiationException x) { 468 // throw new Error(x); 469 // } catch (IllegalAccessException x) { 470 // throw new Error(x); 471 // } 472 // return null; 473 // } 474 // }); 475 // } 476 // 477 // private static Charset lookupExtendedCharset(String charsetName) { 478 // CharsetProvider ecp = null; 479 // synchronized (extendedProviderLock) { 480 // if (!extendedProviderProbed) { 481 // probeExtendedProvider(); 482 // extendedProviderProbed = true; 483 // } 484 // ecp = extendedProvider; 485 // } 486 // return (ecp != null) ? ecp.charsetForName(charsetName) : null; 487 // } 488 489 // We expect most programs to use one Charset repeatedly, so the most recently used Charset 490 // instance is stored in the level 1 cache. We convey a hint to this effect to the VM by putting 491 // the level 1 cache miss code in a separate method. Since charsetName is not necessarily in 492 // canonical form, we store the mapping from both the canonical name and the aliases to the 493 // instance in a map for level 2 cache. 494 private static Charset lookup(String charsetName) { 495 if (charsetName == null) 496 throw new IllegalArgumentException("Null charset name"); 497 498 499 final Map.Entry<String, Charset> cached = cache1; 500 if (cached != null && charsetName.equals(cached.getKey())) 501 return cached.getValue(); 502 return lookup2(charsetName); 503 } 504 505 private static Charset lookup2(String charsetName) { 506 Charset cs; 507 synchronized (cache2) { 508 if ((cs = cache2.get(charsetName)) != null) { 509 cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs); 510 return cs; 511 } 512 } 513 514 // Android-changed: Drop support for "standard" and "extended" 515 // providers. 516 if ((cs = NativeConverter.charsetForName(charsetName)) != null || 517 (cs = lookupViaProviders(charsetName)) != null) 518 { 519 cache(charsetName, cs); 520 return cs; 521 } 522 523 /* Only need to check the name if we didn't find a charset for it */ 524 checkName(charsetName); 525 return null; 526 } 527 528 /** 529 * Tells whether the named charset is supported. </p> 530 * 531 * @param charsetName 532 * The name of the requested charset; may be either 533 * a canonical name or an alias 534 * 535 * @return <tt>true</tt> if, and only if, support for the named charset 536 * is available in the current Java virtual machine 537 * 538 * @throws IllegalCharsetNameException 539 * If the given charset name is illegal 540 * 541 * @throws IllegalArgumentException 542 * If the given <tt>charsetName</tt> is null 543 */ 544 public static boolean isSupported(String charsetName) { 545 return (lookup(charsetName) != null); 546 } 547 548 /** 549 * Returns a charset object for the named charset. </p> 550 * 551 * @param charsetName 552 * The name of the requested charset; may be either 553 * a canonical name or an alias 554 * 555 * @return A charset object for the named charset 556 * 557 * @throws IllegalCharsetNameException 558 * If the given charset name is illegal 559 * 560 * @throws IllegalArgumentException 561 * If the given <tt>charsetName</tt> is null 562 * 563 * @throws UnsupportedCharsetException 564 * If no support for the named charset is available 565 * in this instance of the Java virtual machine 566 */ 567 public static Charset forName(String charsetName) { 568 Charset cs = lookup(charsetName); 569 if (cs != null) 570 return cs; 571 throw new UnsupportedCharsetException(charsetName); 572 } 573 574 575 /** 576 * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException}, 577 * which is all pre-nio code claims to throw. 578 * 579 * @hide internal use only 580 */ 581 public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException { 582 try { 583 return Charset.forName(charsetName); 584 } catch (Exception cause) { 585 UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName); 586 ex.initCause(cause); 587 throw ex; 588 } 589 } 590 591 592 // Fold charsets from the given iterator into the given map, ignoring 593 // charsets whose names already have entries in the map. 594 // 595 private static void put(Iterator<Charset> i, Map<String,Charset> m) { 596 while (i.hasNext()) { 597 Charset cs = i.next(); 598 if (!m.containsKey(cs.name())) 599 m.put(cs.name(), cs); 600 } 601 } 602 603 /** 604 * Constructs a sorted map from canonical charset names to charset objects. 605 * 606 * <p> The map returned by this method will have one entry for each charset 607 * for which support is available in the current Java virtual machine. If 608 * two or more supported charsets have the same canonical name then the 609 * resulting map will contain just one of them; which one it will contain 610 * is not specified. </p> 611 * 612 * <p> The invocation of this method, and the subsequent use of the 613 * resulting map, may cause time-consuming disk or network I/O operations 614 * to occur. This method is provided for applications that need to 615 * enumerate all of the available charsets, for example to allow user 616 * charset selection. This method is not used by the {@link #forName 617 * forName} method, which instead employs an efficient incremental lookup 618 * algorithm. 619 * 620 * <p> This method may return different results at different times if new 621 * charset providers are dynamically made available to the current Java 622 * virtual machine. In the absence of such changes, the charsets returned 623 * by this method are exactly those that can be retrieved via the {@link 624 * #forName forName} method. </p> 625 * 626 * @return An immutable, case-insensitive map from canonical charset names 627 * to charset objects 628 */ 629 public static SortedMap<String,Charset> availableCharsets() { 630 return AccessController.doPrivileged( 631 new PrivilegedAction<SortedMap<String,Charset>>() { 632 public SortedMap<String,Charset> run() { 633 TreeMap<String,Charset> m = 634 new TreeMap<String,Charset>( 635 ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER); 636 for (String charsetName : NativeConverter.getAvailableCharsetNames()) { 637 Charset charset = NativeConverter.charsetForName(charsetName); 638 m.put(charset.name(), charset); 639 } 640 // Android-changed: No more "standard" provider. 641 // put(standardProvider.charsets(), m); 642 for (Iterator i = providers(); i.hasNext();) { 643 CharsetProvider cp = (CharsetProvider)i.next(); 644 put(cp.charsets(), m); 645 } 646 return Collections.unmodifiableSortedMap(m); 647 } 648 }); 649 } 650 651 private static Charset defaultCharset; 652 653 /** 654 * Returns the default charset of this Java virtual machine. 655 * 656 * <p> The default charset is determined during virtual-machine startup and 657 * typically depends upon the locale and charset of the underlying 658 * operating system. 659 * 660 * @return A charset object for the default charset 661 * 662 * @since 1.5 663 */ 664 public static Charset defaultCharset() { 665 // Android changed : Use UTF_8 unconditionally. 666 synchronized (Charset.class) { 667 if (defaultCharset == null) { 668 defaultCharset = java.nio.charset.StandardCharsets.UTF_8; 669 } 670 671 return defaultCharset; 672 } 673 } 674 675 676 /* -- Instance fields and methods -- */ 677 678 private final String name; // tickles a bug in oldjavac 679 private final String[] aliases; // tickles a bug in oldjavac 680 private Set<String> aliasSet = null; 681 682 /** 683 * Initializes a new charset with the given canonical name and alias 684 * set. </p> 685 * 686 * @param canonicalName 687 * The canonical name of this charset 688 * 689 * @param aliases 690 * An array of this charset's aliases, or null if it has no aliases 691 * 692 * @throws IllegalCharsetNameException 693 * If the canonical name or any of the aliases are illegal 694 */ 695 protected Charset(String canonicalName, String[] aliases) { 696 checkName(canonicalName); 697 String[] as = (aliases == null) ? new String[0] : aliases; 698 for (int i = 0; i < as.length; i++) 699 checkName(as[i]); 700 this.name = canonicalName; 701 this.aliases = as; 702 } 703 704 /** 705 * Returns this charset's canonical name. </p> 706 * 707 * @return The canonical name of this charset 708 */ 709 public final String name() { 710 return name; 711 } 712 713 /** 714 * Returns a set containing this charset's aliases. </p> 715 * 716 * @return An immutable set of this charset's aliases 717 */ 718 public final Set<String> aliases() { 719 if (aliasSet != null) 720 return aliasSet; 721 int n = aliases.length; 722 HashSet<String> hs = new HashSet<String>(n); 723 for (int i = 0; i < n; i++) 724 hs.add(aliases[i]); 725 aliasSet = Collections.unmodifiableSet(hs); 726 return aliasSet; 727 } 728 729 /** 730 * Returns this charset's human-readable name for the default locale. 731 * 732 * <p> The default implementation of this method simply returns this 733 * charset's canonical name. Concrete subclasses of this class may 734 * override this method in order to provide a localized display name. </p> 735 * 736 * @return The display name of this charset in the default locale 737 */ 738 public String displayName() { 739 return name; 740 } 741 742 /** 743 * Tells whether or not this charset is registered in the <a 744 * href="http://www.iana.org/assignments/character-sets">IANA Charset 745 * Registry</a>. </p> 746 * 747 * @return <tt>true</tt> if, and only if, this charset is known by its 748 * implementor to be registered with the IANA 749 */ 750 public final boolean isRegistered() { 751 return !name.startsWith("X-") && !name.startsWith("x-"); 752 } 753 754 /** 755 * Returns this charset's human-readable name for the given locale. 756 * 757 * <p> The default implementation of this method simply returns this 758 * charset's canonical name. Concrete subclasses of this class may 759 * override this method in order to provide a localized display name. </p> 760 * 761 * @param locale 762 * The locale for which the display name is to be retrieved 763 * 764 * @return The display name of this charset in the given locale 765 */ 766 public String displayName(Locale locale) { 767 return name; 768 } 769 770 /** 771 * Tells whether or not this charset contains the given charset. 772 * 773 * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if, 774 * and only if, every character representable in <i>D</i> is also 775 * representable in <i>C</i>. If this relationship holds then it is 776 * guaranteed that every string that can be encoded in <i>D</i> can also be 777 * encoded in <i>C</i> without performing any replacements. 778 * 779 * <p> That <i>C</i> contains <i>D</i> does not imply that each character 780 * representable in <i>C</i> by a particular byte sequence is represented 781 * in <i>D</i> by the same byte sequence, although sometimes this is the 782 * case. 783 * 784 * <p> Every charset contains itself. 785 * 786 * <p> This method computes an approximation of the containment relation: 787 * If it returns <tt>true</tt> then the given charset is known to be 788 * contained by this charset; if it returns <tt>false</tt>, however, then 789 * it is not necessarily the case that the given charset is not contained 790 * in this charset. 791 * 792 * @return <tt>true</tt> if the given charset is contained in this charset 793 */ 794 public abstract boolean contains(Charset cs); 795 796 /** 797 * Constructs a new decoder for this charset. </p> 798 * 799 * @return A new decoder for this charset 800 */ 801 public abstract CharsetDecoder newDecoder(); 802 803 /** 804 * Constructs a new encoder for this charset. </p> 805 * 806 * @return A new encoder for this charset 807 * 808 * @throws UnsupportedOperationException 809 * If this charset does not support encoding 810 */ 811 public abstract CharsetEncoder newEncoder(); 812 813 /** 814 * Tells whether or not this charset supports encoding. 815 * 816 * <p> Nearly all charsets support encoding. The primary exceptions are 817 * special-purpose <i>auto-detect</i> charsets whose decoders can determine 818 * which of several possible encoding schemes is in use by examining the 819 * input byte sequence. Such charsets do not support encoding because 820 * there is no way to determine which encoding should be used on output. 821 * Implementations of such charsets should override this method to return 822 * <tt>false</tt>. </p> 823 * 824 * @return <tt>true</tt> if, and only if, this charset supports encoding 825 */ 826 public boolean canEncode() { 827 return true; 828 } 829 830 /** 831 * Convenience method that decodes bytes in this charset into Unicode 832 * characters. 833 * 834 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 835 * same result as the expression 836 * 837 * <pre> 838 * cs.newDecoder() 839 * .onMalformedInput(CodingErrorAction.REPLACE) 840 * .onUnmappableCharacter(CodingErrorAction.REPLACE) 841 * .decode(bb); </pre> 842 * 843 * except that it is potentially more efficient because it can cache 844 * decoders between successive invocations. 845 * 846 * <p> This method always replaces malformed-input and unmappable-character 847 * sequences with this charset's default replacement byte array. In order 848 * to detect such sequences, use the {@link 849 * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly. </p> 850 * 851 * @param bb The byte buffer to be decoded 852 * 853 * @return A char buffer containing the decoded characters 854 */ 855 public final CharBuffer decode(ByteBuffer bb) { 856 try { 857 return ThreadLocalCoders.decoderFor(this) 858 .onMalformedInput(CodingErrorAction.REPLACE) 859 .onUnmappableCharacter(CodingErrorAction.REPLACE) 860 .decode(bb); 861 } catch (CharacterCodingException x) { 862 throw new Error(x); // Can't happen 863 } 864 } 865 866 /** 867 * Convenience method that encodes Unicode characters into bytes in this 868 * charset. 869 * 870 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 871 * same result as the expression 872 * 873 * <pre> 874 * cs.newEncoder() 875 * .onMalformedInput(CodingErrorAction.REPLACE) 876 * .onUnmappableCharacter(CodingErrorAction.REPLACE) 877 * .encode(bb); </pre> 878 * 879 * except that it is potentially more efficient because it can cache 880 * encoders between successive invocations. 881 * 882 * <p> This method always replaces malformed-input and unmappable-character 883 * sequences with this charset's default replacement string. In order to 884 * detect such sequences, use the {@link 885 * CharsetEncoder#encode(java.nio.CharBuffer)} method directly. </p> 886 * 887 * @param cb The char buffer to be encoded 888 * 889 * @return A byte buffer containing the encoded characters 890 */ 891 public final ByteBuffer encode(CharBuffer cb) { 892 try { 893 return ThreadLocalCoders.encoderFor(this) 894 .onMalformedInput(CodingErrorAction.REPLACE) 895 .onUnmappableCharacter(CodingErrorAction.REPLACE) 896 .encode(cb); 897 } catch (CharacterCodingException x) { 898 throw new Error(x); // Can't happen 899 } 900 } 901 902 /** 903 * Convenience method that encodes a string into bytes in this charset. 904 * 905 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 906 * same result as the expression 907 * 908 * <pre> 909 * cs.encode(CharBuffer.wrap(s)); </pre> 910 * 911 * @param str The string to be encoded 912 * 913 * @return A byte buffer containing the encoded characters 914 */ 915 public final ByteBuffer encode(String str) { 916 return encode(CharBuffer.wrap(str)); 917 } 918 919 /** 920 * Compares this charset to another. 921 * 922 * <p> Charsets are ordered by their canonical names, without regard to 923 * case. </p> 924 * 925 * @param that 926 * The charset to which this charset is to be compared 927 * 928 * @return A negative integer, zero, or a positive integer as this charset 929 * is less than, equal to, or greater than the specified charset 930 */ 931 public final int compareTo(Charset that) { 932 return (name().compareToIgnoreCase(that.name())); 933 } 934 935 /** 936 * Computes a hashcode for this charset. </p> 937 * 938 * @return An integer hashcode 939 */ 940 public final int hashCode() { 941 return name().hashCode(); 942 } 943 944 /** 945 * Tells whether or not this object is equal to another. 946 * 947 * <p> Two charsets are equal if, and only if, they have the same canonical 948 * names. A charset is never equal to any other type of object. </p> 949 * 950 * @return <tt>true</tt> if, and only if, this charset is equal to the 951 * given object 952 */ 953 public final boolean equals(Object ob) { 954 if (!(ob instanceof Charset)) 955 return false; 956 if (this == ob) 957 return true; 958 return name.equals(((Charset)ob).name()); 959 } 960 961 /** 962 * Returns a string describing this charset. </p> 963 * 964 * @return A string describing this charset 965 */ 966 public final String toString() { 967 return name(); 968 } 969 } 970