1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2016, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.charset; 9 10 import java.lang.reflect.Constructor; 11 import java.lang.reflect.InvocationTargetException; 12 import java.nio.charset.Charset; 13 import java.nio.charset.IllegalCharsetNameException; 14 import java.nio.charset.UnsupportedCharsetException; 15 import java.util.HashMap; 16 17 import com.ibm.icu.text.UnicodeSet; 18 19 /** 20 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters. 21 * This API is used to convert codepage or character encoded data to and 22 * from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that 23 * converter, you can get its properties, set options, convert your data. 24 * 25 * <p>Since many software programs recognize different converter names for 26 * different types of converters, there are other functions in this API to 27 * iterate over the converter aliases. 28 * 29 * <p>Note that {@link #name()} cannot always return a unique charset name. 30 * {@link Charset} documents that, 31 * for charsets listed in the IANA Charset Registry, 32 * the {@link #name()} must be listed there, 33 * and it must be the MIME-preferred name if there are multiple names. 34 * 35 * <p>However, there are different implementations of many if not most charsets, 36 * ICU provides multiple variants for some of them, 37 * ICU provides variants of some java.nio-system-supported charsets, 38 * and ICU users are free to add more variants. 39 * This is so that applications can be compatible with multiple implementations at the same time. 40 * 41 * <p>This is in conflict with the {@link Charset#name()} requirements. 42 * It is not possible to offer variants of an IANA charset and 43 * always use the MIME-preferred name and also have those names be unique. 44 * 45 * <p>{@link #name()} returns the MIME-preferred name, or IANA name, 46 * so that it can always be used for the charset field in internet protocols. 47 * 48 * <p>Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU} 49 * by using unique aliases (e.g., the ICU-canonical names). 50 * 51 * <p>{@link Charset} also documents that 52 * Two charsets are equal if, and only if, they have the same canonical names. 53 * This is not possible. 54 * 55 * <p>Unfortunately, {@link Charset#equals} is final, and 56 * {@link Charset#availableCharsets} returns 57 * a sorted map from canonical charset names to charset objects. 58 * Since {@link #name()} cannot be unique, 59 * {@link #equals} cannot work properly in such cases, and 60 * {@link Charset#availableCharsets} can only include one variant for a name. 61 * 62 * @stable ICU 3.6 63 */ 64 public abstract class CharsetICU extends Charset{ 65 66 String icuCanonicalName; 67 int options; 68 69 float maxCharsPerByte; 70 71 String name; /* +4: 60 internal name of the converter- invariant chars */ 72 73 int codepage; /* +64: 4 codepage # (now IBM-$codepage) */ 74 75 byte platform; /* +68: 1 platform of the converter (only IBM now) */ 76 byte conversionType; /* +69: 1 conversion type */ 77 78 int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */ 79 int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */ 80 81 byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */ 82 byte subCharLen; /* +76: 1 */ 83 84 byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */ 85 byte hasFromUnicodeFallback; /* +78: 1 */ 86 short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */ 87 byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */ 88 //byte reserved[/*19*/]; /* +81: 19 to round out the structure */ 89 90 91 // typedef enum UConverterUnicodeSet { 92 /** 93 * Parameter that select the set of roundtrippable Unicode code points. 94 * @stable ICU 4.0 95 */ 96 public static final int ROUNDTRIP_SET=0; 97 /** 98 * Select the set of Unicode code points with roundtrip or fallback mappings. 99 * Not supported at this point. 100 * @internal 101 * @deprecated This API is ICU internal only. 102 */ 103 @Deprecated 104 public static final int ROUNDTRIP_AND_FALLBACK_SET =1; 105 106 //} UConverterUnicodeSet; 107 108 /** 109 * 110 * @param icuCanonicalName 111 * @param canonicalName 112 * @param aliases 113 * @stable ICU 3.6 114 */ 115 protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) { 116 super(canonicalName,aliases); 117 if(canonicalName.length() == 0){ 118 throw new IllegalCharsetNameException(canonicalName); 119 } 120 this.icuCanonicalName = icuCanonicalName; 121 } 122 123 /** 124 * Ascertains if a charset is a sub set of this charset 125 * Implements the abstract method of super class. 126 * @param cs charset to test 127 * @return true if the given charset is a subset of this charset 128 * @stable ICU 3.6 129 */ 130 public boolean contains(Charset cs){ 131 if (null == cs) { 132 return false; 133 } else if (this.equals(cs)) { 134 return true; 135 } 136 return false; 137 } 138 private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>(); 139 static{ 140 algorithmicCharsets.put("LMBCS-1", "com.ibm.icu.charset.CharsetLMBCS"); 141 algorithmicCharsets.put("LMBCS-2", "com.ibm.icu.charset.CharsetLMBCS"); 142 algorithmicCharsets.put("LMBCS-3", "com.ibm.icu.charset.CharsetLMBCS"); 143 algorithmicCharsets.put("LMBCS-4", "com.ibm.icu.charset.CharsetLMBCS"); 144 algorithmicCharsets.put("LMBCS-5", "com.ibm.icu.charset.CharsetLMBCS"); 145 algorithmicCharsets.put("LMBCS-6", "com.ibm.icu.charset.CharsetLMBCS"); 146 algorithmicCharsets.put("LMBCS-8", "com.ibm.icu.charset.CharsetLMBCS"); 147 algorithmicCharsets.put("LMBCS-11", "com.ibm.icu.charset.CharsetLMBCS"); 148 algorithmicCharsets.put("LMBCS-16", "com.ibm.icu.charset.CharsetLMBCS"); 149 algorithmicCharsets.put("LMBCS-17", "com.ibm.icu.charset.CharsetLMBCS"); 150 algorithmicCharsets.put("LMBCS-18", "com.ibm.icu.charset.CharsetLMBCS"); 151 algorithmicCharsets.put("LMBCS-19", "com.ibm.icu.charset.CharsetLMBCS"); 152 algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" ); 153 algorithmicCharsets.put("SCSU", "com.ibm.icu.charset.CharsetSCSU" ); 154 algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" ); 155 algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" ); 156 algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" ); 157 algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" ); 158 algorithmicCharsets.put("UTF-16BE,version=1", "com.ibm.icu.charset.CharsetUTF16BE" ); 159 algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" ); 160 algorithmicCharsets.put("UTF-16LE,version=1", "com.ibm.icu.charset.CharsetUTF16LE" ); 161 algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" ); 162 algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" ); 163 algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" ); 164 algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" ); 165 algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" ); 166 algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" ); 167 algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" ); 168 algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" ); 169 algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" ); 170 algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" ); 171 algorithmicCharsets.put("ISCII,version=0", "com.ibm.icu.charset.CharsetISCII" ); 172 algorithmicCharsets.put("ISCII,version=1", "com.ibm.icu.charset.CharsetISCII" ); 173 algorithmicCharsets.put("ISCII,version=2", "com.ibm.icu.charset.CharsetISCII" ); 174 algorithmicCharsets.put("ISCII,version=3", "com.ibm.icu.charset.CharsetISCII" ); 175 algorithmicCharsets.put("ISCII,version=4", "com.ibm.icu.charset.CharsetISCII" ); 176 algorithmicCharsets.put("ISCII,version=5", "com.ibm.icu.charset.CharsetISCII" ); 177 algorithmicCharsets.put("ISCII,version=6", "com.ibm.icu.charset.CharsetISCII" ); 178 algorithmicCharsets.put("ISCII,version=7", "com.ibm.icu.charset.CharsetISCII" ); 179 algorithmicCharsets.put("ISCII,version=8", "com.ibm.icu.charset.CharsetISCII" ); 180 algorithmicCharsets.put("IMAP-mailbox-name", "com.ibm.icu.charset.CharsetUTF7" ); 181 algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" ); 182 algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 183 algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 184 algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.ibm.icu.charset.CharsetISO2022" ); 185 algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.ibm.icu.charset.CharsetISO2022" ); 186 algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.ibm.icu.charset.CharsetISO2022" ); 187 algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 188 algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 189 algorithmicCharsets.put("ISO_2022,locale=zh,version=2", "com.ibm.icu.charset.CharsetISO2022" ); 190 algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 191 algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 192 algorithmicCharsets.put("x11-compound-text", "com.ibm.icu.charset.CharsetCompoundText" ); 193 } 194 195 /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){ 196 String className = algorithmicCharsets.get(icuCanonicalName); 197 if(className==null){ 198 //all the cnv files are loaded as MBCS 199 className = "com.ibm.icu.charset.CharsetMBCS"; 200 } 201 try{ 202 CharsetICU conv = null; 203 Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class); 204 Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class, String[].class}; 205 final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes); 206 Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases}; 207 208 // Run constructor 209 try { 210 conv = c.newInstance(params); 211 if (conv != null) { 212 return conv; 213 } 214 }catch (InvocationTargetException e) { 215 Throwable cause = e.getCause(); 216 UnsupportedCharsetException e2 = new UnsupportedCharsetException( 217 icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause); 218 e2.initCause(cause); 219 throw e2; 220 } 221 }catch(ClassNotFoundException ex){ 222 }catch(NoSuchMethodException ex){ 223 }catch (IllegalAccessException ex){ 224 }catch (InstantiationException ex){ 225 } 226 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className); 227 } 228 229 static final boolean isSurrogate(int c){ 230 return (((c)&0xfffff800)==0xd800); 231 } 232 233 /* 234 * Returns the default charset name 235 */ 236 // static final String getDefaultCharsetName(){ 237 // String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); 238 // return defaultEncoding; 239 // } 240 241 /** 242 * Returns a charset object for the named charset. 243 * This method gurantee that ICU charset is returned when 244 * available. If the ICU charset provider does not support 245 * the specified charset, then try other charset providers 246 * including the standard Java charset provider. 247 * 248 * @param charsetName The name of the requested charset, 249 * may be either a canonical name or an alias 250 * @return A charset object for the named charset 251 * @throws IllegalCharsetNameException If the given charset name 252 * is illegal 253 * @throws UnsupportedCharsetException If no support for the 254 * named charset is available in this instance of th Java 255 * virtual machine 256 * @stable ICU 3.6 257 */ 258 public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException { 259 CharsetProviderICU icuProvider = new CharsetProviderICU(); 260 CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName); 261 if (cs != null) { 262 return cs; 263 } 264 return Charset.forName(charsetName); 265 } 266 267 // /** 268 // * @see java.lang.Comparable#compareTo(java.lang.Object) 269 // * @stable 3.8 270 // */ 271 // public int compareTo(Object otherObj) { 272 // if (!(otherObj instanceof CharsetICU)) { 273 // return -1; 274 // } 275 // return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName); 276 // } 277 278 /** 279 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the 280 * start of the stream for example U+FEFF (the Unicode BOM/signature 281 * character) that can be ignored. 282 * 283 * Detects Unicode signature byte sequences at the start of the byte stream 284 * and returns number of bytes of the BOM of the indicated Unicode charset. 285 * 0 is returned when no Unicode signature is recognized. 286 * 287 */ 288 // TODO This should be proposed as CharsetDecoderICU API. 289 // static String detectUnicodeSignature(ByteBuffer source) { 290 // int signatureLength = 0; // number of bytes of the signature 291 // final int SIG_MAX_LEN = 5; 292 // String sigUniCharset = null; // states what unicode charset is the BOM 293 // int i = 0; 294 // 295 // /* 296 // * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we 297 // * don't misdetect something 298 // */ 299 // byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, 300 // (byte) 0xa5 }; 301 // 302 // while (i < source.remaining() && i < SIG_MAX_LEN) { 303 // start[i] = source.get(i); 304 // i++; 305 // } 306 // 307 // if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) { 308 // signatureLength = 2; 309 // sigUniCharset = "UTF-16BE"; 310 // source.position(signatureLength); 311 // return sigUniCharset; 312 // } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) { 313 // if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) { 314 // signatureLength = 4; 315 // sigUniCharset = "UTF-32LE"; 316 // source.position(signatureLength); 317 // return sigUniCharset; 318 // } else { 319 // signatureLength = 2; 320 // sigUniCharset = "UTF-16LE"; 321 // source.position(signatureLength); 322 // return sigUniCharset; 323 // } 324 // } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB 325 // && start[2] == (byte) 0xBF) { 326 // signatureLength = 3; 327 // sigUniCharset = "UTF-8"; 328 // source.position(signatureLength); 329 // return sigUniCharset; 330 // } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00 331 // && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) { 332 // signatureLength = 4; 333 // sigUniCharset = "UTF-32BE"; 334 // source.position(signatureLength); 335 // return sigUniCharset; 336 // } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE 337 // && start[2] == (byte) 0xFF) { 338 // signatureLength = 3; 339 // sigUniCharset = "SCSU"; 340 // source.position(signatureLength); 341 // return sigUniCharset; 342 // } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE 343 // && start[2] == (byte) 0x28) { 344 // signatureLength = 3; 345 // sigUniCharset = "BOCU-1"; 346 // source.position(signatureLength); 347 // return sigUniCharset; 348 // } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F 349 // && start[2] == (byte) 0x76) { 350 // 351 // if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) { 352 // signatureLength = 5; 353 // sigUniCharset = "UTF-7"; 354 // source.position(signatureLength); 355 // return sigUniCharset; 356 // } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39 357 // || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) { 358 // signatureLength = 4; 359 // sigUniCharset = "UTF-7"; 360 // source.position(signatureLength); 361 // return sigUniCharset; 362 // } 363 // } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73 364 // && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) { 365 // signatureLength = 4; 366 // sigUniCharset = "UTF-EBCDIC"; 367 // source.position(signatureLength); 368 // return sigUniCharset; 369 // } 370 // 371 // /* no known Unicode signature byte sequence recognized */ 372 // return null; 373 // } 374 375 376 abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which); 377 378 /** 379 * Returns the set of Unicode code points that can be converted by an ICU Converter. 380 * 381 * <p>The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be 382 * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback 383 * mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a> 384 * 385 * <p>In the future, there may be more UConverterUnicodeSet choices to select sets with different properties. 386 * 387 * <p>This is useful for example for 388 * <ul><li>checking that a string or document can be roundtrip-converted with a converter, 389 * without/before actually performing the conversion</li> 390 * <li>testing if a converter can be used for text for typical text for a certain locale, 391 * by comparing its roundtrip set with the set of ExemplarCharacters from 392 * ICU's locale data or other sources</li></ul> 393 * 394 * @param setFillIn A valid UnicodeSet. It will be cleared by this function before 395 * the converter's specific set is filled in. 396 * @param which A selector; currently ROUNDTRIP_SET is the only supported value. 397 * @throws IllegalArgumentException if the parameters does not match. 398 * @stable ICU 4.0 399 */ 400 public void getUnicodeSet(UnicodeSet setFillIn, int which){ 401 if( setFillIn == null || which != ROUNDTRIP_SET ){ 402 throw new IllegalArgumentException(); 403 } 404 setFillIn.clear(); 405 getUnicodeSetImpl(setFillIn, which); 406 } 407 408 /** 409 * Returns whether or not the charset of the converter has a fixed number of bytes 410 * per charset character. 411 * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS. 412 * Another example is UTF-32 which is always 4 bytes per character. A UTF-32 code point 413 * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes. 414 * Note: This method is not intended to be used to determine whether the charset has a 415 * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form. 416 * @return true if the converter is fixed-width 417 * @stable ICU 4.8 418 */ 419 public boolean isFixedWidth() { 420 if (this instanceof CharsetASCII || this instanceof CharsetUTF32) { 421 return true; 422 } 423 424 if (this instanceof CharsetMBCS) { 425 if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) { 426 return true; 427 } 428 } 429 430 return false; 431 } 432 433 static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){ 434 setFillIn.add(0, 0xd7ff); 435 setFillIn.add(0xe000, 0x10ffff); 436 } 437 438 static void getCompleteUnicodeSet(UnicodeSet setFillIn){ 439 setFillIn.add(0, 0x10ffff); 440 } 441 } 442