Home | History | Annotate | Download | only in charset
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2006-2015, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 package com.ibm.icu.charset;
     11 
     12 import java.io.IOException;
     13 import java.nio.charset.Charset;
     14 import java.nio.charset.UnsupportedCharsetException;
     15 import java.nio.charset.spi.CharsetProvider;
     16 import java.util.Collections;
     17 import java.util.Iterator;
     18 import java.util.LinkedList;
     19 import java.util.List;
     20 
     21 import com.ibm.icu.impl.InvalidFormatException;
     22 
     23 
     24 /**
     25  * A concrete subclass of CharsetProvider for loading and providing charset converters
     26  * in ICU.
     27  * @stable ICU 3.6
     28  */
     29 public final class CharsetProviderICU extends CharsetProvider{
     30     /**
     31      * List of available ICU Charsets, empty during static initialization.
     32      * Not a Set or Map, so that we can add different Charset objects with the same name(),
     33      * which means that they are .equals(). See ICU ticket #11493.
     34      */
     35     private static List<Charset> icuCharsets = Collections.<Charset>emptyList();
     36 
     37     /**
     38      * Default constructor
     39      * @stable ICU 3.6
     40      */
     41     public CharsetProviderICU() {
     42     }
     43 
     44     /**
     45      * Constructs a Charset for the given charset name.
     46      * Implements the abstract method of super class.
     47      * @param charsetName charset name
     48      * @return Charset object for the given charset name, null if unsupported
     49      * @stable ICU 3.6
     50      */
     51     @Override
     52     public final Charset charsetForName(String charsetName){
     53         try{
     54             // extract the options from the charset name
     55             String optionsString = "";
     56             if (charsetName.endsWith(UConverterConstants.OPTION_SWAP_LFNL_STRING)) {
     57                 /* Remove and save the swap lfnl option string portion of the charset name. */
     58                 optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
     59                 charsetName = charsetName.substring(0, charsetName.length() - optionsString.length());
     60             }
     61             // get the canonical name
     62             String icuCanonicalName = getICUCanonicalName(charsetName);
     63 
     64             // create the converter object and return it
     65             if(icuCanonicalName==null || icuCanonicalName.length()==0){
     66                 // Try the original name, may be something added and not in the alias table.
     67                 // Will get an unsupported encoding exception if it doesn't work.
     68                 icuCanonicalName = charsetName;
     69             }
     70             return getCharset(icuCanonicalName, optionsString);
     71         }catch(UnsupportedCharsetException ex){
     72         }catch(IOException ex){
     73         }
     74         return null;
     75     }
     76 
     77     /**
     78      * Constructs a charset for the given ICU conversion table from the specified class path.
     79      * Example use: <code>cnv = CharsetProviderICU.charsetForName("myConverter", "com/myCompany/myDataPackage");</code>.
     80      * In this example myConverter.cnv would exist in the com/myCompany/myDataPackage Java package.
     81      * Conversion tables can be made with ICU4C's makeconv tool.
     82      * This function allows you to allows you to load user defined conversion
     83      * tables that are outside of ICU's core data.
     84      * @param charsetName The name of the charset conversion table.
     85      * @param classPath The class path that contain the conversion table.
     86      * @return charset object for the given charset name, null if unsupported
     87      * @stable ICU 3.8
     88      */
     89     public final Charset charsetForName(String charsetName, String classPath) {
     90         return charsetForName(charsetName, classPath, null);
     91     }
     92 
     93     /**
     94      * Constructs a charset for the given ICU conversion table from the specified class path.
     95      * This function is similar to {@link #charsetForName(String, String)}.
     96      * @param charsetName The name of the charset conversion table.
     97      * @param classPath The class path that contain the conversion table.
     98      * @param loader the class object from which to load the charset conversion table
     99      * @return charset object for the given charset name, null if unsupported
    100      * @stable ICU 3.8
    101      */
    102     public Charset charsetForName(String charsetName, String classPath, ClassLoader loader) {
    103         CharsetMBCS cs = null;
    104         try {
    105              cs = new CharsetMBCS(charsetName, charsetName, new String[0], classPath, loader);
    106         } catch (InvalidFormatException e) {
    107             // return null;
    108         }
    109         return cs;
    110     }
    111 
    112     /**
    113      * Gets the canonical name of the converter as defined by Java
    114      * @param enc converter name
    115      * @return canonical name of the converter
    116      * @internal
    117      * @deprecated This API is ICU internal only.
    118      */
    119      @Deprecated
    120      public static final String getICUCanonicalName(String enc)
    121                                 throws UnsupportedCharsetException{
    122         String canonicalName = null;
    123         String ret = null;
    124         try{
    125             if(enc!=null){
    126                  if((canonicalName = UConverterAlias.getCanonicalName(enc, "MIME"))!=null){
    127                     ret = canonicalName;
    128                 } else if((canonicalName = UConverterAlias.getCanonicalName(enc, "IANA"))!=null){
    129                     ret = canonicalName;
    130                 } else if((canonicalName = UConverterAlias.getAlias(enc, 0))!=null){
    131                     /* we have some aliases in the form x-blah .. match those */
    132                     ret = canonicalName;
    133                 }/*else if((canonicalName = UConverterAlias.getCanonicalName(enc, ""))!=null){
    134                     ret = canonicalName;
    135                 }*/else if(enc.indexOf("x-")==0 || enc.indexOf("X-")==0){
    136                     /* TODO: Match with getJavaCanonicalName method */
    137                     /*
    138                     char temp[ UCNV_MAX_CONVERTER_NAME_LENGTH] = {0};
    139                     strcpy(temp, encName+2);
    140                     */
    141                     // Remove the 'x-' and get the ICU canonical name
    142                     if ((canonicalName = UConverterAlias.getAlias(enc.substring(2), 0))!=null) {
    143                         ret = canonicalName;
    144                     } else {
    145                         ret = "";
    146                     }
    147 
    148                 }else{
    149                     /* unsupported encoding */
    150                    ret = "";
    151                 }
    152             }
    153             return ret;
    154         }catch(IOException ex){
    155             throw new UnsupportedCharsetException(enc);
    156         }
    157     }
    158     private static final Charset getCharset(String icuCanonicalName, String optionsString)
    159             throws IOException {
    160        String[] aliases = getAliases(icuCanonicalName);
    161        String canonicalName = getJavaCanonicalName(icuCanonicalName);
    162 
    163        /* Concat the option string to the icuCanonicalName so that the options can be handled properly
    164         * by the actual charset.
    165         */
    166        return (CharsetICU.getCharset(icuCanonicalName + optionsString, canonicalName, aliases));
    167     }
    168     /**
    169      * Gets the canonical name of the converter as defined by Java
    170      * @param charsetName converter name
    171      * @return canonical name of the converter
    172      * @internal
    173      * @deprecated This API is ICU internal only.
    174      */
    175     @Deprecated
    176     public static String getJavaCanonicalName(String charsetName){
    177         /*
    178         If a charset listed in the IANA Charset Registry is supported by an implementation
    179         of the Java platform then its canonical name must be the name listed in the registry.
    180         Many charsets are given more than one name in the registry, in which case the registry
    181         identifies one of the names as MIME-preferred. If a charset has more than one registry
    182         name then its canonical name must be the MIME-preferred name and the other names in
    183         the registry must be valid aliases. If a supported charset is not listed in the IANA
    184         registry then its canonical name must begin with one of the strings "X-" or "x-".
    185         */
    186         if(charsetName==null ){
    187             return null;
    188         }
    189         try{
    190             String cName = null;
    191             /* find out the alias with MIME tag */
    192             if((cName=UConverterAlias.getStandardName(charsetName, "MIME"))!=null){
    193             /* find out the alias with IANA tag */
    194             }else if((cName=UConverterAlias.getStandardName(charsetName, "IANA"))!=null){
    195             }else {
    196                 /*
    197                     check to see if an alias already exists with x- prefix, if yes then
    198                     make that the canonical name
    199                 */
    200                 int aliasNum = UConverterAlias.countAliases(charsetName);
    201                 String name;
    202                 for(int i=0;i<aliasNum;i++){
    203                     name = UConverterAlias.getAlias(charsetName, i);
    204                     if(name!=null && name.indexOf("x-")==0){
    205                         cName = name;
    206                         break;
    207                     }
    208                 }
    209                 /* last resort just append x- to any of the alias and
    210                 make it the canonical name */
    211                 if((cName==null || cName.length()==0)){
    212                     name = UConverterAlias.getStandardName(charsetName, "UTR22");
    213                     if(name==null && charsetName.indexOf(",")!=-1){
    214                         name = UConverterAlias.getAlias(charsetName, 1);
    215                     }
    216                     /* if there is no UTR22 canonical name .. then just return itself*/
    217                     if(name==null){
    218                         name = charsetName;
    219                     }
    220                     cName = "x-"+ name;
    221                 }
    222             }
    223             return cName;
    224         }catch (IOException ex){
    225 
    226         }
    227         return null;
    228      }
    229 
    230     /**
    231      * Gets the aliases associated with the converter name
    232      * @param encName converter name
    233      * @return converter names as elements in an object array
    234      * @internal
    235      * @deprecated This API is ICU internal only.
    236      */
    237     @Deprecated
    238     private static final String[] getAliases(String encName)throws IOException{
    239         String[] ret = null;
    240         int aliasNum = 0;
    241         int i=0;
    242         int j=0;
    243         String aliasArray[/*50*/] = new String[50];
    244 
    245         if(encName != null){
    246             aliasNum = UConverterAlias.countAliases(encName);
    247             for(i=0,j=0;i<aliasNum;i++){
    248                 String name = UConverterAlias.getAlias(encName,i);
    249                 if(name.indexOf(',')==-1){
    250                     aliasArray[j++]= name;
    251                 }
    252             }
    253             ret = new String[j];
    254             for(;--j>=0;) {
    255                 ret[j] = aliasArray[j];
    256             }
    257 
    258         }
    259         return (ret);
    260 
    261     }
    262 
    263     /**
    264      * Lazy-init the icuCharsets list.
    265      * Could be done during static initialization if constructing all of the Charsets
    266      * were cheap enough. See ICU ticket #11481.
    267      */
    268     private static final synchronized void loadAvailableICUCharsets() {
    269         if (!icuCharsets.isEmpty()) {
    270             return;
    271         }
    272         List<Charset> icucs = new LinkedList<Charset>();
    273         int num = UConverterAlias.countAvailable();
    274         for (int i = 0; i < num; ++i) {
    275             String name = UConverterAlias.getAvailableName(i);
    276             try {
    277                 Charset cs = getCharset(name, "");
    278                 icucs.add(cs);
    279             } catch(UnsupportedCharsetException ex) {
    280             } catch(IOException e) {
    281             }
    282             // add only charsets that can be created!
    283         }
    284         // Unmodifiable so that charsets().next().remove() cannot change it.
    285         icuCharsets = Collections.unmodifiableList(icucs);
    286     }
    287 
    288     /**
    289      * Returns an iterator for the available ICU Charsets.
    290      * Implements the abstract method of super class.
    291      * @return the Charset iterator
    292      * @stable ICU 3.6
    293      */
    294     @Override
    295     public final Iterator<Charset> charsets() {
    296         loadAvailableICUCharsets();
    297         return icuCharsets.iterator();
    298     }
    299 
    300     /**
    301      * Gets the canonical names of available ICU converters
    302      * @return array of available converter names
    303      * @internal
    304      * @deprecated This API is ICU internal only.
    305      */
    306     @Deprecated
    307      public static final String[] getAvailableNames() {
    308         loadAvailableICUCharsets();
    309         String[] names = new String[icuCharsets.size()];
    310         int i = 0;
    311         for (Charset cs : icuCharsets) {
    312             names[i++] = cs.name();
    313         }
    314         return names;
    315     }
    316 
    317     /**
    318      * Return all names available
    319      * @return String[] an array of all available names
    320      * @internal
    321      * @deprecated This API is ICU internal only.
    322      */
    323     @Deprecated
    324      public static final String[] getAllNames(){
    325         int num = UConverterAlias.countAvailable();
    326         String[] names = new String[num];
    327         for(int i=0;i<num;i++) {
    328             names[i] = UConverterAlias.getAvailableName(i);
    329         }
    330         return names;
    331     }
    332 }
    333