Home | History | Annotate | Download | only in serializer
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements. See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership. The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the  "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *     http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 /*
     19  * $Id: Encodings.java 471981 2006-11-07 04:28:00Z minchau $
     20  */
     21 package org.apache.xml.serializer;
     22 
     23 import java.io.InputStream;
     24 import java.io.OutputStream;
     25 import java.io.OutputStreamWriter;
     26 import java.io.UnsupportedEncodingException;
     27 import java.io.Writer;
     28 import java.util.ArrayList;
     29 import java.util.Enumeration;
     30 import java.util.Hashtable;
     31 import java.util.List;
     32 import java.util.Properties;
     33 import java.util.StringTokenizer;
     34 
     35 
     36 /**
     37  * Provides information about encodings. Depends on the Java runtime
     38  * to provides writers for the different encodings.
     39  * <p>
     40  * This class is not a public API. It is only public because it
     41  * is used outside of this package.
     42  *
     43  * @xsl.usage internal
     44  */
     45 
     46 public final class Encodings extends Object
     47 {
     48     /**
     49      * Standard filename for properties file with encodings data.
     50      */
     51     private static final String ENCODINGS_FILE = SerializerBase.PKG_PATH+"/Encodings.properties";
     52 
     53     /**
     54      * Returns a writer for the specified encoding based on
     55      * an output stream.
     56      * <p>
     57      * This is not a public API.
     58      * @param output The output stream
     59      * @param encoding The encoding MIME name, not a Java name for the encoding.
     60      * @return A suitable writer
     61      * @throws UnsupportedEncodingException There is no convertor
     62      *  to support this encoding
     63      * @xsl.usage internal
     64      */
     65     static Writer getWriter(OutputStream output, String encoding)
     66         throws UnsupportedEncodingException
     67     {
     68 
     69         for (int i = 0; i < _encodings.length; ++i)
     70         {
     71             if (_encodings[i].name.equalsIgnoreCase(encoding))
     72             {
     73                 try
     74                 {
     75                     String javaName = _encodings[i].javaName;
     76                 	OutputStreamWriter osw = new OutputStreamWriter(output,javaName);
     77                     return osw;
     78                 }
     79                 catch (java.lang.IllegalArgumentException iae) // java 1.1.8
     80                 {
     81                     // keep trying
     82                 }
     83                 catch (UnsupportedEncodingException usee)
     84                 {
     85 
     86                     // keep trying
     87                 }
     88             }
     89         }
     90 
     91         try
     92         {
     93             return new OutputStreamWriter(output, encoding);
     94         }
     95         catch (java.lang.IllegalArgumentException iae) // java 1.1.8
     96         {
     97             throw new UnsupportedEncodingException(encoding);
     98         }
     99     }
    100 
    101     /**
    102      * Returns the EncodingInfo object for the specified
    103      * encoding, never null, although the encoding name
    104      * inside the returned EncodingInfo object will be if
    105      * we can't find a "real" EncodingInfo for the encoding.
    106      * <p>
    107      * This is not a public API.
    108      *
    109      * @param encoding The encoding
    110      * @return The object that is used to determine if
    111      * characters are in the given encoding.
    112      * @xsl.usage internal
    113      */
    114     static EncodingInfo getEncodingInfo(String encoding)
    115     {
    116         EncodingInfo ei;
    117 
    118         String normalizedEncoding = toUpperCaseFast(encoding);
    119         ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
    120         if (ei == null)
    121             ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
    122         if (ei == null) {
    123             // We shouldn't have to do this, but just in case.
    124             ei = new EncodingInfo(null,null, '\u0000');
    125         }
    126 
    127         return ei;
    128     }
    129 
    130     /**
    131      * Determines if the encoding specified was recognized by the
    132      * serializer or not.
    133      *
    134      * @param encoding The encoding
    135      * @return boolean - true if the encoding was recognized else false
    136      */
    137     public static boolean isRecognizedEncoding(String encoding)
    138     {
    139         EncodingInfo ei;
    140 
    141         String normalizedEncoding = encoding.toUpperCase();
    142         ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
    143         if (ei == null)
    144             ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
    145         if (ei != null)
    146             return true;
    147         return false;
    148     }
    149 
    150     /**
    151      * A fast and cheap way to uppercase a String that is
    152      * only made of printable ASCII characters.
    153      * <p>
    154      * This is not a public API.
    155      * @param s a String of ASCII characters
    156      * @return an uppercased version of the input String,
    157      * possibly the same String.
    158      * @xsl.usage internal
    159      */
    160     static private String toUpperCaseFast(final String s) {
    161 
    162     	boolean different = false;
    163     	final int mx = s.length();
    164 		char[] chars = new char[mx];
    165     	for (int i=0; i < mx; i++) {
    166     		char ch = s.charAt(i);
    167             // is the character a lower case ASCII one?
    168     		if ('a' <= ch && ch <= 'z') {
    169                 // a cheap and fast way to uppercase that is good enough
    170     			ch = (char) (ch + ('A' - 'a'));
    171     			different = true; // the uppercased String is different
    172     		}
    173     		chars[i] = ch;
    174     	}
    175 
    176     	// A little optimization, don't call String.valueOf() if
    177     	// the uppercased string is the same as the input string.
    178     	final String upper;
    179     	if (different)
    180     		upper = String.valueOf(chars);
    181     	else
    182     		upper = s;
    183 
    184     	return upper;
    185     }
    186 
    187     /** The default encoding, ISO style, ISO style.   */
    188     static final String DEFAULT_MIME_ENCODING = "UTF-8";
    189 
    190     /**
    191      * Get the proper mime encoding.  From the XSLT recommendation: "The encoding
    192      * attribute specifies the preferred encoding to use for outputting the result
    193      * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
    194      * For other values, if the XSLT processor does not support the specified
    195      * encoding it may signal an error; if it does not signal an error it should
    196      * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
    197      * whose name does not match the EncName production of the XML Recommendation
    198      * [XML]. If no encoding attribute is specified, then the XSLT processor should
    199      * use either UTF-8 or UTF-16."
    200      * <p>
    201      * This is not a public API.
    202      *
    203      * @param encoding Reference to java-style encoding string, which may be null,
    204      * in which case a default will be found.
    205      *
    206      * @return The ISO-style encoding string, or null if failure.
    207      * @xsl.usage internal
    208      */
    209     static String getMimeEncoding(String encoding)
    210     {
    211 
    212         if (null == encoding)
    213         {
    214             try
    215             {
    216 
    217                 // Get the default system character encoding.  This may be
    218                 // incorrect if they passed in a writer, but right now there
    219                 // seems to be no way to get the encoding from a writer.
    220                 encoding = System.getProperty("file.encoding", "UTF8");
    221 
    222                 if (null != encoding)
    223                 {
    224 
    225                     /*
    226                     * See if the mime type is equal to UTF8.  If you don't
    227                     * do that, then  convertJava2MimeEncoding will convert
    228                     * 8859_1 to "ISO-8859-1", which is not what we want,
    229                     * I think, and I don't think I want to alter the tables
    230                     * to convert everything to UTF-8.
    231                     */
    232                     String jencoding =
    233                         (encoding.equalsIgnoreCase("Cp1252")
    234                             || encoding.equalsIgnoreCase("ISO8859_1")
    235                             || encoding.equalsIgnoreCase("8859_1")
    236                             || encoding.equalsIgnoreCase("UTF8"))
    237                             ? DEFAULT_MIME_ENCODING
    238                             : convertJava2MimeEncoding(encoding);
    239 
    240                     encoding =
    241                         (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
    242                 }
    243                 else
    244                 {
    245                     encoding = DEFAULT_MIME_ENCODING;
    246                 }
    247             }
    248             catch (SecurityException se)
    249             {
    250                 encoding = DEFAULT_MIME_ENCODING;
    251             }
    252         }
    253         else
    254         {
    255             encoding = convertJava2MimeEncoding(encoding);
    256         }
    257 
    258         return encoding;
    259     }
    260 
    261     /**
    262      * Try the best we can to convert a Java encoding to a XML-style encoding.
    263      * <p>
    264      * This is not a public API.
    265      * @param encoding non-null reference to encoding string, java style.
    266      *
    267      * @return ISO-style encoding string.
    268      * @xsl.usage internal
    269      */
    270     private static String convertJava2MimeEncoding(String encoding)
    271     {
    272         EncodingInfo enc =
    273             (EncodingInfo) _encodingTableKeyJava.get(toUpperCaseFast(encoding));
    274         if (null != enc)
    275             return enc.name;
    276         return encoding;
    277     }
    278 
    279     /**
    280      * Try the best we can to convert a Java encoding to a XML-style encoding.
    281      * <p>
    282      * This is not a public API.
    283      *
    284      * @param encoding non-null reference to encoding string, java style.
    285      *
    286      * @return ISO-style encoding string.
    287      * <p>
    288      * This method is not a public API.
    289      * @xsl.usage internal
    290      */
    291     public static String convertMime2JavaEncoding(String encoding)
    292     {
    293 
    294         for (int i = 0; i < _encodings.length; ++i)
    295         {
    296             if (_encodings[i].name.equalsIgnoreCase(encoding))
    297             {
    298                 return _encodings[i].javaName;
    299             }
    300         }
    301 
    302         return encoding;
    303     }
    304 
    305     /**
    306      * Load a list of all the supported encodings.
    307      *
    308      * System property "encodings" formatted using URL syntax may define an
    309      * external encodings list. Thanks to Sergey Ushakov for the code
    310      * contribution!
    311      * @xsl.usage internal
    312      */
    313     private static EncodingInfo[] loadEncodingInfo()
    314     {
    315         try
    316         {
    317             final InputStream is;
    318 
    319             SecuritySupport ss = SecuritySupport.getInstance();
    320             is = ss.getResourceAsStream(ObjectFactory.findClassLoader(),
    321                                             ENCODINGS_FILE);
    322 
    323             Properties props = new Properties();
    324             if (is != null) {
    325                 props.load(is);
    326                 is.close();
    327             } else {
    328                 // Seems to be no real need to force failure here, let the
    329                 // system do its best... The issue is not really very critical,
    330                 // and the output will be in any case _correct_ though maybe not
    331                 // always human-friendly... :)
    332                 // But maybe report/log the resource problem?
    333                 // Any standard ways to report/log errors (in static context)?
    334             }
    335 
    336             int totalEntries = props.size();
    337 
    338             List encodingInfo_list = new ArrayList();
    339             Enumeration keys = props.keys();
    340             for (int i = 0; i < totalEntries; ++i)
    341             {
    342                 String javaName = (String) keys.nextElement();
    343                 String val = props.getProperty(javaName);
    344                 int len = lengthOfMimeNames(val);
    345 
    346                 String mimeName;
    347                 char highChar;
    348                 if (len == 0)
    349                 {
    350                     // There is no property value, only the javaName, so try and recover
    351                     mimeName = javaName;
    352                     highChar = '\u0000'; // don't know the high code point, will need to test every character
    353                 }
    354                 else
    355                 {
    356                     try {
    357                         // Get the substring after the Mime names
    358                         final String highVal = val.substring(len).trim();
    359                         highChar = (char) Integer.decode(highVal).intValue();
    360                     }
    361                     catch( NumberFormatException e) {
    362                         highChar = 0;
    363                     }
    364                     String mimeNames = val.substring(0, len);
    365                     StringTokenizer st =
    366                         new StringTokenizer(mimeNames, ",");
    367                     for (boolean first = true;
    368                         st.hasMoreTokens();
    369                         first = false)
    370                     {
    371                         mimeName = st.nextToken();
    372                         EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar);
    373                         encodingInfo_list.add(ei);
    374                         _encodingTableKeyMime.put(mimeName.toUpperCase(), ei);
    375                         if (first)
    376                             _encodingTableKeyJava.put(javaName.toUpperCase(), ei);
    377                     }
    378                 }
    379             }
    380             // Convert the Vector of EncodingInfo objects into an array of them,
    381             // as that is the kind of thing this method returns.
    382             EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()];
    383             encodingInfo_list.toArray(ret_ei);
    384             return ret_ei;
    385         }
    386         catch (java.net.MalformedURLException mue)
    387         {
    388             throw new org.apache.xml.serializer.utils.WrappedRuntimeException(mue);
    389         }
    390         catch (java.io.IOException ioe)
    391         {
    392             throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe);
    393         }
    394     }
    395 
    396     /**
    397      * Get the length of the Mime names within the property value
    398      * @param val The value of the property, which should contain a comma
    399      * separated list of Mime names, followed optionally by a space and the
    400      * high char value
    401      * @return
    402      */
    403     private static int lengthOfMimeNames(String val) {
    404         // look for the space preceding the optional high char
    405         int len = val.indexOf(' ');
    406         // If len is zero it means the optional part is not there, so
    407         // the value must be all Mime names, so set the length appropriately
    408         if (len < 0)
    409             len = val.length();
    410 
    411         return len;
    412     }
    413 
    414     /**
    415      * Return true if the character is the high member of a surrogate pair.
    416      * <p>
    417      * This is not a public API.
    418      * @param ch the character to test
    419      * @xsl.usage internal
    420      */
    421     static boolean isHighUTF16Surrogate(char ch) {
    422         return ('\uD800' <= ch && ch <= '\uDBFF');
    423     }
    424     /**
    425      * Return true if the character is the low member of a surrogate pair.
    426      * <p>
    427      * This is not a public API.
    428      * @param ch the character to test
    429      * @xsl.usage internal
    430      */
    431     static boolean isLowUTF16Surrogate(char ch) {
    432         return ('\uDC00' <= ch && ch <= '\uDFFF');
    433     }
    434     /**
    435      * Return the unicode code point represented by the high/low surrogate pair.
    436      * <p>
    437      * This is not a public API.
    438      * @param highSurrogate the high char of the high/low pair
    439      * @param lowSurrogate the low char of the high/low pair
    440      * @xsl.usage internal
    441      */
    442     static int toCodePoint(char highSurrogate, char lowSurrogate) {
    443         int codePoint =
    444             ((highSurrogate - 0xd800) << 10)
    445                 + (lowSurrogate - 0xdc00)
    446                 + 0x10000;
    447         return codePoint;
    448     }
    449     /**
    450      * Return the unicode code point represented by the char.
    451      * A bit of a dummy method, since all it does is return the char,
    452      * but as an int value.
    453      * <p>
    454      * This is not a public API.
    455      * @param ch the char.
    456      * @xsl.usage internal
    457      */
    458     static int toCodePoint(char ch) {
    459         int codePoint = ch;
    460         return codePoint;
    461     }
    462 
    463     /**
    464      * Characters with values at or below the high code point are
    465      * in the encoding. Code point values above this one may or may
    466      * not be in the encoding, but lower ones certainly are.
    467      * <p>
    468      * This is for performance.
    469      *
    470      * @param encoding The encoding
    471      * @return The code point for which characters at or below this code point
    472      * are in the encoding. Characters with higher code point may or may not be
    473      * in the encoding. A value of zero is returned if the high code point is unknown.
    474      * <p>
    475      * This method is not a public API.
    476      * @xsl.usage internal
    477      */
    478     static public char getHighChar(String encoding)
    479     {
    480         final char highCodePoint;
    481         EncodingInfo ei;
    482 
    483         String normalizedEncoding = toUpperCaseFast(encoding);
    484         ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
    485         if (ei == null)
    486             ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
    487         if (ei != null)
    488             highCodePoint =  ei.getHighChar();
    489         else
    490             highCodePoint = 0;
    491         return highCodePoint;
    492     }
    493 
    494     private static final Hashtable _encodingTableKeyJava = new Hashtable();
    495     private static final Hashtable _encodingTableKeyMime = new Hashtable();
    496     private static final EncodingInfo[] _encodings = loadEncodingInfo();
    497 }
    498