Home | History | Annotate | Download | only in serializer
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements. See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership. The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the  "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *     http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 /*
     19  * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
     20  */
     21 package org.apache.xml.serializer;
     22 
     23 
     24 /**
     25  * Holds information about a given encoding, which is the Java name for the
     26  * encoding, the equivalent ISO name.
     27  * <p>
     28  * An object of this type has two useful methods
     29  * <pre>
     30  * isInEncoding(char ch);
     31  * </pre>
     32  * which can be called if the character is not the high one in
     33  * a surrogate pair and:
     34  * <pre>
     35  * isInEncoding(char high, char low);
     36  * </pre>
     37  * which can be called if the two characters from a high/low surrogate pair.
     38  * <p>
     39  * An EncodingInfo object is a node in a binary search tree. Such a node
     40  * will answer if a character is in the encoding, and do so for a given
     41  * range of unicode values (<code>m_first</code> to
     42  * <code>m_last</code>). It will handle a certain range of values
     43  * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
     44  * If the unicode point is before that explicit range, that is it
     45  * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
     46  * of such a tree, m_before.  Likewise for values in the range
     47  * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
     48  * <p>
     49  * Actually figuring out if a code point is in the encoding is expensive. So the
     50  * purpose of this tree is to cache such determinations, and not to build the
     51  * entire tree of information at the start, but only build up as much of the
     52  * tree as is used during the transformation.
     53  * <p>
     54  * This Class is not a public API, and should only be used internally within
     55  * the serializer.
     56  * <p>
     57  * This class is not a public API.
     58  * @xsl.usage internal
     59  */
     60 public final class EncodingInfo extends Object
     61 {
     62 
     63     /**
     64      * Not all characters in an encoding are in on contiguous group,
     65      * however there is a lowest contiguous group starting at '\u0001'
     66      * and working up to m_highCharInContiguousGroup.
     67      * <p>
     68      * This is the char for which chars at or below this value are
     69      * definately in the encoding, although for chars
     70      * above this point they might be in the encoding.
     71      * This exists for performance, especially for ASCII characters
     72      * because for ASCII all chars in the range '\u0001' to '\u007F'
     73      * are in the encoding.
     74      *
     75      */
     76     private final char m_highCharInContiguousGroup;
     77 
     78     /**
     79      * The ISO encoding name.
     80      */
     81     final String name;
     82 
     83     /**
     84      * The name used by the Java convertor.
     85      */
     86     final String javaName;
     87 
     88     /**
     89      * A helper object that we can ask if a
     90      * single char, or a surrogate UTF-16 pair
     91      * of chars that form a single character,
     92      * is in this encoding.
     93      */
     94     private InEncoding m_encoding;
     95 
     96     /**
     97      * This is not a public API. It returns true if the
     98      * char in question is in the encoding.
     99      * @param ch the char in question.
    100      * <p>
    101      * This method is not a public API.
    102      * @xsl.usage internal
    103      */
    104     public boolean isInEncoding(char ch) {
    105         if (m_encoding == null) {
    106             m_encoding = new EncodingImpl();
    107 
    108             // One could put alternate logic in here to
    109             // instantiate another object that implements the
    110             // InEncoding interface. For example if the JRE is 1.4 or up
    111             // we could have an object that uses JRE 1.4 methods
    112         }
    113         return m_encoding.isInEncoding(ch);
    114     }
    115 
    116     /**
    117      * This is not a public API. It returns true if the
    118      * character formed by the high/low pair is in the encoding.
    119      * @param high a char that the a high char of a high/low surrogate pair.
    120      * @param low a char that is the low char of a high/low surrogate pair.
    121      * <p>
    122      * This method is not a public API.
    123      * @xsl.usage internal
    124      */
    125     public boolean isInEncoding(char high, char low) {
    126         if (m_encoding == null) {
    127             m_encoding = new EncodingImpl();
    128 
    129             // One could put alternate logic in here to
    130             // instantiate another object that implements the
    131             // InEncoding interface. For example if the JRE is 1.4 or up
    132             // we could have an object that uses JRE 1.4 methods
    133         }
    134         return m_encoding.isInEncoding(high, low);
    135     }
    136 
    137     /**
    138      * Create an EncodingInfo object based on the ISO name and Java name.
    139      * If both parameters are null any character will be considered to
    140      * be in the encoding. This is useful for when the serializer is in
    141      * temporary output state, and has no assciated encoding.
    142      *
    143      * @param name reference to the ISO name.
    144      * @param javaName reference to the Java encoding name.
    145      * @param highChar The char for which characters at or below this value are
    146      * definately in the
    147      * encoding, although for characters above this point they might be in the encoding.
    148      */
    149     public EncodingInfo(String name, String javaName, char highChar)
    150     {
    151 
    152         this.name = name;
    153         this.javaName = javaName;
    154         this.m_highCharInContiguousGroup = highChar;
    155     }
    156 
    157 
    158 
    159     /**
    160      * A simple interface to isolate the implementation.
    161      * We could also use some new JRE 1.4 methods in another implementation
    162      * provided we use reflection with them.
    163      * <p>
    164      * This interface is not a public API,
    165      * and should only be used internally within the serializer.
    166      * @xsl.usage internal
    167      */
    168     private interface InEncoding {
    169         /**
    170          * Returns true if the char is in the encoding
    171          */
    172         public boolean isInEncoding(char ch);
    173         /**
    174          * Returns true if the high/low surrogate pair forms
    175          * a character that is in the encoding.
    176          */
    177         public boolean isInEncoding(char high, char low);
    178     }
    179 
    180     /**
    181      * This class implements the
    182      */
    183     private class EncodingImpl implements InEncoding {
    184 
    185 
    186 
    187         public boolean isInEncoding(char ch1) {
    188             final boolean ret;
    189             int codePoint = Encodings.toCodePoint(ch1);
    190             if (codePoint < m_explFirst) {
    191                 // The unicode value is before the range
    192                 // that we explictly manage, so we delegate the answer.
    193 
    194                 // If we don't have an m_before object to delegate to, make one.
    195                 if (m_before == null)
    196                     m_before =
    197                         new EncodingImpl(
    198                             m_encoding,
    199                             m_first,
    200                             m_explFirst - 1,
    201                             codePoint);
    202                 ret = m_before.isInEncoding(ch1);
    203             } else if (m_explLast < codePoint) {
    204                 // The unicode value is after the range
    205                 // that we explictly manage, so we delegate the answer.
    206 
    207                 // If we don't have an m_after object to delegate to, make one.
    208                 if (m_after == null)
    209                     m_after =
    210                         new EncodingImpl(
    211                             m_encoding,
    212                             m_explLast + 1,
    213                             m_last,
    214                             codePoint);
    215                 ret = m_after.isInEncoding(ch1);
    216             } else {
    217                 // The unicode value is in the range we explitly handle
    218                 final int idx = codePoint - m_explFirst;
    219 
    220                 // If we already know the answer, just return it.
    221                 if (m_alreadyKnown[idx])
    222                     ret = m_isInEncoding[idx];
    223                 else {
    224                     // We don't know the answer, so find out,
    225                     // which may be expensive, then cache the answer
    226                     ret = inEncoding(ch1, m_encoding);
    227                     m_alreadyKnown[idx] = true;
    228                     m_isInEncoding[idx] = ret;
    229                 }
    230             }
    231             return ret;
    232         }
    233 
    234         public boolean isInEncoding(char high, char low) {
    235             final boolean ret;
    236             int codePoint = Encodings.toCodePoint(high,low);
    237             if (codePoint < m_explFirst) {
    238                 // The unicode value is before the range
    239                 // that we explictly manage, so we delegate the answer.
    240 
    241                 // If we don't have an m_before object to delegate to, make one.
    242                 if (m_before == null)
    243                     m_before =
    244                         new EncodingImpl(
    245                             m_encoding,
    246                             m_first,
    247                             m_explFirst - 1,
    248                             codePoint);
    249                 ret = m_before.isInEncoding(high,low);
    250             } else if (m_explLast < codePoint) {
    251                 // The unicode value is after the range
    252                 // that we explictly manage, so we delegate the answer.
    253 
    254                 // If we don't have an m_after object to delegate to, make one.
    255                 if (m_after == null)
    256                     m_after =
    257                         new EncodingImpl(
    258                             m_encoding,
    259                             m_explLast + 1,
    260                             m_last,
    261                             codePoint);
    262                 ret = m_after.isInEncoding(high,low);
    263             } else {
    264                 // The unicode value is in the range we explitly handle
    265                 final int idx = codePoint - m_explFirst;
    266 
    267                 // If we already know the answer, just return it.
    268                 if (m_alreadyKnown[idx])
    269                     ret = m_isInEncoding[idx];
    270                 else {
    271                     // We don't know the answer, so find out,
    272                     // which may be expensive, then cache the answer
    273                     ret = inEncoding(high, low, m_encoding);
    274                     m_alreadyKnown[idx] = true;
    275                     m_isInEncoding[idx] = ret;
    276                 }
    277             }
    278             return ret;
    279         }
    280 
    281         /**
    282          * The encoding.
    283          */
    284         final private String m_encoding;
    285         /**
    286          * m_first through m_last is the range of unicode
    287          * values that this object will return an answer on.
    288          * It may delegate to a similar object with a different
    289          * range
    290          */
    291         final private int m_first;
    292 
    293         /**
    294          * m_explFirst through m_explLast is the range of unicode
    295          * value that this object handles explicitly and does not
    296          * delegate to a similar object.
    297          */
    298         final private int m_explFirst;
    299         final private int m_explLast;
    300         final private int m_last;
    301 
    302         /**
    303          * The object, of the same type as this one,
    304          * that handles unicode values in a range before
    305          * the range explictly handled by this object, and
    306          * to which this object may delegate.
    307          */
    308         private InEncoding m_before;
    309         /**
    310          * The object, of the same type as this one,
    311          * that handles unicode values in a range after
    312          * the range explictly handled by this object, and
    313          * to which this object may delegate.
    314          */
    315         private InEncoding m_after;
    316 
    317         /**
    318          * The number of unicode values explicitly handled
    319          * by a single EncodingInfo object. This value is
    320          * tuneable, but is set to 128 because that covers the
    321          * entire low range of ASCII type chars within a single
    322          * object.
    323          */
    324         private static final int RANGE = 128;
    325 
    326         /**
    327          * A flag to record if we already know the answer
    328          * for the given unicode value.
    329          */
    330         final private boolean m_alreadyKnown[] = new boolean[RANGE];
    331         /**
    332          * A table holding the answer on whether the given unicode
    333          * value is in the encoding.
    334          */
    335         final private boolean m_isInEncoding[] = new boolean[RANGE];
    336 
    337         private EncodingImpl() {
    338             // This object will answer whether any unicode value
    339             // is in the encoding, it handles values 0 through Integer.MAX_VALUE
    340             this(javaName, 0, Integer.MAX_VALUE, (char) 0);
    341         }
    342 
    343         private EncodingImpl(String encoding, int first, int last, int codePoint) {
    344             // Set the range of unicode values that this object manages
    345             // either explicitly or implicitly.
    346             m_first = first;
    347             m_last = last;
    348 
    349             // Set the range of unicode values that this object
    350             // explicitly manages
    351             m_explFirst = codePoint;
    352             m_explLast = codePoint + (RANGE-1);
    353 
    354             m_encoding = encoding;
    355 
    356             if (javaName != null)
    357             {
    358                 // Some optimization.
    359                 if (0 <= m_explFirst && m_explFirst <= 127) {
    360                     // This particular EncodingImpl explicitly handles
    361                     // characters in the low range.
    362                     if ("UTF8".equals(javaName)
    363                         || "UTF-16".equals(javaName)
    364                         || "ASCII".equals(javaName)
    365                         || "US-ASCII".equals(javaName)
    366                         || "Unicode".equals(javaName)
    367                         || "UNICODE".equals(javaName)
    368                         || javaName.startsWith("ISO8859")) {
    369 
    370                         // Not only does this EncodingImpl object explicitly
    371                         // handle chracters in the low range, it is
    372                         // also one that we know something about, without
    373                         // needing to call inEncoding(char ch, String encoding)
    374                         // for this low range
    375                         //
    376                         // By initializing the table ahead of time
    377                         // for these low values, we prevent the expensive
    378                         // inEncoding(char ch, String encoding)
    379                         // from being called, at least for these common
    380                         // encodings.
    381                         for (int unicode = 1; unicode < 127; unicode++) {
    382                             final int idx = unicode - m_explFirst;
    383                             if (0 <= idx && idx < RANGE) {
    384                                 m_alreadyKnown[idx] = true;
    385                                 m_isInEncoding[idx] = true;
    386                             }
    387                         }
    388                     }
    389                 }
    390 
    391                 /* A little bit more than optimization.
    392                  *
    393                  * We will say that any character is in the encoding if
    394                  * we don't have an encoding.
    395                  * This is meaningful when the serializer is being used
    396                  * in temporary output state, where we are not writing to
    397                  * the final output tree.  It is when writing to the
    398                  * final output tree that we need to worry about the output
    399                  * encoding
    400                  */
    401                 if (javaName == null) {
    402                     for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
    403                         m_alreadyKnown[idx] = true;
    404                         m_isInEncoding[idx] = true;
    405                     }
    406                 }
    407             }
    408         }
    409     }
    410 
    411     /**
    412      * This is heart of the code that determines if a given character
    413      * is in the given encoding. This method is probably expensive,
    414      * and the answer should be cached.
    415      * <p>
    416      * This method is not a public API,
    417      * and should only be used internally within the serializer.
    418      * @param ch the char in question, that is not a high char of
    419      * a high/low surrogate pair.
    420      * @param encoding the Java name of the enocding.
    421      *
    422      * @xsl.usage internal
    423      *
    424      */
    425     private static boolean inEncoding(char ch, String encoding) {
    426         boolean isInEncoding;
    427         try {
    428             char cArray[] = new char[1];
    429             cArray[0] = ch;
    430             // Construct a String from the char
    431             String s = new String(cArray);
    432             // Encode the String into a sequence of bytes
    433             // using the given, named charset.
    434             byte[] bArray = s.getBytes(encoding);
    435             isInEncoding = inEncoding(ch, bArray);
    436 
    437         } catch (Exception e) {
    438             isInEncoding = false;
    439 
    440             // If for some reason the encoding is null, e.g.
    441             // for a temporary result tree, we should just
    442             // say that every character is in the encoding.
    443             if (encoding == null)
    444             	isInEncoding = true;
    445         }
    446         return isInEncoding;
    447     }
    448 
    449     /**
    450      * This is heart of the code that determines if a given high/low
    451      * surrogate pair forms a character that is in the given encoding.
    452      * This method is probably expensive, and the answer should be cached.
    453      * <p>
    454      * This method is not a public API,
    455      * and should only be used internally within the serializer.
    456      * @param high the high char of
    457      * a high/low surrogate pair.
    458      * @param low the low char of a high/low surrogate pair.
    459      * @param encoding the Java name of the encoding.
    460      *
    461      * @xsl.usage internal
    462      *
    463      */
    464     private static boolean inEncoding(char high, char low, String encoding) {
    465         boolean isInEncoding;
    466         try {
    467             char cArray[] = new char[2];
    468             cArray[0] = high;
    469             cArray[1] = low;
    470             // Construct a String from the char
    471             String s = new String(cArray);
    472             // Encode the String into a sequence of bytes
    473             // using the given, named charset.
    474             byte[] bArray = s.getBytes(encoding);
    475             isInEncoding = inEncoding(high,bArray);
    476         } catch (Exception e) {
    477             isInEncoding = false;
    478         }
    479 
    480         return isInEncoding;
    481     }
    482 
    483     /**
    484      * This method is the core of determining if character
    485      * is in the encoding. The method is not foolproof, because
    486      * s.getBytes(encoding) has specified behavior only if the
    487      * characters are in the specified encoding. However this
    488      * method tries it's best.
    489      * @param ch the char that was converted using getBytes, or
    490      * the first char of a high/low pair that was converted.
    491      * @param data the bytes written out by the call to s.getBytes(encoding);
    492      * @return true if the character is in the encoding.
    493      */
    494     private static boolean inEncoding(char ch, byte[] data) {
    495         final boolean isInEncoding;
    496         // If the string written out as data is not in the encoding,
    497         // the output is not specified according to the documentation
    498         // on the String.getBytes(encoding) method,
    499         // but we do our best here.
    500         if (data==null || data.length == 0) {
    501             isInEncoding = false;
    502         }
    503         else {
    504             if (data[0] == 0)
    505                 isInEncoding = false;
    506             else if (data[0] == '?' && ch != '?')
    507                 isInEncoding = false;
    508             /*
    509              * else if (isJapanese) {
    510              *   // isJapanese is really
    511              *   //   (    "EUC-JP".equals(javaName)
    512              *   //    ||  "EUC_JP".equals(javaName)
    513              *  //     ||  "SJIS".equals(javaName)   )
    514              *
    515              *   // Work around some bugs in JRE for Japanese
    516              *   if(data[0] == 0x21)
    517              *     isInEncoding = false;
    518              *   else if (ch == 0xA5)
    519              *     isInEncoding = false;
    520              *   else
    521              *     isInEncoding = true;
    522              * }
    523              */
    524 
    525             else {
    526                 // We don't know for sure, but it looks like it is in the encoding
    527                 isInEncoding = true;
    528             }
    529         }
    530         return isInEncoding;
    531     }
    532 
    533     /**
    534      * This method exists for performance reasons.
    535      * <p>
    536      * Except for '\u0000', if a char is less than or equal to the value
    537      * returned by this method then it in the encoding.
    538      * <p>
    539      * The characters in an encoding are not contiguous, however
    540      * there is a lowest group of chars starting at '\u0001' upto and
    541      * including the char returned by this method that are all in the encoding.
    542      * So the char returned by this method essentially defines the lowest
    543      * contiguous group.
    544      * <p>
    545      * chars above the value returned might be in the encoding, but
    546      * chars at or below the value returned are definately in the encoding.
    547      * <p>
    548      * In any case however, the isInEncoding(char) method can be used
    549      * regardless of the value of the char returned by this method.
    550      * <p>
    551      * If the value returned is '\u0000' it means that every character must be tested
    552      * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
    553      * for surrogate pairs.
    554      * <p>
    555      * This method is not a public API.
    556      * @xsl.usage internal
    557      */
    558     public final char getHighChar() {
    559         return m_highCharInContiguousGroup;
    560     }
    561 
    562 }
    563