Home | History | Annotate | Download | only in xml
      1 /*
      2  * Copyright (C) 2009 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.xml;
     18 
     19 import com.google.common.annotations.Beta;
     20 import com.google.common.annotations.GwtCompatible;
     21 import com.google.common.escape.Escaper;
     22 import com.google.common.escape.Escapers;
     23 
     24 /**
     25  * {@code Escaper} instances suitable for strings to be included in XML
     26  * attribute values and elements' text contents. When possible, avoid manual
     27  * escaping by using templating systems and high-level APIs that provide
     28  * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or
     29  * <a href="http://www.jdom.org/">JDOM</a>.
     30  *
     31  * <p><b>Note:</b> Currently the escapers provided by this class do not escape
     32  * any characters outside the ASCII character range. Unlike HTML escaping the
     33  * XML escapers will not escape non-ASCII characters to their numeric entity
     34  * replacements. These XML escapers provide the minimal level of escaping to
     35  * ensure that the output can be safely included in a Unicode XML document.
     36  *
     37  *
     38  * <p>For details on the behavior of the escapers in this class, see sections
     39  * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and
     40  * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
     41  * XML specification.
     42  *
     43  * @author Alex Matevossian
     44  * @author David Beaumont
     45  * @since 15.0
     46  */
     47 @Beta
     48 @GwtCompatible
     49 public class XmlEscapers {
     50   private XmlEscapers() {}
     51 
     52   private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
     53   private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
     54 
     55   // For each xxxEscaper() method, please add links to external reference pages
     56   // that are considered authoritative for the behavior of that escaper.
     57 
     58   /**
     59    * Returns an {@link Escaper} instance that escapes special characters in a
     60    * string so it can safely be included in an XML document as element content.
     61    * See section
     62    * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
     63    * XML specification.
     64    *
     65    * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not
     66    * safe</b> to use this escaper to escape attribute values. Use
     67    * {@link #xmlContentEscaper} if the output can appear in element content or
     68    * {@link #xmlAttributeEscaper} in attribute values.
     69    *
     70    * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
     71    * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
     72    * are not permitted in XML. For more detail see section <a
     73    * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
     74    * XML specification.
     75    *
     76    * <p>This escaper does not escape non-ASCII characters to their numeric
     77    * character references (NCR). Any non-ASCII characters appearing in the input
     78    * will be preserved in the output. Specifically "\r" (carriage return) is
     79    * preserved in the output, which may result in it being silently converted to
     80    * "\n" when the XML is parsed.
     81    *
     82    * <p>This escaper does not treat surrogate pairs specially and does not
     83    * perform Unicode validation on its input.
     84    */
     85   public static Escaper xmlContentEscaper() {
     86     return XML_CONTENT_ESCAPER;
     87   }
     88 
     89   /**
     90    * Returns an {@link Escaper} instance that escapes special characters in a
     91    * string so it can safely be included in XML document as an attribute value.
     92    * See section
     93    * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a>
     94    * of the XML specification.
     95    *
     96    * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
     97    * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
     98    * are not permitted in XML. For more detail see section <a
     99    * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
    100    * XML specification.
    101    *
    102    * <p>This escaper does not escape non-ASCII characters to their numeric
    103    * character references (NCR). However, horizontal tab {@code '\t'}, line feed
    104    * {@code '\n'} and carriage return {@code '\r'} are escaped to a
    105    * corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
    106    * respectively. Any other non-ASCII characters appearing in the input will
    107    * be preserved in the output.
    108    *
    109    * <p>This escaper does not treat surrogate pairs specially and does not
    110    * perform Unicode validation on its input.
    111    */
    112   public static Escaper xmlAttributeEscaper() {
    113     return XML_ATTRIBUTE_ESCAPER;
    114   }
    115 
    116   private static final Escaper XML_ESCAPER;
    117   private static final Escaper XML_CONTENT_ESCAPER;
    118   private static final Escaper XML_ATTRIBUTE_ESCAPER;
    119   static {
    120     Escapers.Builder builder = Escapers.builder();
    121     // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
    122     // (Unicode code points above \uFFFF are represented via surrogate pairs
    123     // which means they are treated as pairs of safe characters).
    124     builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
    125     // Unsafe characters are replaced with the Unicode replacement character.
    126     builder.setUnsafeReplacement("\uFFFD");
    127 
    128     /*
    129      * Except for \n, \t, and \r, all ASCII control characters are replaced with
    130      * the Unicode replacement character.
    131      *
    132      * Implementation note: An alternative to the following would be to make a
    133      * map that simply replaces the allowed ASCII whitespace characters with
    134      * themselves and to set the minimum safe character to 0x20. However this
    135      * would slow down the escaping of simple strings that contain \t, \n, or
    136      * \r.
    137      */
    138     for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
    139       if (c != '\t' && c != '\n' && c != '\r') {
    140         builder.addEscape(c, "\uFFFD");
    141       }
    142     }
    143 
    144     // Build the content escaper first and then add quote escaping for the
    145     // general escaper.
    146     builder.addEscape('&', "&amp;");
    147     builder.addEscape('<', "&lt;");
    148     builder.addEscape('>', "&gt;");
    149     XML_CONTENT_ESCAPER = builder.build();
    150     builder.addEscape('\'', "&apos;");
    151     builder.addEscape('"', "&quot;");
    152     XML_ESCAPER = builder.build();
    153     builder.addEscape('\t', "&#x9;");
    154     builder.addEscape('\n', "&#xA;");
    155     builder.addEscape('\r', "&#xD;");
    156     XML_ATTRIBUTE_ESCAPER = builder.build();
    157   }
    158 }
    159