Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2012, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import java.io.IOException;
     32 
     33 import com.google.common.annotations.VisibleForTesting;
     34 
     35 /** Encoders and decoders for HTML. */
     36 final class Encoding {
     37 
     38   /**
     39    * Decodes HTML entities to produce a string containing only valid
     40    * Unicode scalar values.
     41    */
     42   @VisibleForTesting
     43   static String decodeHtml(String s) {
     44     int firstAmp = s.indexOf('&');
     45     int safeLimit = longestPrefixOfGoodCodeunits(s);
     46     if ((firstAmp & safeLimit) < 0) { return s; }
     47 
     48     StringBuilder sb;
     49     {
     50       int n = s.length();
     51       sb = new StringBuilder(n);
     52       int pos = 0;
     53       int amp = firstAmp;
     54       while (amp >= 0) {
     55         long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n);
     56         int end = (int) (endAndCodepoint >>> 32);
     57         int codepoint = (int) endAndCodepoint;
     58         sb.append(s, pos, amp).appendCodePoint(codepoint);
     59         pos = end;
     60         amp = s.indexOf('&', end);
     61       }
     62       sb.append(s, pos, n);
     63     }
     64 
     65     stripBannedCodeunits(
     66         sb,
     67         firstAmp < 0
     68           ? safeLimit : safeLimit < 0
     69           ? firstAmp : Math.min(firstAmp, safeLimit));
     70 
     71     return sb.toString();
     72   }
     73 
     74   /**
     75    * Returns the portion of its input that consists of XML safe chars.
     76    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
     77    */
     78   @TCB
     79   static String stripBannedCodeunits(String s) {
     80     int safeLimit = longestPrefixOfGoodCodeunits(s);
     81     if (safeLimit < 0) { return s; }
     82 
     83     StringBuilder sb = new StringBuilder(s);
     84     stripBannedCodeunits(sb, safeLimit);
     85     return sb.toString();
     86   }
     87 
     88   /**
     89    * Leaves in the input buffer only code-units that comprise XML safe chars.
     90    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
     91    */
     92   @TCB
     93   static void stripBannedCodeunits(StringBuilder sb) {
     94     stripBannedCodeunits(sb, 0);
     95   }
     96 
     97   @TCB
     98   private static void stripBannedCodeunits(StringBuilder sb, int start) {
     99     int k = start;
    100     for (int i = start, n = sb.length(); i < n; ++i) {
    101       char ch = sb.charAt(i);
    102       if (ch < 0x20) {
    103         if (IS_BANNED_ASCII[ch]) {
    104           continue;
    105         }
    106       } else if (0xd800 <= ch) {
    107         if (ch <= 0xdfff) {
    108           if (i+1 < n) {
    109             char next = sb.charAt(i+1);
    110             if (Character.isSurrogatePair(ch, next)) {
    111               sb.setCharAt(k++, ch);
    112               sb.setCharAt(k++, next);
    113               ++i;
    114             }
    115           }
    116           continue;
    117         } else if ((ch & 0xfffe) == 0xfffe) {
    118           continue;
    119         }
    120       }
    121       sb.setCharAt(k++, ch);
    122     }
    123     sb.setLength(k);
    124   }
    125 
    126   /**
    127    * The number of code-units at the front of s that form code-points in the
    128    * XML Character production.
    129    * @return -1 if all of s is in the XML Character production.
    130    */
    131   @TCB
    132   private static int longestPrefixOfGoodCodeunits(String s) {
    133     int n = s.length(), i;
    134     for (i = 0; i < n; ++i) {
    135       char ch = s.charAt(i);
    136       if (ch < 0x20) {
    137         if (IS_BANNED_ASCII[ch]) {
    138           return i;
    139         }
    140       } else if (0xd800 <= ch) {
    141         if (ch <= 0xdfff) {
    142           if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
    143             ++i;  // Skip over low surrogate since we know it's ok.
    144           } else {
    145             return i;
    146           }
    147         } else if ((ch & 0xfffe) == 0xfffe) {
    148           return i;
    149         }
    150       }
    151     }
    152     return -1;
    153   }
    154 
    155   /**
    156    * Writes the HTML equivalent of the given plain text to output.
    157    * For example, {@code escapeHtmlOnto("1 < 2", w)},
    158    * is equivalent to {@code w.append("1 &lt; 2")} but possibly with fewer
    159    * smaller appends.
    160    * Elides code-units that are not valid XML Characters.
    161    * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a>
    162    */
    163   @TCB
    164   static void encodeHtmlOnto(String plainText, Appendable output)
    165       throws IOException {
    166     int n = plainText.length();
    167     int pos = 0;
    168     for (int i = 0; i < n; ++i) {
    169       char ch = plainText.charAt(i);
    170       if (ch < REPLACEMENTS.length) {
    171         String repl = REPLACEMENTS[ch];
    172         if (repl != null) {
    173           output.append(plainText, pos, i).append(repl);
    174           pos = i + 1;
    175         }
    176       } else if (((char) 0xd800) <= ch) {
    177         if (ch <= ((char) 0xdfff)) {
    178           char next;
    179           if (i + 1 < n
    180               && Character.isSurrogatePair(
    181                   ch, next = plainText.charAt(i + 1))) {
    182             // Emit supplemental codepoints as entity so that they cannot
    183             // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
    184             // and get involved in UTF-16/UCS-2 confusion.
    185             int codepoint = Character.toCodePoint(ch, next);
    186             output.append(plainText, pos, i);
    187             appendNumericEntity(codepoint, output);
    188             ++i;
    189             pos = i + 1;
    190           } else {
    191             output.append(plainText, pos, i);
    192             // Elide the orphaned surrogate.
    193             pos = i + 1;
    194           }
    195         } else if (0xff00 <= ch) {
    196           output.append(plainText, pos, i);
    197           pos = i + 1;
    198           // Is a control character or possible full-width version of a
    199           // special character.
    200           if ((ch & 0xfffe) == 0xfffe) {
    201             // Elide since not an the XML Character.
    202           } else {
    203             appendNumericEntity(ch, output);
    204           }
    205         }
    206       }
    207     }
    208     output.append(plainText, pos, n);
    209   }
    210 
    211   @TCB
    212   static void appendNumericEntity(int codepoint, Appendable output)
    213       throws IOException {
    214     if (codepoint < 100) {
    215       // TODO: is this dead code due to REPLACEMENTS above.
    216       output.append("&#");
    217       if (codepoint < 10) {
    218         output.append((char) ('0' + codepoint));
    219       } else {
    220         output.append((char) ('0' + (codepoint / 10)));
    221         output.append((char) ('0' + (codepoint % 10)));
    222       }
    223       output.append(";");
    224     } else {
    225       int nDigits = (codepoint < 0x1000
    226                      ? codepoint < 0x100 ? 2 : 3
    227                      : (codepoint < 0x10000 ? 4
    228                         : codepoint < 0x100000 ? 5 : 6));
    229       output.append("&#x");
    230       for (int digit = nDigits; --digit >= 0;) {
    231         int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
    232         output.append(HEX_NUMERAL[hexDigit]);
    233       }
    234       output.append(";");
    235     }
    236   }
    237 
    238   private static final char[] HEX_NUMERAL = {
    239    '0', '1', '2', '3', '4', '5', '6', '7',
    240    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
    241   };
    242 
    243   /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
    244   static final String[] REPLACEMENTS = new String[0x61];
    245   static {
    246     for (int i = 0; i < ' '; ++i) {
    247       // We elide control characters so that we can ensure that our output is
    248       // in the intersection of valid HTML5 and XML.  According to
    249       // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets
    250       // Char      ::=          #x9 | #xA | #xD | [#x20-#xD7FF]
    251       //             |          [#xE000-#xFFFD] | [#x10000-#x10FFFF]
    252       if (i != '\t' && i != '\n' && i != '\r') {
    253         REPLACEMENTS[i] = "";  // Elide
    254       }
    255     }
    256     // "&#34;" is shorter than "&quot;"
    257     REPLACEMENTS['"']  = "&#" + ((int) '"')  + ";";  // Attribute delimiter.
    258     REPLACEMENTS['&']  = "&amp;";                    // HTML special.
    259     // We don't use &apos; since that is not in the intersection of HTML&XML.
    260     REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";";  // Attribute delimiter.
    261     REPLACEMENTS['+']  = "&#" + ((int) '+')  + ";";  // UTF-7 special.
    262     REPLACEMENTS['<']  = "&lt;";                     // HTML special.
    263     REPLACEMENTS['=']  = "&#" + ((int) '=')  + ";";  // Special in attributes.
    264     REPLACEMENTS['>']  = "&gt;";                     // HTML special.
    265     REPLACEMENTS['@']  = "&#" + ((int) '@')  + ";";  // Conditional compilation.
    266     REPLACEMENTS['`']  = "&#" + ((int) '`')  + ";";  // Attribute delimiter.
    267   }
    268 
    269   /**
    270    * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in
    271    * an HTML5 text node or properly quoted attribute value.
    272    */
    273   private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
    274   static {
    275     for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
    276       IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
    277     }
    278   }
    279 
    280 }
    281