Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import com.google.common.annotations.VisibleForTesting;
     32 import java.io.Closeable;
     33 import java.io.Flushable;
     34 import java.io.IOException;
     35 import java.util.Iterator;
     36 import java.util.List;
     37 import javax.annotation.WillCloseWhenClosed;
     38 import javax.annotation.concurrent.NotThreadSafe;
     39 
     40 /**
     41  * Given a series of HTML tokens, writes valid, normalized HTML to the output.
     42  * The output will have well-defined tag boundaries, but there may be orphaned
     43  * or missing close and open tags.
     44  * The result of two renderers can always be concatenated to produce a larger
     45  * snippet of HTML, but if the first was called with
     46  * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not
     47  * be interpreted as tags in the concatenated version.
     48  */
     49 @TCB
     50 @NotThreadSafe
     51 public class HtmlStreamRenderer implements HtmlStreamEventReceiver {
     52 
     53   private final Appendable output;
     54   private final Handler<? super IOException> ioExHandler;
     55   private final Handler<? super String> badHtmlHandler;
     56   private String lastTagOpened;
     57   private StringBuilder pendingUnescaped;
     58   private boolean open;
     59 
     60   /**
     61    * Factory.
     62    * @param output the buffer to which HTML is streamed.
     63    * @param ioExHandler called with any exception raised by output.
     64    * @param badHtmlHandler receives alerts when HTML cannot be rendered because
     65    *    there is not valid HTML tree that results from that series of calls.
     66    *    E.g. it is not possible to create an HTML {@code <style>} element whose
     67    *    textual content is {@code "</style>"}.
     68    */
     69   public static HtmlStreamRenderer create(
     70       @WillCloseWhenClosed Appendable output,
     71       Handler<? super IOException> ioExHandler,
     72       Handler<? super String> badHtmlHandler) {
     73     if (output instanceof Closeable) {
     74       return new CloseableHtmlStreamRenderer(
     75           output, ioExHandler, badHtmlHandler);
     76     } else {
     77       return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler);
     78     }
     79   }
     80 
     81   /**
     82    * Factory.
     83    * @param output the buffer to which HTML is streamed.
     84    * @param badHtmlHandler receives alerts when HTML cannot be rendered because
     85    *    there is not valid HTML tree that results from that series of calls.
     86    *    E.g. it is not possible to create an HTML {@code <style>} element whose
     87    *    textual content is {@code "</style>"}.
     88    */
     89   public static HtmlStreamRenderer create(
     90       StringBuilder output, Handler<? super String> badHtmlHandler) {
     91     // Propagate since StringBuilder should not throw IOExceptions.
     92     return create(output, Handler.PROPAGATE, badHtmlHandler);
     93   }
     94 
     95   private HtmlStreamRenderer(
     96       Appendable output, Handler<? super IOException> ioExHandler,
     97       Handler<? super String> badHtmlHandler) {
     98     this.output = output;
     99     this.ioExHandler = ioExHandler;
    100     this.badHtmlHandler = badHtmlHandler;
    101   }
    102 
    103   /**
    104    * Called when the series of calls make no sense.
    105    * May be overridden to throw an unchecked throwable, to log, or to take some
    106    * other action.
    107    *
    108    * @param message for human consumption.
    109    * @param identifier an HTML identifier associated with the message.
    110    */
    111   private final void error(String message, CharSequence identifier) {
    112     if (badHtmlHandler != Handler.DO_NOTHING) {   // Avoid string append.
    113       badHtmlHandler.handle(message + " : " + identifier);
    114     }
    115   }
    116 
    117   public final void openDocument() throws IllegalStateException {
    118     if (open) { throw new IllegalStateException(); }
    119     open = true;
    120   }
    121 
    122   public final void closeDocument() throws IllegalStateException {
    123     if (!open) { throw new IllegalStateException(); }
    124     if (pendingUnescaped != null) {
    125       closeTag(lastTagOpened);
    126     }
    127     open = false;
    128     if (output instanceof Flushable) {
    129       try {
    130         ((Flushable) output).flush();
    131       } catch (IOException ex) {
    132         ioExHandler.handle(ex);
    133       }
    134     }
    135   }
    136 
    137   public final boolean isDocumentOpen() {
    138     return open;
    139   }
    140 
    141   public final void openTag(String elementName, List<String> attrs) {
    142     try {
    143       writeOpenTag(elementName, attrs);
    144     } catch (IOException ex) {
    145       ioExHandler.handle(ex);
    146     }
    147   }
    148 
    149   private void writeOpenTag(String elementName, List<? extends String> attrs)
    150       throws IOException {
    151     if (!open) { throw new IllegalStateException(); }
    152     elementName = safeName(elementName);
    153     if (!isValidHtmlName(elementName)) {
    154       error("Invalid element name", elementName);
    155       return;
    156     }
    157     if (pendingUnescaped != null) {
    158       error("Tag content cannot appear inside CDATA element", elementName);
    159       return;
    160     }
    161 
    162     switch (HtmlTextEscapingMode.getModeForTag(elementName)) {
    163       case CDATA_SOMETIMES:
    164       case CDATA:
    165       case PLAIN_TEXT:
    166         lastTagOpened = elementName;
    167         pendingUnescaped = new StringBuilder();
    168         break;
    169       default:
    170     }
    171 
    172     output.append('<').append(elementName);
    173 
    174     for (Iterator<? extends String> attrIt = attrs.iterator();
    175          attrIt.hasNext();) {
    176       String name = attrIt.next();
    177       String value = attrIt.next();
    178       name = HtmlLexer.canonicalName(name);
    179       if (!isValidHtmlName(name)) {
    180         error("Invalid attr name", name);
    181         continue;
    182       }
    183       output.append(' ').append(name).append('=').append('"');
    184       Encoding.encodeHtmlOnto(value, output);
    185       if (value.indexOf('`') != -1) {
    186         // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
    187         // values.  Given
    188         //     <div attr="``foo=bar">
    189         // we encode &#96; but if JavaScript does:
    190         //    nodeA.innerHTML = nodeB.innerHTML;
    191         // and nodeB contains the DIV above, then IE8 will produce
    192         //     <div attr=``foo=bar>
    193         // as the value of nodeB.innerHTML and assign it to nodeA.
    194         // IE8's HTML parser treats `` as a blank attribute value and foo=bar
    195         // becomes a separate attribute.
    196         // Adding a space at the end of the attribute prevents this by forcing
    197         // IE8 to put double quotes around the attribute when computing
    198         // nodeB.innerHTML.
    199         output.append(' ');
    200       }
    201       output.append('"');
    202     }
    203 
    204     // Limit our output to the intersection of valid XML and valid HTML5 when
    205     // the output contains no special HTML5 elements like <title>, <script>, or
    206     // <textarea>.
    207     if (HtmlTextEscapingMode.isVoidElement(elementName)) {
    208       output.append(" /");
    209     }
    210 
    211     output.append('>');
    212   }
    213 
    214   public final void closeTag(String elementName) {
    215     try {
    216       writeCloseTag(safeName(elementName));
    217     } catch (IOException ex) {
    218       ioExHandler.handle(ex);
    219     }
    220   }
    221 
    222   private final void writeCloseTag(String elementName)
    223       throws IOException {
    224     if (!open) { throw new IllegalStateException(); }
    225     elementName = HtmlLexer.canonicalName(elementName);
    226     if (!isValidHtmlName(elementName)) {
    227       error("Invalid element name", elementName);
    228       return;
    229     }
    230 
    231     if (pendingUnescaped != null) {
    232       if (!lastTagOpened.equals(elementName)) {
    233         error("Tag content cannot appear inside CDATA element", elementName);
    234         return;
    235       } else {
    236         StringBuilder cdataContent = pendingUnescaped;
    237         pendingUnescaped = null;
    238         Encoding.stripBannedCodeunits(cdataContent);
    239         int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent);
    240         if (problemIndex == -1) {
    241           output.append(cdataContent);
    242         } else {
    243           error(
    244               "Invalid CDATA text content",
    245               cdataContent.subSequence(
    246                   problemIndex,
    247                   Math.min(problemIndex + 10, cdataContent.length())));
    248           // Still output the close tag.
    249         }
    250       }
    251       if ("plaintext".equals(elementName)) { return; }
    252     }
    253     output.append("</").append(elementName).append(">");
    254   }
    255 
    256   public final void text(String text) {
    257     try {
    258       writeText(text);
    259     } catch (IOException ex) {
    260       ioExHandler.handle(ex);
    261     }
    262   }
    263 
    264   private final void writeText(String text) throws IOException {
    265     if (!open) { throw new IllegalStateException(); }
    266     if (pendingUnescaped != null) {
    267       pendingUnescaped.append(text);
    268     } else {
    269       Encoding.encodeHtmlOnto(text, output);  // Works for RCDATA.
    270     }
    271   }
    272 
    273   private static int checkHtmlCdataCloseable(
    274       String localName, StringBuilder sb) {
    275     int escapingTextSpanStart = -1;
    276     for (int i = 0, n = sb.length(); i < n; ++i) {
    277       char ch = sb.charAt(i);
    278       switch (ch) {
    279         case '<':
    280           if (i + 3 < n
    281               && '!' == sb.charAt(i + 1)
    282               && '-' == sb.charAt(i + 2)
    283               && '-' == sb.charAt(i + 3)) {
    284             if (escapingTextSpanStart == -1) {
    285               escapingTextSpanStart = i;
    286             } else {
    287               return i;
    288             }
    289           } else if (i + 1 + localName.length() < n
    290                      && '/' == sb.charAt(i + 1)
    291                      && Strings.regionMatchesIgnoreCase(
    292                          sb, i + 2, localName, 0, localName.length())) {
    293             // A close tag contained in the content.
    294             if (escapingTextSpanStart < 0) {
    295               // We could try some recovery strategies here.
    296               // E.g. prepending "/<!--\n" to sb if "script".equals(localName)
    297               return i;
    298             }
    299             if (!"script".equals(localName)) {
    300               // Script tags are commonly included inside script tags.
    301               // <script><!--document.write('<script>f()</script>');--></script>
    302               // but this does not happen in other CDATA element types.
    303               // Actually allowing an end tag inside others is problematic.
    304               // Specifically,
    305               // <style><!--</style>-->/* foo */</style>
    306               // displays the text "/* foo */" on some browsers.
    307               return i;
    308             }
    309           }
    310           break;
    311         case '>':
    312           // From the HTML5 spec:
    313           //    The text in style, script, title, and textarea elements must not
    314           //    have an escaping text span start that is not followed by an
    315           //    escaping text span end.
    316           // We look left since the HTML 5 spec allows the escaping text span
    317           // end to share dashes with the start.
    318           if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
    319             if (escapingTextSpanStart < 0) { return i - 2; }
    320             escapingTextSpanStart = -1;
    321           }
    322           break;
    323         default:
    324           break;
    325       }
    326     }
    327     if (escapingTextSpanStart >= 0) {
    328       // We could try recovery strategies here.
    329       // E.g. appending "//-->" to the buffer if "script".equals(localName)
    330       return escapingTextSpanStart;
    331     }
    332     return -1;
    333   }
    334 
    335 
    336   @VisibleForTesting
    337   static boolean isValidHtmlName(String name) {
    338     int n = name.length();
    339     if (n == 0) { return false; }
    340     if (n > 128) { return false; }
    341     boolean isNamespaced = false;
    342     for (int i = 0; i < n; ++i) {
    343       char ch = name.charAt(i);
    344       switch (ch) {
    345         case ':':
    346           if (isNamespaced) { return false; }
    347           isNamespaced = true;
    348           if (i == 0 || i + 1 == n) { return false; }
    349           break;
    350         case '-':
    351           if (i == 0 || i + 1 == n) { return false; }
    352           break;
    353         default:
    354           if (ch <= '9') {
    355             if (i == 0 || ch < '0') { return false; }
    356           } else if ('A' <= ch && ch <= 'z') {
    357             if ('Z' < ch && ch < 'a') { return false; }
    358           } else {
    359             return false;
    360           }
    361           break;
    362       }
    363     }
    364     return true;
    365   }
    366 
    367   /**
    368    * Canonicalizes the element name and possibly substitutes an alternative
    369    * that has more consistent semantics.
    370    */
    371   static String safeName(String elementName) {
    372     elementName = HtmlLexer.canonicalName(elementName);
    373 
    374     // Substitute a reliably non-raw-text element for raw-text and
    375     // plain-text elements.
    376     switch (elementName.length()) {
    377       case 3:
    378         if ("xmp".equals(elementName)) { return "pre"; }
    379         break;
    380       case 7:
    381         if ("listing".equals(elementName)) { return "pre"; }
    382         break;
    383       case 9:
    384         if ("plaintext".equals(elementName)) { return "pre"; }
    385         break;
    386     }
    387     return elementName;
    388   }
    389 
    390   static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer
    391       implements Closeable {
    392     private final Closeable closeable;
    393 
    394     CloseableHtmlStreamRenderer(
    395         @WillCloseWhenClosed
    396         Appendable output, Handler<? super IOException> errorHandler,
    397         Handler<? super String> badHtmlHandler) {
    398       super(output, errorHandler, badHtmlHandler);
    399       this.closeable = (Closeable) output;
    400     }
    401 
    402     public void close() throws IOException {
    403       if (isDocumentOpen()) { closeDocument(); }
    404       closeable.close();
    405     }
    406   }
    407 }
    408