Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import junit.framework.TestCase;
     32 
     33 import javax.annotation.Nullable;
     34 
     35 import org.junit.Test;
     36 
     37 
     38 public class HtmlSanitizerTest extends TestCase {
     39 
     40   @Test
     41   public static final void testEmpty() throws Exception {
     42     assertEquals("", sanitize(""));
     43     assertEquals("", sanitize(null));
     44   }
     45 
     46   @Test
     47   public static final void testSimpleText() throws Exception {
     48     assertEquals("hello world", sanitize("hello world"));
     49   }
     50 
     51   @Test
     52   public static final void testEntities1() throws Exception {
     53     assertEquals("<hello world>", sanitize("<hello world>"));
     54   }
     55 
     56   @Test
     57   public static final void testEntities2() throws Exception {
     58     assertEquals("<b>hello <i>world</i></b>",
     59                  sanitize("<b>hello <i>world</i></b>"));
     60   }
     61 
     62   @Test
     63   public static final void testUnknownTagsRemoved() throws Exception {
     64     assertEquals("<b>hello <i>world</i></b>",
     65                  sanitize("<b>hello <bogus></bogus><i>world</i></b>"));
     66   }
     67 
     68   @Test
     69   public static final void testUnsafeTagsRemoved() throws Exception {
     70     assertEquals("<b>hello <i>world</i></b>",
     71                  sanitize("<b>hello <i>world</i>"
     72                           + "<script src=foo.js></script></b>"));
     73   }
     74 
     75   @Test
     76   public static final void testUnsafeAttributesRemoved() throws Exception {
     77     assertEquals(
     78         "<b>hello <i>world</i></b>",
     79         sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>"));
     80   }
     81 
     82   @Test
     83   public static final void testCruftEscaped() throws Exception {
     84     assertEquals("<b>hello <i>world&lt;</i></b> &amp; tomorrow the universe",
     85                  sanitize(
     86                      "<b>hello <i>world<</i></b> & tomorrow the universe"));
     87   }
     88 
     89   @Test
     90   public static final void testTagCruftRemoved() throws Exception {
     91     assertEquals("<b id=\"p-foo\">hello <i>world&lt;</i></b>",
     92                  sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>"));
     93   }
     94 
     95   @Test
     96   public static final void testIdsAndClassesPrefixed() throws Exception {
     97     assertEquals(
     98         "<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">"
     99         + "hello <i>world&lt;</i></b>",
    100         sanitize(
    101             "<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>"));
    102   }
    103 
    104   @Test
    105   public static final void testSpecialCharsInAttributes() throws Exception {
    106     assertEquals(
    107         "<b title=\"a&lt;b &amp;&amp; c&gt;b\">bar</b>",
    108         sanitize("<b title=\"a<b && c>b\">bar</b>"));
    109   }
    110 
    111   @Test
    112   public static final void testUnclosedTags() throws Exception {
    113     assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>",
    114                  sanitize("<div id=\"foo\">Bar<br>Baz"));
    115   }
    116 
    117   @Test
    118   public static final void testUnopenedTags() throws Exception {
    119     assertEquals("Foo<b>Bar</b>Baz",
    120                  sanitize("Foo<b></select>Bar</b></b>Baz</select>"));
    121   }
    122 
    123   @Test
    124   public static final void testUnsafeEndTags() throws Exception {
    125     assertEquals(
    126         "",
    127         sanitize(
    128             "</meta http-equiv=\"refesh\""
    129             + " content=\"1;URL=http://evilgadget.com\">"));
    130   }
    131 
    132   @Test
    133   public static final void testEmptyEndTags() throws Exception {
    134     assertEquals("<input />", sanitize("<input></input>"));
    135   }
    136 
    137   @Test
    138   public static final void testOnLoadStripped() throws Exception {
    139     assertEquals(
    140         "<img />",
    141         sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>"));
    142   }
    143 
    144   @Test
    145   public static final void testClosingTagParameters() throws Exception {
    146     assertEquals(
    147         "<p>Hello world</p>",
    148         sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">"));
    149   }
    150 
    151   @Test
    152   public static final void testOptionalEndTags() throws Exception {
    153     // Should not be
    154     //     "<ol> <li>A</li> <li>B<li>C </li></li></ol>"
    155     // The difference is significant because in the first, the item contains no
    156     // space after 'A", but in the third, the item contains 'C' and a space.
    157     assertEquals(
    158         "<ol><li>A</li><li>B</li><li>C </li></ol>",
    159         sanitize("<ol> <li>A</li> <li>B<li>C </ol>"));
    160   }
    161 
    162   @Test
    163   public static final void testFoldingOfHtmlAndBodyTags() throws Exception {
    164     assertEquals(
    165         "<p>P 1</p>",
    166         sanitize("<html><head><title>Foo</title></head>"
    167                  + "<body><p>P 1</p></body></html>"));
    168     assertEquals(
    169         "Hello",
    170         sanitize("<body bgcolor=\"blue\">Hello</body>"));
    171     assertEquals(
    172         "<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>",
    173         sanitize(
    174             "<html>"
    175             + "<head>"
    176             + "<title>Blah</title>"
    177             + "<p>Foo</p>"
    178             + "</head>"
    179             + "<body>"
    180             + "<p>One"
    181             + "<p>Two</p>"
    182             + "Three"
    183             + "<p>Four</p>"
    184             + "</body>"
    185             + "</html>"));
    186   }
    187 
    188   @Test
    189   public static final void testEmptyAndValuelessAttributes() throws Exception {
    190     assertEquals(
    191         "<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />",
    192         sanitize("<input checked type=checkbox id=\"\" class=>"));
    193   }
    194 
    195   @Test
    196   public static final void testSgmlShortTags() throws Exception {
    197     // We make no attempt to correctly handle SGML short tags since they are
    198     // not implemented consistently across browsers, and have been removed from
    199     // HTML 5.
    200     //
    201     // According to http://www.w3.org/QA/2007/10/shorttags.html
    202     //      Shorttags - the odd side of HTML 4.01
    203     //      ...
    204     //      It uses an ill-known feature of SGML called shorthand markup, which
    205     //      was authorized in HTML up to HTML 4.01. But what used to be a "cool"
    206     //      feature for SGML experts becomes a liability in HTML, where the
    207     //      construct is more likely to appear as a typo than as a conscious
    208     //      choice.
    209     //
    210     //      All could be fine if this form typo-that-happens-to-be-legal was
    211     //      properly implemented in contemporary HTML user-agents. It is not.
    212     assertEquals("<p></p>", sanitize("<p/b/"));  // Short-tag discarded.
    213     assertEquals("<p></p>", sanitize("<p<b>"));  // Discard <b attribute
    214     assertEquals(
    215         // This behavior for short tags is not ideal, but it is safe.
    216         "<p href=\"/\">first part of the text&lt;/&gt; second part</p>",
    217         sanitize("<p<a href=\"/\">first part of the text</> second part"));
    218   }
    219 
    220   @Test
    221   public static final void testNul() throws Exception {
    222     assertEquals(
    223         "<a title="
    224         + "\"harmless  SCRIPT&#61;javascript:alert(1) ignored&#61;ignored\">"
    225         + "</a>",
    226         sanitize(
    227             "<A TITLE="
    228             + "\"harmless\0  SCRIPT=javascript:alert(1) ignored=ignored\">"
    229             ));
    230   }
    231 
    232   @Test
    233   public static final void testDigitsInAttrNames() throws Exception {
    234     // See bug 614 for details.
    235     assertEquals(
    236         "<div>Hello</div>",
    237         sanitize(
    238             "<div style1=\"expression(\'alert(1)\")\">Hello</div>"
    239             ));
    240   }
    241 
    242   @Test
    243   public static final void testSupplementaryCodepointEncoding()
    244       throws Exception {
    245     // &#xd87e;&#xdc1a; is not appropriate.
    246     // &#x2f81a; is appropriate as is the unencoded form.
    247     assertEquals(
    248         "&#x2f81a; | &#x2f81a; | &#x2f81a;",
    249         sanitize("&#x2F81A; | \ud87e\udc1a | &#xd87e;&#xdc1a;"));
    250   }
    251 
    252   @Test
    253   public static final void testDeeplyNestedTagsDoS() throws Exception {
    254     String sanitized = sanitize(stringRepeatedTimes("<div>", 20000));
    255     int n = sanitized.length() / "<div></div>".length();
    256     assertTrue("" + n, 50 <= n && n <= 1000);
    257     int middle = n * "<div>".length();
    258     assertEquals(sanitized.substring(0, middle),
    259                  stringRepeatedTimes("<div>", n));
    260     assertEquals(sanitized.substring(middle),
    261                  stringRepeatedTimes("</div>", n));
    262   }
    263 
    264   @Test
    265   public static final void testInnerHTMLIE8() throws Exception {
    266     // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
    267     // values.  Given
    268     //     <div attr="``foo=bar">
    269     // we encode &#96; but if JavaScript does:
    270     //    nodeA.innerHTML = nodeB.innerHTML;
    271     // and nodeB contains the DIV above, then IE8 will produce
    272     //     <div attr=``foo=bar>
    273     // as the value of nodeB.innerHTML and assign it to nodeA.
    274     // IE8's HTML parser treats `` as a blank attribute value and foo=bar
    275     // becomes a separate attribute.
    276     // Adding a space at the end of the attribute prevents this by forcing
    277     // IE8 to put double quotes around the attribute when computing
    278     // nodeB.innerHTML.
    279     assertEquals(
    280         "<div title=\"&#96;&#96;onmouseover&#61;alert(1337) \"></div>",
    281         sanitize("<div title=\"``onmouseover=alert(1337)\">"));
    282   }
    283 
    284   @Test
    285   public static final void testNabobsOfNegativism() throws Exception {
    286     // Treating <noscript> as raw-text gains us nothing security-wise.
    287     assertEquals("<noscript></noscript>",
    288                  sanitize("<noscript><evil></noscript>"));
    289     assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
    290                  sanitize("<noscript>I <b><3</b> Ponies</noscript>"));
    291     assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
    292                  sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>"));
    293     assertEquals("<noframes>I <b>&lt;3</b> Ponies</noframes>",
    294                  sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>"));
    295     assertEquals("<noembed>I <b>&lt;3</b> Ponies</noembed>",
    296                  sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>"));
    297     assertEquals("<noxss>I <b>&lt;3</b> Ponies</noxss>",
    298                  sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>"));
    299     assertEquals(
    300         "&lt;noscript&gt;I &lt;b&gt;&lt;3&lt;/b&gt; Ponies&lt;/noscript&gt;",
    301         sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>"));
    302   }
    303 
    304   @Test
    305   public static final void testNULs() throws Exception {
    306     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>"));
    307     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000"));
    308     assertEquals("",               sanitize("\u0000"));
    309     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, &#0;</b>"));
    310     assertEquals("",               sanitize("&#0;"));
    311   }
    312 
    313   @Test
    314   public static final void testQMarkMeta() throws Exception {
    315     assertEquals(
    316         "Hello, <b>World</b>!",
    317         sanitize(
    318             ""
    319             // An XML Prologue.
    320             // HTML5 treats it as ignorable content via the bogus comment state.
    321             + "<?xml version=\"1\" ?>"
    322             + "Hello, "
    323             // An XML Processing instruction.
    324             // HTML5 treats it as ignorable content via the bogus comment state.
    325             + "<?processing instruction?>"
    326             + "<b>World"
    327             // Appears in HTML copied from outlook.
    328             + "<?xml:namespace prefix = o ns = "
    329             + "\"urn:schemas-microsoft-com:office:office\" />"
    330             + "</b>!"));
    331   }
    332 
    333   @Test
    334   public static final void testScriptInIframe() throws Exception {
    335     assertEquals(
    336         "<iframe></iframe>",
    337         sanitize(
    338             "<iframe>\n"
    339             + "  <script>alert(Hi)</script>\n"
    340             + "</iframe>"));
    341   }
    342 
    343   private static String sanitize(@Nullable String html) throws Exception {
    344     StringBuilder sb = new StringBuilder();
    345     HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
    346         sb,
    347         new Handler<String>() {
    348           public void handle(String errorMessage) {
    349             fail(errorMessage);
    350           }
    351         });
    352 
    353     HtmlSanitizer.Policy policy = new HtmlPolicyBuilder()
    354         // Allow these tags.
    355        .allowElements(
    356            "a", "b", "br", "div", "i", "iframe", "img", "input", "li",
    357            "ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss")
    358        // And these attributes.
    359        .allowAttributes(
    360            "dir", "checked", "class", "href", "id", "target", "title", "type")
    361        .globally()
    362        // Cleanup IDs and CLASSes and prefix them with p- to move to a separate
    363        // name-space.
    364        .allowAttributes("id", "class")
    365        .matching(
    366            new AttributePolicy() {
    367             public String apply(
    368                 String elementName, String attributeName, String value) {
    369               return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1")
    370                   .replaceAll("\\s+", " ")
    371                   .trim();
    372             }
    373            })
    374        .globally()
    375        // Don't throw out useless <img> and <input> elements to ease debugging.
    376        .allowWithoutAttributes("img", "input")
    377        .build(renderer);
    378 
    379     HtmlSanitizer.sanitize(html, policy);
    380 
    381     return sb.toString();
    382   }
    383 
    384   private static final String stringRepeatedTimes(String s, int n) {
    385     StringBuilder sb = new StringBuilder(s.length() * n);
    386     while (--n >= 0) {
    387       sb.append(s);
    388     }
    389     return sb.toString();
    390   }
    391 }
    392