1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import junit.framework.TestCase; 32 33 import javax.annotation.Nullable; 34 35 import org.junit.Test; 36 37 38 public class HtmlSanitizerTest extends TestCase { 39 40 @Test 41 public static final void testEmpty() throws Exception { 42 assertEquals("", sanitize("")); 43 assertEquals("", sanitize(null)); 44 } 45 46 @Test 47 public static final void testSimpleText() throws Exception { 48 assertEquals("hello world", sanitize("hello world")); 49 } 50 51 @Test 52 public static final void testEntities1() throws Exception { 53 assertEquals("<hello world>", sanitize("<hello world>")); 54 } 55 56 @Test 57 public static final void testEntities2() throws Exception { 58 assertEquals("<b>hello <i>world</i></b>", 59 sanitize("<b>hello <i>world</i></b>")); 60 } 61 62 @Test 63 public static final void testUnknownTagsRemoved() throws Exception { 64 assertEquals("<b>hello <i>world</i></b>", 65 sanitize("<b>hello <bogus></bogus><i>world</i></b>")); 66 } 67 68 @Test 69 public static final void testUnsafeTagsRemoved() throws Exception { 70 assertEquals("<b>hello <i>world</i></b>", 71 sanitize("<b>hello <i>world</i>" 72 + "<script src=foo.js></script></b>")); 73 } 74 75 @Test 76 public static final void testUnsafeAttributesRemoved() throws Exception { 77 assertEquals( 78 "<b>hello <i>world</i></b>", 79 sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>")); 80 } 81 82 @Test 83 public static final void testCruftEscaped() throws Exception { 84 assertEquals("<b>hello <i>world<</i></b> & tomorrow the universe", 85 sanitize( 86 "<b>hello <i>world<</i></b> & tomorrow the universe")); 87 } 88 89 @Test 90 public static final void testTagCruftRemoved() throws Exception { 91 assertEquals("<b id=\"p-foo\">hello <i>world<</i></b>", 92 sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>")); 93 } 94 95 @Test 96 public static final void testIdsAndClassesPrefixed() throws Exception { 97 assertEquals( 98 "<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">" 99 + "hello <i>world<</i></b>", 100 sanitize( 101 "<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>")); 102 } 103 104 @Test 105 public static final void testSpecialCharsInAttributes() throws Exception { 106 assertEquals( 107 "<b title=\"a<b && c>b\">bar</b>", 108 sanitize("<b title=\"a<b && c>b\">bar</b>")); 109 } 110 111 @Test 112 public static final void testUnclosedTags() throws Exception { 113 assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>", 114 sanitize("<div id=\"foo\">Bar<br>Baz")); 115 } 116 117 @Test 118 public static final void testUnopenedTags() throws Exception { 119 assertEquals("Foo<b>Bar</b>Baz", 120 sanitize("Foo<b></select>Bar</b></b>Baz</select>")); 121 } 122 123 @Test 124 public static final void testUnsafeEndTags() throws Exception { 125 assertEquals( 126 "", 127 sanitize( 128 "</meta http-equiv=\"refesh\"" 129 + " content=\"1;URL=http://evilgadget.com\">")); 130 } 131 132 @Test 133 public static final void testEmptyEndTags() throws Exception { 134 assertEquals("<input />", sanitize("<input></input>")); 135 } 136 137 @Test 138 public static final void testOnLoadStripped() throws Exception { 139 assertEquals( 140 "<img />", 141 sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>")); 142 } 143 144 @Test 145 public static final void testClosingTagParameters() throws Exception { 146 assertEquals( 147 "<p>Hello world</p>", 148 sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">")); 149 } 150 151 @Test 152 public static final void testOptionalEndTags() throws Exception { 153 // Should not be 154 // "<ol> <li>A</li> <li>B<li>C </li></li></ol>" 155 // The difference is significant because in the first, the item contains no 156 // space after 'A", but in the third, the item contains 'C' and a space. 157 assertEquals( 158 "<ol><li>A</li><li>B</li><li>C </li></ol>", 159 sanitize("<ol> <li>A</li> <li>B<li>C </ol>")); 160 } 161 162 @Test 163 public static final void testFoldingOfHtmlAndBodyTags() throws Exception { 164 assertEquals( 165 "<p>P 1</p>", 166 sanitize("<html><head><title>Foo</title></head>" 167 + "<body><p>P 1</p></body></html>")); 168 assertEquals( 169 "Hello", 170 sanitize("<body bgcolor=\"blue\">Hello</body>")); 171 assertEquals( 172 "<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>", 173 sanitize( 174 "<html>" 175 + "<head>" 176 + "<title>Blah</title>" 177 + "<p>Foo</p>" 178 + "</head>" 179 + "<body>" 180 + "<p>One" 181 + "<p>Two</p>" 182 + "Three" 183 + "<p>Four</p>" 184 + "</body>" 185 + "</html>")); 186 } 187 188 @Test 189 public static final void testEmptyAndValuelessAttributes() throws Exception { 190 assertEquals( 191 "<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />", 192 sanitize("<input checked type=checkbox id=\"\" class=>")); 193 } 194 195 @Test 196 public static final void testSgmlShortTags() throws Exception { 197 // We make no attempt to correctly handle SGML short tags since they are 198 // not implemented consistently across browsers, and have been removed from 199 // HTML 5. 200 // 201 // According to http://www.w3.org/QA/2007/10/shorttags.html 202 // Shorttags - the odd side of HTML 4.01 203 // ... 204 // It uses an ill-known feature of SGML called shorthand markup, which 205 // was authorized in HTML up to HTML 4.01. But what used to be a "cool" 206 // feature for SGML experts becomes a liability in HTML, where the 207 // construct is more likely to appear as a typo than as a conscious 208 // choice. 209 // 210 // All could be fine if this form typo-that-happens-to-be-legal was 211 // properly implemented in contemporary HTML user-agents. It is not. 212 assertEquals("<p></p>", sanitize("<p/b/")); // Short-tag discarded. 213 assertEquals("<p></p>", sanitize("<p<b>")); // Discard <b attribute 214 assertEquals( 215 // This behavior for short tags is not ideal, but it is safe. 216 "<p href=\"/\">first part of the text</> second part</p>", 217 sanitize("<p<a href=\"/\">first part of the text</> second part")); 218 } 219 220 @Test 221 public static final void testNul() throws Exception { 222 assertEquals( 223 "<a title=" 224 + "\"harmless SCRIPT=javascript:alert(1) ignored=ignored\">" 225 + "</a>", 226 sanitize( 227 "<A TITLE=" 228 + "\"harmless\0 SCRIPT=javascript:alert(1) ignored=ignored\">" 229 )); 230 } 231 232 @Test 233 public static final void testDigitsInAttrNames() throws Exception { 234 // See bug 614 for details. 235 assertEquals( 236 "<div>Hello</div>", 237 sanitize( 238 "<div style1=\"expression(\'alert(1)\")\">Hello</div>" 239 )); 240 } 241 242 @Test 243 public static final void testSupplementaryCodepointEncoding() 244 throws Exception { 245 // �� is not appropriate. 246 // 冬 is appropriate as is the unencoded form. 247 assertEquals( 248 "冬 | 冬 | 冬", 249 sanitize("冬 | \ud87e\udc1a | ��")); 250 } 251 252 @Test 253 public static final void testDeeplyNestedTagsDoS() throws Exception { 254 String sanitized = sanitize(stringRepeatedTimes("<div>", 20000)); 255 int n = sanitized.length() / "<div></div>".length(); 256 assertTrue("" + n, 50 <= n && n <= 1000); 257 int middle = n * "<div>".length(); 258 assertEquals(sanitized.substring(0, middle), 259 stringRepeatedTimes("<div>", n)); 260 assertEquals(sanitized.substring(middle), 261 stringRepeatedTimes("</div>", n)); 262 } 263 264 @Test 265 public static final void testInnerHTMLIE8() throws Exception { 266 // Apparently, in quirks mode, IE8 does a poor job producing innerHTML 267 // values. Given 268 // <div attr="``foo=bar"> 269 // we encode ` but if JavaScript does: 270 // nodeA.innerHTML = nodeB.innerHTML; 271 // and nodeB contains the DIV above, then IE8 will produce 272 // <div attr=``foo=bar> 273 // as the value of nodeB.innerHTML and assign it to nodeA. 274 // IE8's HTML parser treats `` as a blank attribute value and foo=bar 275 // becomes a separate attribute. 276 // Adding a space at the end of the attribute prevents this by forcing 277 // IE8 to put double quotes around the attribute when computing 278 // nodeB.innerHTML. 279 assertEquals( 280 "<div title=\"``onmouseover=alert(1337) \"></div>", 281 sanitize("<div title=\"``onmouseover=alert(1337)\">")); 282 } 283 284 @Test 285 public static final void testNabobsOfNegativism() throws Exception { 286 // Treating <noscript> as raw-text gains us nothing security-wise. 287 assertEquals("<noscript></noscript>", 288 sanitize("<noscript><evil></noscript>")); 289 assertEquals("<noscript>I <b><3</b> Ponies</noscript>", 290 sanitize("<noscript>I <b><3</b> Ponies</noscript>")); 291 assertEquals("<noscript>I <b><3</b> Ponies</noscript>", 292 sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>")); 293 assertEquals("<noframes>I <b><3</b> Ponies</noframes>", 294 sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>")); 295 assertEquals("<noembed>I <b><3</b> Ponies</noembed>", 296 sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>")); 297 assertEquals("<noxss>I <b><3</b> Ponies</noxss>", 298 sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>")); 299 assertEquals( 300 "<noscript>I <b><3</b> Ponies</noscript>", 301 sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>")); 302 } 303 304 @Test 305 public static final void testNULs() throws Exception { 306 assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>")); 307 assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000")); 308 assertEquals("", sanitize("\u0000")); 309 assertEquals("<b>Hello, </b>", sanitize("<b>Hello, �</b>")); 310 assertEquals("", sanitize("�")); 311 } 312 313 @Test 314 public static final void testQMarkMeta() throws Exception { 315 assertEquals( 316 "Hello, <b>World</b>!", 317 sanitize( 318 "" 319 // An XML Prologue. 320 // HTML5 treats it as ignorable content via the bogus comment state. 321 + "<?xml version=\"1\" ?>" 322 + "Hello, " 323 // An XML Processing instruction. 324 // HTML5 treats it as ignorable content via the bogus comment state. 325 + "<?processing instruction?>" 326 + "<b>World" 327 // Appears in HTML copied from outlook. 328 + "<?xml:namespace prefix = o ns = " 329 + "\"urn:schemas-microsoft-com:office:office\" />" 330 + "</b>!")); 331 } 332 333 @Test 334 public static final void testScriptInIframe() throws Exception { 335 assertEquals( 336 "<iframe></iframe>", 337 sanitize( 338 "<iframe>\n" 339 + " <script>alert(Hi)</script>\n" 340 + "</iframe>")); 341 } 342 343 private static String sanitize(@Nullable String html) throws Exception { 344 StringBuilder sb = new StringBuilder(); 345 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 346 sb, 347 new Handler<String>() { 348 public void handle(String errorMessage) { 349 fail(errorMessage); 350 } 351 }); 352 353 HtmlSanitizer.Policy policy = new HtmlPolicyBuilder() 354 // Allow these tags. 355 .allowElements( 356 "a", "b", "br", "div", "i", "iframe", "img", "input", "li", 357 "ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss") 358 // And these attributes. 359 .allowAttributes( 360 "dir", "checked", "class", "href", "id", "target", "title", "type") 361 .globally() 362 // Cleanup IDs and CLASSes and prefix them with p- to move to a separate 363 // name-space. 364 .allowAttributes("id", "class") 365 .matching( 366 new AttributePolicy() { 367 public String apply( 368 String elementName, String attributeName, String value) { 369 return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1") 370 .replaceAll("\\s+", " ") 371 .trim(); 372 } 373 }) 374 .globally() 375 // Don't throw out useless <img> and <input> elements to ease debugging. 376 .allowWithoutAttributes("img", "input") 377 .build(renderer); 378 379 HtmlSanitizer.sanitize(html, policy); 380 381 return sb.toString(); 382 } 383 384 private static final String stringRepeatedTimes(String s, int n) { 385 StringBuilder sb = new StringBuilder(s.length() * n); 386 while (--n >= 0) { 387 sb.append(s); 388 } 389 return sb.toString(); 390 } 391 } 392