1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import junit.framework.TestCase; 32 33 import java.util.Arrays; 34 import java.util.List; 35 36 import org.junit.Test; 37 38 import com.google.common.base.Charsets; 39 import com.google.common.collect.Lists; 40 import com.google.common.io.Resources; 41 42 public class HtmlLexerTest extends TestCase { 43 44 @Test 45 public final void testHtmlLexer() throws Exception { 46 // Do the lexing. 47 String input = Resources.toString( 48 Resources.getResource(getClass(), "htmllexerinput1.html"), 49 Charsets.UTF_8); 50 StringBuilder actual = new StringBuilder(); 51 lex(input, actual); 52 53 // Get the golden. 54 String golden = Resources.toString( 55 Resources.getResource(getClass(), "htmllexergolden1.txt"), 56 Charsets.UTF_8); 57 58 // Compare. 59 assertEquals(golden, actual.toString()); 60 } 61 62 @Test 63 public static final void testEofInTag() throws Exception { 64 assertTokens("<div", "TAGBEGIN: <div"); 65 assertTokens("</div", "TAGBEGIN: </div"); 66 assertTokens("<div\n", "TAGBEGIN: <div"); 67 assertTokens("</div\n", "TAGBEGIN: </div"); 68 assertTokens("<div", "TAGBEGIN: <div"); 69 assertTokens("</div", "TAGBEGIN: </div"); 70 assertTokens("<div\n", "TAGBEGIN: <div"); 71 assertTokens("</div\n", "TAGBEGIN: </div"); 72 } 73 74 @Test 75 public static final void testPartialTagInCData() throws Exception { 76 assertTokens( 77 "<script>w('</b')</script>", 78 "TAGBEGIN: <script", 79 "TAGEND: >", 80 "UNESCAPED: w('</b')", 81 "TAGBEGIN: </script", 82 "TAGEND: >"); 83 } 84 85 @Test 86 public static final void testUrlEndingInSlashOutsideQuotes() 87 throws Exception { 88 assertTokens( 89 "<a href=http://foo.com/>Clicky</a>", 90 "TAGBEGIN: <a", 91 "ATTRNAME: href", 92 "ATTRVALUE: http://foo.com/", 93 "TAGEND: >", 94 "TEXT: Clicky", 95 "TAGBEGIN: </a", 96 "TAGEND: >"); 97 } 98 99 @Test 100 public static final void testShortTags() throws Exception { 101 // See comments in html-sanitizer-test.js as to why we don't bother with 102 // short tags. In short, they are not in HTML5 and not implemented properly 103 // in existing HTML4 clients. 104 assertTokens( 105 "<p<a href=\"/\">first part of the text</> second part", 106 "TAGBEGIN: <p", 107 "ATTRNAME: <a", 108 "ATTRNAME: href", 109 "ATTRVALUE: \"/\"", 110 "TAGEND: >", 111 "TEXT: first part of the text</> second part"); 112 assertTokens( 113 "<p/b/", 114 "TAGBEGIN: <p", 115 "ATTRNAME: /", 116 "ATTRNAME: b/"); 117 assertTokens( 118 "<p<b>", 119 "TAGBEGIN: <p", 120 "ATTRNAME: <b", 121 "TAGEND: >"); 122 } 123 124 private static void lex(String input, Appendable out) throws Exception { 125 HtmlLexer lexer = new HtmlLexer(input); 126 int maxTypeLength = 0; 127 for (HtmlTokenType t : HtmlTokenType.values()) { 128 maxTypeLength = Math.max(maxTypeLength, t.name().length()); 129 } 130 131 while (lexer.hasNext()) { 132 HtmlToken t = lexer.next(); 133 // Do C style escaping of the token text so that each token in the golden 134 // file can fit on one line. 135 String escaped = input.substring(t.start, t.end) 136 .replace("\\", "\\\\").replace("\n", "\\n"); 137 String type = t.type.toString(); 138 int nPadding = maxTypeLength - type.length(); 139 out.append(type); 140 while (--nPadding >= 0) { out.append(' '); } 141 out.append(" [").append(escaped).append("] : ") 142 .append(String.valueOf(t.start)).append('-') 143 .append(String.valueOf(t.end)) 144 .append("\n"); 145 } 146 } 147 148 private static void assertTokens(String markup, String... golden) { 149 HtmlLexer lexer = new HtmlLexer(markup); 150 List<String> actual = Lists.newArrayList(); 151 while (lexer.hasNext()) { 152 HtmlToken t = lexer.next(); 153 actual.add(t.type + ": " + markup.substring(t.start, t.end)); 154 } 155 assertEquals(Arrays.asList(golden), actual); 156 } 157 } 158