Home | History | Annotate | Download | only in html
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html;
     30 
     31 import junit.framework.TestCase;
     32 
     33 import java.util.Arrays;
     34 import java.util.List;
     35 
     36 import org.junit.Test;
     37 
     38 import com.google.common.base.Charsets;
     39 import com.google.common.collect.Lists;
     40 import com.google.common.io.Resources;
     41 
     42 public class HtmlLexerTest extends TestCase {
     43 
     44   @Test
     45   public final void testHtmlLexer() throws Exception {
     46     // Do the lexing.
     47     String input = Resources.toString(
     48         Resources.getResource(getClass(), "htmllexerinput1.html"),
     49         Charsets.UTF_8);
     50     StringBuilder actual = new StringBuilder();
     51     lex(input, actual);
     52 
     53     // Get the golden.
     54     String golden = Resources.toString(
     55         Resources.getResource(getClass(), "htmllexergolden1.txt"),
     56         Charsets.UTF_8);
     57 
     58     // Compare.
     59     assertEquals(golden, actual.toString());
     60   }
     61 
     62   @Test
     63   public static final void testEofInTag() throws Exception {
     64     assertTokens("<div", "TAGBEGIN: <div");
     65     assertTokens("</div", "TAGBEGIN: </div");
     66     assertTokens("<div\n", "TAGBEGIN: <div");
     67     assertTokens("</div\n", "TAGBEGIN: </div");
     68     assertTokens("<div", "TAGBEGIN: <div");
     69     assertTokens("</div", "TAGBEGIN: </div");
     70     assertTokens("<div\n", "TAGBEGIN: <div");
     71     assertTokens("</div\n", "TAGBEGIN: </div");
     72   }
     73 
     74   @Test
     75   public static final void testPartialTagInCData() throws Exception {
     76     assertTokens(
     77         "<script>w('</b')</script>",
     78         "TAGBEGIN: <script",
     79         "TAGEND: >",
     80         "UNESCAPED: w('</b')",
     81         "TAGBEGIN: </script",
     82         "TAGEND: >");
     83   }
     84 
     85   @Test
     86   public static final void testUrlEndingInSlashOutsideQuotes()
     87       throws Exception {
     88     assertTokens(
     89         "<a href=http://foo.com/>Clicky</a>",
     90         "TAGBEGIN: <a",
     91         "ATTRNAME: href",
     92         "ATTRVALUE: http://foo.com/",
     93         "TAGEND: >",
     94         "TEXT: Clicky",
     95         "TAGBEGIN: </a",
     96         "TAGEND: >");
     97   }
     98 
     99   @Test
    100   public static final void testShortTags() throws Exception {
    101     // See comments in html-sanitizer-test.js as to why we don't bother with
    102     // short tags.  In short, they are not in HTML5 and not implemented properly
    103     // in existing HTML4 clients.
    104     assertTokens(
    105         "<p<a href=\"/\">first part of the text</> second part",
    106         "TAGBEGIN: <p",
    107         "ATTRNAME: <a",
    108         "ATTRNAME: href",
    109         "ATTRVALUE: \"/\"",
    110         "TAGEND: >",
    111         "TEXT: first part of the text</> second part");
    112     assertTokens(
    113         "<p/b/",
    114         "TAGBEGIN: <p",
    115         "ATTRNAME: /",
    116         "ATTRNAME: b/");
    117     assertTokens(
    118         "<p<b>",
    119         "TAGBEGIN: <p",
    120         "ATTRNAME: <b",
    121         "TAGEND: >");
    122   }
    123 
    124   private static void lex(String input, Appendable out) throws Exception {
    125     HtmlLexer lexer = new HtmlLexer(input);
    126     int maxTypeLength = 0;
    127     for (HtmlTokenType t : HtmlTokenType.values()) {
    128       maxTypeLength = Math.max(maxTypeLength, t.name().length());
    129     }
    130 
    131     while (lexer.hasNext()) {
    132       HtmlToken t = lexer.next();
    133       // Do C style escaping of the token text so that each token in the golden
    134       // file can fit on one line.
    135       String escaped = input.substring(t.start, t.end)
    136           .replace("\\", "\\\\").replace("\n", "\\n");
    137       String type = t.type.toString();
    138       int nPadding = maxTypeLength - type.length();
    139       out.append(type);
    140       while (--nPadding >= 0) { out.append(' '); }
    141       out.append(" [").append(escaped).append("]  :  ")
    142           .append(String.valueOf(t.start)).append('-')
    143           .append(String.valueOf(t.end))
    144           .append("\n");
    145     }
    146   }
    147 
    148   private static void assertTokens(String markup, String... golden) {
    149     HtmlLexer lexer = new HtmlLexer(markup);
    150     List<String> actual = Lists.newArrayList();
    151     while (lexer.hasNext()) {
    152       HtmlToken t = lexer.next();
    153       actual.add(t.type + ": " + markup.substring(t.start, t.end));
    154     }
    155     assertEquals(Arrays.asList(golden), actual);
    156   }
    157 }
    158