Home | History | Annotate | Download | only in examples
      1 // Copyright (c) 2011, Mike Samuel
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions
      6 // are met:
      7 //
      8 // Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 // Redistributions in binary form must reproduce the above copyright
     11 // notice, this list of conditions and the following disclaimer in the
     12 // documentation and/or other materials provided with the distribution.
     13 // Neither the name of the OWASP nor the names of its contributors may
     14 // be used to endorse or promote products derived from this software
     15 // without specific prior written permission.
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27 // POSSIBILITY OF SUCH DAMAGE.
     28 
     29 package org.owasp.html.examples;
     30 
     31 import java.io.IOException;
     32 import java.io.InputStreamReader;
     33 import java.util.regex.Pattern;
     34 
     35 import org.owasp.html.Handler;
     36 import org.owasp.html.HtmlPolicyBuilder;
     37 import org.owasp.html.HtmlSanitizer;
     38 import org.owasp.html.HtmlStreamRenderer;
     39 import org.owasp.html.PolicyFactory;
     40 
     41 import com.google.common.base.Charsets;
     42 import com.google.common.base.Predicate;
     43 import com.google.common.base.Throwables;
     44 import com.google.common.io.CharStreams;
     45 
     46 /**
     47  * Based on the
     48  * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>.
     49  * <blockquote>
     50  * eBay (http://www.ebay.com/) is the most popular online auction site in the
     51  * universe, as far as I can tell. It is a public site so anyone is allowed to
     52  * post listings with rich HTML content. It's not surprising that given the
     53  * attractiveness of eBay as a target that it has been subject to a few complex
     54  * XSS attacks. Listings are allowed to contain much more rich content than,
     55  * say, Slashdot- so it's attack surface is considerably larger. The following
     56  * tags appear to be accepted by eBay (they don't publish rules):
     57  * {@code <a>},...
     58  * </blockquote>
     59  */
     60 public class EbayPolicyExample {
     61 
     62   // Some common regular expression definitions.
     63 
     64   // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
     65   private static final Pattern COLOR_NAME = Pattern.compile(
     66       "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
     67       + "|red|silver|teal|white|yellow)");
     68 
     69   // HTML/CSS Spec allows 3 or 6 digit hex to specify color
     70   private static final Pattern COLOR_CODE = Pattern.compile(
     71       "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))");
     72 
     73   private static final Pattern NUMBER_OR_PERCENT = Pattern.compile(
     74       "[0-9]+%?");
     75   private static final Pattern PARAGRAPH = Pattern.compile(
     76       "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");
     77   private static final Pattern HTML_ID = Pattern.compile(
     78       "[a-zA-Z0-9\\:\\-_\\.]+");
     79   // force non-empty with a '+' at the end instead of '*'
     80   private static final Pattern HTML_TITLE = Pattern.compile(
     81       "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");
     82   private static final Pattern HTML_CLASS = Pattern.compile(
     83       "[a-zA-Z0-9\\s,\\-_]+");
     84 
     85   private static final Pattern ONSITE_URL = Pattern.compile(
     86       "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");
     87   private static final Pattern OFFSITE_URL = Pattern.compile(
     88       "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]"
     89       + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*+\\s*");
     90 
     91   private static final Pattern NUMBER = Pattern.compile(
     92       "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)");
     93 
     94   private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");
     95 
     96   private static final Pattern ALIGN = Pattern.compile(
     97       "(?i)center|left|right|justify|char");
     98 
     99   private static final Pattern VALIGN = Pattern.compile(
    100       "(?i)baseline|bottom|middle|top");
    101 
    102   private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE
    103       = new Predicate<String>() {
    104         public boolean apply(String s) {
    105           return COLOR_NAME.matcher(s).matches()
    106               || COLOR_CODE.matcher(s).matches();
    107         }
    108       };
    109 
    110   private static final Predicate<String> ONSITE_OR_OFFSITE_URL
    111       = new Predicate<String>() {
    112         public boolean apply(String s) {
    113           return ONSITE_URL.matcher(s).matches()
    114               || OFFSITE_URL.matcher(s).matches();
    115         }
    116       };
    117 
    118   private static final Pattern HISTORY_BACK = Pattern.compile(
    119       "(?:javascript:)?\\Qhistory.go(-1)\\E");
    120 
    121   private static final Pattern ONE_CHAR = Pattern.compile(
    122       ".?", Pattern.DOTALL);
    123 
    124 
    125 
    126   public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
    127           .allowAttributes("id").matching(HTML_ID).globally()
    128           .allowAttributes("class").matching(HTML_CLASS).globally()
    129           .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
    130               .globally()
    131           .allowAttributes("title").matching(HTML_TITLE).globally()
    132           .allowStyling()
    133           .allowAttributes("align").matching(ALIGN).onElements("p")
    134           .allowAttributes("for").matching(HTML_ID).onElements("label")
    135           .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE)
    136               .onElements("font")
    137           .allowAttributes("face")
    138               .matching(Pattern.compile("[\\w;, \\-]+"))
    139               .onElements("font")
    140           .allowAttributes("size").matching(NUMBER).onElements("font")
    141           .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL)
    142               .onElements("a")
    143           .allowStandardUrlProtocols()
    144           .allowAttributes("nohref").onElements("a")
    145           .allowAttributes("name").matching(NAME).onElements("a")
    146           .allowAttributes(
    147               "onfocus", "onblur", "onclick", "onmousedown", "onmouseup")
    148               .matching(HISTORY_BACK).onElements("a")
    149           .requireRelNofollowOnLinks()
    150           .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL)
    151               .onElements("img")
    152           .allowAttributes("name").matching(NAME)
    153               .onElements("img")
    154           .allowAttributes("alt").matching(PARAGRAPH)
    155               .onElements("img")
    156           .allowAttributes("border", "hspace", "vspace").matching(NUMBER)
    157               .onElements("img")
    158           .allowAttributes("border", "cellpadding", "cellspacing")
    159               .matching(NUMBER).onElements("table")
    160           .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
    161               .onElements("table")
    162           .allowAttributes("background").matching(ONSITE_URL)
    163               .onElements("table")
    164           .allowAttributes("align").matching(ALIGN)
    165               .onElements("table")
    166           .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize"))
    167               .onElements("table")
    168           .allowAttributes("background").matching(ONSITE_URL)
    169               .onElements("td", "th", "tr")
    170           .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
    171               .onElements("td", "th")
    172           .allowAttributes("abbr").matching(PARAGRAPH)
    173               .onElements("td", "th")
    174           .allowAttributes("axis", "headers").matching(NAME)
    175               .onElements("td", "th")
    176           .allowAttributes("scope")
    177               .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
    178               .onElements("td", "th")
    179           .allowAttributes("nowrap")
    180               .onElements("td", "th")
    181           .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT)
    182               .onElements("table", "td", "th", "tr", "img")
    183           .allowAttributes("align").matching(ALIGN)
    184               .onElements("thead", "tbody", "tfoot", "img",
    185                                "td", "th", "tr", "colgroup", "col")
    186           .allowAttributes("valign").matching(VALIGN)
    187               .onElements("thead", "tbody", "tfoot",
    188                               "td", "th", "tr", "colgroup", "col")
    189           .allowAttributes("charoff").matching(NUMBER_OR_PERCENT)
    190               .onElements("td", "th", "tr", "colgroup", "col",
    191                               "thead", "tbody", "tfoot")
    192           .allowAttributes("char").matching(ONE_CHAR)
    193               .onElements("td", "th", "tr", "colgroup", "col",
    194                                "thead", "tbody", "tfoot")
    195           .allowAttributes("colspan", "rowspan").matching(NUMBER)
    196               .onElements("td", "th")
    197           .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT)
    198               .onElements("colgroup", "col")
    199           .allowElements(
    200               "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6",
    201               "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
    202               "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
    203               "hr", "br", "col", "font", "map", "span", "div", "img",
    204               "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
    205               "table", "td", "th", "tr", "colgroup", "fieldset", "legend")
    206           .toFactory();
    207 
    208   public static void main(String[] args) throws IOException {
    209     if (args.length != 0) {
    210       System.err.println("Reads from STDIN and writes to STDOUT");
    211       System.exit(-1);
    212     }
    213     System.err.println("[Reading from STDIN]");
    214     // Fetch the HTML to sanitize.
    215     String html = CharStreams.toString(
    216         new InputStreamReader(System.in, Charsets.UTF_8));
    217     // Set up an output channel to receive the sanitized HTML.
    218     HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
    219         System.out,
    220         // Receives notifications on a failure to write to the output.
    221         new Handler<IOException>() {
    222           public void handle(IOException ex) {
    223             Throwables.propagate(ex);  // System.out suppresses IOExceptions
    224           }
    225         },
    226         // Our HTML parser is very lenient, but this receives notifications on
    227         // truly bizarre inputs.
    228         new Handler<String>() {
    229           public void handle(String x) {
    230             throw new AssertionError(x);
    231           }
    232         });
    233     // Use the policy defined above to sanitize the HTML.
    234     HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
    235   }
    236 }
    237