1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html.examples; 30 31 import java.io.IOException; 32 import java.io.InputStreamReader; 33 import java.util.regex.Pattern; 34 35 import org.owasp.html.Handler; 36 import org.owasp.html.HtmlPolicyBuilder; 37 import org.owasp.html.HtmlSanitizer; 38 import org.owasp.html.HtmlStreamRenderer; 39 import org.owasp.html.PolicyFactory; 40 41 import com.google.common.base.Charsets; 42 import com.google.common.base.Predicate; 43 import com.google.common.base.Throwables; 44 import com.google.common.io.CharStreams; 45 46 /** 47 * Based on the 48 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>. 49 * <blockquote> 50 * eBay (http://www.ebay.com/) is the most popular online auction site in the 51 * universe, as far as I can tell. It is a public site so anyone is allowed to 52 * post listings with rich HTML content. It's not surprising that given the 53 * attractiveness of eBay as a target that it has been subject to a few complex 54 * XSS attacks. Listings are allowed to contain much more rich content than, 55 * say, Slashdot- so it's attack surface is considerably larger. The following 56 * tags appear to be accepted by eBay (they don't publish rules): 57 * {@code <a>},... 58 * </blockquote> 59 */ 60 public class EbayPolicyExample { 61 62 // Some common regular expression definitions. 63 64 // The 16 colors defined by the HTML Spec (also used by the CSS Spec) 65 private static final Pattern COLOR_NAME = Pattern.compile( 66 "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple" 67 + "|red|silver|teal|white|yellow)"); 68 69 // HTML/CSS Spec allows 3 or 6 digit hex to specify color 70 private static final Pattern COLOR_CODE = Pattern.compile( 71 "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))"); 72 73 private static final Pattern NUMBER_OR_PERCENT = Pattern.compile( 74 "[0-9]+%?"); 75 private static final Pattern PARAGRAPH = Pattern.compile( 76 "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*"); 77 private static final Pattern HTML_ID = Pattern.compile( 78 "[a-zA-Z0-9\\:\\-_\\.]+"); 79 // force non-empty with a '+' at the end instead of '*' 80 private static final Pattern HTML_TITLE = Pattern.compile( 81 "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*"); 82 private static final Pattern HTML_CLASS = Pattern.compile( 83 "[a-zA-Z0-9\\s,\\-_]+"); 84 85 private static final Pattern ONSITE_URL = Pattern.compile( 86 "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)"); 87 private static final Pattern OFFSITE_URL = Pattern.compile( 88 "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]" 89 + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*+\\s*"); 90 91 private static final Pattern NUMBER = Pattern.compile( 92 "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)"); 93 94 private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+"); 95 96 private static final Pattern ALIGN = Pattern.compile( 97 "(?i)center|left|right|justify|char"); 98 99 private static final Pattern VALIGN = Pattern.compile( 100 "(?i)baseline|bottom|middle|top"); 101 102 private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE 103 = new Predicate<String>() { 104 public boolean apply(String s) { 105 return COLOR_NAME.matcher(s).matches() 106 || COLOR_CODE.matcher(s).matches(); 107 } 108 }; 109 110 private static final Predicate<String> ONSITE_OR_OFFSITE_URL 111 = new Predicate<String>() { 112 public boolean apply(String s) { 113 return ONSITE_URL.matcher(s).matches() 114 || OFFSITE_URL.matcher(s).matches(); 115 } 116 }; 117 118 private static final Pattern HISTORY_BACK = Pattern.compile( 119 "(?:javascript:)?\\Qhistory.go(-1)\\E"); 120 121 private static final Pattern ONE_CHAR = Pattern.compile( 122 ".?", Pattern.DOTALL); 123 124 125 126 public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder() 127 .allowAttributes("id").matching(HTML_ID).globally() 128 .allowAttributes("class").matching(HTML_CLASS).globally() 129 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) 130 .globally() 131 .allowAttributes("title").matching(HTML_TITLE).globally() 132 .allowStyling() 133 .allowAttributes("align").matching(ALIGN).onElements("p") 134 .allowAttributes("for").matching(HTML_ID).onElements("label") 135 .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE) 136 .onElements("font") 137 .allowAttributes("face") 138 .matching(Pattern.compile("[\\w;, \\-]+")) 139 .onElements("font") 140 .allowAttributes("size").matching(NUMBER).onElements("font") 141 .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL) 142 .onElements("a") 143 .allowStandardUrlProtocols() 144 .allowAttributes("nohref").onElements("a") 145 .allowAttributes("name").matching(NAME).onElements("a") 146 .allowAttributes( 147 "onfocus", "onblur", "onclick", "onmousedown", "onmouseup") 148 .matching(HISTORY_BACK).onElements("a") 149 .requireRelNofollowOnLinks() 150 .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL) 151 .onElements("img") 152 .allowAttributes("name").matching(NAME) 153 .onElements("img") 154 .allowAttributes("alt").matching(PARAGRAPH) 155 .onElements("img") 156 .allowAttributes("border", "hspace", "vspace").matching(NUMBER) 157 .onElements("img") 158 .allowAttributes("border", "cellpadding", "cellspacing") 159 .matching(NUMBER).onElements("table") 160 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 161 .onElements("table") 162 .allowAttributes("background").matching(ONSITE_URL) 163 .onElements("table") 164 .allowAttributes("align").matching(ALIGN) 165 .onElements("table") 166 .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize")) 167 .onElements("table") 168 .allowAttributes("background").matching(ONSITE_URL) 169 .onElements("td", "th", "tr") 170 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) 171 .onElements("td", "th") 172 .allowAttributes("abbr").matching(PARAGRAPH) 173 .onElements("td", "th") 174 .allowAttributes("axis", "headers").matching(NAME) 175 .onElements("td", "th") 176 .allowAttributes("scope") 177 .matching(Pattern.compile("(?i)(?:row|col)(?:group)?")) 178 .onElements("td", "th") 179 .allowAttributes("nowrap") 180 .onElements("td", "th") 181 .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT) 182 .onElements("table", "td", "th", "tr", "img") 183 .allowAttributes("align").matching(ALIGN) 184 .onElements("thead", "tbody", "tfoot", "img", 185 "td", "th", "tr", "colgroup", "col") 186 .allowAttributes("valign").matching(VALIGN) 187 .onElements("thead", "tbody", "tfoot", 188 "td", "th", "tr", "colgroup", "col") 189 .allowAttributes("charoff").matching(NUMBER_OR_PERCENT) 190 .onElements("td", "th", "tr", "colgroup", "col", 191 "thead", "tbody", "tfoot") 192 .allowAttributes("char").matching(ONE_CHAR) 193 .onElements("td", "th", "tr", "colgroup", "col", 194 "thead", "tbody", "tfoot") 195 .allowAttributes("colspan", "rowspan").matching(NUMBER) 196 .onElements("td", "th") 197 .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT) 198 .onElements("colgroup", "col") 199 .allowElements( 200 "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6", 201 "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code", 202 "cite", "samp", "sub", "sup", "strike", "center", "blockquote", 203 "hr", "br", "col", "font", "map", "span", "div", "img", 204 "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot", 205 "table", "td", "th", "tr", "colgroup", "fieldset", "legend") 206 .toFactory(); 207 208 public static void main(String[] args) throws IOException { 209 if (args.length != 0) { 210 System.err.println("Reads from STDIN and writes to STDOUT"); 211 System.exit(-1); 212 } 213 System.err.println("[Reading from STDIN]"); 214 // Fetch the HTML to sanitize. 215 String html = CharStreams.toString( 216 new InputStreamReader(System.in, Charsets.UTF_8)); 217 // Set up an output channel to receive the sanitized HTML. 218 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 219 System.out, 220 // Receives notifications on a failure to write to the output. 221 new Handler<IOException>() { 222 public void handle(IOException ex) { 223 Throwables.propagate(ex); // System.out suppresses IOExceptions 224 } 225 }, 226 // Our HTML parser is very lenient, but this receives notifications on 227 // truly bizarre inputs. 228 new Handler<String>() { 229 public void handle(String x) { 230 throw new AssertionError(x); 231 } 232 }); 233 // Use the policy defined above to sanitize the HTML. 234 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); 235 } 236 } 237