1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html.examples; 30 31 import java.io.IOException; 32 import java.io.InputStreamReader; 33 import java.util.regex.Pattern; 34 35 import org.owasp.html.Handler; 36 import org.owasp.html.HtmlPolicyBuilder; 37 import org.owasp.html.HtmlSanitizer; 38 import org.owasp.html.HtmlStreamEventReceiver; 39 import org.owasp.html.HtmlStreamRenderer; 40 41 import com.google.common.base.Charsets; 42 import com.google.common.base.Function; 43 import com.google.common.base.Throwables; 44 import com.google.common.io.CharStreams; 45 46 /** 47 * Based on the 48 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy Slashdot example</a>. 49 * <blockquote> 50 * Slashdot (http://www.slashdot.org/) is a techie news site that allows users 51 * to respond anonymously to news posts with very limited HTML markup. Now 52 * Slashdot is not only one of the coolest sites around, it's also one that's 53 * been subject to many different successful attacks. Even more unfortunate is 54 * the fact that most of the attacks led users to the infamous goatse.cx picture 55 * (please don't go look it up). The rules for Slashdot are fairly strict: users 56 * can only submit the following HTML tags and no CSS: {@code <b>}, {@code <u>}, 57 * {@code <i>}, {@code <a>}, {@code <blockquote>}. 58 * <br> 59 * Accordingly, we've built a policy file that allows fairly similar 60 * functionality. All text-formatting tags that operate directly on the font, 61 * color or emphasis have been allowed. 62 * </blockquote> 63 */ 64 public class SlashdotPolicyExample { 65 66 /** A policy definition that matches the minimal HTML that Slashdot allows. */ 67 public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> 68 POLICY_DEFINITION = new HtmlPolicyBuilder() 69 .allowStandardUrlProtocols() 70 // Allow title="..." on any element. 71 .allowAttributes("title").globally() 72 // Allow href="..." on <a> elements. 73 .allowAttributes("href").onElements("a") 74 // Defeat link spammers. 75 .requireRelNofollowOnLinks() 76 // Allow lang= with an alphabetic value on any element. 77 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) 78 .globally() 79 // The align attribute on <p> elements can have any value below. 80 .allowAttributes("align") 81 .matching(true, "center", "left", "right", "justify", "char") 82 .onElements("p") 83 // These elements are allowed. 84 .allowElements( 85 "a", "p", "div", "i", "b", "em", "blockquote", "tt", "strong", 86 "br", "ul", "ol", "li") 87 // Custom slashdot tags. 88 // These could be rewritten in the sanitizer using an ElementPolicy. 89 .allowElements("quote", "ecode") 90 .toFactory(); 91 92 public static void main(String[] args) throws IOException { 93 if (args.length != 0) { 94 System.err.println("Reads from STDIN and writes to STDOUT"); 95 System.exit(-1); 96 } 97 System.err.println("[Reading from STDIN]"); 98 // Fetch the HTML to sanitize. 99 String html = CharStreams.toString( 100 new InputStreamReader(System.in, Charsets.UTF_8)); 101 // Set up an output channel to receive the sanitized HTML. 102 HtmlStreamRenderer renderer = HtmlStreamRenderer.create( 103 System.out, 104 // Receives notifications on a failure to write to the output. 105 new Handler<IOException>() { 106 public void handle(IOException ex) { 107 Throwables.propagate(ex); // System.out suppresses IOExceptions 108 } 109 }, 110 // Our HTML parser is very lenient, but this receives notifications on 111 // truly bizarre inputs. 112 new Handler<String>() { 113 public void handle(String x) { 114 throw new AssertionError(x); 115 } 116 }); 117 // Use the policy defined above to sanitize the HTML. 118 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); 119 } 120 } 121