Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 package com.android.mail.utils;
     17 
     18 import android.os.Looper;
     19 import android.util.Log;
     20 
     21 import com.android.mail.perf.Timer;
     22 import com.google.common.collect.ImmutableList;
     23 import com.google.common.collect.ImmutableSet;
     24 
     25 import org.owasp.html.AttributePolicy;
     26 import org.owasp.html.CssSchema;
     27 import org.owasp.html.ElementPolicy;
     28 import org.owasp.html.FilterUrlByProtocolAttributePolicy;
     29 import org.owasp.html.Handler;
     30 import org.owasp.html.HtmlPolicyBuilder;
     31 import org.owasp.html.HtmlStreamRenderer;
     32 import org.owasp.html.PolicyFactory;
     33 
     34 import java.util.List;
     35 
     36 /**
     37  * This sanitizer is meant to strip all scripts and any malicious HTML from untrusted emails. It
     38  * uses the <a href="https://www.owasp.org/index.php/OWASP_Java_HTML_Sanitizer_Project">OWASP Java
     39  * HTML Sanitizer Project</a> to whitelist the subset of HTML elements and attributes as well as CSS
     40  * properties that are considered safe. Any unmatched HTML or CSS is discarded.
     41  *
     42  * All URLS are scrubbed to ensure they match the blessed form of "http://the.url.here",
     43  * "https://the.url.here" or "mailto:address (at) server.com" and cannot resemble "javascript:badness()"
     44  * or comparable.
     45  */
     46 public final class HtmlSanitizer {
     47 
     48     /**
     49      * This version number should be bumped each time a meaningful change is made to this sanitizer
     50      * configuration which influences its output. It is compared against a minimum target version
     51      * number. If it meets or exceeds the minimum target version, the result of the sanitizer is
     52      * free to be shown in a standard webview. If it does not meet the minimum target version then
     53      * the sanitized output is deemed untrustworthy and is shown in a sandboxed webview with
     54      * javascript execution disabled.
     55      */
     56     public static final int VERSION = 1;
     57 
     58     private static final String LOG_TAG = LogTag.getLogTag();
     59 
     60     /**
     61      * The following CSS properties do not appear in the default whitelist from OWASP, but they
     62      * improve the fidelity of the HTML display without unacceptable risk.
     63      */
     64     private static final CssSchema ADDITIONAL_CSS = CssSchema.withProperties(ImmutableSet.of(
     65             "float",
     66             "display"
     67     ));
     68 
     69     /**
     70      * Translates the body tag into the div tag
     71      */
     72     private static final ElementPolicy TRANSLATE_BODY_TO_DIV = new ElementPolicy() {
     73         public String apply(String elementName, List<String> attrs) {
     74             return "div";
     75         }
     76     };
     77 
     78     /**
     79      * Translates <div> tags surrounding quoted text into <div class="elided-text"> which allows
     80      * quoted text collapsing in ConversationViewFragment.
     81      */
     82     private static final ElementPolicy TRANSLATE_DIV_CLASS = new ElementPolicy() {
     83         public String apply(String elementName, List<String> attrs) {
     84             boolean showHideQuotedText = false;
     85 
     86             // check if the class attribute is listed
     87             final int classIndex = attrs.indexOf("class");
     88             if (classIndex >= 0) {
     89                 // remove the class attribute and its value
     90                 final String value = attrs.remove(classIndex + 1);
     91                 attrs.remove(classIndex);
     92 
     93                 // gmail and yahoo use a specific div class name to indicate quoted text
     94                 showHideQuotedText = "gmail_quote".equals(value) || "yahoo_quoted".equals(value);
     95             }
     96 
     97             // check if the id attribute is listed
     98             final int idIndex = attrs.indexOf("id");
     99             if (idIndex >= 0) {
    100                 // remove the id attribute and its value
    101                 final String value = attrs.remove(idIndex + 1);
    102                 attrs.remove(idIndex);
    103 
    104                 // AOL uses a specific id value to indicate quoted text
    105                 showHideQuotedText = value.startsWith("AOLMsgPart");
    106             }
    107 
    108             // insert a class attribute with a value of "elided-text" to hide/show quoted text
    109             if (showHideQuotedText) {
    110                 attrs.add("class");
    111                 attrs.add("elided-text");
    112             }
    113 
    114             return "div";
    115         }
    116     };
    117 
    118     /**
    119      * Disallow "cid:" and "mailto:" urls on all tags not &lt;a&gt; or &lt;img&gt;.
    120      */
    121     private static final AttributePolicy URL_PROTOCOLS =
    122             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("http", "https"));
    123 
    124     /**
    125      * Disallow the "cid:" url on links. Do allow "mailto:" urls to support sending mail.
    126      */
    127     private static final AttributePolicy A_HREF_PROTOCOLS =
    128             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("mailto", "http", "https"));
    129 
    130     /**
    131      * Disallow the "mailto:" url on images so that "Show pictures" can't be used to start composing
    132      * a bajillion emails. Do allow "cid:" urls to support inline image attachments.
    133      */
    134     private static final AttributePolicy IMG_SRC_PROTOCOLS =
    135             new FilterUrlByProtocolAttributePolicy(ImmutableList.of("cid", "http", "https"));
    136 
    137     /**
    138      * This sanitizer policy removes these elements and the content within:
    139      * <ul>
    140      *     <li>APPLET</li>
    141      *     <li>FRAMESET</li>
    142      *     <li>OBJECT</li>
    143      *     <li>SCRIPT</li>
    144      *     <li>STYLE</li>
    145      *     <li>TITLE</li>
    146      * </ul>
    147      *
    148      * This sanitizer policy removes these elements but preserves the content within:
    149      * <ul>
    150      *     <li>BASEFONT</li>
    151      *     <li>FRAME</li>
    152      *     <li>HEAD</li>
    153      *     <li>IFRAME</li>
    154      *     <li>ISINDEX</li>
    155      *     <li>LINK</li>
    156      *     <li>META</li>
    157      *     <li>NOFRAMES</li>
    158      *     <li>PARAM</li>
    159      *     <li>NOSCRIPT</li>
    160      * </ul>
    161      *
    162      * This sanitizer policy removes these attributes from all elements:
    163      * <ul>
    164      *     <li>code</li>
    165      *     <li>codebase</li>
    166      *     <li>id</li>
    167      *     <li>for</li>
    168      *     <li>headers</li>
    169      *     <li>onblur</li>
    170      *     <li>onchange</li>
    171      *     <li>onclick</li>
    172      *     <li>ondblclick</li>
    173      *     <li>onfocus</li>
    174      *     <li>onkeydown</li>
    175      *     <li>onkeypress</li>
    176      *     <li>onkeyup</li>
    177      *     <li>onload</li>
    178      *     <li>onmousedown</li>
    179      *     <li>onmousemove</li>
    180      *     <li>onmouseout</li>
    181      *     <li>onmouseover</li>
    182      *     <li>onmouseup</li>
    183      *     <li>onreset</li>
    184      *     <li>onselect</li>
    185      *     <li>onsubmit</li>
    186      *     <li>onunload</li>
    187      *     <li>tabindex</li>
    188      * </ul>
    189      */
    190     private static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
    191             .allowAttributes("dir").matching(true, "ltr", "rtl").globally()
    192             .allowUrlProtocols("cid", "http", "https", "mailto")
    193             .allowStyling(CssSchema.union(CssSchema.DEFAULT, ADDITIONAL_CSS))
    194             .disallowTextIn("applet", "frameset", "object", "script", "style", "title")
    195             .allowElements("a")
    196                 .allowAttributes("coords", "name", "shape").onElements("a")
    197                 .allowAttributes("href").matching(A_HREF_PROTOCOLS).onElements("a")
    198             .allowElements("abbr").allowAttributes("title").onElements("abbr")
    199             .allowElements("acronym").allowAttributes("title").onElements("acronym")
    200             .allowElements("address")
    201             .allowElements("area")
    202                 .allowAttributes("alt", "coords", "nohref", "name", "shape").onElements("area")
    203                 .allowAttributes("href").matching(URL_PROTOCOLS).onElements("area")
    204             .allowElements("article")
    205             .allowElements("aside")
    206             .allowElements("b")
    207             .allowElements("base")
    208                 .allowAttributes("href").matching(URL_PROTOCOLS).onElements("base")
    209             .allowElements("bdi").allowAttributes("dir").onElements("bdi")
    210             .allowElements("bdo").allowAttributes("dir").onElements("bdo")
    211             .allowElements("big")
    212             .allowElements("blockquote").allowAttributes("cite").onElements("blockquote")
    213             .allowElements(TRANSLATE_BODY_TO_DIV, "body")
    214             .allowElements("br").allowAttributes("clear").onElements("br")
    215             .allowElements("button")
    216                 .allowAttributes("autofocus", "disabled", "form", "formaction", "formenctype",
    217                         "formmethod", "formnovalidate", "formtarget", "name", "type", "value")
    218             .onElements("button")
    219             .allowElements("canvas").allowAttributes("width", "height").onElements("canvas")
    220             .allowElements("caption").allowAttributes("align").onElements("caption")
    221             .allowElements("center")
    222             .allowElements("cite")
    223             .allowElements("code")
    224             .allowElements("col")
    225                 .allowAttributes("align", "bgcolor", "char", "charoff", "span", "valign", "width")
    226             .onElements("col")
    227             .allowElements("colgroup")
    228                 .allowAttributes("align", "char", "charoff", "span", "valign", "width")
    229             .onElements("colgroup")
    230             .allowElements("datalist")
    231             .allowElements("dd")
    232             .allowElements("del").allowAttributes("cite", "datetime").onElements("del")
    233             .allowElements("details")
    234             .allowElements("dfn")
    235             .allowElements("dir").allowAttributes("compact").onElements("dir")
    236             .allowElements(TRANSLATE_DIV_CLASS, "div")
    237                 .allowAttributes("align", "background", "class", "id")
    238             .onElements("div")
    239             .allowElements("dl")
    240             .allowElements("dt")
    241             .allowElements("em")
    242             .allowElements("fieldset")
    243                 .allowAttributes("disabled", "form", "name")
    244             .onElements("fieldset")
    245             .allowElements("figcaption")
    246             .allowElements("figure")
    247             .allowElements("font").allowAttributes("color", "face", "size").onElements("font")
    248             .allowElements("footer")
    249             .allowElements("form")
    250                 .allowAttributes("accept", "action", "accept-charset", "autocomplete", "enctype",
    251                         "method", "name", "novalidate", "target")
    252             .onElements("form")
    253             .allowElements("header")
    254             .allowElements("h1").allowAttributes("align").onElements("h1")
    255             .allowElements("h2").allowAttributes("align").onElements("h2")
    256             .allowElements("h3").allowAttributes("align").onElements("h3")
    257             .allowElements("h4").allowAttributes("align").onElements("h4")
    258             .allowElements("h5").allowAttributes("align").onElements("h5")
    259             .allowElements("h6").allowAttributes("align").onElements("h6")
    260             .allowElements("hr")
    261                 .allowAttributes("align", "noshade", "size", "width")
    262             .onElements("hr")
    263             .allowElements("i")
    264             .allowElements("img")
    265                 .allowAttributes("src").matching(IMG_SRC_PROTOCOLS).onElements("img")
    266                 .allowAttributes("longdesc").matching(URL_PROTOCOLS).onElements("img")
    267                 .allowAttributes("align", "alt", "border", "crossorigin", "height", "hspace",
    268                         "ismap", "usemap", "vspace", "width")
    269             .onElements("img")
    270             .allowElements("input")
    271                 .allowAttributes("src").matching(URL_PROTOCOLS).onElements("input")
    272                 .allowAttributes("formaction").matching(URL_PROTOCOLS).onElements("input")
    273                 .allowAttributes("accept", "align", "alt", "autocomplete", "autofocus", "checked",
    274                         "disabled", "form", "formenctype", "formmethod", "formnovalidate",
    275                         "formtarget", "height", "list", "max", "maxlength", "min", "multiple",
    276                         "name", "pattern", "placeholder", "readonly", "required", "size", "step",
    277                         "type", "value", "width")
    278             .onElements("input")
    279             .allowElements("ins")
    280                 .allowAttributes("cite").matching(URL_PROTOCOLS).onElements("ins")
    281                 .allowAttributes("datetime").onElements("ins")
    282             .allowElements("kbd")
    283             .allowElements("keygen")
    284                 .allowAttributes("autofocus", "challenge", "disabled", "form", "keytype", "name")
    285             .onElements("keygen")
    286             .allowElements("label").allowAttributes("form").onElements("label")
    287             .allowElements("legend").allowAttributes("align").onElements("legend")
    288             .allowElements("li").allowAttributes("type", "value").onElements("li")
    289             .allowElements("main")
    290             .allowElements("map").allowAttributes("name").onElements("map")
    291             .allowElements("mark")
    292             .allowElements("menu").allowAttributes("label", "type").onElements("menu")
    293             .allowElements("menuitem")
    294                 .allowAttributes("icon").matching(URL_PROTOCOLS).onElements("menuitem")
    295                 .allowAttributes("checked", "command", "default", "disabled", "label", "type",
    296                         "radiogroup").onElements("menuitem")
    297             .allowElements("meter")
    298                 .allowAttributes("form", "high", "low", "max", "min", "optimum", "value")
    299             .onElements("meter")
    300             .allowElements("nav")
    301             .allowElements("ol")
    302                 .allowAttributes("compact", "reversed", "start", "type")
    303             .onElements("ol")
    304             .allowElements("optgroup").allowAttributes("disabled", "label").onElements("optgroup")
    305             .allowElements("option")
    306                 .allowAttributes("disabled", "label", "selected", "value")
    307             .onElements("option")
    308             .allowElements("output").allowAttributes("form", "name").onElements("output")
    309             .allowElements("p").allowAttributes("align").onElements("p")
    310             .allowElements("pre").allowAttributes("width").onElements("pre")
    311             .allowElements("progress").allowAttributes("max", "value").onElements("progress")
    312             .allowElements("q").allowAttributes("cite").matching(URL_PROTOCOLS).onElements("q")
    313             .allowElements("rp")
    314             .allowElements("rt")
    315             .allowElements("ruby")
    316             .allowElements("s")
    317             .allowElements("samp")
    318             .allowElements("section")
    319             .allowElements("select")
    320                 .allowAttributes("autofocus", "disabled", "form", "multiple", "name", "required",
    321                         "size")
    322             .onElements("select")
    323             .allowElements("small")
    324             .allowElements("span")
    325             .allowElements("strike")
    326             .allowElements("strong")
    327             .allowElements("sub")
    328             .allowElements("summary")
    329             .allowElements("sup")
    330             .allowElements("table")
    331                 .allowAttributes("align", "bgcolor", "border", "cellpadding", "cellspacing",
    332                         "frame", "rules", "sortable", "summary", "width")
    333             .onElements("table")
    334             .allowElements("tbody")
    335                 .allowAttributes("align", "char", "charoff", "valign").onElements("tbody")
    336             .allowElements("td")
    337                 .allowAttributes("abbr", "align", "axis", "bgcolor", "char", "charoff", "colspan",
    338                         "height", "nowrap", "rowspan", "scope", "valign", "width")
    339             .onElements("td")
    340             .allowElements("textarea")
    341                 .allowAttributes("autofocus", "cols", "disabled", "form", "maxlength", "name",
    342                         "placeholder", "readonly", "required", "rows", "wrap")
    343             .onElements("textarea")
    344             .allowElements("tfoot")
    345                 .allowAttributes("align", "char", "charoff", "valign").onElements("tfoot")
    346             .allowElements("th")
    347                 .allowAttributes("abbr", "align", "axis", "bgcolor", "char", "charoff", "colspan",
    348                         "height", "nowrap", "rowspan", "scope", "sorted", "valign", "width")
    349             .onElements("th")
    350             .allowElements("thead")
    351                 .allowAttributes("align", "char", "charoff", "valign").onElements("thead")
    352             .allowElements("time").allowAttributes("datetime").onElements("time")
    353             .allowElements("tr")
    354                 .allowAttributes("align", "bgcolor", "char", "charoff", "valign").onElements("tr")
    355             .allowElements("tt")
    356             .allowElements("u")
    357             .allowElements("ul").allowAttributes("compact", "type").onElements("ul")
    358             .allowElements("var")
    359             .allowElements("wbr")
    360             .toFactory();
    361 
    362     private HtmlSanitizer() {}
    363 
    364     /**
    365      * Sanitizing email is treated as an expensive operation; this method should be called from
    366      * a background Thread.
    367      *
    368      * @param rawHtml the unsanitized, suspicious html
    369      * @return the sanitized form of the <code>rawHtml</code>; <code>null</code> if
    370      *      <code>rawHtml</code> was <code>null</code>
    371      */
    372     public static String sanitizeHtml(final String rawHtml) {
    373         if (Looper.getMainLooper() == Looper.myLooper()) {
    374             throw new IllegalStateException("sanitizing email should not occur on the main thread");
    375         }
    376 
    377         if (rawHtml == null) {
    378             return null;
    379         }
    380 
    381         // create the builder into which the sanitized email will be written
    382         final StringBuilder htmlBuilder = new StringBuilder(rawHtml.length());
    383 
    384         // create the renderer that will write the sanitized HTML to the builder
    385         final HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
    386                 htmlBuilder,
    387                 Handler.PROPAGATE,
    388                 // log errors resulting from exceptionally bizarre inputs
    389                 new Handler<String>() {
    390                     public void handle(final String x) {
    391                         Log.wtf(LOG_TAG, "Mangled HTML content cannot be parsed: " + x);
    392                         throw new AssertionError(x);
    393                     }
    394                 }
    395         );
    396 
    397         // create a thread-specific policy
    398         final org.owasp.html.HtmlSanitizer.Policy policy = POLICY_DEFINITION.apply(renderer);
    399 
    400         // run the html through the sanitizer
    401         Timer.startTiming("sanitizingHTMLEmail");
    402         try {
    403             org.owasp.html.HtmlSanitizer.sanitize(rawHtml, policy);
    404         } finally {
    405             Timer.stopTiming("sanitizingHTMLEmail");
    406         }
    407 
    408         // return the resulting HTML from the builder
    409         return htmlBuilder.toString();
    410     }
    411 }
    412