Home | History | Annotate | Download | only in tests
      1 from __future__ import absolute_import, division, unicode_literals
      2 
      3 try:
      4     import json
      5 except ImportError:
      6     import simplejson as json
      7 
      8 from html5lib import html5parser, sanitizer, constants, treebuilders
      9 
     10 
     11 def toxmlFactory():
     12     tree = treebuilders.getTreeBuilder("etree")
     13 
     14     def toxml(element):
     15         # encode/decode roundtrip required for Python 2.6 compatibility
     16         result_bytes = tree.implementation.tostring(element, encoding="utf-8")
     17         return result_bytes.decode("utf-8")
     18 
     19     return toxml
     20 
     21 
     22 def runSanitizerTest(name, expected, input, toxml=None):
     23     if toxml is None:
     24         toxml = toxmlFactory()
     25     expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
     26                         parseFragment(expected)])
     27     expected = json.loads(json.dumps(expected))
     28     assert expected == sanitize_html(input)
     29 
     30 
     31 def sanitize_html(stream, toxml=None):
     32     if toxml is None:
     33         toxml = toxmlFactory()
     34     return ''.join([toxml(token) for token in
     35                     html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
     36                     parseFragment(stream)])
     37 
     38 
     39 def test_should_handle_astral_plane_characters():
     40     assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
     41 
     42 
     43 def test_should_allow_relative_uris():
     44     assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>')
     45 
     46 
     47 def test_sanitizer():
     48     toxml = toxmlFactory()
     49     for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
     50         if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
     51             continue  # TODO
     52         if tag_name != tag_name.lower():
     53             continue  # TODO
     54         if tag_name == 'image':
     55             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
     56                    "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
     57                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
     58                    toxml)
     59         elif tag_name == 'br':
     60             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
     61                    "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
     62                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
     63                    toxml)
     64         elif tag_name in constants.voidElements:
     65             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
     66                    "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
     67                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
     68                    toxml)
     69         else:
     70             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
     71                    "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
     72                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
     73                    toxml)
     74 
     75     for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
     76         tag_name = tag_name.upper()
     77         yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
     78                "&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
     79                "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
     80                toxml)
     81 
     82     for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
     83         if attribute_name != attribute_name.lower():
     84             continue  # TODO
     85         if attribute_name == 'style':
     86             continue
     87         attribute_value = 'foo'
     88         if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
     89             attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
     90         yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
     91                "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
     92                "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
     93                toxml)
     94 
     95     for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
     96         attribute_name = attribute_name.upper()
     97         yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
     98                "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
     99                "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
    100                toxml)
    101 
    102     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
    103         rest_of_uri = '//sub.domain.tld/path/object.ext'
    104         if protocol == 'data':
    105             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
    106         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
    107                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
    108                """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
    109                toxml)
    110 
    111     yield (runSanitizerTest, "test_invalid_data_uri",
    112            "<audio controls=\"\"></audio>",
    113            "<audio controls=\"\" src=\"data:foobar\"></audio>",
    114            toxml)
    115 
    116     yield (runSanitizerTest, "test_data_uri_disallowed_type",
    117            "<audio controls=\"\"></audio>",
    118            "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>",
    119            toxml)
    120 
    121     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
    122         rest_of_uri = '//sub.domain.tld/path/object.ext'
    123         if protocol == 'data':
    124             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
    125         protocol = protocol.upper()
    126         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
    127                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
    128                """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
    129                toxml)
    130