1 from __future__ import absolute_import, division, unicode_literals 2 3 try: 4 import json 5 except ImportError: 6 import simplejson as json 7 8 from html5lib import html5parser, sanitizer, constants, treebuilders 9 10 11 def toxmlFactory(): 12 tree = treebuilders.getTreeBuilder("etree") 13 14 def toxml(element): 15 # encode/decode roundtrip required for Python 2.6 compatibility 16 result_bytes = tree.implementation.tostring(element, encoding="utf-8") 17 return result_bytes.decode("utf-8") 18 19 return toxml 20 21 22 def runSanitizerTest(name, expected, input, toxml=None): 23 if toxml is None: 24 toxml = toxmlFactory() 25 expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). 26 parseFragment(expected)]) 27 expected = json.loads(json.dumps(expected)) 28 assert expected == sanitize_html(input) 29 30 31 def sanitize_html(stream, toxml=None): 32 if toxml is None: 33 toxml = toxmlFactory() 34 return ''.join([toxml(token) for token in 35 html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). 36 parseFragment(stream)]) 37 38 39 def test_should_handle_astral_plane_characters(): 40 assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") 41 42 43 def test_should_allow_relative_uris(): 44 assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') 45 46 47 def test_sanitizer(): 48 toxml = toxmlFactory() 49 for tag_name in sanitizer.HTMLSanitizer.allowed_elements: 50 if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: 51 continue # TODO 52 if tag_name != tag_name.lower(): 53 continue # TODO 54 if tag_name == 'image': 55 yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, 56 "<img title=\"1\"/>foo <bad>bar</bad> baz", 57 "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 58 toxml) 59 elif tag_name == 'br': 60 yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, 61 "<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", 62 "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 63 toxml) 64 elif tag_name in constants.voidElements: 65 yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, 66 "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, 67 "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 68 toxml) 69 else: 70 yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, 71 "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 72 "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 73 toxml) 74 75 for tag_name in sanitizer.HTMLSanitizer.allowed_elements: 76 tag_name = tag_name.upper() 77 yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, 78 "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 79 "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), 80 toxml) 81 82 for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: 83 if attribute_name != attribute_name.lower(): 84 continue # TODO 85 if attribute_name == 'style': 86 continue 87 attribute_value = 'foo' 88 if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: 89 attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] 90 yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, 91 "<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), 92 "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), 93 toxml) 94 95 for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: 96 attribute_name = attribute_name.upper() 97 yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, 98 "<p>foo <bad>bar</bad> baz</p>", 99 "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, 100 toxml) 101 102 for protocol in sanitizer.HTMLSanitizer.allowed_protocols: 103 rest_of_uri = '//sub.domain.tld/path/object.ext' 104 if protocol == 'data': 105 rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' 106 yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, 107 "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), 108 """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), 109 toxml) 110 111 yield (runSanitizerTest, "test_invalid_data_uri", 112 "<audio controls=\"\"></audio>", 113 "<audio controls=\"\" src=\"data:foobar\"></audio>", 114 toxml) 115 116 yield (runSanitizerTest, "test_data_uri_disallowed_type", 117 "<audio controls=\"\"></audio>", 118 "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", 119 toxml) 120 121 for protocol in sanitizer.HTMLSanitizer.allowed_protocols: 122 rest_of_uri = '//sub.domain.tld/path/object.ext' 123 if protocol == 'data': 124 rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' 125 protocol = protocol.upper() 126 yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, 127 "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), 128 """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), 129 toxml) 130