Home | History | Annotate | Download | only in html5lib
      1 from __future__ import absolute_import, division, unicode_literals
      2 
      3 import re
      4 from xml.sax.saxutils import escape, unescape
      5 from six.moves import urllib_parse as urlparse
      6 
      7 from .tokenizer import HTMLTokenizer
      8 from .constants import tokenTypes
      9 
     10 
     11 content_type_rgx = re.compile(r'''
     12                                ^
     13                                # Match a content type <application>/<type>
     14                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
     15                                # Match any character set and encoding
     16                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
     17                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
     18                                # Assume the rest is data
     19                                ,.*
     20                                $
     21                                ''',
     22                               re.VERBOSE)
     23 
     24 
     25 class HTMLSanitizerMixin(object):
     26     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
     27 
     28     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
     29                            'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
     30                            'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
     31                            'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
     32                            'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
     33                            'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
     34                            'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
     35                            'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
     36                            'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
     37                            'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
     38                            'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
     39                            'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
     40                            'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
     41 
     42     mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
     43                        'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
     44                        'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
     45                        'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
     46                        'munderover', 'none']
     47 
     48     svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
     49                     'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
     50                     'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
     51                     'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
     52                     'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
     53                     'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
     54 
     55     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
     56                              'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
     57                              'background', 'balance', 'bgcolor', 'bgproperties', 'border',
     58                              'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
     59                              'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
     60                              'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
     61                              'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
     62                              'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
     63                              'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
     64                              'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
     65                              'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
     66                              'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
     67                              'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
     68                              'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
     69                              'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
     70                              'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
     71                              'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
     72                              'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
     73                              'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
     74                              'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
     75                              'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
     76                              'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
     77                              'width', 'wrap', 'xml:lang']
     78 
     79     mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
     80                          'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
     81                          'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
     82                          'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
     83                          'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
     84                          'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
     85                          'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
     86                          'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
     87                          'xlink:type', 'xmlns', 'xmlns:xlink']
     88 
     89     svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
     90                       'arabic-form', 'ascent', 'attributeName', 'attributeType',
     91                       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
     92                       'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
     93                       'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
     94                       'fill-opacity', 'fill-rule', 'font-family', 'font-size',
     95                       'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
     96                       'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
     97                       'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
     98                       'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
     99                       'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
    100                       'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
    101                       'opacity', 'orient', 'origin', 'overline-position',
    102                       'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
    103                       'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
    104                       'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
    105                       'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
    106                       'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
    107                       'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
    108                       'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
    109                       'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
    110                       'transform', 'type', 'u1', 'u2', 'underline-position',
    111                       'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
    112                       'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
    113                       'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
    114                       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
    115                       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
    116                       'y1', 'y2', 'zoomAndPan']
    117 
    118     attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
    119                        'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
    120 
    121     svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
    122                                'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
    123                                'mask', 'stroke']
    124 
    125     svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
    126                             'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
    127                             'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
    128                             'set', 'use']
    129 
    130     acceptable_css_properties = ['azimuth', 'background-color',
    131                                  'border-bottom-color', 'border-collapse', 'border-color',
    132                                  'border-left-color', 'border-right-color', 'border-top-color', 'clear',
    133                                  'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
    134                                  'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
    135                                  'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
    136                                  'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
    137                                  'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
    138                                  'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
    139                                  'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
    140                                  'white-space', 'width']
    141 
    142     acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
    143                                'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
    144                                'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
    145                                'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
    146                                'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
    147                                'transparent', 'underline', 'white', 'yellow']
    148 
    149     acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
    150                                  'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
    151                                  'stroke-opacity']
    152 
    153     acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
    154                             'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
    155                             'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
    156                             'ssh', 'sftp', 'rtsp', 'afs', 'data']
    157 
    158     acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
    159 
    160     # subclasses may define their own versions of these constants
    161     allowed_elements = acceptable_elements + mathml_elements + svg_elements
    162     allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
    163     allowed_css_properties = acceptable_css_properties
    164     allowed_css_keywords = acceptable_css_keywords
    165     allowed_svg_properties = acceptable_svg_properties
    166     allowed_protocols = acceptable_protocols
    167     allowed_content_types = acceptable_content_types
    168 
    169     # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    170     # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
    171     # attributes are parsed, and a restricted set, # specified by
    172     # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
    173     # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
    174     # in ALLOWED_PROTOCOLS are allowed.
    175     #
    176     #   sanitize_html('<script> do_nasty_stuff() </script>')
    177     #    => &lt;script> do_nasty_stuff() &lt;/script>
    178     #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    179     #    => <a>Click here for $100</a>
    180     def sanitize_token(self, token):
    181 
    182         # accommodate filters which use token_type differently
    183         token_type = token["type"]
    184         if token_type in list(tokenTypes.keys()):
    185             token_type = tokenTypes[token_type]
    186 
    187         if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
    188                           tokenTypes["EmptyTag"]):
    189             if token["name"] in self.allowed_elements:
    190                 return self.allowed_token(token, token_type)
    191             else:
    192                 return self.disallowed_token(token, token_type)
    193         elif token_type == tokenTypes["Comment"]:
    194             pass
    195         else:
    196             return token
    197 
    198     def allowed_token(self, token, token_type):
    199         if "data" in token:
    200             attrs = dict([(name, val) for name, val in
    201                           token["data"][::-1]
    202                           if name in self.allowed_attributes])
    203             for attr in self.attr_val_is_uri:
    204                 if attr not in attrs:
    205                     continue
    206                 val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
    207                                        unescape(attrs[attr])).lower()
    208                 # remove replacement characters from unescaped characters
    209                 val_unescaped = val_unescaped.replace("\ufffd", "")
    210                 uri = urlparse.urlparse(val_unescaped)
    211                 if uri and uri.scheme:
    212                     if uri.scheme not in self.allowed_protocols:
    213                         del attrs[attr]
    214                     if uri.scheme == 'data':
    215                         m = content_type_rgx.match(uri.path)
    216                         if not m:
    217                             del attrs[attr]
    218                         elif m.group('content_type') not in self.allowed_content_types:
    219                             del attrs[attr]
    220 
    221             for attr in self.svg_attr_val_allows_ref:
    222                 if attr in attrs:
    223                     attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
    224                                          ' ',
    225                                          unescape(attrs[attr]))
    226             if (token["name"] in self.svg_allow_local_href and
    227                 'xlink:href' in attrs and re.search('^\s*[^#\s].*',
    228                                                     attrs['xlink:href'])):
    229                 del attrs['xlink:href']
    230             if 'style' in attrs:
    231                 attrs['style'] = self.sanitize_css(attrs['style'])
    232             token["data"] = [[name, val] for name, val in list(attrs.items())]
    233         return token
    234 
    235     def disallowed_token(self, token, token_type):
    236         if token_type == tokenTypes["EndTag"]:
    237             token["data"] = "</%s>" % token["name"]
    238         elif token["data"]:
    239             attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
    240             token["data"] = "<%s%s>" % (token["name"], attrs)
    241         else:
    242             token["data"] = "<%s>" % token["name"]
    243         if token.get("selfClosing"):
    244             token["data"] = token["data"][:-1] + "/>"
    245 
    246         if token["type"] in list(tokenTypes.keys()):
    247             token["type"] = "Characters"
    248         else:
    249             token["type"] = tokenTypes["Characters"]
    250 
    251         del token["name"]
    252         return token
    253 
    254     def sanitize_css(self, style):
    255         # disallow urls
    256         style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
    257 
    258         # gauntlet
    259         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
    260             return ''
    261         if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
    262             return ''
    263 
    264         clean = []
    265         for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
    266             if not value:
    267                 continue
    268             if prop.lower() in self.allowed_css_properties:
    269                 clean.append(prop + ': ' + value + ';')
    270             elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
    271                                                 'padding']:
    272                 for keyword in value.split():
    273                     if keyword not in self.acceptable_css_keywords and \
    274                             not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
    275                         break
    276                 else:
    277                     clean.append(prop + ': ' + value + ';')
    278             elif prop.lower() in self.allowed_svg_properties:
    279                 clean.append(prop + ': ' + value + ';')
    280 
    281         return ' '.join(clean)
    282 
    283 
    284 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
    285     def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
    286                  lowercaseElementName=False, lowercaseAttrName=False, parser=None):
    287         # Change case matching defaults as we only output lowercase html anyway
    288         # This solution doesn't seem ideal...
    289         HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
    290                                lowercaseElementName, lowercaseAttrName, parser=parser)
    291 
    292     def __iter__(self):
    293         for token in HTMLTokenizer.__iter__(self):
    294             token = self.sanitize_token(token)
    295             if token:
    296                 yield token
    297