Home | History | Annotate | Download | only in serializer
      1 from __future__ import absolute_import, division, unicode_literals
      2 from six import text_type
      3 
      4 try:
      5     from functools import reduce
      6 except ImportError:
      7     pass
      8 
      9 from ..constants import voidElements, booleanAttributes, spaceCharacters
     10 from ..constants import rcdataElements, entities, xmlEntities
     11 from .. import utils
     12 from xml.sax.saxutils import escape
     13 
     14 spaceCharacters = "".join(spaceCharacters)
     15 
     16 try:
     17     from codecs import register_error, xmlcharrefreplace_errors
     18 except ImportError:
     19     unicode_encode_errors = "strict"
     20 else:
     21     unicode_encode_errors = "htmlentityreplace"
     22 
     23     encode_entity_map = {}
     24     is_ucs4 = len("\U0010FFFF") == 1
     25     for k, v in list(entities.items()):
     26         # skip multi-character entities
     27         if ((is_ucs4 and len(v) > 1) or
     28                 (not is_ucs4 and len(v) > 2)):
     29             continue
     30         if v != "&":
     31             if len(v) == 2:
     32                 v = utils.surrogatePairToCodepoint(v)
     33             else:
     34                 v = ord(v)
     35             if v not in encode_entity_map or k.islower():
     36                 # prefer < over < and similarly for &, >, etc.
     37                 encode_entity_map[v] = k
     38 
     39     def htmlentityreplace_errors(exc):
     40         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
     41             res = []
     42             codepoints = []
     43             skip = False
     44             for i, c in enumerate(exc.object[exc.start:exc.end]):
     45                 if skip:
     46                     skip = False
     47                     continue
     48                 index = i + exc.start
     49                 if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
     50                     codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
     51                     skip = True
     52                 else:
     53                     codepoint = ord(c)
     54                 codepoints.append(codepoint)
     55             for cp in codepoints:
     56                 e = encode_entity_map.get(cp)
     57                 if e:
     58                     res.append("&")
     59                     res.append(e)
     60                     if not e.endswith(";"):
     61                         res.append(";")
     62                 else:
     63                     res.append("&#x%s;" % (hex(cp)[2:]))
     64             return ("".join(res), exc.end)
     65         else:
     66             return xmlcharrefreplace_errors(exc)
     67 
     68     register_error(unicode_encode_errors, htmlentityreplace_errors)
     69 
     70     del register_error
     71 
     72 
     73 class HTMLSerializer(object):
     74 
     75     # attribute quoting options
     76     quote_attr_values = False
     77     quote_char = '"'
     78     use_best_quote_char = True
     79 
     80     # tag syntax options
     81     omit_optional_tags = True
     82     minimize_boolean_attributes = True
     83     use_trailing_solidus = False
     84     space_before_trailing_solidus = True
     85 
     86     # escaping options
     87     escape_lt_in_attrs = False
     88     escape_rcdata = False
     89     resolve_entities = True
     90 
     91     # miscellaneous options
     92     alphabetical_attributes = False
     93     inject_meta_charset = True
     94     strip_whitespace = False
     95     sanitize = False
     96 
     97     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
     98                "omit_optional_tags", "minimize_boolean_attributes",
     99                "use_trailing_solidus", "space_before_trailing_solidus",
    100                "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
    101                "alphabetical_attributes", "inject_meta_charset",
    102                "strip_whitespace", "sanitize")
    103 
    104     def __init__(self, **kwargs):
    105         """Initialize HTMLSerializer.
    106 
    107         Keyword options (default given first unless specified) include:
    108 
    109         inject_meta_charset=True|False
    110           Whether it insert a meta element to define the character set of the
    111           document.
    112         quote_attr_values=True|False
    113           Whether to quote attribute values that don't require quoting
    114           per HTML5 parsing rules.
    115         quote_char=u'"'|u"'"
    116           Use given quote character for attribute quoting. Default is to
    117           use double quote unless attribute value contains a double quote,
    118           in which case single quotes are used instead.
    119         escape_lt_in_attrs=False|True
    120           Whether to escape < in attribute values.
    121         escape_rcdata=False|True
    122           Whether to escape characters that need to be escaped within normal
    123           elements within rcdata elements such as style.
    124         resolve_entities=True|False
    125           Whether to resolve named character entities that appear in the
    126           source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
    127           are unaffected by this setting.
    128         strip_whitespace=False|True
    129           Whether to remove semantically meaningless whitespace. (This
    130           compresses all whitespace to a single space except within pre.)
    131         minimize_boolean_attributes=True|False
    132           Shortens boolean attributes to give just the attribute value,
    133           for example <input disabled="disabled"> becomes <input disabled>.
    134         use_trailing_solidus=False|True
    135           Includes a close-tag slash at the end of the start tag of void
    136           elements (empty elements whose end tag is forbidden). E.g. <hr/>.
    137         space_before_trailing_solidus=True|False
    138           Places a space immediately before the closing slash in a tag
    139           using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
    140         sanitize=False|True
    141           Strip all unsafe or unknown constructs from output.
    142           See `html5lib user documentation`_
    143         omit_optional_tags=True|False
    144           Omit start/end tags that are optional.
    145         alphabetical_attributes=False|True
    146           Reorder attributes to be in alphabetical order.
    147 
    148         .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
    149         """
    150         if 'quote_char' in kwargs:
    151             self.use_best_quote_char = False
    152         for attr in self.options:
    153             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
    154         self.errors = []
    155         self.strict = False
    156 
    157     def encode(self, string):
    158         assert(isinstance(string, text_type))
    159         if self.encoding:
    160             return string.encode(self.encoding, unicode_encode_errors)
    161         else:
    162             return string
    163 
    164     def encodeStrict(self, string):
    165         assert(isinstance(string, text_type))
    166         if self.encoding:
    167             return string.encode(self.encoding, "strict")
    168         else:
    169             return string
    170 
    171     def serialize(self, treewalker, encoding=None):
    172         self.encoding = encoding
    173         in_cdata = False
    174         self.errors = []
    175 
    176         if encoding and self.inject_meta_charset:
    177             from ..filters.inject_meta_charset import Filter
    178             treewalker = Filter(treewalker, encoding)
    179         # WhitespaceFilter should be used before OptionalTagFilter
    180         # for maximum efficiently of this latter filter
    181         if self.strip_whitespace:
    182             from ..filters.whitespace import Filter
    183             treewalker = Filter(treewalker)
    184         if self.sanitize:
    185             from ..filters.sanitizer import Filter
    186             treewalker = Filter(treewalker)
    187         if self.omit_optional_tags:
    188             from ..filters.optionaltags import Filter
    189             treewalker = Filter(treewalker)
    190         # Alphabetical attributes must be last, as other filters
    191         # could add attributes and alter the order
    192         if self.alphabetical_attributes:
    193             from ..filters.alphabeticalattributes import Filter
    194             treewalker = Filter(treewalker)
    195 
    196         for token in treewalker:
    197             type = token["type"]
    198             if type == "Doctype":
    199                 doctype = "<!DOCTYPE %s" % token["name"]
    200 
    201                 if token["publicId"]:
    202                     doctype += ' PUBLIC "%s"' % token["publicId"]
    203                 elif token["systemId"]:
    204                     doctype += " SYSTEM"
    205                 if token["systemId"]:
    206                     if token["systemId"].find('"') >= 0:
    207                         if token["systemId"].find("'") >= 0:
    208                             self.serializeError("System identifer contains both single and double quote characters")
    209                         quote_char = "'"
    210                     else:
    211                         quote_char = '"'
    212                     doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
    213 
    214                 doctype += ">"
    215                 yield self.encodeStrict(doctype)
    216 
    217             elif type in ("Characters", "SpaceCharacters"):
    218                 if type == "SpaceCharacters" or in_cdata:
    219                     if in_cdata and token["data"].find("</") >= 0:
    220                         self.serializeError("Unexpected </ in CDATA")
    221                     yield self.encode(token["data"])
    222                 else:
    223                     yield self.encode(escape(token["data"]))
    224 
    225             elif type in ("StartTag", "EmptyTag"):
    226                 name = token["name"]
    227                 yield self.encodeStrict("<%s" % name)
    228                 if name in rcdataElements and not self.escape_rcdata:
    229                     in_cdata = True
    230                 elif in_cdata:
    231                     self.serializeError("Unexpected child element of a CDATA element")
    232                 for (attr_namespace, attr_name), attr_value in token["data"].items():
    233                     # TODO: Add namespace support here
    234                     k = attr_name
    235                     v = attr_value
    236                     yield self.encodeStrict(' ')
    237 
    238                     yield self.encodeStrict(k)
    239                     if not self.minimize_boolean_attributes or \
    240                         (k not in booleanAttributes.get(name, tuple())
    241                          and k not in booleanAttributes.get("", tuple())):
    242                         yield self.encodeStrict("=")
    243                         if self.quote_attr_values or not v:
    244                             quote_attr = True
    245                         else:
    246                             quote_attr = reduce(lambda x, y: x or (y in v),
    247                                                 spaceCharacters + ">\"'=", False)
    248                         v = v.replace("&", "&amp;")
    249                         if self.escape_lt_in_attrs:
    250                             v = v.replace("<", "&lt;")
    251                         if quote_attr:
    252                             quote_char = self.quote_char
    253                             if self.use_best_quote_char:
    254                                 if "'" in v and '"' not in v:
    255                                     quote_char = '"'
    256                                 elif '"' in v and "'" not in v:
    257                                     quote_char = "'"
    258                             if quote_char == "'":
    259                                 v = v.replace("'", "&#39;")
    260                             else:
    261                                 v = v.replace('"', "&quot;")
    262                             yield self.encodeStrict(quote_char)
    263                             yield self.encode(v)
    264                             yield self.encodeStrict(quote_char)
    265                         else:
    266                             yield self.encode(v)
    267                 if name in voidElements and self.use_trailing_solidus:
    268                     if self.space_before_trailing_solidus:
    269                         yield self.encodeStrict(" /")
    270                     else:
    271                         yield self.encodeStrict("/")
    272                 yield self.encode(">")
    273 
    274             elif type == "EndTag":
    275                 name = token["name"]
    276                 if name in rcdataElements:
    277                     in_cdata = False
    278                 elif in_cdata:
    279                     self.serializeError("Unexpected child element of a CDATA element")
    280                 yield self.encodeStrict("</%s>" % name)
    281 
    282             elif type == "Comment":
    283                 data = token["data"]
    284                 if data.find("--") >= 0:
    285                     self.serializeError("Comment contains --")
    286                 yield self.encodeStrict("<!--%s-->" % token["data"])
    287 
    288             elif type == "Entity":
    289                 name = token["name"]
    290                 key = name + ";"
    291                 if key not in entities:
    292                     self.serializeError("Entity %s not recognized" % name)
    293                 if self.resolve_entities and key not in xmlEntities:
    294                     data = entities[key]
    295                 else:
    296                     data = "&%s;" % name
    297                 yield self.encodeStrict(data)
    298 
    299             else:
    300                 self.serializeError(token["data"])
    301 
    302     def render(self, treewalker, encoding=None):
    303         if encoding:
    304             return b"".join(list(self.serialize(treewalker, encoding)))
    305         else:
    306             return "".join(list(self.serialize(treewalker)))
    307 
    308     def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
    309         # XXX The idea is to make data mandatory.
    310         self.errors.append(data)
    311         if self.strict:
    312             raise SerializeError
    313 
    314 
    315 def SerializeError(Exception):
    316     """Error in serialized tree"""
    317     pass
    318