1 from __future__ import absolute_import, division, unicode_literals 2 from six import text_type 3 4 try: 5 from functools import reduce 6 except ImportError: 7 pass 8 9 from ..constants import voidElements, booleanAttributes, spaceCharacters 10 from ..constants import rcdataElements, entities, xmlEntities 11 from .. import utils 12 from xml.sax.saxutils import escape 13 14 spaceCharacters = "".join(spaceCharacters) 15 16 try: 17 from codecs import register_error, xmlcharrefreplace_errors 18 except ImportError: 19 unicode_encode_errors = "strict" 20 else: 21 unicode_encode_errors = "htmlentityreplace" 22 23 encode_entity_map = {} 24 is_ucs4 = len("\U0010FFFF") == 1 25 for k, v in list(entities.items()): 26 # skip multi-character entities 27 if ((is_ucs4 and len(v) > 1) or 28 (not is_ucs4 and len(v) > 2)): 29 continue 30 if v != "&": 31 if len(v) == 2: 32 v = utils.surrogatePairToCodepoint(v) 33 else: 34 v = ord(v) 35 if v not in encode_entity_map or k.islower(): 36 # prefer < over < and similarly for &, >, etc. 37 encode_entity_map[v] = k 38 39 def htmlentityreplace_errors(exc): 40 if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): 41 res = [] 42 codepoints = [] 43 skip = False 44 for i, c in enumerate(exc.object[exc.start:exc.end]): 45 if skip: 46 skip = False 47 continue 48 index = i + exc.start 49 if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): 50 codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) 51 skip = True 52 else: 53 codepoint = ord(c) 54 codepoints.append(codepoint) 55 for cp in codepoints: 56 e = encode_entity_map.get(cp) 57 if e: 58 res.append("&") 59 res.append(e) 60 if not e.endswith(";"): 61 res.append(";") 62 else: 63 res.append("&#x%s;" % (hex(cp)[2:])) 64 return ("".join(res), exc.end) 65 else: 66 return xmlcharrefreplace_errors(exc) 67 68 register_error(unicode_encode_errors, htmlentityreplace_errors) 69 70 del register_error 71 72 73 class HTMLSerializer(object): 74 75 # attribute quoting options 76 quote_attr_values = False 77 quote_char = '"' 78 use_best_quote_char = True 79 80 # tag syntax options 81 omit_optional_tags = True 82 minimize_boolean_attributes = True 83 use_trailing_solidus = False 84 space_before_trailing_solidus = True 85 86 # escaping options 87 escape_lt_in_attrs = False 88 escape_rcdata = False 89 resolve_entities = True 90 91 # miscellaneous options 92 alphabetical_attributes = False 93 inject_meta_charset = True 94 strip_whitespace = False 95 sanitize = False 96 97 options = ("quote_attr_values", "quote_char", "use_best_quote_char", 98 "omit_optional_tags", "minimize_boolean_attributes", 99 "use_trailing_solidus", "space_before_trailing_solidus", 100 "escape_lt_in_attrs", "escape_rcdata", "resolve_entities", 101 "alphabetical_attributes", "inject_meta_charset", 102 "strip_whitespace", "sanitize") 103 104 def __init__(self, **kwargs): 105 """Initialize HTMLSerializer. 106 107 Keyword options (default given first unless specified) include: 108 109 inject_meta_charset=True|False 110 Whether it insert a meta element to define the character set of the 111 document. 112 quote_attr_values=True|False 113 Whether to quote attribute values that don't require quoting 114 per HTML5 parsing rules. 115 quote_char=u'"'|u"'" 116 Use given quote character for attribute quoting. Default is to 117 use double quote unless attribute value contains a double quote, 118 in which case single quotes are used instead. 119 escape_lt_in_attrs=False|True 120 Whether to escape < in attribute values. 121 escape_rcdata=False|True 122 Whether to escape characters that need to be escaped within normal 123 elements within rcdata elements such as style. 124 resolve_entities=True|False 125 Whether to resolve named character entities that appear in the 126 source tree. The XML predefined entities < > & " ' 127 are unaffected by this setting. 128 strip_whitespace=False|True 129 Whether to remove semantically meaningless whitespace. (This 130 compresses all whitespace to a single space except within pre.) 131 minimize_boolean_attributes=True|False 132 Shortens boolean attributes to give just the attribute value, 133 for example <input disabled="disabled"> becomes <input disabled>. 134 use_trailing_solidus=False|True 135 Includes a close-tag slash at the end of the start tag of void 136 elements (empty elements whose end tag is forbidden). E.g. <hr/>. 137 space_before_trailing_solidus=True|False 138 Places a space immediately before the closing slash in a tag 139 using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus. 140 sanitize=False|True 141 Strip all unsafe or unknown constructs from output. 142 See `html5lib user documentation`_ 143 omit_optional_tags=True|False 144 Omit start/end tags that are optional. 145 alphabetical_attributes=False|True 146 Reorder attributes to be in alphabetical order. 147 148 .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation 149 """ 150 if 'quote_char' in kwargs: 151 self.use_best_quote_char = False 152 for attr in self.options: 153 setattr(self, attr, kwargs.get(attr, getattr(self, attr))) 154 self.errors = [] 155 self.strict = False 156 157 def encode(self, string): 158 assert(isinstance(string, text_type)) 159 if self.encoding: 160 return string.encode(self.encoding, unicode_encode_errors) 161 else: 162 return string 163 164 def encodeStrict(self, string): 165 assert(isinstance(string, text_type)) 166 if self.encoding: 167 return string.encode(self.encoding, "strict") 168 else: 169 return string 170 171 def serialize(self, treewalker, encoding=None): 172 self.encoding = encoding 173 in_cdata = False 174 self.errors = [] 175 176 if encoding and self.inject_meta_charset: 177 from ..filters.inject_meta_charset import Filter 178 treewalker = Filter(treewalker, encoding) 179 # WhitespaceFilter should be used before OptionalTagFilter 180 # for maximum efficiently of this latter filter 181 if self.strip_whitespace: 182 from ..filters.whitespace import Filter 183 treewalker = Filter(treewalker) 184 if self.sanitize: 185 from ..filters.sanitizer import Filter 186 treewalker = Filter(treewalker) 187 if self.omit_optional_tags: 188 from ..filters.optionaltags import Filter 189 treewalker = Filter(treewalker) 190 # Alphabetical attributes must be last, as other filters 191 # could add attributes and alter the order 192 if self.alphabetical_attributes: 193 from ..filters.alphabeticalattributes import Filter 194 treewalker = Filter(treewalker) 195 196 for token in treewalker: 197 type = token["type"] 198 if type == "Doctype": 199 doctype = "<!DOCTYPE %s" % token["name"] 200 201 if token["publicId"]: 202 doctype += ' PUBLIC "%s"' % token["publicId"] 203 elif token["systemId"]: 204 doctype += " SYSTEM" 205 if token["systemId"]: 206 if token["systemId"].find('"') >= 0: 207 if token["systemId"].find("'") >= 0: 208 self.serializeError("System identifer contains both single and double quote characters") 209 quote_char = "'" 210 else: 211 quote_char = '"' 212 doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) 213 214 doctype += ">" 215 yield self.encodeStrict(doctype) 216 217 elif type in ("Characters", "SpaceCharacters"): 218 if type == "SpaceCharacters" or in_cdata: 219 if in_cdata and token["data"].find("</") >= 0: 220 self.serializeError("Unexpected </ in CDATA") 221 yield self.encode(token["data"]) 222 else: 223 yield self.encode(escape(token["data"])) 224 225 elif type in ("StartTag", "EmptyTag"): 226 name = token["name"] 227 yield self.encodeStrict("<%s" % name) 228 if name in rcdataElements and not self.escape_rcdata: 229 in_cdata = True 230 elif in_cdata: 231 self.serializeError("Unexpected child element of a CDATA element") 232 for (attr_namespace, attr_name), attr_value in token["data"].items(): 233 # TODO: Add namespace support here 234 k = attr_name 235 v = attr_value 236 yield self.encodeStrict(' ') 237 238 yield self.encodeStrict(k) 239 if not self.minimize_boolean_attributes or \ 240 (k not in booleanAttributes.get(name, tuple()) 241 and k not in booleanAttributes.get("", tuple())): 242 yield self.encodeStrict("=") 243 if self.quote_attr_values or not v: 244 quote_attr = True 245 else: 246 quote_attr = reduce(lambda x, y: x or (y in v), 247 spaceCharacters + ">\"'=", False) 248 v = v.replace("&", "&") 249 if self.escape_lt_in_attrs: 250 v = v.replace("<", "<") 251 if quote_attr: 252 quote_char = self.quote_char 253 if self.use_best_quote_char: 254 if "'" in v and '"' not in v: 255 quote_char = '"' 256 elif '"' in v and "'" not in v: 257 quote_char = "'" 258 if quote_char == "'": 259 v = v.replace("'", "'") 260 else: 261 v = v.replace('"', """) 262 yield self.encodeStrict(quote_char) 263 yield self.encode(v) 264 yield self.encodeStrict(quote_char) 265 else: 266 yield self.encode(v) 267 if name in voidElements and self.use_trailing_solidus: 268 if self.space_before_trailing_solidus: 269 yield self.encodeStrict(" /") 270 else: 271 yield self.encodeStrict("/") 272 yield self.encode(">") 273 274 elif type == "EndTag": 275 name = token["name"] 276 if name in rcdataElements: 277 in_cdata = False 278 elif in_cdata: 279 self.serializeError("Unexpected child element of a CDATA element") 280 yield self.encodeStrict("</%s>" % name) 281 282 elif type == "Comment": 283 data = token["data"] 284 if data.find("--") >= 0: 285 self.serializeError("Comment contains --") 286 yield self.encodeStrict("<!--%s-->" % token["data"]) 287 288 elif type == "Entity": 289 name = token["name"] 290 key = name + ";" 291 if key not in entities: 292 self.serializeError("Entity %s not recognized" % name) 293 if self.resolve_entities and key not in xmlEntities: 294 data = entities[key] 295 else: 296 data = "&%s;" % name 297 yield self.encodeStrict(data) 298 299 else: 300 self.serializeError(token["data"]) 301 302 def render(self, treewalker, encoding=None): 303 if encoding: 304 return b"".join(list(self.serialize(treewalker, encoding))) 305 else: 306 return "".join(list(self.serialize(treewalker))) 307 308 def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): 309 # XXX The idea is to make data mandatory. 310 self.errors.append(data) 311 if self.strict: 312 raise SerializeError 313 314 315 def SerializeError(Exception): 316 """Error in serialized tree""" 317 pass 318