1 """Module for supporting the lxml.etree library. The idea here is to use as much 2 of the native library as possible, without using fragile hacks like custom element 3 names that break between releases. The downside of this is that we cannot represent 4 all possible trees; specifically the following are known to cause problems: 5 6 Text or comments as siblings of the root element 7 Docypes with no name 8 9 When any of these things occur, we emit a DataLossWarning 10 """ 11 12 from __future__ import absolute_import, division, unicode_literals 13 14 import warnings 15 import re 16 import sys 17 18 from . import _base 19 from ..constants import DataLossWarning 20 from .. import constants 21 from . import etree as etree_builders 22 from .. import ihatexml 23 24 import lxml.etree as etree 25 26 27 fullTree = True 28 tag_regexp = re.compile("{([^}]*)}(.*)") 29 30 comment_type = etree.Comment("asd").tag 31 32 33 class DocumentType(object): 34 def __init__(self, name, publicId, systemId): 35 self.name = name 36 self.publicId = publicId 37 self.systemId = systemId 38 39 40 class Document(object): 41 def __init__(self): 42 self._elementTree = None 43 self._childNodes = [] 44 45 def appendChild(self, element): 46 self._elementTree.getroot().addnext(element._element) 47 48 def _getChildNodes(self): 49 return self._childNodes 50 51 childNodes = property(_getChildNodes) 52 53 54 def testSerializer(element): 55 rv = [] 56 finalText = None 57 infosetFilter = ihatexml.InfosetFilter() 58 59 def serializeElement(element, indent=0): 60 if not hasattr(element, "tag"): 61 if hasattr(element, "getroot"): 62 # Full tree case 63 rv.append("#document") 64 if element.docinfo.internalDTD: 65 if not (element.docinfo.public_id or 66 element.docinfo.system_url): 67 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name 68 else: 69 dtd_str = """<!DOCTYPE %s "%s" "%s">""" % ( 70 element.docinfo.root_name, 71 element.docinfo.public_id, 72 element.docinfo.system_url) 73 rv.append("|%s%s" % (' ' * (indent + 2), dtd_str)) 74 next_element = element.getroot() 75 while next_element.getprevious() is not None: 76 next_element = next_element.getprevious() 77 while next_element is not None: 78 serializeElement(next_element, indent + 2) 79 next_element = next_element.getnext() 80 elif isinstance(element, str) or isinstance(element, bytes): 81 # Text in a fragment 82 assert isinstance(element, str) or sys.version_info.major == 2 83 rv.append("|%s\"%s\"" % (' ' * indent, element)) 84 else: 85 # Fragment case 86 rv.append("#document-fragment") 87 for next_element in element: 88 serializeElement(next_element, indent + 2) 89 elif element.tag == comment_type: 90 rv.append("|%s<!-- %s -->" % (' ' * indent, element.text)) 91 if hasattr(element, "tail") and element.tail: 92 rv.append("|%s\"%s\"" % (' ' * indent, element.tail)) 93 else: 94 assert isinstance(element, etree._Element) 95 nsmatch = etree_builders.tag_regexp.match(element.tag) 96 if nsmatch is not None: 97 ns = nsmatch.group(1) 98 tag = nsmatch.group(2) 99 prefix = constants.prefixes[ns] 100 rv.append("|%s<%s %s>" % (' ' * indent, prefix, 101 infosetFilter.fromXmlName(tag))) 102 else: 103 rv.append("|%s<%s>" % (' ' * indent, 104 infosetFilter.fromXmlName(element.tag))) 105 106 if hasattr(element, "attrib"): 107 attributes = [] 108 for name, value in element.attrib.items(): 109 nsmatch = tag_regexp.match(name) 110 if nsmatch is not None: 111 ns, name = nsmatch.groups() 112 name = infosetFilter.fromXmlName(name) 113 prefix = constants.prefixes[ns] 114 attr_string = "%s %s" % (prefix, name) 115 else: 116 attr_string = infosetFilter.fromXmlName(name) 117 attributes.append((attr_string, value)) 118 119 for name, value in sorted(attributes): 120 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) 121 122 if element.text: 123 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) 124 indent += 2 125 for child in element: 126 serializeElement(child, indent) 127 if hasattr(element, "tail") and element.tail: 128 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) 129 serializeElement(element, 0) 130 131 if finalText is not None: 132 rv.append("|%s\"%s\"" % (' ' * 2, finalText)) 133 134 return "\n".join(rv) 135 136 137 def tostring(element): 138 """Serialize an element and its child nodes to a string""" 139 rv = [] 140 finalText = None 141 142 def serializeElement(element): 143 if not hasattr(element, "tag"): 144 if element.docinfo.internalDTD: 145 if element.docinfo.doctype: 146 dtd_str = element.docinfo.doctype 147 else: 148 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name 149 rv.append(dtd_str) 150 serializeElement(element.getroot()) 151 152 elif element.tag == comment_type: 153 rv.append("<!--%s-->" % (element.text,)) 154 155 else: 156 # This is assumed to be an ordinary element 157 if not element.attrib: 158 rv.append("<%s>" % (element.tag,)) 159 else: 160 attr = " ".join(["%s=\"%s\"" % (name, value) 161 for name, value in element.attrib.items()]) 162 rv.append("<%s %s>" % (element.tag, attr)) 163 if element.text: 164 rv.append(element.text) 165 166 for child in element: 167 serializeElement(child) 168 169 rv.append("</%s>" % (element.tag,)) 170 171 if hasattr(element, "tail") and element.tail: 172 rv.append(element.tail) 173 174 serializeElement(element) 175 176 if finalText is not None: 177 rv.append("%s\"" % (' ' * 2, finalText)) 178 179 return "".join(rv) 180 181 182 class TreeBuilder(_base.TreeBuilder): 183 documentClass = Document 184 doctypeClass = DocumentType 185 elementClass = None 186 commentClass = None 187 fragmentClass = Document 188 implementation = etree 189 190 def __init__(self, namespaceHTMLElements, fullTree=False): 191 builder = etree_builders.getETreeModule(etree, fullTree=fullTree) 192 infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() 193 self.namespaceHTMLElements = namespaceHTMLElements 194 195 class Attributes(dict): 196 def __init__(self, element, value={}): 197 self._element = element 198 dict.__init__(self, value) 199 for key, value in self.items(): 200 if isinstance(key, tuple): 201 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) 202 else: 203 name = infosetFilter.coerceAttribute(key) 204 self._element._element.attrib[name] = value 205 206 def __setitem__(self, key, value): 207 dict.__setitem__(self, key, value) 208 if isinstance(key, tuple): 209 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) 210 else: 211 name = infosetFilter.coerceAttribute(key) 212 self._element._element.attrib[name] = value 213 214 class Element(builder.Element): 215 def __init__(self, name, namespace): 216 name = infosetFilter.coerceElement(name) 217 builder.Element.__init__(self, name, namespace=namespace) 218 self._attributes = Attributes(self) 219 220 def _setName(self, name): 221 self._name = infosetFilter.coerceElement(name) 222 self._element.tag = self._getETreeTag( 223 self._name, self._namespace) 224 225 def _getName(self): 226 return infosetFilter.fromXmlName(self._name) 227 228 name = property(_getName, _setName) 229 230 def _getAttributes(self): 231 return self._attributes 232 233 def _setAttributes(self, attributes): 234 self._attributes = Attributes(self, attributes) 235 236 attributes = property(_getAttributes, _setAttributes) 237 238 def insertText(self, data, insertBefore=None): 239 data = infosetFilter.coerceCharacters(data) 240 builder.Element.insertText(self, data, insertBefore) 241 242 def appendChild(self, child): 243 builder.Element.appendChild(self, child) 244 245 class Comment(builder.Comment): 246 def __init__(self, data): 247 data = infosetFilter.coerceComment(data) 248 builder.Comment.__init__(self, data) 249 250 def _setData(self, data): 251 data = infosetFilter.coerceComment(data) 252 self._element.text = data 253 254 def _getData(self): 255 return self._element.text 256 257 data = property(_getData, _setData) 258 259 self.elementClass = Element 260 self.commentClass = builder.Comment 261 # self.fragmentClass = builder.DocumentFragment 262 _base.TreeBuilder.__init__(self, namespaceHTMLElements) 263 264 def reset(self): 265 _base.TreeBuilder.reset(self) 266 self.insertComment = self.insertCommentInitial 267 self.initial_comments = [] 268 self.doctype = None 269 270 def testSerializer(self, element): 271 return testSerializer(element) 272 273 def getDocument(self): 274 if fullTree: 275 return self.document._elementTree 276 else: 277 return self.document._elementTree.getroot() 278 279 def getFragment(self): 280 fragment = [] 281 element = self.openElements[0]._element 282 if element.text: 283 fragment.append(element.text) 284 fragment.extend(list(element)) 285 if element.tail: 286 fragment.append(element.tail) 287 return fragment 288 289 def insertDoctype(self, token): 290 name = token["name"] 291 publicId = token["publicId"] 292 systemId = token["systemId"] 293 294 if not name: 295 warnings.warn("lxml cannot represent empty doctype", DataLossWarning) 296 self.doctype = None 297 else: 298 coercedName = self.infosetFilter.coerceElement(name) 299 if coercedName != name: 300 warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning) 301 302 doctype = self.doctypeClass(coercedName, publicId, systemId) 303 self.doctype = doctype 304 305 def insertCommentInitial(self, data, parent=None): 306 self.initial_comments.append(data) 307 308 def insertCommentMain(self, data, parent=None): 309 if (parent == self.document and 310 self.document._elementTree.getroot()[-1].tag == comment_type): 311 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) 312 super(TreeBuilder, self).insertComment(data, parent) 313 314 def insertRoot(self, token): 315 """Create the document root""" 316 # Because of the way libxml2 works, it doesn't seem to be possible to 317 # alter information like the doctype after the tree has been parsed. 318 # Therefore we need to use the built-in parser to create our iniial 319 # tree, after which we can add elements like normal 320 docStr = "" 321 if self.doctype: 322 assert self.doctype.name 323 docStr += "<!DOCTYPE %s" % self.doctype.name 324 if (self.doctype.publicId is not None or 325 self.doctype.systemId is not None): 326 docStr += (' PUBLIC "%s" ' % 327 (self.infosetFilter.coercePubid(self.doctype.publicId or ""))) 328 if self.doctype.systemId: 329 sysid = self.doctype.systemId 330 if sysid.find("'") >= 0 and sysid.find('"') >= 0: 331 warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning) 332 sysid = sysid.replace("'", 'U00027') 333 if sysid.find("'") >= 0: 334 docStr += '"%s"' % sysid 335 else: 336 docStr += "'%s'" % sysid 337 else: 338 docStr += "''" 339 docStr += ">" 340 if self.doctype.name != token["name"]: 341 warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning) 342 docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>" 343 root = etree.fromstring(docStr) 344 345 # Append the initial comments: 346 for comment_token in self.initial_comments: 347 root.addprevious(etree.Comment(comment_token["data"])) 348 349 # Create the root document and add the ElementTree to it 350 self.document = self.documentClass() 351 self.document._elementTree = root.getroottree() 352 353 # Give the root element the right name 354 name = token["name"] 355 namespace = token.get("namespace", self.defaultNamespace) 356 if namespace is None: 357 etree_tag = name 358 else: 359 etree_tag = "{%s}%s" % (namespace, name) 360 root.tag = etree_tag 361 362 # Add the root element to the internal child/open data structures 363 root_element = self.elementClass(name, namespace) 364 root_element._element = root 365 self.document._childNodes.append(root_element) 366 self.openElements.append(root_element) 367 368 # Reset to the default insert comment function 369 self.insertComment = self.insertCommentMain 370