Home | History | Annotate | Download | only in treebuilders
      1 """Module for supporting the lxml.etree library. The idea here is to use as much
      2 of the native library as possible, without using fragile hacks like custom element
      3 names that break between releases. The downside of this is that we cannot represent
      4 all possible trees; specifically the following are known to cause problems:
      5 
      6 Text or comments as siblings of the root element
      7 Docypes with no name
      8 
      9 When any of these things occur, we emit a DataLossWarning
     10 """
     11 
     12 from __future__ import absolute_import, division, unicode_literals
     13 
     14 import warnings
     15 import re
     16 import sys
     17 
     18 from . import _base
     19 from ..constants import DataLossWarning
     20 from .. import constants
     21 from . import etree as etree_builders
     22 from .. import ihatexml
     23 
     24 import lxml.etree as etree
     25 
     26 
     27 fullTree = True
     28 tag_regexp = re.compile("{([^}]*)}(.*)")
     29 
     30 comment_type = etree.Comment("asd").tag
     31 
     32 
     33 class DocumentType(object):
     34     def __init__(self, name, publicId, systemId):
     35         self.name = name
     36         self.publicId = publicId
     37         self.systemId = systemId
     38 
     39 
     40 class Document(object):
     41     def __init__(self):
     42         self._elementTree = None
     43         self._childNodes = []
     44 
     45     def appendChild(self, element):
     46         self._elementTree.getroot().addnext(element._element)
     47 
     48     def _getChildNodes(self):
     49         return self._childNodes
     50 
     51     childNodes = property(_getChildNodes)
     52 
     53 
     54 def testSerializer(element):
     55     rv = []
     56     finalText = None
     57     infosetFilter = ihatexml.InfosetFilter()
     58 
     59     def serializeElement(element, indent=0):
     60         if not hasattr(element, "tag"):
     61             if hasattr(element, "getroot"):
     62                 # Full tree case
     63                 rv.append("#document")
     64                 if element.docinfo.internalDTD:
     65                     if not (element.docinfo.public_id or
     66                             element.docinfo.system_url):
     67                         dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
     68                     else:
     69                         dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
     70                             element.docinfo.root_name,
     71                             element.docinfo.public_id,
     72                             element.docinfo.system_url)
     73                     rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
     74                 next_element = element.getroot()
     75                 while next_element.getprevious() is not None:
     76                     next_element = next_element.getprevious()
     77                 while next_element is not None:
     78                     serializeElement(next_element, indent + 2)
     79                     next_element = next_element.getnext()
     80             elif isinstance(element, str) or isinstance(element, bytes):
     81                 # Text in a fragment
     82                 assert isinstance(element, str) or sys.version_info.major == 2
     83                 rv.append("|%s\"%s\"" % (' ' * indent, element))
     84             else:
     85                 # Fragment case
     86                 rv.append("#document-fragment")
     87                 for next_element in element:
     88                     serializeElement(next_element, indent + 2)
     89         elif element.tag == comment_type:
     90             rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
     91             if hasattr(element, "tail") and element.tail:
     92                 rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
     93         else:
     94             assert isinstance(element, etree._Element)
     95             nsmatch = etree_builders.tag_regexp.match(element.tag)
     96             if nsmatch is not None:
     97                 ns = nsmatch.group(1)
     98                 tag = nsmatch.group(2)
     99                 prefix = constants.prefixes[ns]
    100                 rv.append("|%s<%s %s>" % (' ' * indent, prefix,
    101                                           infosetFilter.fromXmlName(tag)))
    102             else:
    103                 rv.append("|%s<%s>" % (' ' * indent,
    104                                        infosetFilter.fromXmlName(element.tag)))
    105 
    106             if hasattr(element, "attrib"):
    107                 attributes = []
    108                 for name, value in element.attrib.items():
    109                     nsmatch = tag_regexp.match(name)
    110                     if nsmatch is not None:
    111                         ns, name = nsmatch.groups()
    112                         name = infosetFilter.fromXmlName(name)
    113                         prefix = constants.prefixes[ns]
    114                         attr_string = "%s %s" % (prefix, name)
    115                     else:
    116                         attr_string = infosetFilter.fromXmlName(name)
    117                     attributes.append((attr_string, value))
    118 
    119                 for name, value in sorted(attributes):
    120                     rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
    121 
    122             if element.text:
    123                 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
    124             indent += 2
    125             for child in element:
    126                 serializeElement(child, indent)
    127             if hasattr(element, "tail") and element.tail:
    128                 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
    129     serializeElement(element, 0)
    130 
    131     if finalText is not None:
    132         rv.append("|%s\"%s\"" % (' ' * 2, finalText))
    133 
    134     return "\n".join(rv)
    135 
    136 
    137 def tostring(element):
    138     """Serialize an element and its child nodes to a string"""
    139     rv = []
    140     finalText = None
    141 
    142     def serializeElement(element):
    143         if not hasattr(element, "tag"):
    144             if element.docinfo.internalDTD:
    145                 if element.docinfo.doctype:
    146                     dtd_str = element.docinfo.doctype
    147                 else:
    148                     dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
    149                 rv.append(dtd_str)
    150             serializeElement(element.getroot())
    151 
    152         elif element.tag == comment_type:
    153             rv.append("<!--%s-->" % (element.text,))
    154 
    155         else:
    156             # This is assumed to be an ordinary element
    157             if not element.attrib:
    158                 rv.append("<%s>" % (element.tag,))
    159             else:
    160                 attr = " ".join(["%s=\"%s\"" % (name, value)
    161                                  for name, value in element.attrib.items()])
    162                 rv.append("<%s %s>" % (element.tag, attr))
    163             if element.text:
    164                 rv.append(element.text)
    165 
    166             for child in element:
    167                 serializeElement(child)
    168 
    169             rv.append("</%s>" % (element.tag,))
    170 
    171         if hasattr(element, "tail") and element.tail:
    172             rv.append(element.tail)
    173 
    174     serializeElement(element)
    175 
    176     if finalText is not None:
    177         rv.append("%s\"" % (' ' * 2, finalText))
    178 
    179     return "".join(rv)
    180 
    181 
    182 class TreeBuilder(_base.TreeBuilder):
    183     documentClass = Document
    184     doctypeClass = DocumentType
    185     elementClass = None
    186     commentClass = None
    187     fragmentClass = Document
    188     implementation = etree
    189 
    190     def __init__(self, namespaceHTMLElements, fullTree=False):
    191         builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
    192         infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
    193         self.namespaceHTMLElements = namespaceHTMLElements
    194 
    195         class Attributes(dict):
    196             def __init__(self, element, value={}):
    197                 self._element = element
    198                 dict.__init__(self, value)
    199                 for key, value in self.items():
    200                     if isinstance(key, tuple):
    201                         name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
    202                     else:
    203                         name = infosetFilter.coerceAttribute(key)
    204                     self._element._element.attrib[name] = value
    205 
    206             def __setitem__(self, key, value):
    207                 dict.__setitem__(self, key, value)
    208                 if isinstance(key, tuple):
    209                     name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
    210                 else:
    211                     name = infosetFilter.coerceAttribute(key)
    212                 self._element._element.attrib[name] = value
    213 
    214         class Element(builder.Element):
    215             def __init__(self, name, namespace):
    216                 name = infosetFilter.coerceElement(name)
    217                 builder.Element.__init__(self, name, namespace=namespace)
    218                 self._attributes = Attributes(self)
    219 
    220             def _setName(self, name):
    221                 self._name = infosetFilter.coerceElement(name)
    222                 self._element.tag = self._getETreeTag(
    223                     self._name, self._namespace)
    224 
    225             def _getName(self):
    226                 return infosetFilter.fromXmlName(self._name)
    227 
    228             name = property(_getName, _setName)
    229 
    230             def _getAttributes(self):
    231                 return self._attributes
    232 
    233             def _setAttributes(self, attributes):
    234                 self._attributes = Attributes(self, attributes)
    235 
    236             attributes = property(_getAttributes, _setAttributes)
    237 
    238             def insertText(self, data, insertBefore=None):
    239                 data = infosetFilter.coerceCharacters(data)
    240                 builder.Element.insertText(self, data, insertBefore)
    241 
    242             def appendChild(self, child):
    243                 builder.Element.appendChild(self, child)
    244 
    245         class Comment(builder.Comment):
    246             def __init__(self, data):
    247                 data = infosetFilter.coerceComment(data)
    248                 builder.Comment.__init__(self, data)
    249 
    250             def _setData(self, data):
    251                 data = infosetFilter.coerceComment(data)
    252                 self._element.text = data
    253 
    254             def _getData(self):
    255                 return self._element.text
    256 
    257             data = property(_getData, _setData)
    258 
    259         self.elementClass = Element
    260         self.commentClass = builder.Comment
    261         # self.fragmentClass = builder.DocumentFragment
    262         _base.TreeBuilder.__init__(self, namespaceHTMLElements)
    263 
    264     def reset(self):
    265         _base.TreeBuilder.reset(self)
    266         self.insertComment = self.insertCommentInitial
    267         self.initial_comments = []
    268         self.doctype = None
    269 
    270     def testSerializer(self, element):
    271         return testSerializer(element)
    272 
    273     def getDocument(self):
    274         if fullTree:
    275             return self.document._elementTree
    276         else:
    277             return self.document._elementTree.getroot()
    278 
    279     def getFragment(self):
    280         fragment = []
    281         element = self.openElements[0]._element
    282         if element.text:
    283             fragment.append(element.text)
    284         fragment.extend(list(element))
    285         if element.tail:
    286             fragment.append(element.tail)
    287         return fragment
    288 
    289     def insertDoctype(self, token):
    290         name = token["name"]
    291         publicId = token["publicId"]
    292         systemId = token["systemId"]
    293 
    294         if not name:
    295             warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
    296             self.doctype = None
    297         else:
    298             coercedName = self.infosetFilter.coerceElement(name)
    299             if coercedName != name:
    300                 warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
    301 
    302             doctype = self.doctypeClass(coercedName, publicId, systemId)
    303             self.doctype = doctype
    304 
    305     def insertCommentInitial(self, data, parent=None):
    306         self.initial_comments.append(data)
    307 
    308     def insertCommentMain(self, data, parent=None):
    309         if (parent == self.document and
    310                 self.document._elementTree.getroot()[-1].tag == comment_type):
    311                 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
    312         super(TreeBuilder, self).insertComment(data, parent)
    313 
    314     def insertRoot(self, token):
    315         """Create the document root"""
    316         # Because of the way libxml2 works, it doesn't seem to be possible to
    317         # alter information like the doctype after the tree has been parsed.
    318         # Therefore we need to use the built-in parser to create our iniial
    319         # tree, after which we can add elements like normal
    320         docStr = ""
    321         if self.doctype:
    322             assert self.doctype.name
    323             docStr += "<!DOCTYPE %s" % self.doctype.name
    324             if (self.doctype.publicId is not None or
    325                     self.doctype.systemId is not None):
    326                 docStr += (' PUBLIC "%s" ' %
    327                            (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
    328                 if self.doctype.systemId:
    329                     sysid = self.doctype.systemId
    330                     if sysid.find("'") >= 0 and sysid.find('"') >= 0:
    331                         warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
    332                         sysid = sysid.replace("'", 'U00027')
    333                     if sysid.find("'") >= 0:
    334                         docStr += '"%s"' % sysid
    335                     else:
    336                         docStr += "'%s'" % sysid
    337                 else:
    338                     docStr += "''"
    339             docStr += ">"
    340             if self.doctype.name != token["name"]:
    341                 warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
    342         docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
    343         root = etree.fromstring(docStr)
    344 
    345         # Append the initial comments:
    346         for comment_token in self.initial_comments:
    347             root.addprevious(etree.Comment(comment_token["data"]))
    348 
    349         # Create the root document and add the ElementTree to it
    350         self.document = self.documentClass()
    351         self.document._elementTree = root.getroottree()
    352 
    353         # Give the root element the right name
    354         name = token["name"]
    355         namespace = token.get("namespace", self.defaultNamespace)
    356         if namespace is None:
    357             etree_tag = name
    358         else:
    359             etree_tag = "{%s}%s" % (namespace, name)
    360         root.tag = etree_tag
    361 
    362         # Add the root element to the internal child/open data structures
    363         root_element = self.elementClass(name, namespace)
    364         root_element._element = root
    365         self.document._childNodes.append(root_element)
    366         self.openElements.append(root_element)
    367 
    368         # Reset to the default insert comment function
    369         self.insertComment = self.insertCommentMain
    370