Home | History | Annotate | Download | only in treebuilders
      1 from __future__ import absolute_import, division, unicode_literals
      2 from six import text_type
      3 
      4 from ..constants import scopingElements, tableInsertModeElements, namespaces
      5 
      6 # The scope markers are inserted when entering object elements,
      7 # marquees, table cells, and table captions, and are used to prevent formatting
      8 # from "leaking" into tables, object elements, and marquees.
      9 Marker = None
     10 
     11 listElementsMap = {
     12     None: (frozenset(scopingElements), False),
     13     "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
     14     "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
     15                                               (namespaces["html"], "ul")])), False),
     16     "table": (frozenset([(namespaces["html"], "html"),
     17                          (namespaces["html"], "table")]), False),
     18     "select": (frozenset([(namespaces["html"], "optgroup"),
     19                           (namespaces["html"], "option")]), True)
     20 }
     21 
     22 
     23 class Node(object):
     24     def __init__(self, name):
     25         """Node representing an item in the tree.
     26         name - The tag name associated with the node
     27         parent - The parent of the current node (or None for the document node)
     28         value - The value of the current node (applies to text nodes and
     29         comments
     30         attributes - a dict holding name, value pairs for attributes of the node
     31         childNodes - a list of child nodes of the current node. This must
     32         include all elements but not necessarily other node types
     33         _flags - A list of miscellaneous flags that can be set on the node
     34         """
     35         self.name = name
     36         self.parent = None
     37         self.value = None
     38         self.attributes = {}
     39         self.childNodes = []
     40         self._flags = []
     41 
     42     def __str__(self):
     43         attributesStr = " ".join(["%s=\"%s\"" % (name, value)
     44                                   for name, value in
     45                                   self.attributes.items()])
     46         if attributesStr:
     47             return "<%s %s>" % (self.name, attributesStr)
     48         else:
     49             return "<%s>" % (self.name)
     50 
     51     def __repr__(self):
     52         return "<%s>" % (self.name)
     53 
     54     def appendChild(self, node):
     55         """Insert node as a child of the current node
     56         """
     57         raise NotImplementedError
     58 
     59     def insertText(self, data, insertBefore=None):
     60         """Insert data as text in the current node, positioned before the
     61         start of node insertBefore or to the end of the node's text.
     62         """
     63         raise NotImplementedError
     64 
     65     def insertBefore(self, node, refNode):
     66         """Insert node as a child of the current node, before refNode in the
     67         list of child nodes. Raises ValueError if refNode is not a child of
     68         the current node"""
     69         raise NotImplementedError
     70 
     71     def removeChild(self, node):
     72         """Remove node from the children of the current node
     73         """
     74         raise NotImplementedError
     75 
     76     def reparentChildren(self, newParent):
     77         """Move all the children of the current node to newParent.
     78         This is needed so that trees that don't store text as nodes move the
     79         text in the correct way
     80         """
     81         # XXX - should this method be made more general?
     82         for child in self.childNodes:
     83             newParent.appendChild(child)
     84         self.childNodes = []
     85 
     86     def cloneNode(self):
     87         """Return a shallow copy of the current node i.e. a node with the same
     88         name and attributes but with no parent or child nodes
     89         """
     90         raise NotImplementedError
     91 
     92     def hasContent(self):
     93         """Return true if the node has children or text, false otherwise
     94         """
     95         raise NotImplementedError
     96 
     97 
     98 class ActiveFormattingElements(list):
     99     def append(self, node):
    100         equalCount = 0
    101         if node != Marker:
    102             for element in self[::-1]:
    103                 if element == Marker:
    104                     break
    105                 if self.nodesEqual(element, node):
    106                     equalCount += 1
    107                 if equalCount == 3:
    108                     self.remove(element)
    109                     break
    110         list.append(self, node)
    111 
    112     def nodesEqual(self, node1, node2):
    113         if not node1.nameTuple == node2.nameTuple:
    114             return False
    115 
    116         if not node1.attributes == node2.attributes:
    117             return False
    118 
    119         return True
    120 
    121 
    122 class TreeBuilder(object):
    123     """Base treebuilder implementation
    124     documentClass - the class to use for the bottommost node of a document
    125     elementClass - the class to use for HTML Elements
    126     commentClass - the class to use for comments
    127     doctypeClass - the class to use for doctypes
    128     """
    129 
    130     # Document class
    131     documentClass = None
    132 
    133     # The class to use for creating a node
    134     elementClass = None
    135 
    136     # The class to use for creating comments
    137     commentClass = None
    138 
    139     # The class to use for creating doctypes
    140     doctypeClass = None
    141 
    142     # Fragment class
    143     fragmentClass = None
    144 
    145     def __init__(self, namespaceHTMLElements):
    146         if namespaceHTMLElements:
    147             self.defaultNamespace = "http://www.w3.org/1999/xhtml"
    148         else:
    149             self.defaultNamespace = None
    150         self.reset()
    151 
    152     def reset(self):
    153         self.openElements = []
    154         self.activeFormattingElements = ActiveFormattingElements()
    155 
    156         # XXX - rename these to headElement, formElement
    157         self.headPointer = None
    158         self.formPointer = None
    159 
    160         self.insertFromTable = False
    161 
    162         self.document = self.documentClass()
    163 
    164     def elementInScope(self, target, variant=None):
    165 
    166         # If we pass a node in we match that. if we pass a string
    167         # match any node with that name
    168         exactNode = hasattr(target, "nameTuple")
    169 
    170         listElements, invert = listElementsMap[variant]
    171 
    172         for node in reversed(self.openElements):
    173             if (node.name == target and not exactNode or
    174                     node == target and exactNode):
    175                 return True
    176             elif (invert ^ (node.nameTuple in listElements)):
    177                 return False
    178 
    179         assert False  # We should never reach this point
    180 
    181     def reconstructActiveFormattingElements(self):
    182         # Within this algorithm the order of steps described in the
    183         # specification is not quite the same as the order of steps in the
    184         # code. It should still do the same though.
    185 
    186         # Step 1: stop the algorithm when there's nothing to do.
    187         if not self.activeFormattingElements:
    188             return
    189 
    190         # Step 2 and step 3: we start with the last element. So i is -1.
    191         i = len(self.activeFormattingElements) - 1
    192         entry = self.activeFormattingElements[i]
    193         if entry == Marker or entry in self.openElements:
    194             return
    195 
    196         # Step 6
    197         while entry != Marker and entry not in self.openElements:
    198             if i == 0:
    199                 # This will be reset to 0 below
    200                 i = -1
    201                 break
    202             i -= 1
    203             # Step 5: let entry be one earlier in the list.
    204             entry = self.activeFormattingElements[i]
    205 
    206         while True:
    207             # Step 7
    208             i += 1
    209 
    210             # Step 8
    211             entry = self.activeFormattingElements[i]
    212             clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
    213 
    214             # Step 9
    215             element = self.insertElement({"type": "StartTag",
    216                                           "name": clone.name,
    217                                           "namespace": clone.namespace,
    218                                           "data": clone.attributes})
    219 
    220             # Step 10
    221             self.activeFormattingElements[i] = element
    222 
    223             # Step 11
    224             if element == self.activeFormattingElements[-1]:
    225                 break
    226 
    227     def clearActiveFormattingElements(self):
    228         entry = self.activeFormattingElements.pop()
    229         while self.activeFormattingElements and entry != Marker:
    230             entry = self.activeFormattingElements.pop()
    231 
    232     def elementInActiveFormattingElements(self, name):
    233         """Check if an element exists between the end of the active
    234         formatting elements and the last marker. If it does, return it, else
    235         return false"""
    236 
    237         for item in self.activeFormattingElements[::-1]:
    238             # Check for Marker first because if it's a Marker it doesn't have a
    239             # name attribute.
    240             if item == Marker:
    241                 break
    242             elif item.name == name:
    243                 return item
    244         return False
    245 
    246     def insertRoot(self, token):
    247         element = self.createElement(token)
    248         self.openElements.append(element)
    249         self.document.appendChild(element)
    250 
    251     def insertDoctype(self, token):
    252         name = token["name"]
    253         publicId = token["publicId"]
    254         systemId = token["systemId"]
    255 
    256         doctype = self.doctypeClass(name, publicId, systemId)
    257         self.document.appendChild(doctype)
    258 
    259     def insertComment(self, token, parent=None):
    260         if parent is None:
    261             parent = self.openElements[-1]
    262         parent.appendChild(self.commentClass(token["data"]))
    263 
    264     def createElement(self, token):
    265         """Create an element but don't insert it anywhere"""
    266         name = token["name"]
    267         namespace = token.get("namespace", self.defaultNamespace)
    268         element = self.elementClass(name, namespace)
    269         element.attributes = token["data"]
    270         return element
    271 
    272     def _getInsertFromTable(self):
    273         return self._insertFromTable
    274 
    275     def _setInsertFromTable(self, value):
    276         """Switch the function used to insert an element from the
    277         normal one to the misnested table one and back again"""
    278         self._insertFromTable = value
    279         if value:
    280             self.insertElement = self.insertElementTable
    281         else:
    282             self.insertElement = self.insertElementNormal
    283 
    284     insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
    285 
    286     def insertElementNormal(self, token):
    287         name = token["name"]
    288         assert isinstance(name, text_type), "Element %s not unicode" % name
    289         namespace = token.get("namespace", self.defaultNamespace)
    290         element = self.elementClass(name, namespace)
    291         element.attributes = token["data"]
    292         self.openElements[-1].appendChild(element)
    293         self.openElements.append(element)
    294         return element
    295 
    296     def insertElementTable(self, token):
    297         """Create an element and insert it into the tree"""
    298         element = self.createElement(token)
    299         if self.openElements[-1].name not in tableInsertModeElements:
    300             return self.insertElementNormal(token)
    301         else:
    302             # We should be in the InTable mode. This means we want to do
    303             # special magic element rearranging
    304             parent, insertBefore = self.getTableMisnestedNodePosition()
    305             if insertBefore is None:
    306                 parent.appendChild(element)
    307             else:
    308                 parent.insertBefore(element, insertBefore)
    309             self.openElements.append(element)
    310         return element
    311 
    312     def insertText(self, data, parent=None):
    313         """Insert text data."""
    314         if parent is None:
    315             parent = self.openElements[-1]
    316 
    317         if (not self.insertFromTable or (self.insertFromTable and
    318                                          self.openElements[-1].name
    319                                          not in tableInsertModeElements)):
    320             parent.insertText(data)
    321         else:
    322             # We should be in the InTable mode. This means we want to do
    323             # special magic element rearranging
    324             parent, insertBefore = self.getTableMisnestedNodePosition()
    325             parent.insertText(data, insertBefore)
    326 
    327     def getTableMisnestedNodePosition(self):
    328         """Get the foster parent element, and sibling to insert before
    329         (or None) when inserting a misnested table node"""
    330         # The foster parent element is the one which comes before the most
    331         # recently opened table element
    332         # XXX - this is really inelegant
    333         lastTable = None
    334         fosterParent = None
    335         insertBefore = None
    336         for elm in self.openElements[::-1]:
    337             if elm.name == "table":
    338                 lastTable = elm
    339                 break
    340         if lastTable:
    341             # XXX - we should really check that this parent is actually a
    342             # node here
    343             if lastTable.parent:
    344                 fosterParent = lastTable.parent
    345                 insertBefore = lastTable
    346             else:
    347                 fosterParent = self.openElements[
    348                     self.openElements.index(lastTable) - 1]
    349         else:
    350             fosterParent = self.openElements[0]
    351         return fosterParent, insertBefore
    352 
    353     def generateImpliedEndTags(self, exclude=None):
    354         name = self.openElements[-1].name
    355         # XXX td, th and tr are not actually needed
    356         if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
    357                 and name != exclude):
    358             self.openElements.pop()
    359             # XXX This is not entirely what the specification says. We should
    360             # investigate it more closely.
    361             self.generateImpliedEndTags(exclude)
    362 
    363     def getDocument(self):
    364         "Return the final tree"
    365         return self.document
    366 
    367     def getFragment(self):
    368         "Return the final fragment"
    369         # assert self.innerHTML
    370         fragment = self.fragmentClass()
    371         self.openElements[0].reparentChildren(fragment)
    372         return fragment
    373 
    374     def testSerializer(self, node):
    375         """Serialize the subtree of node in the format required by unit tests
    376         node - the node from which to start serializing"""
    377         raise NotImplementedError
    378