Home | History | Annotate | Download | only in dom
      1 import xml.sax
      2 import xml.sax.handler
      3 
      4 START_ELEMENT = "START_ELEMENT"
      5 END_ELEMENT = "END_ELEMENT"
      6 COMMENT = "COMMENT"
      7 START_DOCUMENT = "START_DOCUMENT"
      8 END_DOCUMENT = "END_DOCUMENT"
      9 PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
     10 IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
     11 CHARACTERS = "CHARACTERS"
     12 
     13 class PullDOM(xml.sax.ContentHandler):
     14     _locator = None
     15     document = None
     16 
     17     def __init__(self, documentFactory=None):
     18         from xml.dom import XML_NAMESPACE
     19         self.documentFactory = documentFactory
     20         self.firstEvent = [None, None]
     21         self.lastEvent = self.firstEvent
     22         self.elementStack = []
     23         self.push = self.elementStack.append
     24         try:
     25             self.pop = self.elementStack.pop
     26         except AttributeError:
     27             # use class' pop instead
     28             pass
     29         self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
     30         self._current_context = self._ns_contexts[-1]
     31         self.pending_events = []
     32 
     33     def pop(self):
     34         result = self.elementStack[-1]
     35         del self.elementStack[-1]
     36         return result
     37 
     38     def setDocumentLocator(self, locator):
     39         self._locator = locator
     40 
     41     def startPrefixMapping(self, prefix, uri):
     42         if not hasattr(self, '_xmlns_attrs'):
     43             self._xmlns_attrs = []
     44         self._xmlns_attrs.append((prefix or 'xmlns', uri))
     45         self._ns_contexts.append(self._current_context.copy())
     46         self._current_context[uri] = prefix or None
     47 
     48     def endPrefixMapping(self, prefix):
     49         self._current_context = self._ns_contexts.pop()
     50 
     51     def startElementNS(self, name, tagName , attrs):
     52         # Retrieve xml namespace declaration attributes.
     53         xmlns_uri = 'http://www.w3.org/2000/xmlns/'
     54         xmlns_attrs = getattr(self, '_xmlns_attrs', None)
     55         if xmlns_attrs is not None:
     56             for aname, value in xmlns_attrs:
     57                 attrs._attrs[(xmlns_uri, aname)] = value
     58             self._xmlns_attrs = []
     59         uri, localname = name
     60         if uri:
     61             # When using namespaces, the reader may or may not
     62             # provide us with the original name. If not, create
     63             # *a* valid tagName from the current context.
     64             if tagName is None:
     65                 prefix = self._current_context[uri]
     66                 if prefix:
     67                     tagName = prefix + ":" + localname
     68                 else:
     69                     tagName = localname
     70             if self.document:
     71                 node = self.document.createElementNS(uri, tagName)
     72             else:
     73                 node = self.buildDocument(uri, tagName)
     74         else:
     75             # When the tagname is not prefixed, it just appears as
     76             # localname
     77             if self.document:
     78                 node = self.document.createElement(localname)
     79             else:
     80                 node = self.buildDocument(None, localname)
     81 
     82         for aname,value in attrs.items():
     83             a_uri, a_localname = aname
     84             if a_uri == xmlns_uri:
     85                 if a_localname == 'xmlns':
     86                     qname = a_localname
     87                 else:
     88                     qname = 'xmlns:' + a_localname
     89                 attr = self.document.createAttributeNS(a_uri, qname)
     90                 node.setAttributeNodeNS(attr)
     91             elif a_uri:
     92                 prefix = self._current_context[a_uri]
     93                 if prefix:
     94                     qname = prefix + ":" + a_localname
     95                 else:
     96                     qname = a_localname
     97                 attr = self.document.createAttributeNS(a_uri, qname)
     98                 node.setAttributeNodeNS(attr)
     99             else:
    100                 attr = self.document.createAttribute(a_localname)
    101                 node.setAttributeNode(attr)
    102             attr.value = value
    103 
    104         self.lastEvent[1] = [(START_ELEMENT, node), None]
    105         self.lastEvent = self.lastEvent[1]
    106         self.push(node)
    107 
    108     def endElementNS(self, name, tagName):
    109         self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
    110         self.lastEvent = self.lastEvent[1]
    111 
    112     def startElement(self, name, attrs):
    113         if self.document:
    114             node = self.document.createElement(name)
    115         else:
    116             node = self.buildDocument(None, name)
    117 
    118         for aname,value in attrs.items():
    119             attr = self.document.createAttribute(aname)
    120             attr.value = value
    121             node.setAttributeNode(attr)
    122 
    123         self.lastEvent[1] = [(START_ELEMENT, node), None]
    124         self.lastEvent = self.lastEvent[1]
    125         self.push(node)
    126 
    127     def endElement(self, name):
    128         self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
    129         self.lastEvent = self.lastEvent[1]
    130 
    131     def comment(self, s):
    132         if self.document:
    133             node = self.document.createComment(s)
    134             self.lastEvent[1] = [(COMMENT, node), None]
    135             self.lastEvent = self.lastEvent[1]
    136         else:
    137             event = [(COMMENT, s), None]
    138             self.pending_events.append(event)
    139 
    140     def processingInstruction(self, target, data):
    141         if self.document:
    142             node = self.document.createProcessingInstruction(target, data)
    143             self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
    144             self.lastEvent = self.lastEvent[1]
    145         else:
    146             event = [(PROCESSING_INSTRUCTION, target, data), None]
    147             self.pending_events.append(event)
    148 
    149     def ignorableWhitespace(self, chars):
    150         node = self.document.createTextNode(chars)
    151         self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
    152         self.lastEvent = self.lastEvent[1]
    153 
    154     def characters(self, chars):
    155         node = self.document.createTextNode(chars)
    156         self.lastEvent[1] = [(CHARACTERS, node), None]
    157         self.lastEvent = self.lastEvent[1]
    158 
    159     def startDocument(self):
    160         if self.documentFactory is None:
    161             import xml.dom.minidom
    162             self.documentFactory = xml.dom.minidom.Document.implementation
    163 
    164     def buildDocument(self, uri, tagname):
    165         # Can't do that in startDocument, since we need the tagname
    166         # XXX: obtain DocumentType
    167         node = self.documentFactory.createDocument(uri, tagname, None)
    168         self.document = node
    169         self.lastEvent[1] = [(START_DOCUMENT, node), None]
    170         self.lastEvent = self.lastEvent[1]
    171         self.push(node)
    172         # Put everything we have seen so far into the document
    173         for e in self.pending_events:
    174             if e[0][0] == PROCESSING_INSTRUCTION:
    175                 _,target,data = e[0]
    176                 n = self.document.createProcessingInstruction(target, data)
    177                 e[0] = (PROCESSING_INSTRUCTION, n)
    178             elif e[0][0] == COMMENT:
    179                 n = self.document.createComment(e[0][1])
    180                 e[0] = (COMMENT, n)
    181             else:
    182                 raise AssertionError("Unknown pending event ",e[0][0])
    183             self.lastEvent[1] = e
    184             self.lastEvent = e
    185         self.pending_events = None
    186         return node.firstChild
    187 
    188     def endDocument(self):
    189         self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
    190         self.pop()
    191 
    192     def clear(self):
    193         "clear(): Explicitly release parsing structures"
    194         self.document = None
    195 
    196 class ErrorHandler:
    197     def warning(self, exception):
    198         print(exception)
    199     def error(self, exception):
    200         raise exception
    201     def fatalError(self, exception):
    202         raise exception
    203 
    204 class DOMEventStream:
    205     def __init__(self, stream, parser, bufsize):
    206         self.stream = stream
    207         self.parser = parser
    208         self.bufsize = bufsize
    209         if not hasattr(self.parser, 'feed'):
    210             self.getEvent = self._slurp
    211         self.reset()
    212 
    213     def reset(self):
    214         self.pulldom = PullDOM()
    215         # This content handler relies on namespace support
    216         self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    217         self.parser.setContentHandler(self.pulldom)
    218 
    219     def __getitem__(self, pos):
    220         rc = self.getEvent()
    221         if rc:
    222             return rc
    223         raise IndexError
    224 
    225     def __next__(self):
    226         rc = self.getEvent()
    227         if rc:
    228             return rc
    229         raise StopIteration
    230 
    231     def __iter__(self):
    232         return self
    233 
    234     def expandNode(self, node):
    235         event = self.getEvent()
    236         parents = [node]
    237         while event:
    238             token, cur_node = event
    239             if cur_node is node:
    240                 return
    241             if token != END_ELEMENT:
    242                 parents[-1].appendChild(cur_node)
    243             if token == START_ELEMENT:
    244                 parents.append(cur_node)
    245             elif token == END_ELEMENT:
    246                 del parents[-1]
    247             event = self.getEvent()
    248 
    249     def getEvent(self):
    250         # use IncrementalParser interface, so we get the desired
    251         # pull effect
    252         if not self.pulldom.firstEvent[1]:
    253             self.pulldom.lastEvent = self.pulldom.firstEvent
    254         while not self.pulldom.firstEvent[1]:
    255             buf = self.stream.read(self.bufsize)
    256             if not buf:
    257                 self.parser.close()
    258                 return None
    259             self.parser.feed(buf)
    260         rc = self.pulldom.firstEvent[1][0]
    261         self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
    262         return rc
    263 
    264     def _slurp(self):
    265         """ Fallback replacement for getEvent() using the
    266             standard SAX2 interface, which means we slurp the
    267             SAX events into memory (no performance gain, but
    268             we are compatible to all SAX parsers).
    269         """
    270         self.parser.parse(self.stream)
    271         self.getEvent = self._emit
    272         return self._emit()
    273 
    274     def _emit(self):
    275         """ Fallback replacement for getEvent() that emits
    276             the events that _slurp() read previously.
    277         """
    278         rc = self.pulldom.firstEvent[1][0]
    279         self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
    280         return rc
    281 
    282     def clear(self):
    283         """clear(): Explicitly release parsing objects"""
    284         self.pulldom.clear()
    285         del self.pulldom
    286         self.parser = None
    287         self.stream = None
    288 
    289 class SAX2DOM(PullDOM):
    290 
    291     def startElementNS(self, name, tagName , attrs):
    292         PullDOM.startElementNS(self, name, tagName, attrs)
    293         curNode = self.elementStack[-1]
    294         parentNode = self.elementStack[-2]
    295         parentNode.appendChild(curNode)
    296 
    297     def startElement(self, name, attrs):
    298         PullDOM.startElement(self, name, attrs)
    299         curNode = self.elementStack[-1]
    300         parentNode = self.elementStack[-2]
    301         parentNode.appendChild(curNode)
    302 
    303     def processingInstruction(self, target, data):
    304         PullDOM.processingInstruction(self, target, data)
    305         node = self.lastEvent[0][1]
    306         parentNode = self.elementStack[-1]
    307         parentNode.appendChild(node)
    308 
    309     def ignorableWhitespace(self, chars):
    310         PullDOM.ignorableWhitespace(self, chars)
    311         node = self.lastEvent[0][1]
    312         parentNode = self.elementStack[-1]
    313         parentNode.appendChild(node)
    314 
    315     def characters(self, chars):
    316         PullDOM.characters(self, chars)
    317         node = self.lastEvent[0][1]
    318         parentNode = self.elementStack[-1]
    319         parentNode.appendChild(node)
    320 
    321 
    322 default_bufsize = (2 ** 14) - 20
    323 
    324 def parse(stream_or_string, parser=None, bufsize=None):
    325     if bufsize is None:
    326         bufsize = default_bufsize
    327     if isinstance(stream_or_string, str):
    328         stream = open(stream_or_string, 'rb')
    329     else:
    330         stream = stream_or_string
    331     if not parser:
    332         parser = xml.sax.make_parser()
    333     return DOMEventStream(stream, parser, bufsize)
    334 
    335 def parseString(string, parser=None):
    336     from io import StringIO
    337 
    338     bufsize = len(string)
    339     buf = StringIO(string)
    340     if not parser:
    341         parser = xml.sax.make_parser()
    342     return DOMEventStream(buf, parser, bufsize)
    343