Home | History | Annotate | Download | only in dom
      1 import xml.sax
      2 import xml.sax.handler
      3 import types
      4 
      5 try:
      6     _StringTypes = [types.StringType, types.UnicodeType]
      7 except AttributeError:
      8     _StringTypes = [types.StringType]
      9 
     10 START_ELEMENT = "START_ELEMENT"
     11 END_ELEMENT = "END_ELEMENT"
     12 COMMENT = "COMMENT"
     13 START_DOCUMENT = "START_DOCUMENT"
     14 END_DOCUMENT = "END_DOCUMENT"
     15 PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
     16 IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
     17 CHARACTERS = "CHARACTERS"
     18 
     19 class PullDOM(xml.sax.ContentHandler):
     20     _locator = None
     21     document = None
     22 
     23     def __init__(self, documentFactory=None):
     24         from xml.dom import XML_NAMESPACE
     25         self.documentFactory = documentFactory
     26         self.firstEvent = [None, None]
     27         self.lastEvent = self.firstEvent
     28         self.elementStack = []
     29         self.push = self.elementStack.append
     30         try:
     31             self.pop = self.elementStack.pop
     32         except AttributeError:
     33             # use class' pop instead
     34             pass
     35         self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
     36         self._current_context = self._ns_contexts[-1]
     37         self.pending_events = []
     38 
     39     def pop(self):
     40         result = self.elementStack[-1]
     41         del self.elementStack[-1]
     42         return result
     43 
     44     def setDocumentLocator(self, locator):
     45         self._locator = locator
     46 
     47     def startPrefixMapping(self, prefix, uri):
     48         if not hasattr(self, '_xmlns_attrs'):
     49             self._xmlns_attrs = []
     50         self._xmlns_attrs.append((prefix or 'xmlns', uri))
     51         self._ns_contexts.append(self._current_context.copy())
     52         self._current_context[uri] = prefix or None
     53 
     54     def endPrefixMapping(self, prefix):
     55         self._current_context = self._ns_contexts.pop()
     56 
     57     def startElementNS(self, name, tagName , attrs):
     58         # Retrieve xml namespace declaration attributes.
     59         xmlns_uri = 'http://www.w3.org/2000/xmlns/'
     60         xmlns_attrs = getattr(self, '_xmlns_attrs', None)
     61         if xmlns_attrs is not None:
     62             for aname, value in xmlns_attrs:
     63                 attrs._attrs[(xmlns_uri, aname)] = value
     64             self._xmlns_attrs = []
     65         uri, localname = name
     66         if uri:
     67             # When using namespaces, the reader may or may not
     68             # provide us with the original name. If not, create
     69             # *a* valid tagName from the current context.
     70             if tagName is None:
     71                 prefix = self._current_context[uri]
     72                 if prefix:
     73                     tagName = prefix + ":" + localname
     74                 else:
     75                     tagName = localname
     76             if self.document:
     77                 node = self.document.createElementNS(uri, tagName)
     78             else:
     79                 node = self.buildDocument(uri, tagName)
     80         else:
     81             # When the tagname is not prefixed, it just appears as
     82             # localname
     83             if self.document:
     84                 node = self.document.createElement(localname)
     85             else:
     86                 node = self.buildDocument(None, localname)
     87 
     88         for aname,value in attrs.items():
     89             a_uri, a_localname = aname
     90             if a_uri == xmlns_uri:
     91                 if a_localname == 'xmlns':
     92                     qname = a_localname
     93                 else:
     94                     qname = 'xmlns:' + a_localname
     95                 attr = self.document.createAttributeNS(a_uri, qname)
     96                 node.setAttributeNodeNS(attr)
     97             elif a_uri:
     98                 prefix = self._current_context[a_uri]
     99                 if prefix:
    100                     qname = prefix + ":" + a_localname
    101                 else:
    102                     qname = a_localname
    103                 attr = self.document.createAttributeNS(a_uri, qname)
    104                 node.setAttributeNodeNS(attr)
    105             else:
    106                 attr = self.document.createAttribute(a_localname)
    107                 node.setAttributeNode(attr)
    108             attr.value = value
    109 
    110         self.lastEvent[1] = [(START_ELEMENT, node), None]
    111         self.lastEvent = self.lastEvent[1]
    112         self.push(node)
    113 
    114     def endElementNS(self, name, tagName):
    115         self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
    116         self.lastEvent = self.lastEvent[1]
    117 
    118     def startElement(self, name, attrs):
    119         if self.document:
    120             node = self.document.createElement(name)
    121         else:
    122             node = self.buildDocument(None, name)
    123 
    124         for aname,value in attrs.items():
    125             attr = self.document.createAttribute(aname)
    126             attr.value = value
    127             node.setAttributeNode(attr)
    128 
    129         self.lastEvent[1] = [(START_ELEMENT, node), None]
    130         self.lastEvent = self.lastEvent[1]
    131         self.push(node)
    132 
    133     def endElement(self, name):
    134         self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
    135         self.lastEvent = self.lastEvent[1]
    136 
    137     def comment(self, s):
    138         if self.document:
    139             node = self.document.createComment(s)
    140             self.lastEvent[1] = [(COMMENT, node), None]
    141             self.lastEvent = self.lastEvent[1]
    142         else:
    143             event = [(COMMENT, s), None]
    144             self.pending_events.append(event)
    145 
    146     def processingInstruction(self, target, data):
    147         if self.document:
    148             node = self.document.createProcessingInstruction(target, data)
    149             self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
    150             self.lastEvent = self.lastEvent[1]
    151         else:
    152             event = [(PROCESSING_INSTRUCTION, target, data), None]
    153             self.pending_events.append(event)
    154 
    155     def ignorableWhitespace(self, chars):
    156         node = self.document.createTextNode(chars)
    157         self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
    158         self.lastEvent = self.lastEvent[1]
    159 
    160     def characters(self, chars):
    161         node = self.document.createTextNode(chars)
    162         self.lastEvent[1] = [(CHARACTERS, node), None]
    163         self.lastEvent = self.lastEvent[1]
    164 
    165     def startDocument(self):
    166         if self.documentFactory is None:
    167             import xml.dom.minidom
    168             self.documentFactory = xml.dom.minidom.Document.implementation
    169 
    170     def buildDocument(self, uri, tagname):
    171         # Can't do that in startDocument, since we need the tagname
    172         # XXX: obtain DocumentType
    173         node = self.documentFactory.createDocument(uri, tagname, None)
    174         self.document = node
    175         self.lastEvent[1] = [(START_DOCUMENT, node), None]
    176         self.lastEvent = self.lastEvent[1]
    177         self.push(node)
    178         # Put everything we have seen so far into the document
    179         for e in self.pending_events:
    180             if e[0][0] == PROCESSING_INSTRUCTION:
    181                 _,target,data = e[0]
    182                 n = self.document.createProcessingInstruction(target, data)
    183                 e[0] = (PROCESSING_INSTRUCTION, n)
    184             elif e[0][0] == COMMENT:
    185                 n = self.document.createComment(e[0][1])
    186                 e[0] = (COMMENT, n)
    187             else:
    188                 raise AssertionError("Unknown pending event ",e[0][0])
    189             self.lastEvent[1] = e
    190             self.lastEvent = e
    191         self.pending_events = None
    192         return node.firstChild
    193 
    194     def endDocument(self):
    195         self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
    196         self.pop()
    197 
    198     def clear(self):
    199         "clear(): Explicitly release parsing structures"
    200         self.document = None
    201 
    202 class ErrorHandler:
    203     def warning(self, exception):
    204         print exception
    205     def error(self, exception):
    206         raise exception
    207     def fatalError(self, exception):
    208         raise exception
    209 
    210 class DOMEventStream:
    211     def __init__(self, stream, parser, bufsize):
    212         self.stream = stream
    213         self.parser = parser
    214         self.bufsize = bufsize
    215         if not hasattr(self.parser, 'feed'):
    216             self.getEvent = self._slurp
    217         self.reset()
    218 
    219     def reset(self):
    220         self.pulldom = PullDOM()
    221         # This content handler relies on namespace support
    222         self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    223         self.parser.setContentHandler(self.pulldom)
    224 
    225     def __getitem__(self, pos):
    226         rc = self.getEvent()
    227         if rc:
    228             return rc
    229         raise IndexError
    230 
    231     def next(self):
    232         rc = self.getEvent()
    233         if rc:
    234             return rc
    235         raise StopIteration
    236 
    237     def __iter__(self):
    238         return self
    239 
    240     def expandNode(self, node):
    241         event = self.getEvent()
    242         parents = [node]
    243         while event:
    244             token, cur_node = event
    245             if cur_node is node:
    246                 return
    247             if token != END_ELEMENT:
    248                 parents[-1].appendChild(cur_node)
    249             if token == START_ELEMENT:
    250                 parents.append(cur_node)
    251             elif token == END_ELEMENT:
    252                 del parents[-1]
    253             event = self.getEvent()
    254 
    255     def getEvent(self):
    256         # use IncrementalParser interface, so we get the desired
    257         # pull effect
    258         if not self.pulldom.firstEvent[1]:
    259             self.pulldom.lastEvent = self.pulldom.firstEvent
    260         while not self.pulldom.firstEvent[1]:
    261             buf = self.stream.read(self.bufsize)
    262             if not buf:
    263                 self.parser.close()
    264                 return None
    265             self.parser.feed(buf)
    266         rc = self.pulldom.firstEvent[1][0]
    267         self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
    268         return rc
    269 
    270     def _slurp(self):
    271         """ Fallback replacement for getEvent() using the
    272             standard SAX2 interface, which means we slurp the
    273             SAX events into memory (no performance gain, but
    274             we are compatible to all SAX parsers).
    275         """
    276         self.parser.parse(self.stream)
    277         self.getEvent = self._emit
    278         return self._emit()
    279 
    280     def _emit(self):
    281         """ Fallback replacement for getEvent() that emits
    282             the events that _slurp() read previously.
    283         """
    284         rc = self.pulldom.firstEvent[1][0]
    285         self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
    286         return rc
    287 
    288     def clear(self):
    289         """clear(): Explicitly release parsing objects"""
    290         self.pulldom.clear()
    291         del self.pulldom
    292         self.parser = None
    293         self.stream = None
    294 
    295 class SAX2DOM(PullDOM):
    296 
    297     def startElementNS(self, name, tagName , attrs):
    298         PullDOM.startElementNS(self, name, tagName, attrs)
    299         curNode = self.elementStack[-1]
    300         parentNode = self.elementStack[-2]
    301         parentNode.appendChild(curNode)
    302 
    303     def startElement(self, name, attrs):
    304         PullDOM.startElement(self, name, attrs)
    305         curNode = self.elementStack[-1]
    306         parentNode = self.elementStack[-2]
    307         parentNode.appendChild(curNode)
    308 
    309     def processingInstruction(self, target, data):
    310         PullDOM.processingInstruction(self, target, data)
    311         node = self.lastEvent[0][1]
    312         parentNode = self.elementStack[-1]
    313         parentNode.appendChild(node)
    314 
    315     def ignorableWhitespace(self, chars):
    316         PullDOM.ignorableWhitespace(self, chars)
    317         node = self.lastEvent[0][1]
    318         parentNode = self.elementStack[-1]
    319         parentNode.appendChild(node)
    320 
    321     def characters(self, chars):
    322         PullDOM.characters(self, chars)
    323         node = self.lastEvent[0][1]
    324         parentNode = self.elementStack[-1]
    325         parentNode.appendChild(node)
    326 
    327 
    328 default_bufsize = (2 ** 14) - 20
    329 
    330 def parse(stream_or_string, parser=None, bufsize=None):
    331     if bufsize is None:
    332         bufsize = default_bufsize
    333     if type(stream_or_string) in _StringTypes:
    334         stream = open(stream_or_string)
    335     else:
    336         stream = stream_or_string
    337     if not parser:
    338         parser = xml.sax.make_parser()
    339     return DOMEventStream(stream, parser, bufsize)
    340 
    341 def parseString(string, parser=None):
    342     try:
    343         from cStringIO import StringIO
    344     except ImportError:
    345         from StringIO import StringIO
    346 
    347     bufsize = len(string)
    348     buf = StringIO(string)
    349     if not parser:
    350         parser = xml.sax.make_parser()
    351     return DOMEventStream(buf, parser, bufsize)
    352