1 import xml.sax 2 import xml.sax.handler 3 import types 4 5 try: 6 _StringTypes = [types.StringType, types.UnicodeType] 7 except AttributeError: 8 _StringTypes = [types.StringType] 9 10 START_ELEMENT = "START_ELEMENT" 11 END_ELEMENT = "END_ELEMENT" 12 COMMENT = "COMMENT" 13 START_DOCUMENT = "START_DOCUMENT" 14 END_DOCUMENT = "END_DOCUMENT" 15 PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" 16 IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" 17 CHARACTERS = "CHARACTERS" 18 19 class PullDOM(xml.sax.ContentHandler): 20 _locator = None 21 document = None 22 23 def __init__(self, documentFactory=None): 24 from xml.dom import XML_NAMESPACE 25 self.documentFactory = documentFactory 26 self.firstEvent = [None, None] 27 self.lastEvent = self.firstEvent 28 self.elementStack = [] 29 self.push = self.elementStack.append 30 try: 31 self.pop = self.elementStack.pop 32 except AttributeError: 33 # use class' pop instead 34 pass 35 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts 36 self._current_context = self._ns_contexts[-1] 37 self.pending_events = [] 38 39 def pop(self): 40 result = self.elementStack[-1] 41 del self.elementStack[-1] 42 return result 43 44 def setDocumentLocator(self, locator): 45 self._locator = locator 46 47 def startPrefixMapping(self, prefix, uri): 48 if not hasattr(self, '_xmlns_attrs'): 49 self._xmlns_attrs = [] 50 self._xmlns_attrs.append((prefix or 'xmlns', uri)) 51 self._ns_contexts.append(self._current_context.copy()) 52 self._current_context[uri] = prefix or None 53 54 def endPrefixMapping(self, prefix): 55 self._current_context = self._ns_contexts.pop() 56 57 def startElementNS(self, name, tagName , attrs): 58 # Retrieve xml namespace declaration attributes. 59 xmlns_uri = 'http://www.w3.org/2000/xmlns/' 60 xmlns_attrs = getattr(self, '_xmlns_attrs', None) 61 if xmlns_attrs is not None: 62 for aname, value in xmlns_attrs: 63 attrs._attrs[(xmlns_uri, aname)] = value 64 self._xmlns_attrs = [] 65 uri, localname = name 66 if uri: 67 # When using namespaces, the reader may or may not 68 # provide us with the original name. If not, create 69 # *a* valid tagName from the current context. 70 if tagName is None: 71 prefix = self._current_context[uri] 72 if prefix: 73 tagName = prefix + ":" + localname 74 else: 75 tagName = localname 76 if self.document: 77 node = self.document.createElementNS(uri, tagName) 78 else: 79 node = self.buildDocument(uri, tagName) 80 else: 81 # When the tagname is not prefixed, it just appears as 82 # localname 83 if self.document: 84 node = self.document.createElement(localname) 85 else: 86 node = self.buildDocument(None, localname) 87 88 for aname,value in attrs.items(): 89 a_uri, a_localname = aname 90 if a_uri == xmlns_uri: 91 if a_localname == 'xmlns': 92 qname = a_localname 93 else: 94 qname = 'xmlns:' + a_localname 95 attr = self.document.createAttributeNS(a_uri, qname) 96 node.setAttributeNodeNS(attr) 97 elif a_uri: 98 prefix = self._current_context[a_uri] 99 if prefix: 100 qname = prefix + ":" + a_localname 101 else: 102 qname = a_localname 103 attr = self.document.createAttributeNS(a_uri, qname) 104 node.setAttributeNodeNS(attr) 105 else: 106 attr = self.document.createAttribute(a_localname) 107 node.setAttributeNode(attr) 108 attr.value = value 109 110 self.lastEvent[1] = [(START_ELEMENT, node), None] 111 self.lastEvent = self.lastEvent[1] 112 self.push(node) 113 114 def endElementNS(self, name, tagName): 115 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 116 self.lastEvent = self.lastEvent[1] 117 118 def startElement(self, name, attrs): 119 if self.document: 120 node = self.document.createElement(name) 121 else: 122 node = self.buildDocument(None, name) 123 124 for aname,value in attrs.items(): 125 attr = self.document.createAttribute(aname) 126 attr.value = value 127 node.setAttributeNode(attr) 128 129 self.lastEvent[1] = [(START_ELEMENT, node), None] 130 self.lastEvent = self.lastEvent[1] 131 self.push(node) 132 133 def endElement(self, name): 134 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 135 self.lastEvent = self.lastEvent[1] 136 137 def comment(self, s): 138 if self.document: 139 node = self.document.createComment(s) 140 self.lastEvent[1] = [(COMMENT, node), None] 141 self.lastEvent = self.lastEvent[1] 142 else: 143 event = [(COMMENT, s), None] 144 self.pending_events.append(event) 145 146 def processingInstruction(self, target, data): 147 if self.document: 148 node = self.document.createProcessingInstruction(target, data) 149 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] 150 self.lastEvent = self.lastEvent[1] 151 else: 152 event = [(PROCESSING_INSTRUCTION, target, data), None] 153 self.pending_events.append(event) 154 155 def ignorableWhitespace(self, chars): 156 node = self.document.createTextNode(chars) 157 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] 158 self.lastEvent = self.lastEvent[1] 159 160 def characters(self, chars): 161 node = self.document.createTextNode(chars) 162 self.lastEvent[1] = [(CHARACTERS, node), None] 163 self.lastEvent = self.lastEvent[1] 164 165 def startDocument(self): 166 if self.documentFactory is None: 167 import xml.dom.minidom 168 self.documentFactory = xml.dom.minidom.Document.implementation 169 170 def buildDocument(self, uri, tagname): 171 # Can't do that in startDocument, since we need the tagname 172 # XXX: obtain DocumentType 173 node = self.documentFactory.createDocument(uri, tagname, None) 174 self.document = node 175 self.lastEvent[1] = [(START_DOCUMENT, node), None] 176 self.lastEvent = self.lastEvent[1] 177 self.push(node) 178 # Put everything we have seen so far into the document 179 for e in self.pending_events: 180 if e[0][0] == PROCESSING_INSTRUCTION: 181 _,target,data = e[0] 182 n = self.document.createProcessingInstruction(target, data) 183 e[0] = (PROCESSING_INSTRUCTION, n) 184 elif e[0][0] == COMMENT: 185 n = self.document.createComment(e[0][1]) 186 e[0] = (COMMENT, n) 187 else: 188 raise AssertionError("Unknown pending event ",e[0][0]) 189 self.lastEvent[1] = e 190 self.lastEvent = e 191 self.pending_events = None 192 return node.firstChild 193 194 def endDocument(self): 195 self.lastEvent[1] = [(END_DOCUMENT, self.document), None] 196 self.pop() 197 198 def clear(self): 199 "clear(): Explicitly release parsing structures" 200 self.document = None 201 202 class ErrorHandler: 203 def warning(self, exception): 204 print exception 205 def error(self, exception): 206 raise exception 207 def fatalError(self, exception): 208 raise exception 209 210 class DOMEventStream: 211 def __init__(self, stream, parser, bufsize): 212 self.stream = stream 213 self.parser = parser 214 self.bufsize = bufsize 215 if not hasattr(self.parser, 'feed'): 216 self.getEvent = self._slurp 217 self.reset() 218 219 def reset(self): 220 self.pulldom = PullDOM() 221 # This content handler relies on namespace support 222 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) 223 self.parser.setContentHandler(self.pulldom) 224 225 def __getitem__(self, pos): 226 rc = self.getEvent() 227 if rc: 228 return rc 229 raise IndexError 230 231 def next(self): 232 rc = self.getEvent() 233 if rc: 234 return rc 235 raise StopIteration 236 237 def __iter__(self): 238 return self 239 240 def expandNode(self, node): 241 event = self.getEvent() 242 parents = [node] 243 while event: 244 token, cur_node = event 245 if cur_node is node: 246 return 247 if token != END_ELEMENT: 248 parents[-1].appendChild(cur_node) 249 if token == START_ELEMENT: 250 parents.append(cur_node) 251 elif token == END_ELEMENT: 252 del parents[-1] 253 event = self.getEvent() 254 255 def getEvent(self): 256 # use IncrementalParser interface, so we get the desired 257 # pull effect 258 if not self.pulldom.firstEvent[1]: 259 self.pulldom.lastEvent = self.pulldom.firstEvent 260 while not self.pulldom.firstEvent[1]: 261 buf = self.stream.read(self.bufsize) 262 if not buf: 263 self.parser.close() 264 return None 265 self.parser.feed(buf) 266 rc = self.pulldom.firstEvent[1][0] 267 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 268 return rc 269 270 def _slurp(self): 271 """ Fallback replacement for getEvent() using the 272 standard SAX2 interface, which means we slurp the 273 SAX events into memory (no performance gain, but 274 we are compatible to all SAX parsers). 275 """ 276 self.parser.parse(self.stream) 277 self.getEvent = self._emit 278 return self._emit() 279 280 def _emit(self): 281 """ Fallback replacement for getEvent() that emits 282 the events that _slurp() read previously. 283 """ 284 rc = self.pulldom.firstEvent[1][0] 285 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 286 return rc 287 288 def clear(self): 289 """clear(): Explicitly release parsing objects""" 290 self.pulldom.clear() 291 del self.pulldom 292 self.parser = None 293 self.stream = None 294 295 class SAX2DOM(PullDOM): 296 297 def startElementNS(self, name, tagName , attrs): 298 PullDOM.startElementNS(self, name, tagName, attrs) 299 curNode = self.elementStack[-1] 300 parentNode = self.elementStack[-2] 301 parentNode.appendChild(curNode) 302 303 def startElement(self, name, attrs): 304 PullDOM.startElement(self, name, attrs) 305 curNode = self.elementStack[-1] 306 parentNode = self.elementStack[-2] 307 parentNode.appendChild(curNode) 308 309 def processingInstruction(self, target, data): 310 PullDOM.processingInstruction(self, target, data) 311 node = self.lastEvent[0][1] 312 parentNode = self.elementStack[-1] 313 parentNode.appendChild(node) 314 315 def ignorableWhitespace(self, chars): 316 PullDOM.ignorableWhitespace(self, chars) 317 node = self.lastEvent[0][1] 318 parentNode = self.elementStack[-1] 319 parentNode.appendChild(node) 320 321 def characters(self, chars): 322 PullDOM.characters(self, chars) 323 node = self.lastEvent[0][1] 324 parentNode = self.elementStack[-1] 325 parentNode.appendChild(node) 326 327 328 default_bufsize = (2 ** 14) - 20 329 330 def parse(stream_or_string, parser=None, bufsize=None): 331 if bufsize is None: 332 bufsize = default_bufsize 333 if type(stream_or_string) in _StringTypes: 334 stream = open(stream_or_string) 335 else: 336 stream = stream_or_string 337 if not parser: 338 parser = xml.sax.make_parser() 339 return DOMEventStream(stream, parser, bufsize) 340 341 def parseString(string, parser=None): 342 try: 343 from cStringIO import StringIO 344 except ImportError: 345 from StringIO import StringIO 346 347 bufsize = len(string) 348 buf = StringIO(string) 349 if not parser: 350 parser = xml.sax.make_parser() 351 return DOMEventStream(buf, parser, bufsize) 352