1 import xml.sax 2 import xml.sax.handler 3 4 START_ELEMENT = "START_ELEMENT" 5 END_ELEMENT = "END_ELEMENT" 6 COMMENT = "COMMENT" 7 START_DOCUMENT = "START_DOCUMENT" 8 END_DOCUMENT = "END_DOCUMENT" 9 PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" 10 IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" 11 CHARACTERS = "CHARACTERS" 12 13 class PullDOM(xml.sax.ContentHandler): 14 _locator = None 15 document = None 16 17 def __init__(self, documentFactory=None): 18 from xml.dom import XML_NAMESPACE 19 self.documentFactory = documentFactory 20 self.firstEvent = [None, None] 21 self.lastEvent = self.firstEvent 22 self.elementStack = [] 23 self.push = self.elementStack.append 24 try: 25 self.pop = self.elementStack.pop 26 except AttributeError: 27 # use class' pop instead 28 pass 29 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts 30 self._current_context = self._ns_contexts[-1] 31 self.pending_events = [] 32 33 def pop(self): 34 result = self.elementStack[-1] 35 del self.elementStack[-1] 36 return result 37 38 def setDocumentLocator(self, locator): 39 self._locator = locator 40 41 def startPrefixMapping(self, prefix, uri): 42 if not hasattr(self, '_xmlns_attrs'): 43 self._xmlns_attrs = [] 44 self._xmlns_attrs.append((prefix or 'xmlns', uri)) 45 self._ns_contexts.append(self._current_context.copy()) 46 self._current_context[uri] = prefix or None 47 48 def endPrefixMapping(self, prefix): 49 self._current_context = self._ns_contexts.pop() 50 51 def startElementNS(self, name, tagName , attrs): 52 # Retrieve xml namespace declaration attributes. 53 xmlns_uri = 'http://www.w3.org/2000/xmlns/' 54 xmlns_attrs = getattr(self, '_xmlns_attrs', None) 55 if xmlns_attrs is not None: 56 for aname, value in xmlns_attrs: 57 attrs._attrs[(xmlns_uri, aname)] = value 58 self._xmlns_attrs = [] 59 uri, localname = name 60 if uri: 61 # When using namespaces, the reader may or may not 62 # provide us with the original name. If not, create 63 # *a* valid tagName from the current context. 64 if tagName is None: 65 prefix = self._current_context[uri] 66 if prefix: 67 tagName = prefix + ":" + localname 68 else: 69 tagName = localname 70 if self.document: 71 node = self.document.createElementNS(uri, tagName) 72 else: 73 node = self.buildDocument(uri, tagName) 74 else: 75 # When the tagname is not prefixed, it just appears as 76 # localname 77 if self.document: 78 node = self.document.createElement(localname) 79 else: 80 node = self.buildDocument(None, localname) 81 82 for aname,value in attrs.items(): 83 a_uri, a_localname = aname 84 if a_uri == xmlns_uri: 85 if a_localname == 'xmlns': 86 qname = a_localname 87 else: 88 qname = 'xmlns:' + a_localname 89 attr = self.document.createAttributeNS(a_uri, qname) 90 node.setAttributeNodeNS(attr) 91 elif a_uri: 92 prefix = self._current_context[a_uri] 93 if prefix: 94 qname = prefix + ":" + a_localname 95 else: 96 qname = a_localname 97 attr = self.document.createAttributeNS(a_uri, qname) 98 node.setAttributeNodeNS(attr) 99 else: 100 attr = self.document.createAttribute(a_localname) 101 node.setAttributeNode(attr) 102 attr.value = value 103 104 self.lastEvent[1] = [(START_ELEMENT, node), None] 105 self.lastEvent = self.lastEvent[1] 106 self.push(node) 107 108 def endElementNS(self, name, tagName): 109 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 110 self.lastEvent = self.lastEvent[1] 111 112 def startElement(self, name, attrs): 113 if self.document: 114 node = self.document.createElement(name) 115 else: 116 node = self.buildDocument(None, name) 117 118 for aname,value in attrs.items(): 119 attr = self.document.createAttribute(aname) 120 attr.value = value 121 node.setAttributeNode(attr) 122 123 self.lastEvent[1] = [(START_ELEMENT, node), None] 124 self.lastEvent = self.lastEvent[1] 125 self.push(node) 126 127 def endElement(self, name): 128 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] 129 self.lastEvent = self.lastEvent[1] 130 131 def comment(self, s): 132 if self.document: 133 node = self.document.createComment(s) 134 self.lastEvent[1] = [(COMMENT, node), None] 135 self.lastEvent = self.lastEvent[1] 136 else: 137 event = [(COMMENT, s), None] 138 self.pending_events.append(event) 139 140 def processingInstruction(self, target, data): 141 if self.document: 142 node = self.document.createProcessingInstruction(target, data) 143 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] 144 self.lastEvent = self.lastEvent[1] 145 else: 146 event = [(PROCESSING_INSTRUCTION, target, data), None] 147 self.pending_events.append(event) 148 149 def ignorableWhitespace(self, chars): 150 node = self.document.createTextNode(chars) 151 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] 152 self.lastEvent = self.lastEvent[1] 153 154 def characters(self, chars): 155 node = self.document.createTextNode(chars) 156 self.lastEvent[1] = [(CHARACTERS, node), None] 157 self.lastEvent = self.lastEvent[1] 158 159 def startDocument(self): 160 if self.documentFactory is None: 161 import xml.dom.minidom 162 self.documentFactory = xml.dom.minidom.Document.implementation 163 164 def buildDocument(self, uri, tagname): 165 # Can't do that in startDocument, since we need the tagname 166 # XXX: obtain DocumentType 167 node = self.documentFactory.createDocument(uri, tagname, None) 168 self.document = node 169 self.lastEvent[1] = [(START_DOCUMENT, node), None] 170 self.lastEvent = self.lastEvent[1] 171 self.push(node) 172 # Put everything we have seen so far into the document 173 for e in self.pending_events: 174 if e[0][0] == PROCESSING_INSTRUCTION: 175 _,target,data = e[0] 176 n = self.document.createProcessingInstruction(target, data) 177 e[0] = (PROCESSING_INSTRUCTION, n) 178 elif e[0][0] == COMMENT: 179 n = self.document.createComment(e[0][1]) 180 e[0] = (COMMENT, n) 181 else: 182 raise AssertionError("Unknown pending event ",e[0][0]) 183 self.lastEvent[1] = e 184 self.lastEvent = e 185 self.pending_events = None 186 return node.firstChild 187 188 def endDocument(self): 189 self.lastEvent[1] = [(END_DOCUMENT, self.document), None] 190 self.pop() 191 192 def clear(self): 193 "clear(): Explicitly release parsing structures" 194 self.document = None 195 196 class ErrorHandler: 197 def warning(self, exception): 198 print(exception) 199 def error(self, exception): 200 raise exception 201 def fatalError(self, exception): 202 raise exception 203 204 class DOMEventStream: 205 def __init__(self, stream, parser, bufsize): 206 self.stream = stream 207 self.parser = parser 208 self.bufsize = bufsize 209 if not hasattr(self.parser, 'feed'): 210 self.getEvent = self._slurp 211 self.reset() 212 213 def reset(self): 214 self.pulldom = PullDOM() 215 # This content handler relies on namespace support 216 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) 217 self.parser.setContentHandler(self.pulldom) 218 219 def __getitem__(self, pos): 220 rc = self.getEvent() 221 if rc: 222 return rc 223 raise IndexError 224 225 def __next__(self): 226 rc = self.getEvent() 227 if rc: 228 return rc 229 raise StopIteration 230 231 def __iter__(self): 232 return self 233 234 def expandNode(self, node): 235 event = self.getEvent() 236 parents = [node] 237 while event: 238 token, cur_node = event 239 if cur_node is node: 240 return 241 if token != END_ELEMENT: 242 parents[-1].appendChild(cur_node) 243 if token == START_ELEMENT: 244 parents.append(cur_node) 245 elif token == END_ELEMENT: 246 del parents[-1] 247 event = self.getEvent() 248 249 def getEvent(self): 250 # use IncrementalParser interface, so we get the desired 251 # pull effect 252 if not self.pulldom.firstEvent[1]: 253 self.pulldom.lastEvent = self.pulldom.firstEvent 254 while not self.pulldom.firstEvent[1]: 255 buf = self.stream.read(self.bufsize) 256 if not buf: 257 self.parser.close() 258 return None 259 self.parser.feed(buf) 260 rc = self.pulldom.firstEvent[1][0] 261 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 262 return rc 263 264 def _slurp(self): 265 """ Fallback replacement for getEvent() using the 266 standard SAX2 interface, which means we slurp the 267 SAX events into memory (no performance gain, but 268 we are compatible to all SAX parsers). 269 """ 270 self.parser.parse(self.stream) 271 self.getEvent = self._emit 272 return self._emit() 273 274 def _emit(self): 275 """ Fallback replacement for getEvent() that emits 276 the events that _slurp() read previously. 277 """ 278 rc = self.pulldom.firstEvent[1][0] 279 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] 280 return rc 281 282 def clear(self): 283 """clear(): Explicitly release parsing objects""" 284 self.pulldom.clear() 285 del self.pulldom 286 self.parser = None 287 self.stream = None 288 289 class SAX2DOM(PullDOM): 290 291 def startElementNS(self, name, tagName , attrs): 292 PullDOM.startElementNS(self, name, tagName, attrs) 293 curNode = self.elementStack[-1] 294 parentNode = self.elementStack[-2] 295 parentNode.appendChild(curNode) 296 297 def startElement(self, name, attrs): 298 PullDOM.startElement(self, name, attrs) 299 curNode = self.elementStack[-1] 300 parentNode = self.elementStack[-2] 301 parentNode.appendChild(curNode) 302 303 def processingInstruction(self, target, data): 304 PullDOM.processingInstruction(self, target, data) 305 node = self.lastEvent[0][1] 306 parentNode = self.elementStack[-1] 307 parentNode.appendChild(node) 308 309 def ignorableWhitespace(self, chars): 310 PullDOM.ignorableWhitespace(self, chars) 311 node = self.lastEvent[0][1] 312 parentNode = self.elementStack[-1] 313 parentNode.appendChild(node) 314 315 def characters(self, chars): 316 PullDOM.characters(self, chars) 317 node = self.lastEvent[0][1] 318 parentNode = self.elementStack[-1] 319 parentNode.appendChild(node) 320 321 322 default_bufsize = (2 ** 14) - 20 323 324 def parse(stream_or_string, parser=None, bufsize=None): 325 if bufsize is None: 326 bufsize = default_bufsize 327 if isinstance(stream_or_string, str): 328 stream = open(stream_or_string, 'rb') 329 else: 330 stream = stream_or_string 331 if not parser: 332 parser = xml.sax.make_parser() 333 return DOMEventStream(stream, parser, bufsize) 334 335 def parseString(string, parser=None): 336 from io import StringIO 337 338 bufsize = len(string) 339 buf = StringIO(string) 340 if not parser: 341 parser = xml.sax.make_parser() 342 return DOMEventStream(buf, parser, bufsize) 343