1 """Facility to use the Expat parser to load a minidom instance 2 from a string or file. 3 4 This avoids all the overhead of SAX and pulldom to gain performance. 5 """ 6 7 # Warning! 8 # 9 # This module is tightly bound to the implementation details of the 10 # minidom DOM and can't be used with other DOM implementations. This 11 # is due, in part, to a lack of appropriate methods in the DOM (there is 12 # no way to create Entity and Notation nodes via the DOM Level 2 13 # interface), and for performance. The latter is the cause of some fairly 14 # cryptic code. 15 # 16 # Performance hacks: 17 # 18 # - .character_data_handler() has an extra case in which continuing 19 # data is appended to an existing Text node; this can be a 20 # speedup since pyexpat can break up character data into multiple 21 # callbacks even though we set the buffer_text attribute on the 22 # parser. This also gives us the advantage that we don't need a 23 # separate normalization pass. 24 # 25 # - Determining that a node exists is done using an identity comparison 26 # with None rather than a truth test; this avoids searching for and 27 # calling any methods on the node object if it exists. (A rather 28 # nice speedup is achieved this way as well!) 29 30 from xml.dom import xmlbuilder, minidom, Node 31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE 32 from xml.parsers import expat 33 from xml.dom.minidom import _append_child, _set_attribute_node 34 from xml.dom.NodeFilter import NodeFilter 35 36 TEXT_NODE = Node.TEXT_NODE 37 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE 38 DOCUMENT_NODE = Node.DOCUMENT_NODE 39 40 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT 41 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT 42 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP 43 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT 44 45 theDOMImplementation = minidom.getDOMImplementation() 46 47 # Expat typename -> TypeInfo 48 _typeinfo_map = { 49 "CDATA": minidom.TypeInfo(None, "cdata"), 50 "ENUM": minidom.TypeInfo(None, "enumeration"), 51 "ENTITY": minidom.TypeInfo(None, "entity"), 52 "ENTITIES": minidom.TypeInfo(None, "entities"), 53 "ID": minidom.TypeInfo(None, "id"), 54 "IDREF": minidom.TypeInfo(None, "idref"), 55 "IDREFS": minidom.TypeInfo(None, "idrefs"), 56 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"), 57 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"), 58 } 59 60 class ElementInfo(object): 61 __slots__ = '_attr_info', '_model', 'tagName' 62 63 def __init__(self, tagName, model=None): 64 self.tagName = tagName 65 self._attr_info = [] 66 self._model = model 67 68 def __getstate__(self): 69 return self._attr_info, self._model, self.tagName 70 71 def __setstate__(self, state): 72 self._attr_info, self._model, self.tagName = state 73 74 def getAttributeType(self, aname): 75 for info in self._attr_info: 76 if info[1] == aname: 77 t = info[-2] 78 if t[0] == "(": 79 return _typeinfo_map["ENUM"] 80 else: 81 return _typeinfo_map[info[-2]] 82 return minidom._no_type 83 84 def getAttributeTypeNS(self, namespaceURI, localName): 85 return minidom._no_type 86 87 def isElementContent(self): 88 if self._model: 89 type = self._model[0] 90 return type not in (expat.model.XML_CTYPE_ANY, 91 expat.model.XML_CTYPE_MIXED) 92 else: 93 return False 94 95 def isEmpty(self): 96 if self._model: 97 return self._model[0] == expat.model.XML_CTYPE_EMPTY 98 else: 99 return False 100 101 def isId(self, aname): 102 for info in self._attr_info: 103 if info[1] == aname: 104 return info[-2] == "ID" 105 return False 106 107 def isIdNS(self, euri, ename, auri, aname): 108 # not sure this is meaningful 109 return self.isId((auri, aname)) 110 111 def _intern(builder, s): 112 return builder._intern_setdefault(s, s) 113 114 def _parse_ns_name(builder, name): 115 assert ' ' in name 116 parts = name.split(' ') 117 intern = builder._intern_setdefault 118 if len(parts) == 3: 119 uri, localname, prefix = parts 120 prefix = intern(prefix, prefix) 121 qname = "%s:%s" % (prefix, localname) 122 qname = intern(qname, qname) 123 localname = intern(localname, localname) 124 elif len(parts) == 2: 125 uri, localname = parts 126 prefix = EMPTY_PREFIX 127 qname = localname = intern(localname, localname) 128 else: 129 raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name) 130 return intern(uri, uri), localname, prefix, qname 131 132 133 class ExpatBuilder: 134 """Document builder that uses Expat to build a ParsedXML.DOM document 135 instance.""" 136 137 def __init__(self, options=None): 138 if options is None: 139 options = xmlbuilder.Options() 140 self._options = options 141 if self._options.filter is not None: 142 self._filter = FilterVisibilityController(self._options.filter) 143 else: 144 self._filter = None 145 # This *really* doesn't do anything in this case, so 146 # override it with something fast & minimal. 147 self._finish_start_element = id 148 self._parser = None 149 self.reset() 150 151 def createParser(self): 152 """Create a new parser object.""" 153 return expat.ParserCreate() 154 155 def getParser(self): 156 """Return the parser object, creating a new one if needed.""" 157 if not self._parser: 158 self._parser = self.createParser() 159 self._intern_setdefault = self._parser.intern.setdefault 160 self._parser.buffer_text = True 161 self._parser.ordered_attributes = True 162 self._parser.specified_attributes = True 163 self.install(self._parser) 164 return self._parser 165 166 def reset(self): 167 """Free all data structures used during DOM construction.""" 168 self.document = theDOMImplementation.createDocument( 169 EMPTY_NAMESPACE, None, None) 170 self.curNode = self.document 171 self._elem_info = self.document._elem_info 172 self._cdata = False 173 174 def install(self, parser): 175 """Install the callbacks needed to build the DOM into the parser.""" 176 # This creates circular references! 177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 178 parser.StartElementHandler = self.first_element_handler 179 parser.EndElementHandler = self.end_element_handler 180 parser.ProcessingInstructionHandler = self.pi_handler 181 if self._options.entities: 182 parser.EntityDeclHandler = self.entity_decl_handler 183 parser.NotationDeclHandler = self.notation_decl_handler 184 if self._options.comments: 185 parser.CommentHandler = self.comment_handler 186 if self._options.cdata_sections: 187 parser.StartCdataSectionHandler = self.start_cdata_section_handler 188 parser.EndCdataSectionHandler = self.end_cdata_section_handler 189 parser.CharacterDataHandler = self.character_data_handler_cdata 190 else: 191 parser.CharacterDataHandler = self.character_data_handler 192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler 193 parser.XmlDeclHandler = self.xml_decl_handler 194 parser.ElementDeclHandler = self.element_decl_handler 195 parser.AttlistDeclHandler = self.attlist_decl_handler 196 197 def parseFile(self, file): 198 """Parse a document from a file object, returning the document 199 node.""" 200 parser = self.getParser() 201 first_buffer = True 202 try: 203 while 1: 204 buffer = file.read(16*1024) 205 if not buffer: 206 break 207 parser.Parse(buffer, 0) 208 if first_buffer and self.document.documentElement: 209 self._setup_subset(buffer) 210 first_buffer = False 211 parser.Parse("", True) 212 except ParseEscape: 213 pass 214 doc = self.document 215 self.reset() 216 self._parser = None 217 return doc 218 219 def parseString(self, string): 220 """Parse a document from a string, returning the document node.""" 221 parser = self.getParser() 222 try: 223 parser.Parse(string, True) 224 self._setup_subset(string) 225 except ParseEscape: 226 pass 227 doc = self.document 228 self.reset() 229 self._parser = None 230 return doc 231 232 def _setup_subset(self, buffer): 233 """Load the internal subset if there might be one.""" 234 if self.document.doctype: 235 extractor = InternalSubsetExtractor() 236 extractor.parseString(buffer) 237 subset = extractor.getSubset() 238 self.document.doctype.internalSubset = subset 239 240 def start_doctype_decl_handler(self, doctypeName, systemId, publicId, 241 has_internal_subset): 242 doctype = self.document.implementation.createDocumentType( 243 doctypeName, publicId, systemId) 244 doctype.ownerDocument = self.document 245 _append_child(self.document, doctype) 246 self.document.doctype = doctype 247 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT: 248 self.document.doctype = None 249 del self.document.childNodes[-1] 250 doctype = None 251 self._parser.EntityDeclHandler = None 252 self._parser.NotationDeclHandler = None 253 if has_internal_subset: 254 if doctype is not None: 255 doctype.entities._seq = [] 256 doctype.notations._seq = [] 257 self._parser.CommentHandler = None 258 self._parser.ProcessingInstructionHandler = None 259 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 260 261 def end_doctype_decl_handler(self): 262 if self._options.comments: 263 self._parser.CommentHandler = self.comment_handler 264 self._parser.ProcessingInstructionHandler = self.pi_handler 265 if not (self._elem_info or self._filter): 266 self._finish_end_element = id 267 268 def pi_handler(self, target, data): 269 node = self.document.createProcessingInstruction(target, data) 270 _append_child(self.curNode, node) 271 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 272 self.curNode.removeChild(node) 273 274 def character_data_handler_cdata(self, data): 275 childNodes = self.curNode.childNodes 276 if self._cdata: 277 if ( self._cdata_continue 278 and childNodes[-1].nodeType == CDATA_SECTION_NODE): 279 childNodes[-1].appendData(data) 280 return 281 node = self.document.createCDATASection(data) 282 self._cdata_continue = True 283 elif childNodes and childNodes[-1].nodeType == TEXT_NODE: 284 node = childNodes[-1] 285 value = node.data + data 286 node.data = value 287 return 288 else: 289 node = minidom.Text() 290 node.data = data 291 node.ownerDocument = self.document 292 _append_child(self.curNode, node) 293 294 def character_data_handler(self, data): 295 childNodes = self.curNode.childNodes 296 if childNodes and childNodes[-1].nodeType == TEXT_NODE: 297 node = childNodes[-1] 298 node.data = node.data + data 299 return 300 node = minidom.Text() 301 node.data = node.data + data 302 node.ownerDocument = self.document 303 _append_child(self.curNode, node) 304 305 def entity_decl_handler(self, entityName, is_parameter_entity, value, 306 base, systemId, publicId, notationName): 307 if is_parameter_entity: 308 # we don't care about parameter entities for the DOM 309 return 310 if not self._options.entities: 311 return 312 node = self.document._create_entity(entityName, publicId, 313 systemId, notationName) 314 if value is not None: 315 # internal entity 316 # node *should* be readonly, but we'll cheat 317 child = self.document.createTextNode(value) 318 node.childNodes.append(child) 319 self.document.doctype.entities._seq.append(node) 320 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 321 del self.document.doctype.entities._seq[-1] 322 323 def notation_decl_handler(self, notationName, base, systemId, publicId): 324 node = self.document._create_notation(notationName, publicId, systemId) 325 self.document.doctype.notations._seq.append(node) 326 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT: 327 del self.document.doctype.notations._seq[-1] 328 329 def comment_handler(self, data): 330 node = self.document.createComment(data) 331 _append_child(self.curNode, node) 332 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 333 self.curNode.removeChild(node) 334 335 def start_cdata_section_handler(self): 336 self._cdata = True 337 self._cdata_continue = False 338 339 def end_cdata_section_handler(self): 340 self._cdata = False 341 self._cdata_continue = False 342 343 def external_entity_ref_handler(self, context, base, systemId, publicId): 344 return 1 345 346 def first_element_handler(self, name, attributes): 347 if self._filter is None and not self._elem_info: 348 self._finish_end_element = id 349 self.getParser().StartElementHandler = self.start_element_handler 350 self.start_element_handler(name, attributes) 351 352 def start_element_handler(self, name, attributes): 353 node = self.document.createElement(name) 354 _append_child(self.curNode, node) 355 self.curNode = node 356 357 if attributes: 358 for i in range(0, len(attributes), 2): 359 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE, 360 None, EMPTY_PREFIX) 361 value = attributes[i+1] 362 a.value = value 363 a.ownerDocument = self.document 364 _set_attribute_node(node, a) 365 366 if node is not self.document.documentElement: 367 self._finish_start_element(node) 368 369 def _finish_start_element(self, node): 370 if self._filter: 371 # To be general, we'd have to call isSameNode(), but this 372 # is sufficient for minidom: 373 if node is self.document.documentElement: 374 return 375 filt = self._filter.startContainer(node) 376 if filt == FILTER_REJECT: 377 # ignore this node & all descendents 378 Rejecter(self) 379 elif filt == FILTER_SKIP: 380 # ignore this node, but make it's children become 381 # children of the parent node 382 Skipper(self) 383 else: 384 return 385 self.curNode = node.parentNode 386 node.parentNode.removeChild(node) 387 node.unlink() 388 389 # If this ever changes, Namespaces.end_element_handler() needs to 390 # be changed to match. 391 # 392 def end_element_handler(self, name): 393 curNode = self.curNode 394 self.curNode = curNode.parentNode 395 self._finish_end_element(curNode) 396 397 def _finish_end_element(self, curNode): 398 info = self._elem_info.get(curNode.tagName) 399 if info: 400 self._handle_white_text_nodes(curNode, info) 401 if self._filter: 402 if curNode is self.document.documentElement: 403 return 404 if self._filter.acceptNode(curNode) == FILTER_REJECT: 405 self.curNode.removeChild(curNode) 406 curNode.unlink() 407 408 def _handle_white_text_nodes(self, node, info): 409 if (self._options.whitespace_in_element_content 410 or not info.isElementContent()): 411 return 412 413 # We have element type information and should remove ignorable 414 # whitespace; identify for text nodes which contain only 415 # whitespace. 416 L = [] 417 for child in node.childNodes: 418 if child.nodeType == TEXT_NODE and not child.data.strip(): 419 L.append(child) 420 421 # Remove ignorable whitespace from the tree. 422 for child in L: 423 node.removeChild(child) 424 425 def element_decl_handler(self, name, model): 426 info = self._elem_info.get(name) 427 if info is None: 428 self._elem_info[name] = ElementInfo(name, model) 429 else: 430 assert info._model is None 431 info._model = model 432 433 def attlist_decl_handler(self, elem, name, type, default, required): 434 info = self._elem_info.get(elem) 435 if info is None: 436 info = ElementInfo(elem) 437 self._elem_info[elem] = info 438 info._attr_info.append( 439 [None, name, None, None, default, 0, type, required]) 440 441 def xml_decl_handler(self, version, encoding, standalone): 442 self.document.version = version 443 self.document.encoding = encoding 444 # This is still a little ugly, thanks to the pyexpat API. ;-( 445 if standalone >= 0: 446 if standalone: 447 self.document.standalone = True 448 else: 449 self.document.standalone = False 450 451 452 # Don't include FILTER_INTERRUPT, since that's checked separately 453 # where allowed. 454 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP) 455 456 class FilterVisibilityController(object): 457 """Wrapper around a DOMBuilderFilter which implements the checks 458 to make the whatToShow filter attribute work.""" 459 460 __slots__ = 'filter', 461 462 def __init__(self, filter): 463 self.filter = filter 464 465 def startContainer(self, node): 466 mask = self._nodetype_mask[node.nodeType] 467 if self.filter.whatToShow & mask: 468 val = self.filter.startContainer(node) 469 if val == FILTER_INTERRUPT: 470 raise ParseEscape 471 if val not in _ALLOWED_FILTER_RETURNS: 472 raise ValueError( 473 "startContainer() returned illegal value: " + repr(val)) 474 return val 475 else: 476 return FILTER_ACCEPT 477 478 def acceptNode(self, node): 479 mask = self._nodetype_mask[node.nodeType] 480 if self.filter.whatToShow & mask: 481 val = self.filter.acceptNode(node) 482 if val == FILTER_INTERRUPT: 483 raise ParseEscape 484 if val == FILTER_SKIP: 485 # move all child nodes to the parent, and remove this node 486 parent = node.parentNode 487 for child in node.childNodes[:]: 488 parent.appendChild(child) 489 # node is handled by the caller 490 return FILTER_REJECT 491 if val not in _ALLOWED_FILTER_RETURNS: 492 raise ValueError( 493 "acceptNode() returned illegal value: " + repr(val)) 494 return val 495 else: 496 return FILTER_ACCEPT 497 498 _nodetype_mask = { 499 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT, 500 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE, 501 Node.TEXT_NODE: NodeFilter.SHOW_TEXT, 502 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION, 503 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE, 504 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY, 505 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION, 506 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT, 507 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT, 508 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE, 509 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT, 510 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION, 511 } 512 513 514 class FilterCrutch(object): 515 __slots__ = '_builder', '_level', '_old_start', '_old_end' 516 517 def __init__(self, builder): 518 self._level = 0 519 self._builder = builder 520 parser = builder._parser 521 self._old_start = parser.StartElementHandler 522 self._old_end = parser.EndElementHandler 523 parser.StartElementHandler = self.start_element_handler 524 parser.EndElementHandler = self.end_element_handler 525 526 class Rejecter(FilterCrutch): 527 __slots__ = () 528 529 def __init__(self, builder): 530 FilterCrutch.__init__(self, builder) 531 parser = builder._parser 532 for name in ("ProcessingInstructionHandler", 533 "CommentHandler", 534 "CharacterDataHandler", 535 "StartCdataSectionHandler", 536 "EndCdataSectionHandler", 537 "ExternalEntityRefHandler", 538 ): 539 setattr(parser, name, None) 540 541 def start_element_handler(self, *args): 542 self._level = self._level + 1 543 544 def end_element_handler(self, *args): 545 if self._level == 0: 546 # restore the old handlers 547 parser = self._builder._parser 548 self._builder.install(parser) 549 parser.StartElementHandler = self._old_start 550 parser.EndElementHandler = self._old_end 551 else: 552 self._level = self._level - 1 553 554 class Skipper(FilterCrutch): 555 __slots__ = () 556 557 def start_element_handler(self, *args): 558 node = self._builder.curNode 559 self._old_start(*args) 560 if self._builder.curNode is not node: 561 self._level = self._level + 1 562 563 def end_element_handler(self, *args): 564 if self._level == 0: 565 # We're popping back out of the node we're skipping, so we 566 # shouldn't need to do anything but reset the handlers. 567 self._builder._parser.StartElementHandler = self._old_start 568 self._builder._parser.EndElementHandler = self._old_end 569 self._builder = None 570 else: 571 self._level = self._level - 1 572 self._old_end(*args) 573 574 575 # framework document used by the fragment builder. 576 # Takes a string for the doctype, subset string, and namespace attrs string. 577 578 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \ 579 "http://xml.python.org/entities/fragment-builder/internal" 580 581 _FRAGMENT_BUILDER_TEMPLATE = ( 582 '''\ 583 <!DOCTYPE wrapper 584 %%s [ 585 <!ENTITY fragment-builder-internal 586 SYSTEM "%s"> 587 %%s 588 ]> 589 <wrapper %%s 590 >&fragment-builder-internal;</wrapper>''' 591 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID) 592 593 594 class FragmentBuilder(ExpatBuilder): 595 """Builder which constructs document fragments given XML source 596 text and a context node. 597 598 The context node is expected to provide information about the 599 namespace declarations which are in scope at the start of the 600 fragment. 601 """ 602 603 def __init__(self, context, options=None): 604 if context.nodeType == DOCUMENT_NODE: 605 self.originalDocument = context 606 self.context = context 607 else: 608 self.originalDocument = context.ownerDocument 609 self.context = context 610 ExpatBuilder.__init__(self, options) 611 612 def reset(self): 613 ExpatBuilder.reset(self) 614 self.fragment = None 615 616 def parseFile(self, file): 617 """Parse a document fragment from a file object, returning the 618 fragment node.""" 619 return self.parseString(file.read()) 620 621 def parseString(self, string): 622 """Parse a document fragment from a string, returning the 623 fragment node.""" 624 self._source = string 625 parser = self.getParser() 626 doctype = self.originalDocument.doctype 627 ident = "" 628 if doctype: 629 subset = doctype.internalSubset or self._getDeclarations() 630 if doctype.publicId: 631 ident = ('PUBLIC "%s" "%s"' 632 % (doctype.publicId, doctype.systemId)) 633 elif doctype.systemId: 634 ident = 'SYSTEM "%s"' % doctype.systemId 635 else: 636 subset = "" 637 nsattrs = self._getNSattrs() # get ns decls from node's ancestors 638 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) 639 try: 640 parser.Parse(document, 1) 641 except: 642 self.reset() 643 raise 644 fragment = self.fragment 645 self.reset() 646 ## self._parser = None 647 return fragment 648 649 def _getDeclarations(self): 650 """Re-create the internal subset from the DocumentType node. 651 652 This is only needed if we don't already have the 653 internalSubset as a string. 654 """ 655 doctype = self.context.ownerDocument.doctype 656 s = "" 657 if doctype: 658 for i in range(doctype.notations.length): 659 notation = doctype.notations.item(i) 660 if s: 661 s = s + "\n " 662 s = "%s<!NOTATION %s" % (s, notation.nodeName) 663 if notation.publicId: 664 s = '%s PUBLIC "%s"\n "%s">' \ 665 % (s, notation.publicId, notation.systemId) 666 else: 667 s = '%s SYSTEM "%s">' % (s, notation.systemId) 668 for i in range(doctype.entities.length): 669 entity = doctype.entities.item(i) 670 if s: 671 s = s + "\n " 672 s = "%s<!ENTITY %s" % (s, entity.nodeName) 673 if entity.publicId: 674 s = '%s PUBLIC "%s"\n "%s"' \ 675 % (s, entity.publicId, entity.systemId) 676 elif entity.systemId: 677 s = '%s SYSTEM "%s"' % (s, entity.systemId) 678 else: 679 s = '%s "%s"' % (s, entity.firstChild.data) 680 if entity.notationName: 681 s = "%s NOTATION %s" % (s, entity.notationName) 682 s = s + ">" 683 return s 684 685 def _getNSattrs(self): 686 return "" 687 688 def external_entity_ref_handler(self, context, base, systemId, publicId): 689 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID: 690 # this entref is the one that we made to put the subtree 691 # in; all of our given input is parsed in here. 692 old_document = self.document 693 old_cur_node = self.curNode 694 parser = self._parser.ExternalEntityParserCreate(context) 695 # put the real document back, parse into the fragment to return 696 self.document = self.originalDocument 697 self.fragment = self.document.createDocumentFragment() 698 self.curNode = self.fragment 699 try: 700 parser.Parse(self._source, 1) 701 finally: 702 self.curNode = old_cur_node 703 self.document = old_document 704 self._source = None 705 return -1 706 else: 707 return ExpatBuilder.external_entity_ref_handler( 708 self, context, base, systemId, publicId) 709 710 711 class Namespaces: 712 """Mix-in class for builders; adds support for namespaces.""" 713 714 def _initNamespaces(self): 715 # list of (prefix, uri) ns declarations. Namespace attrs are 716 # constructed from this and added to the element's attrs. 717 self._ns_ordered_prefixes = [] 718 719 def createParser(self): 720 """Create a new namespace-handling parser.""" 721 parser = expat.ParserCreate(namespace_separator=" ") 722 parser.namespace_prefixes = True 723 return parser 724 725 def install(self, parser): 726 """Insert the namespace-handlers onto the parser.""" 727 ExpatBuilder.install(self, parser) 728 if self._options.namespace_declarations: 729 parser.StartNamespaceDeclHandler = ( 730 self.start_namespace_decl_handler) 731 732 def start_namespace_decl_handler(self, prefix, uri): 733 """Push this namespace declaration on our storage.""" 734 self._ns_ordered_prefixes.append((prefix, uri)) 735 736 def start_element_handler(self, name, attributes): 737 if ' ' in name: 738 uri, localname, prefix, qname = _parse_ns_name(self, name) 739 else: 740 uri = EMPTY_NAMESPACE 741 qname = name 742 localname = None 743 prefix = EMPTY_PREFIX 744 node = minidom.Element(qname, uri, prefix, localname) 745 node.ownerDocument = self.document 746 _append_child(self.curNode, node) 747 self.curNode = node 748 749 if self._ns_ordered_prefixes: 750 for prefix, uri in self._ns_ordered_prefixes: 751 if prefix: 752 a = minidom.Attr(_intern(self, 'xmlns:' + prefix), 753 XMLNS_NAMESPACE, prefix, "xmlns") 754 else: 755 a = minidom.Attr("xmlns", XMLNS_NAMESPACE, 756 "xmlns", EMPTY_PREFIX) 757 a.value = uri 758 a.ownerDocument = self.document 759 _set_attribute_node(node, a) 760 del self._ns_ordered_prefixes[:] 761 762 if attributes: 763 node._ensure_attributes() 764 _attrs = node._attrs 765 _attrsNS = node._attrsNS 766 for i in range(0, len(attributes), 2): 767 aname = attributes[i] 768 value = attributes[i+1] 769 if ' ' in aname: 770 uri, localname, prefix, qname = _parse_ns_name(self, aname) 771 a = minidom.Attr(qname, uri, localname, prefix) 772 _attrs[qname] = a 773 _attrsNS[(uri, localname)] = a 774 else: 775 a = minidom.Attr(aname, EMPTY_NAMESPACE, 776 aname, EMPTY_PREFIX) 777 _attrs[aname] = a 778 _attrsNS[(EMPTY_NAMESPACE, aname)] = a 779 a.ownerDocument = self.document 780 a.value = value 781 a.ownerElement = node 782 783 if __debug__: 784 # This only adds some asserts to the original 785 # end_element_handler(), so we only define this when -O is not 786 # used. If changing one, be sure to check the other to see if 787 # it needs to be changed as well. 788 # 789 def end_element_handler(self, name): 790 curNode = self.curNode 791 if ' ' in name: 792 uri, localname, prefix, qname = _parse_ns_name(self, name) 793 assert (curNode.namespaceURI == uri 794 and curNode.localName == localname 795 and curNode.prefix == prefix), \ 796 "element stack messed up! (namespace)" 797 else: 798 assert curNode.nodeName == name, \ 799 "element stack messed up - bad nodeName" 800 assert curNode.namespaceURI == EMPTY_NAMESPACE, \ 801 "element stack messed up - bad namespaceURI" 802 self.curNode = curNode.parentNode 803 self._finish_end_element(curNode) 804 805 806 class ExpatBuilderNS(Namespaces, ExpatBuilder): 807 """Document builder that supports namespaces.""" 808 809 def reset(self): 810 ExpatBuilder.reset(self) 811 self._initNamespaces() 812 813 814 class FragmentBuilderNS(Namespaces, FragmentBuilder): 815 """Fragment builder that supports namespaces.""" 816 817 def reset(self): 818 FragmentBuilder.reset(self) 819 self._initNamespaces() 820 821 def _getNSattrs(self): 822 """Return string of namespace attributes from this element and 823 ancestors.""" 824 # XXX This needs to be re-written to walk the ancestors of the 825 # context to build up the namespace information from 826 # declarations, elements, and attributes found in context. 827 # Otherwise we have to store a bunch more data on the DOM 828 # (though that *might* be more reliable -- not clear). 829 attrs = "" 830 context = self.context 831 L = [] 832 while context: 833 if hasattr(context, '_ns_prefix_uri'): 834 for prefix, uri in context._ns_prefix_uri.items(): 835 # add every new NS decl from context to L and attrs string 836 if prefix in L: 837 continue 838 L.append(prefix) 839 if prefix: 840 declname = "xmlns:" + prefix 841 else: 842 declname = "xmlns" 843 if attrs: 844 attrs = "%s\n %s='%s'" % (attrs, declname, uri) 845 else: 846 attrs = " %s='%s'" % (declname, uri) 847 context = context.parentNode 848 return attrs 849 850 851 class ParseEscape(Exception): 852 """Exception raised to short-circuit parsing in InternalSubsetExtractor.""" 853 pass 854 855 class InternalSubsetExtractor(ExpatBuilder): 856 """XML processor which can rip out the internal document type subset.""" 857 858 subset = None 859 860 def getSubset(self): 861 """Return the internal subset as a string.""" 862 return self.subset 863 864 def parseFile(self, file): 865 try: 866 ExpatBuilder.parseFile(self, file) 867 except ParseEscape: 868 pass 869 870 def parseString(self, string): 871 try: 872 ExpatBuilder.parseString(self, string) 873 except ParseEscape: 874 pass 875 876 def install(self, parser): 877 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 878 parser.StartElementHandler = self.start_element_handler 879 880 def start_doctype_decl_handler(self, name, publicId, systemId, 881 has_internal_subset): 882 if has_internal_subset: 883 parser = self.getParser() 884 self.subset = [] 885 parser.DefaultHandler = self.subset.append 886 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 887 else: 888 raise ParseEscape() 889 890 def end_doctype_decl_handler(self): 891 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n') 892 self.subset = s 893 raise ParseEscape() 894 895 def start_element_handler(self, name, attrs): 896 raise ParseEscape() 897 898 899 def parse(file, namespaces=True): 900 """Parse a document, returning the resulting Document node. 901 902 'file' may be either a file name or an open file object. 903 """ 904 if namespaces: 905 builder = ExpatBuilderNS() 906 else: 907 builder = ExpatBuilder() 908 909 if isinstance(file, str): 910 with open(file, 'rb') as fp: 911 result = builder.parseFile(fp) 912 else: 913 result = builder.parseFile(file) 914 return result 915 916 917 def parseString(string, namespaces=True): 918 """Parse a document from a string, returning the resulting 919 Document node. 920 """ 921 if namespaces: 922 builder = ExpatBuilderNS() 923 else: 924 builder = ExpatBuilder() 925 return builder.parseString(string) 926 927 928 def parseFragment(file, context, namespaces=True): 929 """Parse a fragment of a document, given the context from which it 930 was originally extracted. context should be the parent of the 931 node(s) which are in the fragment. 932 933 'file' may be either a file name or an open file object. 934 """ 935 if namespaces: 936 builder = FragmentBuilderNS(context) 937 else: 938 builder = FragmentBuilder(context) 939 940 if isinstance(file, str): 941 with open(file, 'rb') as fp: 942 result = builder.parseFile(fp) 943 else: 944 result = builder.parseFile(file) 945 return result 946 947 948 def parseFragmentString(string, context, namespaces=True): 949 """Parse a fragment of a document from a string, given the context 950 from which it was originally extracted. context should be the 951 parent of the node(s) which are in the fragment. 952 """ 953 if namespaces: 954 builder = FragmentBuilderNS(context) 955 else: 956 builder = FragmentBuilder(context) 957 return builder.parseString(string) 958 959 960 def makeBuilder(options): 961 """Create a builder based on an Options object.""" 962 if options.namespaces: 963 return ExpatBuilderNS(options) 964 else: 965 return ExpatBuilder(options) 966