1 """Facility to use the Expat parser to load a minidom instance 2 from a string or file. 3 4 This avoids all the overhead of SAX and pulldom to gain performance. 5 """ 6 7 # Warning! 8 # 9 # This module is tightly bound to the implementation details of the 10 # minidom DOM and can't be used with other DOM implementations. This 11 # is due, in part, to a lack of appropriate methods in the DOM (there is 12 # no way to create Entity and Notation nodes via the DOM Level 2 13 # interface), and for performance. The later is the cause of some fairly 14 # cryptic code. 15 # 16 # Performance hacks: 17 # 18 # - .character_data_handler() has an extra case in which continuing 19 # data is appended to an existing Text node; this can be a 20 # speedup since pyexpat can break up character data into multiple 21 # callbacks even though we set the buffer_text attribute on the 22 # parser. This also gives us the advantage that we don't need a 23 # separate normalization pass. 24 # 25 # - Determining that a node exists is done using an identity comparison 26 # with None rather than a truth test; this avoids searching for and 27 # calling any methods on the node object if it exists. (A rather 28 # nice speedup is achieved this way as well!) 29 30 from xml.dom import xmlbuilder, minidom, Node 31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE 32 from xml.parsers import expat 33 from xml.dom.minidom import _append_child, _set_attribute_node 34 from xml.dom.NodeFilter import NodeFilter 35 36 from xml.dom.minicompat import * 37 38 TEXT_NODE = Node.TEXT_NODE 39 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE 40 DOCUMENT_NODE = Node.DOCUMENT_NODE 41 42 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT 43 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT 44 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP 45 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT 46 47 theDOMImplementation = minidom.getDOMImplementation() 48 49 # Expat typename -> TypeInfo 50 _typeinfo_map = { 51 "CDATA": minidom.TypeInfo(None, "cdata"), 52 "ENUM": minidom.TypeInfo(None, "enumeration"), 53 "ENTITY": minidom.TypeInfo(None, "entity"), 54 "ENTITIES": minidom.TypeInfo(None, "entities"), 55 "ID": minidom.TypeInfo(None, "id"), 56 "IDREF": minidom.TypeInfo(None, "idref"), 57 "IDREFS": minidom.TypeInfo(None, "idrefs"), 58 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"), 59 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"), 60 } 61 62 class ElementInfo(object): 63 __slots__ = '_attr_info', '_model', 'tagName' 64 65 def __init__(self, tagName, model=None): 66 self.tagName = tagName 67 self._attr_info = [] 68 self._model = model 69 70 def __getstate__(self): 71 return self._attr_info, self._model, self.tagName 72 73 def __setstate__(self, state): 74 self._attr_info, self._model, self.tagName = state 75 76 def getAttributeType(self, aname): 77 for info in self._attr_info: 78 if info[1] == aname: 79 t = info[-2] 80 if t[0] == "(": 81 return _typeinfo_map["ENUM"] 82 else: 83 return _typeinfo_map[info[-2]] 84 return minidom._no_type 85 86 def getAttributeTypeNS(self, namespaceURI, localName): 87 return minidom._no_type 88 89 def isElementContent(self): 90 if self._model: 91 type = self._model[0] 92 return type not in (expat.model.XML_CTYPE_ANY, 93 expat.model.XML_CTYPE_MIXED) 94 else: 95 return False 96 97 def isEmpty(self): 98 if self._model: 99 return self._model[0] == expat.model.XML_CTYPE_EMPTY 100 else: 101 return False 102 103 def isId(self, aname): 104 for info in self._attr_info: 105 if info[1] == aname: 106 return info[-2] == "ID" 107 return False 108 109 def isIdNS(self, euri, ename, auri, aname): 110 # not sure this is meaningful 111 return self.isId((auri, aname)) 112 113 def _intern(builder, s): 114 return builder._intern_setdefault(s, s) 115 116 def _parse_ns_name(builder, name): 117 assert ' ' in name 118 parts = name.split(' ') 119 intern = builder._intern_setdefault 120 if len(parts) == 3: 121 uri, localname, prefix = parts 122 prefix = intern(prefix, prefix) 123 qname = "%s:%s" % (prefix, localname) 124 qname = intern(qname, qname) 125 localname = intern(localname, localname) 126 else: 127 uri, localname = parts 128 prefix = EMPTY_PREFIX 129 qname = localname = intern(localname, localname) 130 return intern(uri, uri), localname, prefix, qname 131 132 133 class ExpatBuilder: 134 """Document builder that uses Expat to build a ParsedXML.DOM document 135 instance.""" 136 137 def __init__(self, options=None): 138 if options is None: 139 options = xmlbuilder.Options() 140 self._options = options 141 if self._options.filter is not None: 142 self._filter = FilterVisibilityController(self._options.filter) 143 else: 144 self._filter = None 145 # This *really* doesn't do anything in this case, so 146 # override it with something fast & minimal. 147 self._finish_start_element = id 148 self._parser = None 149 self.reset() 150 151 def createParser(self): 152 """Create a new parser object.""" 153 return expat.ParserCreate() 154 155 def getParser(self): 156 """Return the parser object, creating a new one if needed.""" 157 if not self._parser: 158 self._parser = self.createParser() 159 self._intern_setdefault = self._parser.intern.setdefault 160 self._parser.buffer_text = True 161 self._parser.ordered_attributes = True 162 self._parser.specified_attributes = True 163 self.install(self._parser) 164 return self._parser 165 166 def reset(self): 167 """Free all data structures used during DOM construction.""" 168 self.document = theDOMImplementation.createDocument( 169 EMPTY_NAMESPACE, None, None) 170 self.curNode = self.document 171 self._elem_info = self.document._elem_info 172 self._cdata = False 173 174 def install(self, parser): 175 """Install the callbacks needed to build the DOM into the parser.""" 176 # This creates circular references! 177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 178 parser.StartElementHandler = self.first_element_handler 179 parser.EndElementHandler = self.end_element_handler 180 parser.ProcessingInstructionHandler = self.pi_handler 181 if self._options.entities: 182 parser.EntityDeclHandler = self.entity_decl_handler 183 parser.NotationDeclHandler = self.notation_decl_handler 184 if self._options.comments: 185 parser.CommentHandler = self.comment_handler 186 if self._options.cdata_sections: 187 parser.StartCdataSectionHandler = self.start_cdata_section_handler 188 parser.EndCdataSectionHandler = self.end_cdata_section_handler 189 parser.CharacterDataHandler = self.character_data_handler_cdata 190 else: 191 parser.CharacterDataHandler = self.character_data_handler 192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler 193 parser.XmlDeclHandler = self.xml_decl_handler 194 parser.ElementDeclHandler = self.element_decl_handler 195 parser.AttlistDeclHandler = self.attlist_decl_handler 196 197 def parseFile(self, file): 198 """Parse a document from a file object, returning the document 199 node.""" 200 parser = self.getParser() 201 first_buffer = True 202 try: 203 while 1: 204 buffer = file.read(16*1024) 205 if not buffer: 206 break 207 parser.Parse(buffer, 0) 208 if first_buffer and self.document.documentElement: 209 self._setup_subset(buffer) 210 first_buffer = False 211 parser.Parse("", True) 212 except ParseEscape: 213 pass 214 doc = self.document 215 self.reset() 216 self._parser = None 217 return doc 218 219 def parseString(self, string): 220 """Parse a document from a string, returning the document node.""" 221 parser = self.getParser() 222 try: 223 parser.Parse(string, True) 224 self._setup_subset(string) 225 except ParseEscape: 226 pass 227 doc = self.document 228 self.reset() 229 self._parser = None 230 return doc 231 232 def _setup_subset(self, buffer): 233 """Load the internal subset if there might be one.""" 234 if self.document.doctype: 235 extractor = InternalSubsetExtractor() 236 extractor.parseString(buffer) 237 subset = extractor.getSubset() 238 self.document.doctype.internalSubset = subset 239 240 def start_doctype_decl_handler(self, doctypeName, systemId, publicId, 241 has_internal_subset): 242 doctype = self.document.implementation.createDocumentType( 243 doctypeName, publicId, systemId) 244 doctype.ownerDocument = self.document 245 _append_child(self.document, doctype) 246 self.document.doctype = doctype 247 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT: 248 self.document.doctype = None 249 del self.document.childNodes[-1] 250 doctype = None 251 self._parser.EntityDeclHandler = None 252 self._parser.NotationDeclHandler = None 253 if has_internal_subset: 254 if doctype is not None: 255 doctype.entities._seq = [] 256 doctype.notations._seq = [] 257 self._parser.CommentHandler = None 258 self._parser.ProcessingInstructionHandler = None 259 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 260 261 def end_doctype_decl_handler(self): 262 if self._options.comments: 263 self._parser.CommentHandler = self.comment_handler 264 self._parser.ProcessingInstructionHandler = self.pi_handler 265 if not (self._elem_info or self._filter): 266 self._finish_end_element = id 267 268 def pi_handler(self, target, data): 269 node = self.document.createProcessingInstruction(target, data) 270 _append_child(self.curNode, node) 271 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 272 self.curNode.removeChild(node) 273 274 def character_data_handler_cdata(self, data): 275 childNodes = self.curNode.childNodes 276 if self._cdata: 277 if ( self._cdata_continue 278 and childNodes[-1].nodeType == CDATA_SECTION_NODE): 279 childNodes[-1].appendData(data) 280 return 281 node = self.document.createCDATASection(data) 282 self._cdata_continue = True 283 elif childNodes and childNodes[-1].nodeType == TEXT_NODE: 284 node = childNodes[-1] 285 value = node.data + data 286 d = node.__dict__ 287 d['data'] = d['nodeValue'] = value 288 return 289 else: 290 node = minidom.Text() 291 d = node.__dict__ 292 d['data'] = d['nodeValue'] = data 293 d['ownerDocument'] = self.document 294 _append_child(self.curNode, node) 295 296 def character_data_handler(self, data): 297 childNodes = self.curNode.childNodes 298 if childNodes and childNodes[-1].nodeType == TEXT_NODE: 299 node = childNodes[-1] 300 d = node.__dict__ 301 d['data'] = d['nodeValue'] = node.data + data 302 return 303 node = minidom.Text() 304 d = node.__dict__ 305 d['data'] = d['nodeValue'] = node.data + data 306 d['ownerDocument'] = self.document 307 _append_child(self.curNode, node) 308 309 def entity_decl_handler(self, entityName, is_parameter_entity, value, 310 base, systemId, publicId, notationName): 311 if is_parameter_entity: 312 # we don't care about parameter entities for the DOM 313 return 314 if not self._options.entities: 315 return 316 node = self.document._create_entity(entityName, publicId, 317 systemId, notationName) 318 if value is not None: 319 # internal entity 320 # node *should* be readonly, but we'll cheat 321 child = self.document.createTextNode(value) 322 node.childNodes.append(child) 323 self.document.doctype.entities._seq.append(node) 324 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 325 del self.document.doctype.entities._seq[-1] 326 327 def notation_decl_handler(self, notationName, base, systemId, publicId): 328 node = self.document._create_notation(notationName, publicId, systemId) 329 self.document.doctype.notations._seq.append(node) 330 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT: 331 del self.document.doctype.notations._seq[-1] 332 333 def comment_handler(self, data): 334 node = self.document.createComment(data) 335 _append_child(self.curNode, node) 336 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT: 337 self.curNode.removeChild(node) 338 339 def start_cdata_section_handler(self): 340 self._cdata = True 341 self._cdata_continue = False 342 343 def end_cdata_section_handler(self): 344 self._cdata = False 345 self._cdata_continue = False 346 347 def external_entity_ref_handler(self, context, base, systemId, publicId): 348 return 1 349 350 def first_element_handler(self, name, attributes): 351 if self._filter is None and not self._elem_info: 352 self._finish_end_element = id 353 self.getParser().StartElementHandler = self.start_element_handler 354 self.start_element_handler(name, attributes) 355 356 def start_element_handler(self, name, attributes): 357 node = self.document.createElement(name) 358 _append_child(self.curNode, node) 359 self.curNode = node 360 361 if attributes: 362 for i in range(0, len(attributes), 2): 363 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE, 364 None, EMPTY_PREFIX) 365 value = attributes[i+1] 366 d = a.childNodes[0].__dict__ 367 d['data'] = d['nodeValue'] = value 368 d = a.__dict__ 369 d['value'] = d['nodeValue'] = value 370 d['ownerDocument'] = self.document 371 _set_attribute_node(node, a) 372 373 if node is not self.document.documentElement: 374 self._finish_start_element(node) 375 376 def _finish_start_element(self, node): 377 if self._filter: 378 # To be general, we'd have to call isSameNode(), but this 379 # is sufficient for minidom: 380 if node is self.document.documentElement: 381 return 382 filt = self._filter.startContainer(node) 383 if filt == FILTER_REJECT: 384 # ignore this node & all descendents 385 Rejecter(self) 386 elif filt == FILTER_SKIP: 387 # ignore this node, but make it's children become 388 # children of the parent node 389 Skipper(self) 390 else: 391 return 392 self.curNode = node.parentNode 393 node.parentNode.removeChild(node) 394 node.unlink() 395 396 # If this ever changes, Namespaces.end_element_handler() needs to 397 # be changed to match. 398 # 399 def end_element_handler(self, name): 400 curNode = self.curNode 401 self.curNode = curNode.parentNode 402 self._finish_end_element(curNode) 403 404 def _finish_end_element(self, curNode): 405 info = self._elem_info.get(curNode.tagName) 406 if info: 407 self._handle_white_text_nodes(curNode, info) 408 if self._filter: 409 if curNode is self.document.documentElement: 410 return 411 if self._filter.acceptNode(curNode) == FILTER_REJECT: 412 self.curNode.removeChild(curNode) 413 curNode.unlink() 414 415 def _handle_white_text_nodes(self, node, info): 416 if (self._options.whitespace_in_element_content 417 or not info.isElementContent()): 418 return 419 420 # We have element type information and should remove ignorable 421 # whitespace; identify for text nodes which contain only 422 # whitespace. 423 L = [] 424 for child in node.childNodes: 425 if child.nodeType == TEXT_NODE and not child.data.strip(): 426 L.append(child) 427 428 # Remove ignorable whitespace from the tree. 429 for child in L: 430 node.removeChild(child) 431 432 def element_decl_handler(self, name, model): 433 info = self._elem_info.get(name) 434 if info is None: 435 self._elem_info[name] = ElementInfo(name, model) 436 else: 437 assert info._model is None 438 info._model = model 439 440 def attlist_decl_handler(self, elem, name, type, default, required): 441 info = self._elem_info.get(elem) 442 if info is None: 443 info = ElementInfo(elem) 444 self._elem_info[elem] = info 445 info._attr_info.append( 446 [None, name, None, None, default, 0, type, required]) 447 448 def xml_decl_handler(self, version, encoding, standalone): 449 self.document.version = version 450 self.document.encoding = encoding 451 # This is still a little ugly, thanks to the pyexpat API. ;-( 452 if standalone >= 0: 453 if standalone: 454 self.document.standalone = True 455 else: 456 self.document.standalone = False 457 458 459 # Don't include FILTER_INTERRUPT, since that's checked separately 460 # where allowed. 461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP) 462 463 class FilterVisibilityController(object): 464 """Wrapper around a DOMBuilderFilter which implements the checks 465 to make the whatToShow filter attribute work.""" 466 467 __slots__ = 'filter', 468 469 def __init__(self, filter): 470 self.filter = filter 471 472 def startContainer(self, node): 473 mask = self._nodetype_mask[node.nodeType] 474 if self.filter.whatToShow & mask: 475 val = self.filter.startContainer(node) 476 if val == FILTER_INTERRUPT: 477 raise ParseEscape 478 if val not in _ALLOWED_FILTER_RETURNS: 479 raise ValueError, \ 480 "startContainer() returned illegal value: " + repr(val) 481 return val 482 else: 483 return FILTER_ACCEPT 484 485 def acceptNode(self, node): 486 mask = self._nodetype_mask[node.nodeType] 487 if self.filter.whatToShow & mask: 488 val = self.filter.acceptNode(node) 489 if val == FILTER_INTERRUPT: 490 raise ParseEscape 491 if val == FILTER_SKIP: 492 # move all child nodes to the parent, and remove this node 493 parent = node.parentNode 494 for child in node.childNodes[:]: 495 parent.appendChild(child) 496 # node is handled by the caller 497 return FILTER_REJECT 498 if val not in _ALLOWED_FILTER_RETURNS: 499 raise ValueError, \ 500 "acceptNode() returned illegal value: " + repr(val) 501 return val 502 else: 503 return FILTER_ACCEPT 504 505 _nodetype_mask = { 506 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT, 507 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE, 508 Node.TEXT_NODE: NodeFilter.SHOW_TEXT, 509 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION, 510 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE, 511 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY, 512 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION, 513 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT, 514 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT, 515 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE, 516 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT, 517 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION, 518 } 519 520 521 class FilterCrutch(object): 522 __slots__ = '_builder', '_level', '_old_start', '_old_end' 523 524 def __init__(self, builder): 525 self._level = 0 526 self._builder = builder 527 parser = builder._parser 528 self._old_start = parser.StartElementHandler 529 self._old_end = parser.EndElementHandler 530 parser.StartElementHandler = self.start_element_handler 531 parser.EndElementHandler = self.end_element_handler 532 533 class Rejecter(FilterCrutch): 534 __slots__ = () 535 536 def __init__(self, builder): 537 FilterCrutch.__init__(self, builder) 538 parser = builder._parser 539 for name in ("ProcessingInstructionHandler", 540 "CommentHandler", 541 "CharacterDataHandler", 542 "StartCdataSectionHandler", 543 "EndCdataSectionHandler", 544 "ExternalEntityRefHandler", 545 ): 546 setattr(parser, name, None) 547 548 def start_element_handler(self, *args): 549 self._level = self._level + 1 550 551 def end_element_handler(self, *args): 552 if self._level == 0: 553 # restore the old handlers 554 parser = self._builder._parser 555 self._builder.install(parser) 556 parser.StartElementHandler = self._old_start 557 parser.EndElementHandler = self._old_end 558 else: 559 self._level = self._level - 1 560 561 class Skipper(FilterCrutch): 562 __slots__ = () 563 564 def start_element_handler(self, *args): 565 node = self._builder.curNode 566 self._old_start(*args) 567 if self._builder.curNode is not node: 568 self._level = self._level + 1 569 570 def end_element_handler(self, *args): 571 if self._level == 0: 572 # We're popping back out of the node we're skipping, so we 573 # shouldn't need to do anything but reset the handlers. 574 self._builder._parser.StartElementHandler = self._old_start 575 self._builder._parser.EndElementHandler = self._old_end 576 self._builder = None 577 else: 578 self._level = self._level - 1 579 self._old_end(*args) 580 581 582 # framework document used by the fragment builder. 583 # Takes a string for the doctype, subset string, and namespace attrs string. 584 585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \ 586 "http://xml.python.org/entities/fragment-builder/internal" 587 588 _FRAGMENT_BUILDER_TEMPLATE = ( 589 '''\ 590 <!DOCTYPE wrapper 591 %%s [ 592 <!ENTITY fragment-builder-internal 593 SYSTEM "%s"> 594 %%s 595 ]> 596 <wrapper %%s 597 >&fragment-builder-internal;</wrapper>''' 598 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID) 599 600 601 class FragmentBuilder(ExpatBuilder): 602 """Builder which constructs document fragments given XML source 603 text and a context node. 604 605 The context node is expected to provide information about the 606 namespace declarations which are in scope at the start of the 607 fragment. 608 """ 609 610 def __init__(self, context, options=None): 611 if context.nodeType == DOCUMENT_NODE: 612 self.originalDocument = context 613 self.context = context 614 else: 615 self.originalDocument = context.ownerDocument 616 self.context = context 617 ExpatBuilder.__init__(self, options) 618 619 def reset(self): 620 ExpatBuilder.reset(self) 621 self.fragment = None 622 623 def parseFile(self, file): 624 """Parse a document fragment from a file object, returning the 625 fragment node.""" 626 return self.parseString(file.read()) 627 628 def parseString(self, string): 629 """Parse a document fragment from a string, returning the 630 fragment node.""" 631 self._source = string 632 parser = self.getParser() 633 doctype = self.originalDocument.doctype 634 ident = "" 635 if doctype: 636 subset = doctype.internalSubset or self._getDeclarations() 637 if doctype.publicId: 638 ident = ('PUBLIC "%s" "%s"' 639 % (doctype.publicId, doctype.systemId)) 640 elif doctype.systemId: 641 ident = 'SYSTEM "%s"' % doctype.systemId 642 else: 643 subset = "" 644 nsattrs = self._getNSattrs() # get ns decls from node's ancestors 645 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) 646 try: 647 parser.Parse(document, 1) 648 except: 649 self.reset() 650 raise 651 fragment = self.fragment 652 self.reset() 653 ## self._parser = None 654 return fragment 655 656 def _getDeclarations(self): 657 """Re-create the internal subset from the DocumentType node. 658 659 This is only needed if we don't already have the 660 internalSubset as a string. 661 """ 662 doctype = self.context.ownerDocument.doctype 663 s = "" 664 if doctype: 665 for i in range(doctype.notations.length): 666 notation = doctype.notations.item(i) 667 if s: 668 s = s + "\n " 669 s = "%s<!NOTATION %s" % (s, notation.nodeName) 670 if notation.publicId: 671 s = '%s PUBLIC "%s"\n "%s">' \ 672 % (s, notation.publicId, notation.systemId) 673 else: 674 s = '%s SYSTEM "%s">' % (s, notation.systemId) 675 for i in range(doctype.entities.length): 676 entity = doctype.entities.item(i) 677 if s: 678 s = s + "\n " 679 s = "%s<!ENTITY %s" % (s, entity.nodeName) 680 if entity.publicId: 681 s = '%s PUBLIC "%s"\n "%s"' \ 682 % (s, entity.publicId, entity.systemId) 683 elif entity.systemId: 684 s = '%s SYSTEM "%s"' % (s, entity.systemId) 685 else: 686 s = '%s "%s"' % (s, entity.firstChild.data) 687 if entity.notationName: 688 s = "%s NOTATION %s" % (s, entity.notationName) 689 s = s + ">" 690 return s 691 692 def _getNSattrs(self): 693 return "" 694 695 def external_entity_ref_handler(self, context, base, systemId, publicId): 696 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID: 697 # this entref is the one that we made to put the subtree 698 # in; all of our given input is parsed in here. 699 old_document = self.document 700 old_cur_node = self.curNode 701 parser = self._parser.ExternalEntityParserCreate(context) 702 # put the real document back, parse into the fragment to return 703 self.document = self.originalDocument 704 self.fragment = self.document.createDocumentFragment() 705 self.curNode = self.fragment 706 try: 707 parser.Parse(self._source, 1) 708 finally: 709 self.curNode = old_cur_node 710 self.document = old_document 711 self._source = None 712 return -1 713 else: 714 return ExpatBuilder.external_entity_ref_handler( 715 self, context, base, systemId, publicId) 716 717 718 class Namespaces: 719 """Mix-in class for builders; adds support for namespaces.""" 720 721 def _initNamespaces(self): 722 # list of (prefix, uri) ns declarations. Namespace attrs are 723 # constructed from this and added to the element's attrs. 724 self._ns_ordered_prefixes = [] 725 726 def createParser(self): 727 """Create a new namespace-handling parser.""" 728 parser = expat.ParserCreate(namespace_separator=" ") 729 parser.namespace_prefixes = True 730 return parser 731 732 def install(self, parser): 733 """Insert the namespace-handlers onto the parser.""" 734 ExpatBuilder.install(self, parser) 735 if self._options.namespace_declarations: 736 parser.StartNamespaceDeclHandler = ( 737 self.start_namespace_decl_handler) 738 739 def start_namespace_decl_handler(self, prefix, uri): 740 """Push this namespace declaration on our storage.""" 741 self._ns_ordered_prefixes.append((prefix, uri)) 742 743 def start_element_handler(self, name, attributes): 744 if ' ' in name: 745 uri, localname, prefix, qname = _parse_ns_name(self, name) 746 else: 747 uri = EMPTY_NAMESPACE 748 qname = name 749 localname = None 750 prefix = EMPTY_PREFIX 751 node = minidom.Element(qname, uri, prefix, localname) 752 node.ownerDocument = self.document 753 _append_child(self.curNode, node) 754 self.curNode = node 755 756 if self._ns_ordered_prefixes: 757 for prefix, uri in self._ns_ordered_prefixes: 758 if prefix: 759 a = minidom.Attr(_intern(self, 'xmlns:' + prefix), 760 XMLNS_NAMESPACE, prefix, "xmlns") 761 else: 762 a = minidom.Attr("xmlns", XMLNS_NAMESPACE, 763 "xmlns", EMPTY_PREFIX) 764 d = a.childNodes[0].__dict__ 765 d['data'] = d['nodeValue'] = uri 766 d = a.__dict__ 767 d['value'] = d['nodeValue'] = uri 768 d['ownerDocument'] = self.document 769 _set_attribute_node(node, a) 770 del self._ns_ordered_prefixes[:] 771 772 if attributes: 773 _attrs = node._attrs 774 _attrsNS = node._attrsNS 775 for i in range(0, len(attributes), 2): 776 aname = attributes[i] 777 value = attributes[i+1] 778 if ' ' in aname: 779 uri, localname, prefix, qname = _parse_ns_name(self, aname) 780 a = minidom.Attr(qname, uri, localname, prefix) 781 _attrs[qname] = a 782 _attrsNS[(uri, localname)] = a 783 else: 784 a = minidom.Attr(aname, EMPTY_NAMESPACE, 785 aname, EMPTY_PREFIX) 786 _attrs[aname] = a 787 _attrsNS[(EMPTY_NAMESPACE, aname)] = a 788 d = a.childNodes[0].__dict__ 789 d['data'] = d['nodeValue'] = value 790 d = a.__dict__ 791 d['ownerDocument'] = self.document 792 d['value'] = d['nodeValue'] = value 793 d['ownerElement'] = node 794 795 if __debug__: 796 # This only adds some asserts to the original 797 # end_element_handler(), so we only define this when -O is not 798 # used. If changing one, be sure to check the other to see if 799 # it needs to be changed as well. 800 # 801 def end_element_handler(self, name): 802 curNode = self.curNode 803 if ' ' in name: 804 uri, localname, prefix, qname = _parse_ns_name(self, name) 805 assert (curNode.namespaceURI == uri 806 and curNode.localName == localname 807 and curNode.prefix == prefix), \ 808 "element stack messed up! (namespace)" 809 else: 810 assert curNode.nodeName == name, \ 811 "element stack messed up - bad nodeName" 812 assert curNode.namespaceURI == EMPTY_NAMESPACE, \ 813 "element stack messed up - bad namespaceURI" 814 self.curNode = curNode.parentNode 815 self._finish_end_element(curNode) 816 817 818 class ExpatBuilderNS(Namespaces, ExpatBuilder): 819 """Document builder that supports namespaces.""" 820 821 def reset(self): 822 ExpatBuilder.reset(self) 823 self._initNamespaces() 824 825 826 class FragmentBuilderNS(Namespaces, FragmentBuilder): 827 """Fragment builder that supports namespaces.""" 828 829 def reset(self): 830 FragmentBuilder.reset(self) 831 self._initNamespaces() 832 833 def _getNSattrs(self): 834 """Return string of namespace attributes from this element and 835 ancestors.""" 836 # XXX This needs to be re-written to walk the ancestors of the 837 # context to build up the namespace information from 838 # declarations, elements, and attributes found in context. 839 # Otherwise we have to store a bunch more data on the DOM 840 # (though that *might* be more reliable -- not clear). 841 attrs = "" 842 context = self.context 843 L = [] 844 while context: 845 if hasattr(context, '_ns_prefix_uri'): 846 for prefix, uri in context._ns_prefix_uri.items(): 847 # add every new NS decl from context to L and attrs string 848 if prefix in L: 849 continue 850 L.append(prefix) 851 if prefix: 852 declname = "xmlns:" + prefix 853 else: 854 declname = "xmlns" 855 if attrs: 856 attrs = "%s\n %s='%s'" % (attrs, declname, uri) 857 else: 858 attrs = " %s='%s'" % (declname, uri) 859 context = context.parentNode 860 return attrs 861 862 863 class ParseEscape(Exception): 864 """Exception raised to short-circuit parsing in InternalSubsetExtractor.""" 865 pass 866 867 class InternalSubsetExtractor(ExpatBuilder): 868 """XML processor which can rip out the internal document type subset.""" 869 870 subset = None 871 872 def getSubset(self): 873 """Return the internal subset as a string.""" 874 return self.subset 875 876 def parseFile(self, file): 877 try: 878 ExpatBuilder.parseFile(self, file) 879 except ParseEscape: 880 pass 881 882 def parseString(self, string): 883 try: 884 ExpatBuilder.parseString(self, string) 885 except ParseEscape: 886 pass 887 888 def install(self, parser): 889 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler 890 parser.StartElementHandler = self.start_element_handler 891 892 def start_doctype_decl_handler(self, name, publicId, systemId, 893 has_internal_subset): 894 if has_internal_subset: 895 parser = self.getParser() 896 self.subset = [] 897 parser.DefaultHandler = self.subset.append 898 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler 899 else: 900 raise ParseEscape() 901 902 def end_doctype_decl_handler(self): 903 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n') 904 self.subset = s 905 raise ParseEscape() 906 907 def start_element_handler(self, name, attrs): 908 raise ParseEscape() 909 910 911 def parse(file, namespaces=True): 912 """Parse a document, returning the resulting Document node. 913 914 'file' may be either a file name or an open file object. 915 """ 916 if namespaces: 917 builder = ExpatBuilderNS() 918 else: 919 builder = ExpatBuilder() 920 921 if isinstance(file, StringTypes): 922 fp = open(file, 'rb') 923 try: 924 result = builder.parseFile(fp) 925 finally: 926 fp.close() 927 else: 928 result = builder.parseFile(file) 929 return result 930 931 932 def parseString(string, namespaces=True): 933 """Parse a document from a string, returning the resulting 934 Document node. 935 """ 936 if namespaces: 937 builder = ExpatBuilderNS() 938 else: 939 builder = ExpatBuilder() 940 return builder.parseString(string) 941 942 943 def parseFragment(file, context, namespaces=True): 944 """Parse a fragment of a document, given the context from which it 945 was originally extracted. context should be the parent of the 946 node(s) which are in the fragment. 947 948 'file' may be either a file name or an open file object. 949 """ 950 if namespaces: 951 builder = FragmentBuilderNS(context) 952 else: 953 builder = FragmentBuilder(context) 954 955 if isinstance(file, StringTypes): 956 fp = open(file, 'rb') 957 try: 958 result = builder.parseFile(fp) 959 finally: 960 fp.close() 961 else: 962 result = builder.parseFile(file) 963 return result 964 965 966 def parseFragmentString(string, context, namespaces=True): 967 """Parse a fragment of a document from a string, given the context 968 from which it was originally extracted. context should be the 969 parent of the node(s) which are in the fragment. 970 """ 971 if namespaces: 972 builder = FragmentBuilderNS(context) 973 else: 974 builder = FragmentBuilder(context) 975 return builder.parseString(string) 976 977 978 def makeBuilder(options): 979 """Create a builder based on an Options object.""" 980 if options.namespaces: 981 return ExpatBuilderNS(options) 982 else: 983 return ExpatBuilder(options) 984