Home | History | Annotate | Download | only in dom
      1 """Facility to use the Expat parser to load a minidom instance
      2 from a string or file.
      3 
      4 This avoids all the overhead of SAX and pulldom to gain performance.
      5 """
      6 
      7 # Warning!
      8 #
      9 # This module is tightly bound to the implementation details of the
     10 # minidom DOM and can't be used with other DOM implementations.  This
     11 # is due, in part, to a lack of appropriate methods in the DOM (there is
     12 # no way to create Entity and Notation nodes via the DOM Level 2
     13 # interface), and for performance.  The latter is the cause of some fairly
     14 # cryptic code.
     15 #
     16 # Performance hacks:
     17 #
     18 #   -  .character_data_handler() has an extra case in which continuing
     19 #      data is appended to an existing Text node; this can be a
     20 #      speedup since pyexpat can break up character data into multiple
     21 #      callbacks even though we set the buffer_text attribute on the
     22 #      parser.  This also gives us the advantage that we don't need a
     23 #      separate normalization pass.
     24 #
     25 #   -  Determining that a node exists is done using an identity comparison
     26 #      with None rather than a truth test; this avoids searching for and
     27 #      calling any methods on the node object if it exists.  (A rather
     28 #      nice speedup is achieved this way as well!)
     29 
     30 from xml.dom import xmlbuilder, minidom, Node
     31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
     32 from xml.parsers import expat
     33 from xml.dom.minidom import _append_child, _set_attribute_node
     34 from xml.dom.NodeFilter import NodeFilter
     35 
     36 TEXT_NODE = Node.TEXT_NODE
     37 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
     38 DOCUMENT_NODE = Node.DOCUMENT_NODE
     39 
     40 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
     41 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
     42 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
     43 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
     44 
     45 theDOMImplementation = minidom.getDOMImplementation()
     46 
     47 # Expat typename -> TypeInfo
     48 _typeinfo_map = {
     49     "CDATA":    minidom.TypeInfo(None, "cdata"),
     50     "ENUM":     minidom.TypeInfo(None, "enumeration"),
     51     "ENTITY":   minidom.TypeInfo(None, "entity"),
     52     "ENTITIES": minidom.TypeInfo(None, "entities"),
     53     "ID":       minidom.TypeInfo(None, "id"),
     54     "IDREF":    minidom.TypeInfo(None, "idref"),
     55     "IDREFS":   minidom.TypeInfo(None, "idrefs"),
     56     "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
     57     "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
     58     }
     59 
     60 class ElementInfo(object):
     61     __slots__ = '_attr_info', '_model', 'tagName'
     62 
     63     def __init__(self, tagName, model=None):
     64         self.tagName = tagName
     65         self._attr_info = []
     66         self._model = model
     67 
     68     def __getstate__(self):
     69         return self._attr_info, self._model, self.tagName
     70 
     71     def __setstate__(self, state):
     72         self._attr_info, self._model, self.tagName = state
     73 
     74     def getAttributeType(self, aname):
     75         for info in self._attr_info:
     76             if info[1] == aname:
     77                 t = info[-2]
     78                 if t[0] == "(":
     79                     return _typeinfo_map["ENUM"]
     80                 else:
     81                     return _typeinfo_map[info[-2]]
     82         return minidom._no_type
     83 
     84     def getAttributeTypeNS(self, namespaceURI, localName):
     85         return minidom._no_type
     86 
     87     def isElementContent(self):
     88         if self._model:
     89             type = self._model[0]
     90             return type not in (expat.model.XML_CTYPE_ANY,
     91                                 expat.model.XML_CTYPE_MIXED)
     92         else:
     93             return False
     94 
     95     def isEmpty(self):
     96         if self._model:
     97             return self._model[0] == expat.model.XML_CTYPE_EMPTY
     98         else:
     99             return False
    100 
    101     def isId(self, aname):
    102         for info in self._attr_info:
    103             if info[1] == aname:
    104                 return info[-2] == "ID"
    105         return False
    106 
    107     def isIdNS(self, euri, ename, auri, aname):
    108         # not sure this is meaningful
    109         return self.isId((auri, aname))
    110 
    111 def _intern(builder, s):
    112     return builder._intern_setdefault(s, s)
    113 
    114 def _parse_ns_name(builder, name):
    115     assert ' ' in name
    116     parts = name.split(' ')
    117     intern = builder._intern_setdefault
    118     if len(parts) == 3:
    119         uri, localname, prefix = parts
    120         prefix = intern(prefix, prefix)
    121         qname = "%s:%s" % (prefix, localname)
    122         qname = intern(qname, qname)
    123         localname = intern(localname, localname)
    124     elif len(parts) == 2:
    125         uri, localname = parts
    126         prefix = EMPTY_PREFIX
    127         qname = localname = intern(localname, localname)
    128     else:
    129         raise ValueError("Unsupported syntax: spaces in URIs not supported: %r" % name)
    130     return intern(uri, uri), localname, prefix, qname
    131 
    132 
    133 class ExpatBuilder:
    134     """Document builder that uses Expat to build a ParsedXML.DOM document
    135     instance."""
    136 
    137     def __init__(self, options=None):
    138         if options is None:
    139             options = xmlbuilder.Options()
    140         self._options = options
    141         if self._options.filter is not None:
    142             self._filter = FilterVisibilityController(self._options.filter)
    143         else:
    144             self._filter = None
    145             # This *really* doesn't do anything in this case, so
    146             # override it with something fast & minimal.
    147             self._finish_start_element = id
    148         self._parser = None
    149         self.reset()
    150 
    151     def createParser(self):
    152         """Create a new parser object."""
    153         return expat.ParserCreate()
    154 
    155     def getParser(self):
    156         """Return the parser object, creating a new one if needed."""
    157         if not self._parser:
    158             self._parser = self.createParser()
    159             self._intern_setdefault = self._parser.intern.setdefault
    160             self._parser.buffer_text = True
    161             self._parser.ordered_attributes = True
    162             self._parser.specified_attributes = True
    163             self.install(self._parser)
    164         return self._parser
    165 
    166     def reset(self):
    167         """Free all data structures used during DOM construction."""
    168         self.document = theDOMImplementation.createDocument(
    169             EMPTY_NAMESPACE, None, None)
    170         self.curNode = self.document
    171         self._elem_info = self.document._elem_info
    172         self._cdata = False
    173 
    174     def install(self, parser):
    175         """Install the callbacks needed to build the DOM into the parser."""
    176         # This creates circular references!
    177         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
    178         parser.StartElementHandler = self.first_element_handler
    179         parser.EndElementHandler = self.end_element_handler
    180         parser.ProcessingInstructionHandler = self.pi_handler
    181         if self._options.entities:
    182             parser.EntityDeclHandler = self.entity_decl_handler
    183         parser.NotationDeclHandler = self.notation_decl_handler
    184         if self._options.comments:
    185             parser.CommentHandler = self.comment_handler
    186         if self._options.cdata_sections:
    187             parser.StartCdataSectionHandler = self.start_cdata_section_handler
    188             parser.EndCdataSectionHandler = self.end_cdata_section_handler
    189             parser.CharacterDataHandler = self.character_data_handler_cdata
    190         else:
    191             parser.CharacterDataHandler = self.character_data_handler
    192         parser.ExternalEntityRefHandler = self.external_entity_ref_handler
    193         parser.XmlDeclHandler = self.xml_decl_handler
    194         parser.ElementDeclHandler = self.element_decl_handler
    195         parser.AttlistDeclHandler = self.attlist_decl_handler
    196 
    197     def parseFile(self, file):
    198         """Parse a document from a file object, returning the document
    199         node."""
    200         parser = self.getParser()
    201         first_buffer = True
    202         try:
    203             while 1:
    204                 buffer = file.read(16*1024)
    205                 if not buffer:
    206                     break
    207                 parser.Parse(buffer, 0)
    208                 if first_buffer and self.document.documentElement:
    209                     self._setup_subset(buffer)
    210                 first_buffer = False
    211             parser.Parse("", True)
    212         except ParseEscape:
    213             pass
    214         doc = self.document
    215         self.reset()
    216         self._parser = None
    217         return doc
    218 
    219     def parseString(self, string):
    220         """Parse a document from a string, returning the document node."""
    221         parser = self.getParser()
    222         try:
    223             parser.Parse(string, True)
    224             self._setup_subset(string)
    225         except ParseEscape:
    226             pass
    227         doc = self.document
    228         self.reset()
    229         self._parser = None
    230         return doc
    231 
    232     def _setup_subset(self, buffer):
    233         """Load the internal subset if there might be one."""
    234         if self.document.doctype:
    235             extractor = InternalSubsetExtractor()
    236             extractor.parseString(buffer)
    237             subset = extractor.getSubset()
    238             self.document.doctype.internalSubset = subset
    239 
    240     def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
    241                                    has_internal_subset):
    242         doctype = self.document.implementation.createDocumentType(
    243             doctypeName, publicId, systemId)
    244         doctype.ownerDocument = self.document
    245         _append_child(self.document, doctype)
    246         self.document.doctype = doctype
    247         if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
    248             self.document.doctype = None
    249             del self.document.childNodes[-1]
    250             doctype = None
    251             self._parser.EntityDeclHandler = None
    252             self._parser.NotationDeclHandler = None
    253         if has_internal_subset:
    254             if doctype is not None:
    255                 doctype.entities._seq = []
    256                 doctype.notations._seq = []
    257             self._parser.CommentHandler = None
    258             self._parser.ProcessingInstructionHandler = None
    259             self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
    260 
    261     def end_doctype_decl_handler(self):
    262         if self._options.comments:
    263             self._parser.CommentHandler = self.comment_handler
    264         self._parser.ProcessingInstructionHandler = self.pi_handler
    265         if not (self._elem_info or self._filter):
    266             self._finish_end_element = id
    267 
    268     def pi_handler(self, target, data):
    269         node = self.document.createProcessingInstruction(target, data)
    270         _append_child(self.curNode, node)
    271         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    272             self.curNode.removeChild(node)
    273 
    274     def character_data_handler_cdata(self, data):
    275         childNodes = self.curNode.childNodes
    276         if self._cdata:
    277             if (  self._cdata_continue
    278                   and childNodes[-1].nodeType == CDATA_SECTION_NODE):
    279                 childNodes[-1].appendData(data)
    280                 return
    281             node = self.document.createCDATASection(data)
    282             self._cdata_continue = True
    283         elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
    284             node = childNodes[-1]
    285             value = node.data + data
    286             node.data = value
    287             return
    288         else:
    289             node = minidom.Text()
    290             node.data = data
    291             node.ownerDocument = self.document
    292         _append_child(self.curNode, node)
    293 
    294     def character_data_handler(self, data):
    295         childNodes = self.curNode.childNodes
    296         if childNodes and childNodes[-1].nodeType == TEXT_NODE:
    297             node = childNodes[-1]
    298             node.data = node.data + data
    299             return
    300         node = minidom.Text()
    301         node.data = node.data + data
    302         node.ownerDocument = self.document
    303         _append_child(self.curNode, node)
    304 
    305     def entity_decl_handler(self, entityName, is_parameter_entity, value,
    306                             base, systemId, publicId, notationName):
    307         if is_parameter_entity:
    308             # we don't care about parameter entities for the DOM
    309             return
    310         if not self._options.entities:
    311             return
    312         node = self.document._create_entity(entityName, publicId,
    313                                             systemId, notationName)
    314         if value is not None:
    315             # internal entity
    316             # node *should* be readonly, but we'll cheat
    317             child = self.document.createTextNode(value)
    318             node.childNodes.append(child)
    319         self.document.doctype.entities._seq.append(node)
    320         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    321             del self.document.doctype.entities._seq[-1]
    322 
    323     def notation_decl_handler(self, notationName, base, systemId, publicId):
    324         node = self.document._create_notation(notationName, publicId, systemId)
    325         self.document.doctype.notations._seq.append(node)
    326         if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
    327             del self.document.doctype.notations._seq[-1]
    328 
    329     def comment_handler(self, data):
    330         node = self.document.createComment(data)
    331         _append_child(self.curNode, node)
    332         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    333             self.curNode.removeChild(node)
    334 
    335     def start_cdata_section_handler(self):
    336         self._cdata = True
    337         self._cdata_continue = False
    338 
    339     def end_cdata_section_handler(self):
    340         self._cdata = False
    341         self._cdata_continue = False
    342 
    343     def external_entity_ref_handler(self, context, base, systemId, publicId):
    344         return 1
    345 
    346     def first_element_handler(self, name, attributes):
    347         if self._filter is None and not self._elem_info:
    348             self._finish_end_element = id
    349         self.getParser().StartElementHandler = self.start_element_handler
    350         self.start_element_handler(name, attributes)
    351 
    352     def start_element_handler(self, name, attributes):
    353         node = self.document.createElement(name)
    354         _append_child(self.curNode, node)
    355         self.curNode = node
    356 
    357         if attributes:
    358             for i in range(0, len(attributes), 2):
    359                 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
    360                                  None, EMPTY_PREFIX)
    361                 value = attributes[i+1]
    362                 a.value = value
    363                 a.ownerDocument = self.document
    364                 _set_attribute_node(node, a)
    365 
    366         if node is not self.document.documentElement:
    367             self._finish_start_element(node)
    368 
    369     def _finish_start_element(self, node):
    370         if self._filter:
    371             # To be general, we'd have to call isSameNode(), but this
    372             # is sufficient for minidom:
    373             if node is self.document.documentElement:
    374                 return
    375             filt = self._filter.startContainer(node)
    376             if filt == FILTER_REJECT:
    377                 # ignore this node & all descendents
    378                 Rejecter(self)
    379             elif filt == FILTER_SKIP:
    380                 # ignore this node, but make it's children become
    381                 # children of the parent node
    382                 Skipper(self)
    383             else:
    384                 return
    385             self.curNode = node.parentNode
    386             node.parentNode.removeChild(node)
    387             node.unlink()
    388 
    389     # If this ever changes, Namespaces.end_element_handler() needs to
    390     # be changed to match.
    391     #
    392     def end_element_handler(self, name):
    393         curNode = self.curNode
    394         self.curNode = curNode.parentNode
    395         self._finish_end_element(curNode)
    396 
    397     def _finish_end_element(self, curNode):
    398         info = self._elem_info.get(curNode.tagName)
    399         if info:
    400             self._handle_white_text_nodes(curNode, info)
    401         if self._filter:
    402             if curNode is self.document.documentElement:
    403                 return
    404             if self._filter.acceptNode(curNode) == FILTER_REJECT:
    405                 self.curNode.removeChild(curNode)
    406                 curNode.unlink()
    407 
    408     def _handle_white_text_nodes(self, node, info):
    409         if (self._options.whitespace_in_element_content
    410             or not info.isElementContent()):
    411             return
    412 
    413         # We have element type information and should remove ignorable
    414         # whitespace; identify for text nodes which contain only
    415         # whitespace.
    416         L = []
    417         for child in node.childNodes:
    418             if child.nodeType == TEXT_NODE and not child.data.strip():
    419                 L.append(child)
    420 
    421         # Remove ignorable whitespace from the tree.
    422         for child in L:
    423             node.removeChild(child)
    424 
    425     def element_decl_handler(self, name, model):
    426         info = self._elem_info.get(name)
    427         if info is None:
    428             self._elem_info[name] = ElementInfo(name, model)
    429         else:
    430             assert info._model is None
    431             info._model = model
    432 
    433     def attlist_decl_handler(self, elem, name, type, default, required):
    434         info = self._elem_info.get(elem)
    435         if info is None:
    436             info = ElementInfo(elem)
    437             self._elem_info[elem] = info
    438         info._attr_info.append(
    439             [None, name, None, None, default, 0, type, required])
    440 
    441     def xml_decl_handler(self, version, encoding, standalone):
    442         self.document.version = version
    443         self.document.encoding = encoding
    444         # This is still a little ugly, thanks to the pyexpat API. ;-(
    445         if standalone >= 0:
    446             if standalone:
    447                 self.document.standalone = True
    448             else:
    449                 self.document.standalone = False
    450 
    451 
    452 # Don't include FILTER_INTERRUPT, since that's checked separately
    453 # where allowed.
    454 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
    455 
    456 class FilterVisibilityController(object):
    457     """Wrapper around a DOMBuilderFilter which implements the checks
    458     to make the whatToShow filter attribute work."""
    459 
    460     __slots__ = 'filter',
    461 
    462     def __init__(self, filter):
    463         self.filter = filter
    464 
    465     def startContainer(self, node):
    466         mask = self._nodetype_mask[node.nodeType]
    467         if self.filter.whatToShow & mask:
    468             val = self.filter.startContainer(node)
    469             if val == FILTER_INTERRUPT:
    470                 raise ParseEscape
    471             if val not in _ALLOWED_FILTER_RETURNS:
    472                 raise ValueError(
    473                       "startContainer() returned illegal value: " + repr(val))
    474             return val
    475         else:
    476             return FILTER_ACCEPT
    477 
    478     def acceptNode(self, node):
    479         mask = self._nodetype_mask[node.nodeType]
    480         if self.filter.whatToShow & mask:
    481             val = self.filter.acceptNode(node)
    482             if val == FILTER_INTERRUPT:
    483                 raise ParseEscape
    484             if val == FILTER_SKIP:
    485                 # move all child nodes to the parent, and remove this node
    486                 parent = node.parentNode
    487                 for child in node.childNodes[:]:
    488                     parent.appendChild(child)
    489                 # node is handled by the caller
    490                 return FILTER_REJECT
    491             if val not in _ALLOWED_FILTER_RETURNS:
    492                 raise ValueError(
    493                       "acceptNode() returned illegal value: " + repr(val))
    494             return val
    495         else:
    496             return FILTER_ACCEPT
    497 
    498     _nodetype_mask = {
    499         Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
    500         Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
    501         Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
    502         Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
    503         Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
    504         Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
    505         Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
    506         Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
    507         Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
    508         Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
    509         Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
    510         Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
    511         }
    512 
    513 
    514 class FilterCrutch(object):
    515     __slots__ = '_builder', '_level', '_old_start', '_old_end'
    516 
    517     def __init__(self, builder):
    518         self._level = 0
    519         self._builder = builder
    520         parser = builder._parser
    521         self._old_start = parser.StartElementHandler
    522         self._old_end = parser.EndElementHandler
    523         parser.StartElementHandler = self.start_element_handler
    524         parser.EndElementHandler = self.end_element_handler
    525 
    526 class Rejecter(FilterCrutch):
    527     __slots__ = ()
    528 
    529     def __init__(self, builder):
    530         FilterCrutch.__init__(self, builder)
    531         parser = builder._parser
    532         for name in ("ProcessingInstructionHandler",
    533                      "CommentHandler",
    534                      "CharacterDataHandler",
    535                      "StartCdataSectionHandler",
    536                      "EndCdataSectionHandler",
    537                      "ExternalEntityRefHandler",
    538                      ):
    539             setattr(parser, name, None)
    540 
    541     def start_element_handler(self, *args):
    542         self._level = self._level + 1
    543 
    544     def end_element_handler(self, *args):
    545         if self._level == 0:
    546             # restore the old handlers
    547             parser = self._builder._parser
    548             self._builder.install(parser)
    549             parser.StartElementHandler = self._old_start
    550             parser.EndElementHandler = self._old_end
    551         else:
    552             self._level = self._level - 1
    553 
    554 class Skipper(FilterCrutch):
    555     __slots__ = ()
    556 
    557     def start_element_handler(self, *args):
    558         node = self._builder.curNode
    559         self._old_start(*args)
    560         if self._builder.curNode is not node:
    561             self._level = self._level + 1
    562 
    563     def end_element_handler(self, *args):
    564         if self._level == 0:
    565             # We're popping back out of the node we're skipping, so we
    566             # shouldn't need to do anything but reset the handlers.
    567             self._builder._parser.StartElementHandler = self._old_start
    568             self._builder._parser.EndElementHandler = self._old_end
    569             self._builder = None
    570         else:
    571             self._level = self._level - 1
    572             self._old_end(*args)
    573 
    574 
    575 # framework document used by the fragment builder.
    576 # Takes a string for the doctype, subset string, and namespace attrs string.
    577 
    578 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
    579     "http://xml.python.org/entities/fragment-builder/internal"
    580 
    581 _FRAGMENT_BUILDER_TEMPLATE = (
    582     '''\
    583 <!DOCTYPE wrapper
    584   %%s [
    585   <!ENTITY fragment-builder-internal
    586     SYSTEM "%s">
    587 %%s
    588 ]>
    589 <wrapper %%s
    590 >&fragment-builder-internal;</wrapper>'''
    591     % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
    592 
    593 
    594 class FragmentBuilder(ExpatBuilder):
    595     """Builder which constructs document fragments given XML source
    596     text and a context node.
    597 
    598     The context node is expected to provide information about the
    599     namespace declarations which are in scope at the start of the
    600     fragment.
    601     """
    602 
    603     def __init__(self, context, options=None):
    604         if context.nodeType == DOCUMENT_NODE:
    605             self.originalDocument = context
    606             self.context = context
    607         else:
    608             self.originalDocument = context.ownerDocument
    609             self.context = context
    610         ExpatBuilder.__init__(self, options)
    611 
    612     def reset(self):
    613         ExpatBuilder.reset(self)
    614         self.fragment = None
    615 
    616     def parseFile(self, file):
    617         """Parse a document fragment from a file object, returning the
    618         fragment node."""
    619         return self.parseString(file.read())
    620 
    621     def parseString(self, string):
    622         """Parse a document fragment from a string, returning the
    623         fragment node."""
    624         self._source = string
    625         parser = self.getParser()
    626         doctype = self.originalDocument.doctype
    627         ident = ""
    628         if doctype:
    629             subset = doctype.internalSubset or self._getDeclarations()
    630             if doctype.publicId:
    631                 ident = ('PUBLIC "%s" "%s"'
    632                          % (doctype.publicId, doctype.systemId))
    633             elif doctype.systemId:
    634                 ident = 'SYSTEM "%s"' % doctype.systemId
    635         else:
    636             subset = ""
    637         nsattrs = self._getNSattrs() # get ns decls from node's ancestors
    638         document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
    639         try:
    640             parser.Parse(document, 1)
    641         except:
    642             self.reset()
    643             raise
    644         fragment = self.fragment
    645         self.reset()
    646 ##         self._parser = None
    647         return fragment
    648 
    649     def _getDeclarations(self):
    650         """Re-create the internal subset from the DocumentType node.
    651 
    652         This is only needed if we don't already have the
    653         internalSubset as a string.
    654         """
    655         doctype = self.context.ownerDocument.doctype
    656         s = ""
    657         if doctype:
    658             for i in range(doctype.notations.length):
    659                 notation = doctype.notations.item(i)
    660                 if s:
    661                     s = s + "\n  "
    662                 s = "%s<!NOTATION %s" % (s, notation.nodeName)
    663                 if notation.publicId:
    664                     s = '%s PUBLIC "%s"\n             "%s">' \
    665                         % (s, notation.publicId, notation.systemId)
    666                 else:
    667                     s = '%s SYSTEM "%s">' % (s, notation.systemId)
    668             for i in range(doctype.entities.length):
    669                 entity = doctype.entities.item(i)
    670                 if s:
    671                     s = s + "\n  "
    672                 s = "%s<!ENTITY %s" % (s, entity.nodeName)
    673                 if entity.publicId:
    674                     s = '%s PUBLIC "%s"\n             "%s"' \
    675                         % (s, entity.publicId, entity.systemId)
    676                 elif entity.systemId:
    677                     s = '%s SYSTEM "%s"' % (s, entity.systemId)
    678                 else:
    679                     s = '%s "%s"' % (s, entity.firstChild.data)
    680                 if entity.notationName:
    681                     s = "%s NOTATION %s" % (s, entity.notationName)
    682                 s = s + ">"
    683         return s
    684 
    685     def _getNSattrs(self):
    686         return ""
    687 
    688     def external_entity_ref_handler(self, context, base, systemId, publicId):
    689         if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
    690             # this entref is the one that we made to put the subtree
    691             # in; all of our given input is parsed in here.
    692             old_document = self.document
    693             old_cur_node = self.curNode
    694             parser = self._parser.ExternalEntityParserCreate(context)
    695             # put the real document back, parse into the fragment to return
    696             self.document = self.originalDocument
    697             self.fragment = self.document.createDocumentFragment()
    698             self.curNode = self.fragment
    699             try:
    700                 parser.Parse(self._source, 1)
    701             finally:
    702                 self.curNode = old_cur_node
    703                 self.document = old_document
    704                 self._source = None
    705             return -1
    706         else:
    707             return ExpatBuilder.external_entity_ref_handler(
    708                 self, context, base, systemId, publicId)
    709 
    710 
    711 class Namespaces:
    712     """Mix-in class for builders; adds support for namespaces."""
    713 
    714     def _initNamespaces(self):
    715         # list of (prefix, uri) ns declarations.  Namespace attrs are
    716         # constructed from this and added to the element's attrs.
    717         self._ns_ordered_prefixes = []
    718 
    719     def createParser(self):
    720         """Create a new namespace-handling parser."""
    721         parser = expat.ParserCreate(namespace_separator=" ")
    722         parser.namespace_prefixes = True
    723         return parser
    724 
    725     def install(self, parser):
    726         """Insert the namespace-handlers onto the parser."""
    727         ExpatBuilder.install(self, parser)
    728         if self._options.namespace_declarations:
    729             parser.StartNamespaceDeclHandler = (
    730                 self.start_namespace_decl_handler)
    731 
    732     def start_namespace_decl_handler(self, prefix, uri):
    733         """Push this namespace declaration on our storage."""
    734         self._ns_ordered_prefixes.append((prefix, uri))
    735 
    736     def start_element_handler(self, name, attributes):
    737         if ' ' in name:
    738             uri, localname, prefix, qname = _parse_ns_name(self, name)
    739         else:
    740             uri = EMPTY_NAMESPACE
    741             qname = name
    742             localname = None
    743             prefix = EMPTY_PREFIX
    744         node = minidom.Element(qname, uri, prefix, localname)
    745         node.ownerDocument = self.document
    746         _append_child(self.curNode, node)
    747         self.curNode = node
    748 
    749         if self._ns_ordered_prefixes:
    750             for prefix, uri in self._ns_ordered_prefixes:
    751                 if prefix:
    752                     a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
    753                                      XMLNS_NAMESPACE, prefix, "xmlns")
    754                 else:
    755                     a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
    756                                      "xmlns", EMPTY_PREFIX)
    757                 a.value = uri
    758                 a.ownerDocument = self.document
    759                 _set_attribute_node(node, a)
    760             del self._ns_ordered_prefixes[:]
    761 
    762         if attributes:
    763             node._ensure_attributes()
    764             _attrs = node._attrs
    765             _attrsNS = node._attrsNS
    766             for i in range(0, len(attributes), 2):
    767                 aname = attributes[i]
    768                 value = attributes[i+1]
    769                 if ' ' in aname:
    770                     uri, localname, prefix, qname = _parse_ns_name(self, aname)
    771                     a = minidom.Attr(qname, uri, localname, prefix)
    772                     _attrs[qname] = a
    773                     _attrsNS[(uri, localname)] = a
    774                 else:
    775                     a = minidom.Attr(aname, EMPTY_NAMESPACE,
    776                                      aname, EMPTY_PREFIX)
    777                     _attrs[aname] = a
    778                     _attrsNS[(EMPTY_NAMESPACE, aname)] = a
    779                 a.ownerDocument = self.document
    780                 a.value = value
    781                 a.ownerElement = node
    782 
    783     if __debug__:
    784         # This only adds some asserts to the original
    785         # end_element_handler(), so we only define this when -O is not
    786         # used.  If changing one, be sure to check the other to see if
    787         # it needs to be changed as well.
    788         #
    789         def end_element_handler(self, name):
    790             curNode = self.curNode
    791             if ' ' in name:
    792                 uri, localname, prefix, qname = _parse_ns_name(self, name)
    793                 assert (curNode.namespaceURI == uri
    794                         and curNode.localName == localname
    795                         and curNode.prefix == prefix), \
    796                         "element stack messed up! (namespace)"
    797             else:
    798                 assert curNode.nodeName == name, \
    799                        "element stack messed up - bad nodeName"
    800                 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
    801                        "element stack messed up - bad namespaceURI"
    802             self.curNode = curNode.parentNode
    803             self._finish_end_element(curNode)
    804 
    805 
    806 class ExpatBuilderNS(Namespaces, ExpatBuilder):
    807     """Document builder that supports namespaces."""
    808 
    809     def reset(self):
    810         ExpatBuilder.reset(self)
    811         self._initNamespaces()
    812 
    813 
    814 class FragmentBuilderNS(Namespaces, FragmentBuilder):
    815     """Fragment builder that supports namespaces."""
    816 
    817     def reset(self):
    818         FragmentBuilder.reset(self)
    819         self._initNamespaces()
    820 
    821     def _getNSattrs(self):
    822         """Return string of namespace attributes from this element and
    823         ancestors."""
    824         # XXX This needs to be re-written to walk the ancestors of the
    825         # context to build up the namespace information from
    826         # declarations, elements, and attributes found in context.
    827         # Otherwise we have to store a bunch more data on the DOM
    828         # (though that *might* be more reliable -- not clear).
    829         attrs = ""
    830         context = self.context
    831         L = []
    832         while context:
    833             if hasattr(context, '_ns_prefix_uri'):
    834                 for prefix, uri in context._ns_prefix_uri.items():
    835                     # add every new NS decl from context to L and attrs string
    836                     if prefix in L:
    837                         continue
    838                     L.append(prefix)
    839                     if prefix:
    840                         declname = "xmlns:" + prefix
    841                     else:
    842                         declname = "xmlns"
    843                     if attrs:
    844                         attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
    845                     else:
    846                         attrs = " %s='%s'" % (declname, uri)
    847             context = context.parentNode
    848         return attrs
    849 
    850 
    851 class ParseEscape(Exception):
    852     """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
    853     pass
    854 
    855 class InternalSubsetExtractor(ExpatBuilder):
    856     """XML processor which can rip out the internal document type subset."""
    857 
    858     subset = None
    859 
    860     def getSubset(self):
    861         """Return the internal subset as a string."""
    862         return self.subset
    863 
    864     def parseFile(self, file):
    865         try:
    866             ExpatBuilder.parseFile(self, file)
    867         except ParseEscape:
    868             pass
    869 
    870     def parseString(self, string):
    871         try:
    872             ExpatBuilder.parseString(self, string)
    873         except ParseEscape:
    874             pass
    875 
    876     def install(self, parser):
    877         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
    878         parser.StartElementHandler = self.start_element_handler
    879 
    880     def start_doctype_decl_handler(self, name, publicId, systemId,
    881                                    has_internal_subset):
    882         if has_internal_subset:
    883             parser = self.getParser()
    884             self.subset = []
    885             parser.DefaultHandler = self.subset.append
    886             parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
    887         else:
    888             raise ParseEscape()
    889 
    890     def end_doctype_decl_handler(self):
    891         s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
    892         self.subset = s
    893         raise ParseEscape()
    894 
    895     def start_element_handler(self, name, attrs):
    896         raise ParseEscape()
    897 
    898 
    899 def parse(file, namespaces=True):
    900     """Parse a document, returning the resulting Document node.
    901 
    902     'file' may be either a file name or an open file object.
    903     """
    904     if namespaces:
    905         builder = ExpatBuilderNS()
    906     else:
    907         builder = ExpatBuilder()
    908 
    909     if isinstance(file, str):
    910         with open(file, 'rb') as fp:
    911             result = builder.parseFile(fp)
    912     else:
    913         result = builder.parseFile(file)
    914     return result
    915 
    916 
    917 def parseString(string, namespaces=True):
    918     """Parse a document from a string, returning the resulting
    919     Document node.
    920     """
    921     if namespaces:
    922         builder = ExpatBuilderNS()
    923     else:
    924         builder = ExpatBuilder()
    925     return builder.parseString(string)
    926 
    927 
    928 def parseFragment(file, context, namespaces=True):
    929     """Parse a fragment of a document, given the context from which it
    930     was originally extracted.  context should be the parent of the
    931     node(s) which are in the fragment.
    932 
    933     'file' may be either a file name or an open file object.
    934     """
    935     if namespaces:
    936         builder = FragmentBuilderNS(context)
    937     else:
    938         builder = FragmentBuilder(context)
    939 
    940     if isinstance(file, str):
    941         with open(file, 'rb') as fp:
    942             result = builder.parseFile(fp)
    943     else:
    944         result = builder.parseFile(file)
    945     return result
    946 
    947 
    948 def parseFragmentString(string, context, namespaces=True):
    949     """Parse a fragment of a document from a string, given the context
    950     from which it was originally extracted.  context should be the
    951     parent of the node(s) which are in the fragment.
    952     """
    953     if namespaces:
    954         builder = FragmentBuilderNS(context)
    955     else:
    956         builder = FragmentBuilder(context)
    957     return builder.parseString(string)
    958 
    959 
    960 def makeBuilder(options):
    961     """Create a builder based on an Options object."""
    962     if options.namespaces:
    963         return ExpatBuilderNS(options)
    964     else:
    965         return ExpatBuilder(options)
    966