Home | History | Annotate | Download | only in dom
      1 """Facility to use the Expat parser to load a minidom instance
      2 from a string or file.
      3 
      4 This avoids all the overhead of SAX and pulldom to gain performance.
      5 """
      6 
      7 # Warning!
      8 #
      9 # This module is tightly bound to the implementation details of the
     10 # minidom DOM and can't be used with other DOM implementations.  This
     11 # is due, in part, to a lack of appropriate methods in the DOM (there is
     12 # no way to create Entity and Notation nodes via the DOM Level 2
     13 # interface), and for performance.  The later is the cause of some fairly
     14 # cryptic code.
     15 #
     16 # Performance hacks:
     17 #
     18 #   -  .character_data_handler() has an extra case in which continuing
     19 #      data is appended to an existing Text node; this can be a
     20 #      speedup since pyexpat can break up character data into multiple
     21 #      callbacks even though we set the buffer_text attribute on the
     22 #      parser.  This also gives us the advantage that we don't need a
     23 #      separate normalization pass.
     24 #
     25 #   -  Determining that a node exists is done using an identity comparison
     26 #      with None rather than a truth test; this avoids searching for and
     27 #      calling any methods on the node object if it exists.  (A rather
     28 #      nice speedup is achieved this way as well!)
     29 
     30 from xml.dom import xmlbuilder, minidom, Node
     31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
     32 from xml.parsers import expat
     33 from xml.dom.minidom import _append_child, _set_attribute_node
     34 from xml.dom.NodeFilter import NodeFilter
     35 
     36 from xml.dom.minicompat import *
     37 
     38 TEXT_NODE = Node.TEXT_NODE
     39 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
     40 DOCUMENT_NODE = Node.DOCUMENT_NODE
     41 
     42 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
     43 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
     44 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
     45 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
     46 
     47 theDOMImplementation = minidom.getDOMImplementation()
     48 
     49 # Expat typename -> TypeInfo
     50 _typeinfo_map = {
     51     "CDATA":    minidom.TypeInfo(None, "cdata"),
     52     "ENUM":     minidom.TypeInfo(None, "enumeration"),
     53     "ENTITY":   minidom.TypeInfo(None, "entity"),
     54     "ENTITIES": minidom.TypeInfo(None, "entities"),
     55     "ID":       minidom.TypeInfo(None, "id"),
     56     "IDREF":    minidom.TypeInfo(None, "idref"),
     57     "IDREFS":   minidom.TypeInfo(None, "idrefs"),
     58     "NMTOKEN":  minidom.TypeInfo(None, "nmtoken"),
     59     "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
     60     }
     61 
     62 class ElementInfo(object):
     63     __slots__ = '_attr_info', '_model', 'tagName'
     64 
     65     def __init__(self, tagName, model=None):
     66         self.tagName = tagName
     67         self._attr_info = []
     68         self._model = model
     69 
     70     def __getstate__(self):
     71         return self._attr_info, self._model, self.tagName
     72 
     73     def __setstate__(self, state):
     74         self._attr_info, self._model, self.tagName = state
     75 
     76     def getAttributeType(self, aname):
     77         for info in self._attr_info:
     78             if info[1] == aname:
     79                 t = info[-2]
     80                 if t[0] == "(":
     81                     return _typeinfo_map["ENUM"]
     82                 else:
     83                     return _typeinfo_map[info[-2]]
     84         return minidom._no_type
     85 
     86     def getAttributeTypeNS(self, namespaceURI, localName):
     87         return minidom._no_type
     88 
     89     def isElementContent(self):
     90         if self._model:
     91             type = self._model[0]
     92             return type not in (expat.model.XML_CTYPE_ANY,
     93                                 expat.model.XML_CTYPE_MIXED)
     94         else:
     95             return False
     96 
     97     def isEmpty(self):
     98         if self._model:
     99             return self._model[0] == expat.model.XML_CTYPE_EMPTY
    100         else:
    101             return False
    102 
    103     def isId(self, aname):
    104         for info in self._attr_info:
    105             if info[1] == aname:
    106                 return info[-2] == "ID"
    107         return False
    108 
    109     def isIdNS(self, euri, ename, auri, aname):
    110         # not sure this is meaningful
    111         return self.isId((auri, aname))
    112 
    113 def _intern(builder, s):
    114     return builder._intern_setdefault(s, s)
    115 
    116 def _parse_ns_name(builder, name):
    117     assert ' ' in name
    118     parts = name.split(' ')
    119     intern = builder._intern_setdefault
    120     if len(parts) == 3:
    121         uri, localname, prefix = parts
    122         prefix = intern(prefix, prefix)
    123         qname = "%s:%s" % (prefix, localname)
    124         qname = intern(qname, qname)
    125         localname = intern(localname, localname)
    126     else:
    127         uri, localname = parts
    128         prefix = EMPTY_PREFIX
    129         qname = localname = intern(localname, localname)
    130     return intern(uri, uri), localname, prefix, qname
    131 
    132 
    133 class ExpatBuilder:
    134     """Document builder that uses Expat to build a ParsedXML.DOM document
    135     instance."""
    136 
    137     def __init__(self, options=None):
    138         if options is None:
    139             options = xmlbuilder.Options()
    140         self._options = options
    141         if self._options.filter is not None:
    142             self._filter = FilterVisibilityController(self._options.filter)
    143         else:
    144             self._filter = None
    145             # This *really* doesn't do anything in this case, so
    146             # override it with something fast & minimal.
    147             self._finish_start_element = id
    148         self._parser = None
    149         self.reset()
    150 
    151     def createParser(self):
    152         """Create a new parser object."""
    153         return expat.ParserCreate()
    154 
    155     def getParser(self):
    156         """Return the parser object, creating a new one if needed."""
    157         if not self._parser:
    158             self._parser = self.createParser()
    159             self._intern_setdefault = self._parser.intern.setdefault
    160             self._parser.buffer_text = True
    161             self._parser.ordered_attributes = True
    162             self._parser.specified_attributes = True
    163             self.install(self._parser)
    164         return self._parser
    165 
    166     def reset(self):
    167         """Free all data structures used during DOM construction."""
    168         self.document = theDOMImplementation.createDocument(
    169             EMPTY_NAMESPACE, None, None)
    170         self.curNode = self.document
    171         self._elem_info = self.document._elem_info
    172         self._cdata = False
    173 
    174     def install(self, parser):
    175         """Install the callbacks needed to build the DOM into the parser."""
    176         # This creates circular references!
    177         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
    178         parser.StartElementHandler = self.first_element_handler
    179         parser.EndElementHandler = self.end_element_handler
    180         parser.ProcessingInstructionHandler = self.pi_handler
    181         if self._options.entities:
    182             parser.EntityDeclHandler = self.entity_decl_handler
    183         parser.NotationDeclHandler = self.notation_decl_handler
    184         if self._options.comments:
    185             parser.CommentHandler = self.comment_handler
    186         if self._options.cdata_sections:
    187             parser.StartCdataSectionHandler = self.start_cdata_section_handler
    188             parser.EndCdataSectionHandler = self.end_cdata_section_handler
    189             parser.CharacterDataHandler = self.character_data_handler_cdata
    190         else:
    191             parser.CharacterDataHandler = self.character_data_handler
    192         parser.ExternalEntityRefHandler = self.external_entity_ref_handler
    193         parser.XmlDeclHandler = self.xml_decl_handler
    194         parser.ElementDeclHandler = self.element_decl_handler
    195         parser.AttlistDeclHandler = self.attlist_decl_handler
    196 
    197     def parseFile(self, file):
    198         """Parse a document from a file object, returning the document
    199         node."""
    200         parser = self.getParser()
    201         first_buffer = True
    202         try:
    203             while 1:
    204                 buffer = file.read(16*1024)
    205                 if not buffer:
    206                     break
    207                 parser.Parse(buffer, 0)
    208                 if first_buffer and self.document.documentElement:
    209                     self._setup_subset(buffer)
    210                 first_buffer = False
    211             parser.Parse("", True)
    212         except ParseEscape:
    213             pass
    214         doc = self.document
    215         self.reset()
    216         self._parser = None
    217         return doc
    218 
    219     def parseString(self, string):
    220         """Parse a document from a string, returning the document node."""
    221         parser = self.getParser()
    222         try:
    223             parser.Parse(string, True)
    224             self._setup_subset(string)
    225         except ParseEscape:
    226             pass
    227         doc = self.document
    228         self.reset()
    229         self._parser = None
    230         return doc
    231 
    232     def _setup_subset(self, buffer):
    233         """Load the internal subset if there might be one."""
    234         if self.document.doctype:
    235             extractor = InternalSubsetExtractor()
    236             extractor.parseString(buffer)
    237             subset = extractor.getSubset()
    238             self.document.doctype.internalSubset = subset
    239 
    240     def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
    241                                    has_internal_subset):
    242         doctype = self.document.implementation.createDocumentType(
    243             doctypeName, publicId, systemId)
    244         doctype.ownerDocument = self.document
    245         _append_child(self.document, doctype)
    246         self.document.doctype = doctype
    247         if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
    248             self.document.doctype = None
    249             del self.document.childNodes[-1]
    250             doctype = None
    251             self._parser.EntityDeclHandler = None
    252             self._parser.NotationDeclHandler = None
    253         if has_internal_subset:
    254             if doctype is not None:
    255                 doctype.entities._seq = []
    256                 doctype.notations._seq = []
    257             self._parser.CommentHandler = None
    258             self._parser.ProcessingInstructionHandler = None
    259             self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
    260 
    261     def end_doctype_decl_handler(self):
    262         if self._options.comments:
    263             self._parser.CommentHandler = self.comment_handler
    264         self._parser.ProcessingInstructionHandler = self.pi_handler
    265         if not (self._elem_info or self._filter):
    266             self._finish_end_element = id
    267 
    268     def pi_handler(self, target, data):
    269         node = self.document.createProcessingInstruction(target, data)
    270         _append_child(self.curNode, node)
    271         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    272             self.curNode.removeChild(node)
    273 
    274     def character_data_handler_cdata(self, data):
    275         childNodes = self.curNode.childNodes
    276         if self._cdata:
    277             if (  self._cdata_continue
    278                   and childNodes[-1].nodeType == CDATA_SECTION_NODE):
    279                 childNodes[-1].appendData(data)
    280                 return
    281             node = self.document.createCDATASection(data)
    282             self._cdata_continue = True
    283         elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
    284             node = childNodes[-1]
    285             value = node.data + data
    286             d = node.__dict__
    287             d['data'] = d['nodeValue'] = value
    288             return
    289         else:
    290             node = minidom.Text()
    291             d = node.__dict__
    292             d['data'] = d['nodeValue'] = data
    293             d['ownerDocument'] = self.document
    294         _append_child(self.curNode, node)
    295 
    296     def character_data_handler(self, data):
    297         childNodes = self.curNode.childNodes
    298         if childNodes and childNodes[-1].nodeType == TEXT_NODE:
    299             node = childNodes[-1]
    300             d = node.__dict__
    301             d['data'] = d['nodeValue'] = node.data + data
    302             return
    303         node = minidom.Text()
    304         d = node.__dict__
    305         d['data'] = d['nodeValue'] = node.data + data
    306         d['ownerDocument'] = self.document
    307         _append_child(self.curNode, node)
    308 
    309     def entity_decl_handler(self, entityName, is_parameter_entity, value,
    310                             base, systemId, publicId, notationName):
    311         if is_parameter_entity:
    312             # we don't care about parameter entities for the DOM
    313             return
    314         if not self._options.entities:
    315             return
    316         node = self.document._create_entity(entityName, publicId,
    317                                             systemId, notationName)
    318         if value is not None:
    319             # internal entity
    320             # node *should* be readonly, but we'll cheat
    321             child = self.document.createTextNode(value)
    322             node.childNodes.append(child)
    323         self.document.doctype.entities._seq.append(node)
    324         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    325             del self.document.doctype.entities._seq[-1]
    326 
    327     def notation_decl_handler(self, notationName, base, systemId, publicId):
    328         node = self.document._create_notation(notationName, publicId, systemId)
    329         self.document.doctype.notations._seq.append(node)
    330         if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
    331             del self.document.doctype.notations._seq[-1]
    332 
    333     def comment_handler(self, data):
    334         node = self.document.createComment(data)
    335         _append_child(self.curNode, node)
    336         if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
    337             self.curNode.removeChild(node)
    338 
    339     def start_cdata_section_handler(self):
    340         self._cdata = True
    341         self._cdata_continue = False
    342 
    343     def end_cdata_section_handler(self):
    344         self._cdata = False
    345         self._cdata_continue = False
    346 
    347     def external_entity_ref_handler(self, context, base, systemId, publicId):
    348         return 1
    349 
    350     def first_element_handler(self, name, attributes):
    351         if self._filter is None and not self._elem_info:
    352             self._finish_end_element = id
    353         self.getParser().StartElementHandler = self.start_element_handler
    354         self.start_element_handler(name, attributes)
    355 
    356     def start_element_handler(self, name, attributes):
    357         node = self.document.createElement(name)
    358         _append_child(self.curNode, node)
    359         self.curNode = node
    360 
    361         if attributes:
    362             for i in range(0, len(attributes), 2):
    363                 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
    364                                  None, EMPTY_PREFIX)
    365                 value = attributes[i+1]
    366                 d = a.childNodes[0].__dict__
    367                 d['data'] = d['nodeValue'] = value
    368                 d = a.__dict__
    369                 d['value'] = d['nodeValue'] = value
    370                 d['ownerDocument'] = self.document
    371                 _set_attribute_node(node, a)
    372 
    373         if node is not self.document.documentElement:
    374             self._finish_start_element(node)
    375 
    376     def _finish_start_element(self, node):
    377         if self._filter:
    378             # To be general, we'd have to call isSameNode(), but this
    379             # is sufficient for minidom:
    380             if node is self.document.documentElement:
    381                 return
    382             filt = self._filter.startContainer(node)
    383             if filt == FILTER_REJECT:
    384                 # ignore this node & all descendents
    385                 Rejecter(self)
    386             elif filt == FILTER_SKIP:
    387                 # ignore this node, but make it's children become
    388                 # children of the parent node
    389                 Skipper(self)
    390             else:
    391                 return
    392             self.curNode = node.parentNode
    393             node.parentNode.removeChild(node)
    394             node.unlink()
    395 
    396     # If this ever changes, Namespaces.end_element_handler() needs to
    397     # be changed to match.
    398     #
    399     def end_element_handler(self, name):
    400         curNode = self.curNode
    401         self.curNode = curNode.parentNode
    402         self._finish_end_element(curNode)
    403 
    404     def _finish_end_element(self, curNode):
    405         info = self._elem_info.get(curNode.tagName)
    406         if info:
    407             self._handle_white_text_nodes(curNode, info)
    408         if self._filter:
    409             if curNode is self.document.documentElement:
    410                 return
    411             if self._filter.acceptNode(curNode) == FILTER_REJECT:
    412                 self.curNode.removeChild(curNode)
    413                 curNode.unlink()
    414 
    415     def _handle_white_text_nodes(self, node, info):
    416         if (self._options.whitespace_in_element_content
    417             or not info.isElementContent()):
    418             return
    419 
    420         # We have element type information and should remove ignorable
    421         # whitespace; identify for text nodes which contain only
    422         # whitespace.
    423         L = []
    424         for child in node.childNodes:
    425             if child.nodeType == TEXT_NODE and not child.data.strip():
    426                 L.append(child)
    427 
    428         # Remove ignorable whitespace from the tree.
    429         for child in L:
    430             node.removeChild(child)
    431 
    432     def element_decl_handler(self, name, model):
    433         info = self._elem_info.get(name)
    434         if info is None:
    435             self._elem_info[name] = ElementInfo(name, model)
    436         else:
    437             assert info._model is None
    438             info._model = model
    439 
    440     def attlist_decl_handler(self, elem, name, type, default, required):
    441         info = self._elem_info.get(elem)
    442         if info is None:
    443             info = ElementInfo(elem)
    444             self._elem_info[elem] = info
    445         info._attr_info.append(
    446             [None, name, None, None, default, 0, type, required])
    447 
    448     def xml_decl_handler(self, version, encoding, standalone):
    449         self.document.version = version
    450         self.document.encoding = encoding
    451         # This is still a little ugly, thanks to the pyexpat API. ;-(
    452         if standalone >= 0:
    453             if standalone:
    454                 self.document.standalone = True
    455             else:
    456                 self.document.standalone = False
    457 
    458 
    459 # Don't include FILTER_INTERRUPT, since that's checked separately
    460 # where allowed.
    461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
    462 
    463 class FilterVisibilityController(object):
    464     """Wrapper around a DOMBuilderFilter which implements the checks
    465     to make the whatToShow filter attribute work."""
    466 
    467     __slots__ = 'filter',
    468 
    469     def __init__(self, filter):
    470         self.filter = filter
    471 
    472     def startContainer(self, node):
    473         mask = self._nodetype_mask[node.nodeType]
    474         if self.filter.whatToShow & mask:
    475             val = self.filter.startContainer(node)
    476             if val == FILTER_INTERRUPT:
    477                 raise ParseEscape
    478             if val not in _ALLOWED_FILTER_RETURNS:
    479                 raise ValueError, \
    480                       "startContainer() returned illegal value: " + repr(val)
    481             return val
    482         else:
    483             return FILTER_ACCEPT
    484 
    485     def acceptNode(self, node):
    486         mask = self._nodetype_mask[node.nodeType]
    487         if self.filter.whatToShow & mask:
    488             val = self.filter.acceptNode(node)
    489             if val == FILTER_INTERRUPT:
    490                 raise ParseEscape
    491             if val == FILTER_SKIP:
    492                 # move all child nodes to the parent, and remove this node
    493                 parent = node.parentNode
    494                 for child in node.childNodes[:]:
    495                     parent.appendChild(child)
    496                 # node is handled by the caller
    497                 return FILTER_REJECT
    498             if val not in _ALLOWED_FILTER_RETURNS:
    499                 raise ValueError, \
    500                       "acceptNode() returned illegal value: " + repr(val)
    501             return val
    502         else:
    503             return FILTER_ACCEPT
    504 
    505     _nodetype_mask = {
    506         Node.ELEMENT_NODE:                NodeFilter.SHOW_ELEMENT,
    507         Node.ATTRIBUTE_NODE:              NodeFilter.SHOW_ATTRIBUTE,
    508         Node.TEXT_NODE:                   NodeFilter.SHOW_TEXT,
    509         Node.CDATA_SECTION_NODE:          NodeFilter.SHOW_CDATA_SECTION,
    510         Node.ENTITY_REFERENCE_NODE:       NodeFilter.SHOW_ENTITY_REFERENCE,
    511         Node.ENTITY_NODE:                 NodeFilter.SHOW_ENTITY,
    512         Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
    513         Node.COMMENT_NODE:                NodeFilter.SHOW_COMMENT,
    514         Node.DOCUMENT_NODE:               NodeFilter.SHOW_DOCUMENT,
    515         Node.DOCUMENT_TYPE_NODE:          NodeFilter.SHOW_DOCUMENT_TYPE,
    516         Node.DOCUMENT_FRAGMENT_NODE:      NodeFilter.SHOW_DOCUMENT_FRAGMENT,
    517         Node.NOTATION_NODE:               NodeFilter.SHOW_NOTATION,
    518         }
    519 
    520 
    521 class FilterCrutch(object):
    522     __slots__ = '_builder', '_level', '_old_start', '_old_end'
    523 
    524     def __init__(self, builder):
    525         self._level = 0
    526         self._builder = builder
    527         parser = builder._parser
    528         self._old_start = parser.StartElementHandler
    529         self._old_end = parser.EndElementHandler
    530         parser.StartElementHandler = self.start_element_handler
    531         parser.EndElementHandler = self.end_element_handler
    532 
    533 class Rejecter(FilterCrutch):
    534     __slots__ = ()
    535 
    536     def __init__(self, builder):
    537         FilterCrutch.__init__(self, builder)
    538         parser = builder._parser
    539         for name in ("ProcessingInstructionHandler",
    540                      "CommentHandler",
    541                      "CharacterDataHandler",
    542                      "StartCdataSectionHandler",
    543                      "EndCdataSectionHandler",
    544                      "ExternalEntityRefHandler",
    545                      ):
    546             setattr(parser, name, None)
    547 
    548     def start_element_handler(self, *args):
    549         self._level = self._level + 1
    550 
    551     def end_element_handler(self, *args):
    552         if self._level == 0:
    553             # restore the old handlers
    554             parser = self._builder._parser
    555             self._builder.install(parser)
    556             parser.StartElementHandler = self._old_start
    557             parser.EndElementHandler = self._old_end
    558         else:
    559             self._level = self._level - 1
    560 
    561 class Skipper(FilterCrutch):
    562     __slots__ = ()
    563 
    564     def start_element_handler(self, *args):
    565         node = self._builder.curNode
    566         self._old_start(*args)
    567         if self._builder.curNode is not node:
    568             self._level = self._level + 1
    569 
    570     def end_element_handler(self, *args):
    571         if self._level == 0:
    572             # We're popping back out of the node we're skipping, so we
    573             # shouldn't need to do anything but reset the handlers.
    574             self._builder._parser.StartElementHandler = self._old_start
    575             self._builder._parser.EndElementHandler = self._old_end
    576             self._builder = None
    577         else:
    578             self._level = self._level - 1
    579             self._old_end(*args)
    580 
    581 
    582 # framework document used by the fragment builder.
    583 # Takes a string for the doctype, subset string, and namespace attrs string.
    584 
    585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
    586     "http://xml.python.org/entities/fragment-builder/internal"
    587 
    588 _FRAGMENT_BUILDER_TEMPLATE = (
    589     '''\
    590 <!DOCTYPE wrapper
    591   %%s [
    592   <!ENTITY fragment-builder-internal
    593     SYSTEM "%s">
    594 %%s
    595 ]>
    596 <wrapper %%s
    597 >&fragment-builder-internal;</wrapper>'''
    598     % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
    599 
    600 
    601 class FragmentBuilder(ExpatBuilder):
    602     """Builder which constructs document fragments given XML source
    603     text and a context node.
    604 
    605     The context node is expected to provide information about the
    606     namespace declarations which are in scope at the start of the
    607     fragment.
    608     """
    609 
    610     def __init__(self, context, options=None):
    611         if context.nodeType == DOCUMENT_NODE:
    612             self.originalDocument = context
    613             self.context = context
    614         else:
    615             self.originalDocument = context.ownerDocument
    616             self.context = context
    617         ExpatBuilder.__init__(self, options)
    618 
    619     def reset(self):
    620         ExpatBuilder.reset(self)
    621         self.fragment = None
    622 
    623     def parseFile(self, file):
    624         """Parse a document fragment from a file object, returning the
    625         fragment node."""
    626         return self.parseString(file.read())
    627 
    628     def parseString(self, string):
    629         """Parse a document fragment from a string, returning the
    630         fragment node."""
    631         self._source = string
    632         parser = self.getParser()
    633         doctype = self.originalDocument.doctype
    634         ident = ""
    635         if doctype:
    636             subset = doctype.internalSubset or self._getDeclarations()
    637             if doctype.publicId:
    638                 ident = ('PUBLIC "%s" "%s"'
    639                          % (doctype.publicId, doctype.systemId))
    640             elif doctype.systemId:
    641                 ident = 'SYSTEM "%s"' % doctype.systemId
    642         else:
    643             subset = ""
    644         nsattrs = self._getNSattrs() # get ns decls from node's ancestors
    645         document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
    646         try:
    647             parser.Parse(document, 1)
    648         except:
    649             self.reset()
    650             raise
    651         fragment = self.fragment
    652         self.reset()
    653 ##         self._parser = None
    654         return fragment
    655 
    656     def _getDeclarations(self):
    657         """Re-create the internal subset from the DocumentType node.
    658 
    659         This is only needed if we don't already have the
    660         internalSubset as a string.
    661         """
    662         doctype = self.context.ownerDocument.doctype
    663         s = ""
    664         if doctype:
    665             for i in range(doctype.notations.length):
    666                 notation = doctype.notations.item(i)
    667                 if s:
    668                     s = s + "\n  "
    669                 s = "%s<!NOTATION %s" % (s, notation.nodeName)
    670                 if notation.publicId:
    671                     s = '%s PUBLIC "%s"\n             "%s">' \
    672                         % (s, notation.publicId, notation.systemId)
    673                 else:
    674                     s = '%s SYSTEM "%s">' % (s, notation.systemId)
    675             for i in range(doctype.entities.length):
    676                 entity = doctype.entities.item(i)
    677                 if s:
    678                     s = s + "\n  "
    679                 s = "%s<!ENTITY %s" % (s, entity.nodeName)
    680                 if entity.publicId:
    681                     s = '%s PUBLIC "%s"\n             "%s"' \
    682                         % (s, entity.publicId, entity.systemId)
    683                 elif entity.systemId:
    684                     s = '%s SYSTEM "%s"' % (s, entity.systemId)
    685                 else:
    686                     s = '%s "%s"' % (s, entity.firstChild.data)
    687                 if entity.notationName:
    688                     s = "%s NOTATION %s" % (s, entity.notationName)
    689                 s = s + ">"
    690         return s
    691 
    692     def _getNSattrs(self):
    693         return ""
    694 
    695     def external_entity_ref_handler(self, context, base, systemId, publicId):
    696         if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
    697             # this entref is the one that we made to put the subtree
    698             # in; all of our given input is parsed in here.
    699             old_document = self.document
    700             old_cur_node = self.curNode
    701             parser = self._parser.ExternalEntityParserCreate(context)
    702             # put the real document back, parse into the fragment to return
    703             self.document = self.originalDocument
    704             self.fragment = self.document.createDocumentFragment()
    705             self.curNode = self.fragment
    706             try:
    707                 parser.Parse(self._source, 1)
    708             finally:
    709                 self.curNode = old_cur_node
    710                 self.document = old_document
    711                 self._source = None
    712             return -1
    713         else:
    714             return ExpatBuilder.external_entity_ref_handler(
    715                 self, context, base, systemId, publicId)
    716 
    717 
    718 class Namespaces:
    719     """Mix-in class for builders; adds support for namespaces."""
    720 
    721     def _initNamespaces(self):
    722         # list of (prefix, uri) ns declarations.  Namespace attrs are
    723         # constructed from this and added to the element's attrs.
    724         self._ns_ordered_prefixes = []
    725 
    726     def createParser(self):
    727         """Create a new namespace-handling parser."""
    728         parser = expat.ParserCreate(namespace_separator=" ")
    729         parser.namespace_prefixes = True
    730         return parser
    731 
    732     def install(self, parser):
    733         """Insert the namespace-handlers onto the parser."""
    734         ExpatBuilder.install(self, parser)
    735         if self._options.namespace_declarations:
    736             parser.StartNamespaceDeclHandler = (
    737                 self.start_namespace_decl_handler)
    738 
    739     def start_namespace_decl_handler(self, prefix, uri):
    740         """Push this namespace declaration on our storage."""
    741         self._ns_ordered_prefixes.append((prefix, uri))
    742 
    743     def start_element_handler(self, name, attributes):
    744         if ' ' in name:
    745             uri, localname, prefix, qname = _parse_ns_name(self, name)
    746         else:
    747             uri = EMPTY_NAMESPACE
    748             qname = name
    749             localname = None
    750             prefix = EMPTY_PREFIX
    751         node = minidom.Element(qname, uri, prefix, localname)
    752         node.ownerDocument = self.document
    753         _append_child(self.curNode, node)
    754         self.curNode = node
    755 
    756         if self._ns_ordered_prefixes:
    757             for prefix, uri in self._ns_ordered_prefixes:
    758                 if prefix:
    759                     a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
    760                                      XMLNS_NAMESPACE, prefix, "xmlns")
    761                 else:
    762                     a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
    763                                      "xmlns", EMPTY_PREFIX)
    764                 d = a.childNodes[0].__dict__
    765                 d['data'] = d['nodeValue'] = uri
    766                 d = a.__dict__
    767                 d['value'] = d['nodeValue'] = uri
    768                 d['ownerDocument'] = self.document
    769                 _set_attribute_node(node, a)
    770             del self._ns_ordered_prefixes[:]
    771 
    772         if attributes:
    773             _attrs = node._attrs
    774             _attrsNS = node._attrsNS
    775             for i in range(0, len(attributes), 2):
    776                 aname = attributes[i]
    777                 value = attributes[i+1]
    778                 if ' ' in aname:
    779                     uri, localname, prefix, qname = _parse_ns_name(self, aname)
    780                     a = minidom.Attr(qname, uri, localname, prefix)
    781                     _attrs[qname] = a
    782                     _attrsNS[(uri, localname)] = a
    783                 else:
    784                     a = minidom.Attr(aname, EMPTY_NAMESPACE,
    785                                      aname, EMPTY_PREFIX)
    786                     _attrs[aname] = a
    787                     _attrsNS[(EMPTY_NAMESPACE, aname)] = a
    788                 d = a.childNodes[0].__dict__
    789                 d['data'] = d['nodeValue'] = value
    790                 d = a.__dict__
    791                 d['ownerDocument'] = self.document
    792                 d['value'] = d['nodeValue'] = value
    793                 d['ownerElement'] = node
    794 
    795     if __debug__:
    796         # This only adds some asserts to the original
    797         # end_element_handler(), so we only define this when -O is not
    798         # used.  If changing one, be sure to check the other to see if
    799         # it needs to be changed as well.
    800         #
    801         def end_element_handler(self, name):
    802             curNode = self.curNode
    803             if ' ' in name:
    804                 uri, localname, prefix, qname = _parse_ns_name(self, name)
    805                 assert (curNode.namespaceURI == uri
    806                         and curNode.localName == localname
    807                         and curNode.prefix == prefix), \
    808                         "element stack messed up! (namespace)"
    809             else:
    810                 assert curNode.nodeName == name, \
    811                        "element stack messed up - bad nodeName"
    812                 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
    813                        "element stack messed up - bad namespaceURI"
    814             self.curNode = curNode.parentNode
    815             self._finish_end_element(curNode)
    816 
    817 
    818 class ExpatBuilderNS(Namespaces, ExpatBuilder):
    819     """Document builder that supports namespaces."""
    820 
    821     def reset(self):
    822         ExpatBuilder.reset(self)
    823         self._initNamespaces()
    824 
    825 
    826 class FragmentBuilderNS(Namespaces, FragmentBuilder):
    827     """Fragment builder that supports namespaces."""
    828 
    829     def reset(self):
    830         FragmentBuilder.reset(self)
    831         self._initNamespaces()
    832 
    833     def _getNSattrs(self):
    834         """Return string of namespace attributes from this element and
    835         ancestors."""
    836         # XXX This needs to be re-written to walk the ancestors of the
    837         # context to build up the namespace information from
    838         # declarations, elements, and attributes found in context.
    839         # Otherwise we have to store a bunch more data on the DOM
    840         # (though that *might* be more reliable -- not clear).
    841         attrs = ""
    842         context = self.context
    843         L = []
    844         while context:
    845             if hasattr(context, '_ns_prefix_uri'):
    846                 for prefix, uri in context._ns_prefix_uri.items():
    847                     # add every new NS decl from context to L and attrs string
    848                     if prefix in L:
    849                         continue
    850                     L.append(prefix)
    851                     if prefix:
    852                         declname = "xmlns:" + prefix
    853                     else:
    854                         declname = "xmlns"
    855                     if attrs:
    856                         attrs = "%s\n    %s='%s'" % (attrs, declname, uri)
    857                     else:
    858                         attrs = " %s='%s'" % (declname, uri)
    859             context = context.parentNode
    860         return attrs
    861 
    862 
    863 class ParseEscape(Exception):
    864     """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
    865     pass
    866 
    867 class InternalSubsetExtractor(ExpatBuilder):
    868     """XML processor which can rip out the internal document type subset."""
    869 
    870     subset = None
    871 
    872     def getSubset(self):
    873         """Return the internal subset as a string."""
    874         return self.subset
    875 
    876     def parseFile(self, file):
    877         try:
    878             ExpatBuilder.parseFile(self, file)
    879         except ParseEscape:
    880             pass
    881 
    882     def parseString(self, string):
    883         try:
    884             ExpatBuilder.parseString(self, string)
    885         except ParseEscape:
    886             pass
    887 
    888     def install(self, parser):
    889         parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
    890         parser.StartElementHandler = self.start_element_handler
    891 
    892     def start_doctype_decl_handler(self, name, publicId, systemId,
    893                                    has_internal_subset):
    894         if has_internal_subset:
    895             parser = self.getParser()
    896             self.subset = []
    897             parser.DefaultHandler = self.subset.append
    898             parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
    899         else:
    900             raise ParseEscape()
    901 
    902     def end_doctype_decl_handler(self):
    903         s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
    904         self.subset = s
    905         raise ParseEscape()
    906 
    907     def start_element_handler(self, name, attrs):
    908         raise ParseEscape()
    909 
    910 
    911 def parse(file, namespaces=True):
    912     """Parse a document, returning the resulting Document node.
    913 
    914     'file' may be either a file name or an open file object.
    915     """
    916     if namespaces:
    917         builder = ExpatBuilderNS()
    918     else:
    919         builder = ExpatBuilder()
    920 
    921     if isinstance(file, StringTypes):
    922         fp = open(file, 'rb')
    923         try:
    924             result = builder.parseFile(fp)
    925         finally:
    926             fp.close()
    927     else:
    928         result = builder.parseFile(file)
    929     return result
    930 
    931 
    932 def parseString(string, namespaces=True):
    933     """Parse a document from a string, returning the resulting
    934     Document node.
    935     """
    936     if namespaces:
    937         builder = ExpatBuilderNS()
    938     else:
    939         builder = ExpatBuilder()
    940     return builder.parseString(string)
    941 
    942 
    943 def parseFragment(file, context, namespaces=True):
    944     """Parse a fragment of a document, given the context from which it
    945     was originally extracted.  context should be the parent of the
    946     node(s) which are in the fragment.
    947 
    948     'file' may be either a file name or an open file object.
    949     """
    950     if namespaces:
    951         builder = FragmentBuilderNS(context)
    952     else:
    953         builder = FragmentBuilder(context)
    954 
    955     if isinstance(file, StringTypes):
    956         fp = open(file, 'rb')
    957         try:
    958             result = builder.parseFile(fp)
    959         finally:
    960             fp.close()
    961     else:
    962         result = builder.parseFile(file)
    963     return result
    964 
    965 
    966 def parseFragmentString(string, context, namespaces=True):
    967     """Parse a fragment of a document from a string, given the context
    968     from which it was originally extracted.  context should be the
    969     parent of the node(s) which are in the fragment.
    970     """
    971     if namespaces:
    972         builder = FragmentBuilderNS(context)
    973     else:
    974         builder = FragmentBuilder(context)
    975     return builder.parseString(string)
    976 
    977 
    978 def makeBuilder(options):
    979     """Create a builder based on an Options object."""
    980     if options.namespaces:
    981         return ExpatBuilderNS(options)
    982     else:
    983         return ExpatBuilder(options)
    984