Home | History | Annotate | Download | only in sax
      1 """
      2 SAX driver for the pyexpat C module.  This driver works with
      3 pyexpat.__version__ == '2.22'.
      4 """
      5 
      6 version = "0.20"
      7 
      8 from xml.sax._exceptions import *
      9 from xml.sax.handler import feature_validation, feature_namespaces
     10 from xml.sax.handler import feature_namespace_prefixes
     11 from xml.sax.handler import feature_external_ges, feature_external_pes
     12 from xml.sax.handler import feature_string_interning
     13 from xml.sax.handler import property_xml_string, property_interning_dict
     14 
     15 # xml.parsers.expat does not raise ImportError in Jython
     16 import sys
     17 if sys.platform[:4] == "java":
     18     raise SAXReaderNotAvailable("expat not available in Java", None)
     19 del sys
     20 
     21 try:
     22     from xml.parsers import expat
     23 except ImportError:
     24     raise SAXReaderNotAvailable("expat not supported", None)
     25 else:
     26     if not hasattr(expat, "ParserCreate"):
     27         raise SAXReaderNotAvailable("expat not supported", None)
     28 from xml.sax import xmlreader, saxutils, handler
     29 
     30 AttributesImpl = xmlreader.AttributesImpl
     31 AttributesNSImpl = xmlreader.AttributesNSImpl
     32 
     33 # If we're using a sufficiently recent version of Python, we can use
     34 # weak references to avoid cycles between the parser and content
     35 # handler, otherwise we'll just have to pretend.
     36 try:
     37     import _weakref
     38 except ImportError:
     39     def _mkproxy(o):
     40         return o
     41 else:
     42     import weakref
     43     _mkproxy = weakref.proxy
     44     del weakref, _weakref
     45 
     46 class _ClosedParser:
     47     pass
     48 
     49 # --- ExpatLocator
     50 
     51 class ExpatLocator(xmlreader.Locator):
     52     """Locator for use with the ExpatParser class.
     53 
     54     This uses a weak reference to the parser object to avoid creating
     55     a circular reference between the parser and the content handler.
     56     """
     57     def __init__(self, parser):
     58         self._ref = _mkproxy(parser)
     59 
     60     def getColumnNumber(self):
     61         parser = self._ref
     62         if parser._parser is None:
     63             return None
     64         return parser._parser.ErrorColumnNumber
     65 
     66     def getLineNumber(self):
     67         parser = self._ref
     68         if parser._parser is None:
     69             return 1
     70         return parser._parser.ErrorLineNumber
     71 
     72     def getPublicId(self):
     73         parser = self._ref
     74         if parser is None:
     75             return None
     76         return parser._source.getPublicId()
     77 
     78     def getSystemId(self):
     79         parser = self._ref
     80         if parser is None:
     81             return None
     82         return parser._source.getSystemId()
     83 
     84 
     85 # --- ExpatParser
     86 
     87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
     88     """SAX driver for the pyexpat C module."""
     89 
     90     def __init__(self, namespaceHandling=0, bufsize=2**16-20):
     91         xmlreader.IncrementalParser.__init__(self, bufsize)
     92         self._source = xmlreader.InputSource()
     93         self._parser = None
     94         self._namespaces = namespaceHandling
     95         self._lex_handler_prop = None
     96         self._parsing = 0
     97         self._entity_stack = []
     98         self._external_ges = 1
     99         self._interning = None
    100 
    101     # XMLReader methods
    102 
    103     def parse(self, source):
    104         "Parse an XML document from a URL or an InputSource."
    105         source = saxutils.prepare_input_source(source)
    106 
    107         self._source = source
    108         try:
    109             self.reset()
    110             self._cont_handler.setDocumentLocator(ExpatLocator(self))
    111             xmlreader.IncrementalParser.parse(self, source)
    112         except:
    113             # bpo-30264: Close the source on error to not leak resources:
    114             # xml.sax.parse() doesn't give access to the underlying parser
    115             # to the caller
    116             self._close_source()
    117             raise
    118 
    119     def prepareParser(self, source):
    120         if source.getSystemId() is not None:
    121             base = source.getSystemId()
    122             if isinstance(base, unicode):
    123                 base = base.encode('utf-8')
    124             self._parser.SetBase(base)
    125 
    126     # Redefined setContentHandler to allow changing handlers during parsing
    127 
    128     def setContentHandler(self, handler):
    129         xmlreader.IncrementalParser.setContentHandler(self, handler)
    130         if self._parsing:
    131             self._reset_cont_handler()
    132 
    133     def getFeature(self, name):
    134         if name == feature_namespaces:
    135             return self._namespaces
    136         elif name == feature_string_interning:
    137             return self._interning is not None
    138         elif name in (feature_validation, feature_external_pes,
    139                       feature_namespace_prefixes):
    140             return 0
    141         elif name == feature_external_ges:
    142             return self._external_ges
    143         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
    144 
    145     def setFeature(self, name, state):
    146         if self._parsing:
    147             raise SAXNotSupportedException("Cannot set features while parsing")
    148 
    149         if name == feature_namespaces:
    150             self._namespaces = state
    151         elif name == feature_external_ges:
    152             self._external_ges = state
    153         elif name == feature_string_interning:
    154             if state:
    155                 if self._interning is None:
    156                     self._interning = {}
    157             else:
    158                 self._interning = None
    159         elif name == feature_validation:
    160             if state:
    161                 raise SAXNotSupportedException(
    162                     "expat does not support validation")
    163         elif name == feature_external_pes:
    164             if state:
    165                 raise SAXNotSupportedException(
    166                     "expat does not read external parameter entities")
    167         elif name == feature_namespace_prefixes:
    168             if state:
    169                 raise SAXNotSupportedException(
    170                     "expat does not report namespace prefixes")
    171         else:
    172             raise SAXNotRecognizedException(
    173                 "Feature '%s' not recognized" % name)
    174 
    175     def getProperty(self, name):
    176         if name == handler.property_lexical_handler:
    177             return self._lex_handler_prop
    178         elif name == property_interning_dict:
    179             return self._interning
    180         elif name == property_xml_string:
    181             if self._parser:
    182                 if hasattr(self._parser, "GetInputContext"):
    183                     return self._parser.GetInputContext()
    184                 else:
    185                     raise SAXNotRecognizedException(
    186                         "This version of expat does not support getting"
    187                         " the XML string")
    188             else:
    189                 raise SAXNotSupportedException(
    190                     "XML string cannot be returned when not parsing")
    191         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
    192 
    193     def setProperty(self, name, value):
    194         if name == handler.property_lexical_handler:
    195             self._lex_handler_prop = value
    196             if self._parsing:
    197                 self._reset_lex_handler_prop()
    198         elif name == property_interning_dict:
    199             self._interning = value
    200         elif name == property_xml_string:
    201             raise SAXNotSupportedException("Property '%s' cannot be set" %
    202                                            name)
    203         else:
    204             raise SAXNotRecognizedException("Property '%s' not recognized" %
    205                                             name)
    206 
    207     # IncrementalParser methods
    208 
    209     def feed(self, data, isFinal = 0):
    210         if not self._parsing:
    211             self.reset()
    212             self._parsing = 1
    213             self._cont_handler.startDocument()
    214 
    215         try:
    216             # The isFinal parameter is internal to the expat reader.
    217             # If it is set to true, expat will check validity of the entire
    218             # document. When feeding chunks, they are not normally final -
    219             # except when invoked from close.
    220             self._parser.Parse(data, isFinal)
    221         except expat.error, e:
    222             exc = SAXParseException(expat.ErrorString(e.code), e, self)
    223             # FIXME: when to invoke error()?
    224             self._err_handler.fatalError(exc)
    225 
    226     def _close_source(self):
    227         source = self._source
    228         try:
    229             file = source.getCharacterStream()
    230             if file is not None:
    231                 file.close()
    232         finally:
    233             file = source.getByteStream()
    234             if file is not None:
    235                 file.close()
    236 
    237     def close(self):
    238         if (self._entity_stack or self._parser is None or
    239             isinstance(self._parser, _ClosedParser)):
    240             # If we are completing an external entity, do nothing here
    241             return
    242         try:
    243             self.feed("", isFinal = 1)
    244             self._cont_handler.endDocument()
    245             self._parsing = 0
    246             # break cycle created by expat handlers pointing to our methods
    247             self._parser = None
    248         finally:
    249             self._parsing = 0
    250             if self._parser is not None:
    251                 # Keep ErrorColumnNumber and ErrorLineNumber after closing.
    252                 parser = _ClosedParser()
    253                 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
    254                 parser.ErrorLineNumber = self._parser.ErrorLineNumber
    255                 self._parser = parser
    256             self._close_source()
    257 
    258     def _reset_cont_handler(self):
    259         self._parser.ProcessingInstructionHandler = \
    260                                     self._cont_handler.processingInstruction
    261         self._parser.CharacterDataHandler = self._cont_handler.characters
    262 
    263     def _reset_lex_handler_prop(self):
    264         lex = self._lex_handler_prop
    265         parser = self._parser
    266         if lex is None:
    267             parser.CommentHandler = None
    268             parser.StartCdataSectionHandler = None
    269             parser.EndCdataSectionHandler = None
    270             parser.StartDoctypeDeclHandler = None
    271             parser.EndDoctypeDeclHandler = None
    272         else:
    273             parser.CommentHandler = lex.comment
    274             parser.StartCdataSectionHandler = lex.startCDATA
    275             parser.EndCdataSectionHandler = lex.endCDATA
    276             parser.StartDoctypeDeclHandler = self.start_doctype_decl
    277             parser.EndDoctypeDeclHandler = lex.endDTD
    278 
    279     def reset(self):
    280         if self._namespaces:
    281             self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
    282                                               intern=self._interning)
    283             self._parser.namespace_prefixes = 1
    284             self._parser.StartElementHandler = self.start_element_ns
    285             self._parser.EndElementHandler = self.end_element_ns
    286         else:
    287             self._parser = expat.ParserCreate(self._source.getEncoding(),
    288                                               intern = self._interning)
    289             self._parser.StartElementHandler = self.start_element
    290             self._parser.EndElementHandler = self.end_element
    291 
    292         self._reset_cont_handler()
    293         self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
    294         self._parser.NotationDeclHandler = self.notation_decl
    295         self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
    296         self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
    297 
    298         self._decl_handler_prop = None
    299         if self._lex_handler_prop:
    300             self._reset_lex_handler_prop()
    301 #         self._parser.DefaultHandler =
    302 #         self._parser.DefaultHandlerExpand =
    303 #         self._parser.NotStandaloneHandler =
    304         self._parser.ExternalEntityRefHandler = self.external_entity_ref
    305         try:
    306             self._parser.SkippedEntityHandler = self.skipped_entity_handler
    307         except AttributeError:
    308             # This pyexpat does not support SkippedEntity
    309             pass
    310         self._parser.SetParamEntityParsing(
    311             expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
    312 
    313         self._parsing = 0
    314         self._entity_stack = []
    315 
    316     # Locator methods
    317 
    318     def getColumnNumber(self):
    319         if self._parser is None:
    320             return None
    321         return self._parser.ErrorColumnNumber
    322 
    323     def getLineNumber(self):
    324         if self._parser is None:
    325             return 1
    326         return self._parser.ErrorLineNumber
    327 
    328     def getPublicId(self):
    329         return self._source.getPublicId()
    330 
    331     def getSystemId(self):
    332         return self._source.getSystemId()
    333 
    334     # event handlers
    335     def start_element(self, name, attrs):
    336         self._cont_handler.startElement(name, AttributesImpl(attrs))
    337 
    338     def end_element(self, name):
    339         self._cont_handler.endElement(name)
    340 
    341     def start_element_ns(self, name, attrs):
    342         pair = name.split()
    343         if len(pair) == 1:
    344             # no namespace
    345             pair = (None, name)
    346         elif len(pair) == 3:
    347             pair = pair[0], pair[1]
    348         else:
    349             # default namespace
    350             pair = tuple(pair)
    351 
    352         newattrs = {}
    353         qnames = {}
    354         for (aname, value) in attrs.items():
    355             parts = aname.split()
    356             length = len(parts)
    357             if length == 1:
    358                 # no namespace
    359                 qname = aname
    360                 apair = (None, aname)
    361             elif length == 3:
    362                 qname = "%s:%s" % (parts[2], parts[1])
    363                 apair = parts[0], parts[1]
    364             else:
    365                 # default namespace
    366                 qname = parts[1]
    367                 apair = tuple(parts)
    368 
    369             newattrs[apair] = value
    370             qnames[apair] = qname
    371 
    372         self._cont_handler.startElementNS(pair, None,
    373                                           AttributesNSImpl(newattrs, qnames))
    374 
    375     def end_element_ns(self, name):
    376         pair = name.split()
    377         if len(pair) == 1:
    378             pair = (None, name)
    379         elif len(pair) == 3:
    380             pair = pair[0], pair[1]
    381         else:
    382             pair = tuple(pair)
    383 
    384         self._cont_handler.endElementNS(pair, None)
    385 
    386     # this is not used (call directly to ContentHandler)
    387     def processing_instruction(self, target, data):
    388         self._cont_handler.processingInstruction(target, data)
    389 
    390     # this is not used (call directly to ContentHandler)
    391     def character_data(self, data):
    392         self._cont_handler.characters(data)
    393 
    394     def start_namespace_decl(self, prefix, uri):
    395         self._cont_handler.startPrefixMapping(prefix, uri)
    396 
    397     def end_namespace_decl(self, prefix):
    398         self._cont_handler.endPrefixMapping(prefix)
    399 
    400     def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
    401         self._lex_handler_prop.startDTD(name, pubid, sysid)
    402 
    403     def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
    404         self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
    405 
    406     def notation_decl(self, name, base, sysid, pubid):
    407         self._dtd_handler.notationDecl(name, pubid, sysid)
    408 
    409     def external_entity_ref(self, context, base, sysid, pubid):
    410         if not self._external_ges:
    411             return 1
    412 
    413         source = self._ent_handler.resolveEntity(pubid, sysid)
    414         source = saxutils.prepare_input_source(source,
    415                                                self._source.getSystemId() or
    416                                                "")
    417 
    418         self._entity_stack.append((self._parser, self._source))
    419         self._parser = self._parser.ExternalEntityParserCreate(context)
    420         self._source = source
    421 
    422         try:
    423             xmlreader.IncrementalParser.parse(self, source)
    424         except:
    425             return 0  # FIXME: save error info here?
    426 
    427         (self._parser, self._source) = self._entity_stack[-1]
    428         del self._entity_stack[-1]
    429         return 1
    430 
    431     def skipped_entity_handler(self, name, is_pe):
    432         if is_pe:
    433             # The SAX spec requires to report skipped PEs with a '%'
    434             name = '%'+name
    435         self._cont_handler.skippedEntity(name)
    436 
    437 # ---
    438 
    439 def create_parser(*args, **kwargs):
    440     return ExpatParser(*args, **kwargs)
    441 
    442 # ---
    443 
    444 if __name__ == "__main__":
    445     import xml.sax.saxutils
    446     p = create_parser()
    447     p.setContentHandler(xml.sax.saxutils.XMLGenerator())
    448     p.setErrorHandler(xml.sax.ErrorHandler())
    449     p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
    450