Home | History | Annotate | Download | only in sax
      1 """
      2 SAX driver for the pyexpat C module.  This driver works with
      3 pyexpat.__version__ == '2.22'.
      4 """
      5 
      6 version = "0.20"
      7 
      8 from xml.sax._exceptions import *
      9 from xml.sax.handler import feature_validation, feature_namespaces
     10 from xml.sax.handler import feature_namespace_prefixes
     11 from xml.sax.handler import feature_external_ges, feature_external_pes
     12 from xml.sax.handler import feature_string_interning
     13 from xml.sax.handler import property_xml_string, property_interning_dict
     14 
     15 # xml.parsers.expat does not raise ImportError in Jython
     16 import sys
     17 if sys.platform[:4] == "java":
     18     raise SAXReaderNotAvailable("expat not available in Java", None)
     19 del sys
     20 
     21 try:
     22     from xml.parsers import expat
     23 except ImportError:
     24     raise SAXReaderNotAvailable("expat not supported", None)
     25 else:
     26     if not hasattr(expat, "ParserCreate"):
     27         raise SAXReaderNotAvailable("expat not supported", None)
     28 from xml.sax import xmlreader, saxutils, handler
     29 
     30 AttributesImpl = xmlreader.AttributesImpl
     31 AttributesNSImpl = xmlreader.AttributesNSImpl
     32 
     33 # If we're using a sufficiently recent version of Python, we can use
     34 # weak references to avoid cycles between the parser and content
     35 # handler, otherwise we'll just have to pretend.
     36 try:
     37     import _weakref
     38 except ImportError:
     39     def _mkproxy(o):
     40         return o
     41 else:
     42     import weakref
     43     _mkproxy = weakref.proxy
     44     del weakref, _weakref
     45 
     46 # --- ExpatLocator
     47 
     48 class ExpatLocator(xmlreader.Locator):
     49     """Locator for use with the ExpatParser class.
     50 
     51     This uses a weak reference to the parser object to avoid creating
     52     a circular reference between the parser and the content handler.
     53     """
     54     def __init__(self, parser):
     55         self._ref = _mkproxy(parser)
     56 
     57     def getColumnNumber(self):
     58         parser = self._ref
     59         if parser._parser is None:
     60             return None
     61         return parser._parser.ErrorColumnNumber
     62 
     63     def getLineNumber(self):
     64         parser = self._ref
     65         if parser._parser is None:
     66             return 1
     67         return parser._parser.ErrorLineNumber
     68 
     69     def getPublicId(self):
     70         parser = self._ref
     71         if parser is None:
     72             return None
     73         return parser._source.getPublicId()
     74 
     75     def getSystemId(self):
     76         parser = self._ref
     77         if parser is None:
     78             return None
     79         return parser._source.getSystemId()
     80 
     81 
     82 # --- ExpatParser
     83 
     84 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
     85     """SAX driver for the pyexpat C module."""
     86 
     87     def __init__(self, namespaceHandling=0, bufsize=2**16-20):
     88         xmlreader.IncrementalParser.__init__(self, bufsize)
     89         self._source = xmlreader.InputSource()
     90         self._parser = None
     91         self._namespaces = namespaceHandling
     92         self._lex_handler_prop = None
     93         self._parsing = 0
     94         self._entity_stack = []
     95         self._external_ges = 1
     96         self._interning = None
     97 
     98     # XMLReader methods
     99 
    100     def parse(self, source):
    101         "Parse an XML document from a URL or an InputSource."
    102         source = saxutils.prepare_input_source(source)
    103 
    104         self._source = source
    105         self.reset()
    106         self._cont_handler.setDocumentLocator(ExpatLocator(self))
    107         xmlreader.IncrementalParser.parse(self, source)
    108 
    109     def prepareParser(self, source):
    110         if source.getSystemId() is not None:
    111             base = source.getSystemId()
    112             if isinstance(base, unicode):
    113                 base = base.encode('utf-8')
    114             self._parser.SetBase(base)
    115 
    116     # Redefined setContentHandler to allow changing handlers during parsing
    117 
    118     def setContentHandler(self, handler):
    119         xmlreader.IncrementalParser.setContentHandler(self, handler)
    120         if self._parsing:
    121             self._reset_cont_handler()
    122 
    123     def getFeature(self, name):
    124         if name == feature_namespaces:
    125             return self._namespaces
    126         elif name == feature_string_interning:
    127             return self._interning is not None
    128         elif name in (feature_validation, feature_external_pes,
    129                       feature_namespace_prefixes):
    130             return 0
    131         elif name == feature_external_ges:
    132             return self._external_ges
    133         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
    134 
    135     def setFeature(self, name, state):
    136         if self._parsing:
    137             raise SAXNotSupportedException("Cannot set features while parsing")
    138 
    139         if name == feature_namespaces:
    140             self._namespaces = state
    141         elif name == feature_external_ges:
    142             self._external_ges = state
    143         elif name == feature_string_interning:
    144             if state:
    145                 if self._interning is None:
    146                     self._interning = {}
    147             else:
    148                 self._interning = None
    149         elif name == feature_validation:
    150             if state:
    151                 raise SAXNotSupportedException(
    152                     "expat does not support validation")
    153         elif name == feature_external_pes:
    154             if state:
    155                 raise SAXNotSupportedException(
    156                     "expat does not read external parameter entities")
    157         elif name == feature_namespace_prefixes:
    158             if state:
    159                 raise SAXNotSupportedException(
    160                     "expat does not report namespace prefixes")
    161         else:
    162             raise SAXNotRecognizedException(
    163                 "Feature '%s' not recognized" % name)
    164 
    165     def getProperty(self, name):
    166         if name == handler.property_lexical_handler:
    167             return self._lex_handler_prop
    168         elif name == property_interning_dict:
    169             return self._interning
    170         elif name == property_xml_string:
    171             if self._parser:
    172                 if hasattr(self._parser, "GetInputContext"):
    173                     return self._parser.GetInputContext()
    174                 else:
    175                     raise SAXNotRecognizedException(
    176                         "This version of expat does not support getting"
    177                         " the XML string")
    178             else:
    179                 raise SAXNotSupportedException(
    180                     "XML string cannot be returned when not parsing")
    181         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
    182 
    183     def setProperty(self, name, value):
    184         if name == handler.property_lexical_handler:
    185             self._lex_handler_prop = value
    186             if self._parsing:
    187                 self._reset_lex_handler_prop()
    188         elif name == property_interning_dict:
    189             self._interning = value
    190         elif name == property_xml_string:
    191             raise SAXNotSupportedException("Property '%s' cannot be set" %
    192                                            name)
    193         else:
    194             raise SAXNotRecognizedException("Property '%s' not recognized" %
    195                                             name)
    196 
    197     # IncrementalParser methods
    198 
    199     def feed(self, data, isFinal = 0):
    200         if not self._parsing:
    201             self.reset()
    202             self._parsing = 1
    203             self._cont_handler.startDocument()
    204 
    205         try:
    206             # The isFinal parameter is internal to the expat reader.
    207             # If it is set to true, expat will check validity of the entire
    208             # document. When feeding chunks, they are not normally final -
    209             # except when invoked from close.
    210             self._parser.Parse(data, isFinal)
    211         except expat.error, e:
    212             exc = SAXParseException(expat.ErrorString(e.code), e, self)
    213             # FIXME: when to invoke error()?
    214             self._err_handler.fatalError(exc)
    215 
    216     def close(self):
    217         if self._entity_stack:
    218             # If we are completing an external entity, do nothing here
    219             return
    220         self.feed("", isFinal = 1)
    221         self._cont_handler.endDocument()
    222         self._parsing = 0
    223         # break cycle created by expat handlers pointing to our methods
    224         self._parser = None
    225 
    226     def _reset_cont_handler(self):
    227         self._parser.ProcessingInstructionHandler = \
    228                                     self._cont_handler.processingInstruction
    229         self._parser.CharacterDataHandler = self._cont_handler.characters
    230 
    231     def _reset_lex_handler_prop(self):
    232         lex = self._lex_handler_prop
    233         parser = self._parser
    234         if lex is None:
    235             parser.CommentHandler = None
    236             parser.StartCdataSectionHandler = None
    237             parser.EndCdataSectionHandler = None
    238             parser.StartDoctypeDeclHandler = None
    239             parser.EndDoctypeDeclHandler = None
    240         else:
    241             parser.CommentHandler = lex.comment
    242             parser.StartCdataSectionHandler = lex.startCDATA
    243             parser.EndCdataSectionHandler = lex.endCDATA
    244             parser.StartDoctypeDeclHandler = self.start_doctype_decl
    245             parser.EndDoctypeDeclHandler = lex.endDTD
    246 
    247     def reset(self):
    248         if self._namespaces:
    249             self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
    250                                               intern=self._interning)
    251             self._parser.namespace_prefixes = 1
    252             self._parser.StartElementHandler = self.start_element_ns
    253             self._parser.EndElementHandler = self.end_element_ns
    254         else:
    255             self._parser = expat.ParserCreate(self._source.getEncoding(),
    256                                               intern = self._interning)
    257             self._parser.StartElementHandler = self.start_element
    258             self._parser.EndElementHandler = self.end_element
    259 
    260         self._reset_cont_handler()
    261         self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
    262         self._parser.NotationDeclHandler = self.notation_decl
    263         self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
    264         self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
    265 
    266         self._decl_handler_prop = None
    267         if self._lex_handler_prop:
    268             self._reset_lex_handler_prop()
    269 #         self._parser.DefaultHandler =
    270 #         self._parser.DefaultHandlerExpand =
    271 #         self._parser.NotStandaloneHandler =
    272         self._parser.ExternalEntityRefHandler = self.external_entity_ref
    273         try:
    274             self._parser.SkippedEntityHandler = self.skipped_entity_handler
    275         except AttributeError:
    276             # This pyexpat does not support SkippedEntity
    277             pass
    278         self._parser.SetParamEntityParsing(
    279             expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
    280 
    281         self._parsing = 0
    282         self._entity_stack = []
    283 
    284     # Locator methods
    285 
    286     def getColumnNumber(self):
    287         if self._parser is None:
    288             return None
    289         return self._parser.ErrorColumnNumber
    290 
    291     def getLineNumber(self):
    292         if self._parser is None:
    293             return 1
    294         return self._parser.ErrorLineNumber
    295 
    296     def getPublicId(self):
    297         return self._source.getPublicId()
    298 
    299     def getSystemId(self):
    300         return self._source.getSystemId()
    301 
    302     # event handlers
    303     def start_element(self, name, attrs):
    304         self._cont_handler.startElement(name, AttributesImpl(attrs))
    305 
    306     def end_element(self, name):
    307         self._cont_handler.endElement(name)
    308 
    309     def start_element_ns(self, name, attrs):
    310         pair = name.split()
    311         if len(pair) == 1:
    312             # no namespace
    313             pair = (None, name)
    314         elif len(pair) == 3:
    315             pair = pair[0], pair[1]
    316         else:
    317             # default namespace
    318             pair = tuple(pair)
    319 
    320         newattrs = {}
    321         qnames = {}
    322         for (aname, value) in attrs.items():
    323             parts = aname.split()
    324             length = len(parts)
    325             if length == 1:
    326                 # no namespace
    327                 qname = aname
    328                 apair = (None, aname)
    329             elif length == 3:
    330                 qname = "%s:%s" % (parts[2], parts[1])
    331                 apair = parts[0], parts[1]
    332             else:
    333                 # default namespace
    334                 qname = parts[1]
    335                 apair = tuple(parts)
    336 
    337             newattrs[apair] = value
    338             qnames[apair] = qname
    339 
    340         self._cont_handler.startElementNS(pair, None,
    341                                           AttributesNSImpl(newattrs, qnames))
    342 
    343     def end_element_ns(self, name):
    344         pair = name.split()
    345         if len(pair) == 1:
    346             pair = (None, name)
    347         elif len(pair) == 3:
    348             pair = pair[0], pair[1]
    349         else:
    350             pair = tuple(pair)
    351 
    352         self._cont_handler.endElementNS(pair, None)
    353 
    354     # this is not used (call directly to ContentHandler)
    355     def processing_instruction(self, target, data):
    356         self._cont_handler.processingInstruction(target, data)
    357 
    358     # this is not used (call directly to ContentHandler)
    359     def character_data(self, data):
    360         self._cont_handler.characters(data)
    361 
    362     def start_namespace_decl(self, prefix, uri):
    363         self._cont_handler.startPrefixMapping(prefix, uri)
    364 
    365     def end_namespace_decl(self, prefix):
    366         self._cont_handler.endPrefixMapping(prefix)
    367 
    368     def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
    369         self._lex_handler_prop.startDTD(name, pubid, sysid)
    370 
    371     def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
    372         self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
    373 
    374     def notation_decl(self, name, base, sysid, pubid):
    375         self._dtd_handler.notationDecl(name, pubid, sysid)
    376 
    377     def external_entity_ref(self, context, base, sysid, pubid):
    378         if not self._external_ges:
    379             return 1
    380 
    381         source = self._ent_handler.resolveEntity(pubid, sysid)
    382         source = saxutils.prepare_input_source(source,
    383                                                self._source.getSystemId() or
    384                                                "")
    385 
    386         self._entity_stack.append((self._parser, self._source))
    387         self._parser = self._parser.ExternalEntityParserCreate(context)
    388         self._source = source
    389 
    390         try:
    391             xmlreader.IncrementalParser.parse(self, source)
    392         except:
    393             return 0  # FIXME: save error info here?
    394 
    395         (self._parser, self._source) = self._entity_stack[-1]
    396         del self._entity_stack[-1]
    397         return 1
    398 
    399     def skipped_entity_handler(self, name, is_pe):
    400         if is_pe:
    401             # The SAX spec requires to report skipped PEs with a '%'
    402             name = '%'+name
    403         self._cont_handler.skippedEntity(name)
    404 
    405 # ---
    406 
    407 def create_parser(*args, **kwargs):
    408     return ExpatParser(*args, **kwargs)
    409 
    410 # ---
    411 
    412 if __name__ == "__main__":
    413     import xml.sax.saxutils
    414     p = create_parser()
    415     p.setContentHandler(xml.sax.saxutils.XMLGenerator())
    416     p.setErrorHandler(xml.sax.ErrorHandler())
    417     p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
    418