Home | History | Annotate | Download | only in sax
      1 """
      2 SAX driver for the pyexpat C module.  This driver works with
      3 pyexpat.__version__ == '2.22'.
      4 """
      5 
      6 version = "0.20"
      7 
      8 from xml.sax._exceptions import *
      9 from xml.sax.handler import feature_validation, feature_namespaces
     10 from xml.sax.handler import feature_namespace_prefixes
     11 from xml.sax.handler import feature_external_ges, feature_external_pes
     12 from xml.sax.handler import feature_string_interning
     13 from xml.sax.handler import property_xml_string, property_interning_dict
     14 
     15 # xml.parsers.expat does not raise ImportError in Jython

     16 import sys
     17 if sys.platform[:4] == "java":
     18     raise SAXReaderNotAvailable("expat not available in Java", None)
     19 del sys
     20 
     21 try:
     22     from xml.parsers import expat
     23 except ImportError:
     24     raise SAXReaderNotAvailable("expat not supported", None)
     25 else:
     26     if not hasattr(expat, "ParserCreate"):
     27         raise SAXReaderNotAvailable("expat not supported", None)
     28 from xml.sax import xmlreader, saxutils, handler
     29 
     30 AttributesImpl = xmlreader.AttributesImpl
     31 AttributesNSImpl = xmlreader.AttributesNSImpl
     32 
     33 # If we're using a sufficiently recent version of Python, we can use

     34 # weak references to avoid cycles between the parser and content

     35 # handler, otherwise we'll just have to pretend.

     36 try:
     37     import _weakref
     38 except ImportError:
     39     def _mkproxy(o):
     40         return o
     41 else:
     42     import weakref
     43     _mkproxy = weakref.proxy
     44     del weakref, _weakref
     45 
     46 # --- ExpatLocator

     47 
     48 class ExpatLocator(xmlreader.Locator):
     49     """Locator for use with the ExpatParser class.
     50 
     51     This uses a weak reference to the parser object to avoid creating
     52     a circular reference between the parser and the content handler.
     53     """
     54     def __init__(self, parser):
     55         self._ref = _mkproxy(parser)
     56 
     57     def getColumnNumber(self):
     58         parser = self._ref
     59         if parser._parser is None:
     60             return None
     61         return parser._parser.ErrorColumnNumber
     62 
     63     def getLineNumber(self):
     64         parser = self._ref
     65         if parser._parser is None:
     66             return 1
     67         return parser._parser.ErrorLineNumber
     68 
     69     def getPublicId(self):
     70         parser = self._ref
     71         if parser is None:
     72             return None
     73         return parser._source.getPublicId()
     74 
     75     def getSystemId(self):
     76         parser = self._ref
     77         if parser is None:
     78             return None
     79         return parser._source.getSystemId()
     80 
     81 
     82 # --- ExpatParser

     83 
     84 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
     85     """SAX driver for the pyexpat C module."""
     86 
     87     def __init__(self, namespaceHandling=0, bufsize=2**16-20):
     88         xmlreader.IncrementalParser.__init__(self, bufsize)
     89         self._source = xmlreader.InputSource()
     90         self._parser = None
     91         self._namespaces = namespaceHandling
     92         self._lex_handler_prop = None
     93         self._parsing = 0
     94         self._entity_stack = []
     95         self._external_ges = 1
     96         self._interning = None
     97 
     98     # XMLReader methods

     99 
    100     def parse(self, source):
    101         "Parse an XML document from a URL or an InputSource."
    102         source = saxutils.prepare_input_source(source)
    103 
    104         self._source = source
    105         self.reset()
    106         self._cont_handler.setDocumentLocator(ExpatLocator(self))
    107         xmlreader.IncrementalParser.parse(self, source)
    108 
    109     def prepareParser(self, source):
    110         if source.getSystemId() is not None:
    111             self._parser.SetBase(source.getSystemId())
    112 
    113     # Redefined setContentHandler to allow changing handlers during parsing

    114 
    115     def setContentHandler(self, handler):
    116         xmlreader.IncrementalParser.setContentHandler(self, handler)
    117         if self._parsing:
    118             self._reset_cont_handler()
    119 
    120     def getFeature(self, name):
    121         if name == feature_namespaces:
    122             return self._namespaces
    123         elif name == feature_string_interning:
    124             return self._interning is not None
    125         elif name in (feature_validation, feature_external_pes,
    126                       feature_namespace_prefixes):
    127             return 0
    128         elif name == feature_external_ges:
    129             return self._external_ges
    130         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
    131 
    132     def setFeature(self, name, state):
    133         if self._parsing:
    134             raise SAXNotSupportedException("Cannot set features while parsing")
    135 
    136         if name == feature_namespaces:
    137             self._namespaces = state
    138         elif name == feature_external_ges:
    139             self._external_ges = state
    140         elif name == feature_string_interning:
    141             if state:
    142                 if self._interning is None:
    143                     self._interning = {}
    144             else:
    145                 self._interning = None
    146         elif name == feature_validation:
    147             if state:
    148                 raise SAXNotSupportedException(
    149                     "expat does not support validation")
    150         elif name == feature_external_pes:
    151             if state:
    152                 raise SAXNotSupportedException(
    153                     "expat does not read external parameter entities")
    154         elif name == feature_namespace_prefixes:
    155             if state:
    156                 raise SAXNotSupportedException(
    157                     "expat does not report namespace prefixes")
    158         else:
    159             raise SAXNotRecognizedException(
    160                 "Feature '%s' not recognized" % name)
    161 
    162     def getProperty(self, name):
    163         if name == handler.property_lexical_handler:
    164             return self._lex_handler_prop
    165         elif name == property_interning_dict:
    166             return self._interning
    167         elif name == property_xml_string:
    168             if self._parser:
    169                 if hasattr(self._parser, "GetInputContext"):
    170                     return self._parser.GetInputContext()
    171                 else:
    172                     raise SAXNotRecognizedException(
    173                         "This version of expat does not support getting"
    174                         " the XML string")
    175             else:
    176                 raise SAXNotSupportedException(
    177                     "XML string cannot be returned when not parsing")
    178         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
    179 
    180     def setProperty(self, name, value):
    181         if name == handler.property_lexical_handler:
    182             self._lex_handler_prop = value
    183             if self._parsing:
    184                 self._reset_lex_handler_prop()
    185         elif name == property_interning_dict:
    186             self._interning = value
    187         elif name == property_xml_string:
    188             raise SAXNotSupportedException("Property '%s' cannot be set" %
    189                                            name)
    190         else:
    191             raise SAXNotRecognizedException("Property '%s' not recognized" %
    192                                             name)
    193 
    194     # IncrementalParser methods

    195 
    196     def feed(self, data, isFinal = 0):
    197         if not self._parsing:
    198             self.reset()
    199             self._parsing = 1
    200             self._cont_handler.startDocument()
    201 
    202         try:
    203             # The isFinal parameter is internal to the expat reader.

    204             # If it is set to true, expat will check validity of the entire

    205             # document. When feeding chunks, they are not normally final -

    206             # except when invoked from close.

    207             self._parser.Parse(data, isFinal)
    208         except expat.error, e:
    209             exc = SAXParseException(expat.ErrorString(e.code), e, self)
    210             # FIXME: when to invoke error()?

    211             self._err_handler.fatalError(exc)
    212 
    213     def close(self):
    214         if self._entity_stack:
    215             # If we are completing an external entity, do nothing here

    216             return
    217         self.feed("", isFinal = 1)
    218         self._cont_handler.endDocument()
    219         self._parsing = 0
    220         # break cycle created by expat handlers pointing to our methods

    221         self._parser = None
    222 
    223     def _reset_cont_handler(self):
    224         self._parser.ProcessingInstructionHandler = \
    225                                     self._cont_handler.processingInstruction
    226         self._parser.CharacterDataHandler = self._cont_handler.characters
    227 
    228     def _reset_lex_handler_prop(self):
    229         lex = self._lex_handler_prop
    230         parser = self._parser
    231         if lex is None:
    232             parser.CommentHandler = None
    233             parser.StartCdataSectionHandler = None
    234             parser.EndCdataSectionHandler = None
    235             parser.StartDoctypeDeclHandler = None
    236             parser.EndDoctypeDeclHandler = None
    237         else:
    238             parser.CommentHandler = lex.comment
    239             parser.StartCdataSectionHandler = lex.startCDATA
    240             parser.EndCdataSectionHandler = lex.endCDATA
    241             parser.StartDoctypeDeclHandler = self.start_doctype_decl
    242             parser.EndDoctypeDeclHandler = lex.endDTD
    243 
    244     def reset(self):
    245         if self._namespaces:
    246             self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
    247                                               intern=self._interning)
    248             self._parser.namespace_prefixes = 1
    249             self._parser.StartElementHandler = self.start_element_ns
    250             self._parser.EndElementHandler = self.end_element_ns
    251         else:
    252             self._parser = expat.ParserCreate(self._source.getEncoding(),
    253                                               intern = self._interning)
    254             self._parser.StartElementHandler = self.start_element
    255             self._parser.EndElementHandler = self.end_element
    256 
    257         self._reset_cont_handler()
    258         self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
    259         self._parser.NotationDeclHandler = self.notation_decl
    260         self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
    261         self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
    262 
    263         self._decl_handler_prop = None
    264         if self._lex_handler_prop:
    265             self._reset_lex_handler_prop()
    266 #         self._parser.DefaultHandler =

    267 #         self._parser.DefaultHandlerExpand =

    268 #         self._parser.NotStandaloneHandler =

    269         self._parser.ExternalEntityRefHandler = self.external_entity_ref
    270         try:
    271             self._parser.SkippedEntityHandler = self.skipped_entity_handler
    272         except AttributeError:
    273             # This pyexpat does not support SkippedEntity

    274             pass
    275         self._parser.SetParamEntityParsing(
    276             expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
    277 
    278         self._parsing = 0
    279         self._entity_stack = []
    280 
    281     # Locator methods

    282 
    283     def getColumnNumber(self):
    284         if self._parser is None:
    285             return None
    286         return self._parser.ErrorColumnNumber
    287 
    288     def getLineNumber(self):
    289         if self._parser is None:
    290             return 1
    291         return self._parser.ErrorLineNumber
    292 
    293     def getPublicId(self):
    294         return self._source.getPublicId()
    295 
    296     def getSystemId(self):
    297         return self._source.getSystemId()
    298 
    299     # event handlers

    300     def start_element(self, name, attrs):
    301         self._cont_handler.startElement(name, AttributesImpl(attrs))
    302 
    303     def end_element(self, name):
    304         self._cont_handler.endElement(name)
    305 
    306     def start_element_ns(self, name, attrs):
    307         pair = name.split()
    308         if len(pair) == 1:
    309             # no namespace

    310             pair = (None, name)
    311         elif len(pair) == 3:
    312             pair = pair[0], pair[1]
    313         else:
    314             # default namespace

    315             pair = tuple(pair)
    316 
    317         newattrs = {}
    318         qnames = {}
    319         for (aname, value) in attrs.items():
    320             parts = aname.split()
    321             length = len(parts)
    322             if length == 1:
    323                 # no namespace

    324                 qname = aname
    325                 apair = (None, aname)
    326             elif length == 3:
    327                 qname = "%s:%s" % (parts[2], parts[1])
    328                 apair = parts[0], parts[1]
    329             else:
    330                 # default namespace

    331                 qname = parts[1]
    332                 apair = tuple(parts)
    333 
    334             newattrs[apair] = value
    335             qnames[apair] = qname
    336 
    337         self._cont_handler.startElementNS(pair, None,
    338                                           AttributesNSImpl(newattrs, qnames))
    339 
    340     def end_element_ns(self, name):
    341         pair = name.split()
    342         if len(pair) == 1:
    343             pair = (None, name)
    344         elif len(pair) == 3:
    345             pair = pair[0], pair[1]
    346         else:
    347             pair = tuple(pair)
    348 
    349         self._cont_handler.endElementNS(pair, None)
    350 
    351     # this is not used (call directly to ContentHandler)

    352     def processing_instruction(self, target, data):
    353         self._cont_handler.processingInstruction(target, data)
    354 
    355     # this is not used (call directly to ContentHandler)

    356     def character_data(self, data):
    357         self._cont_handler.characters(data)
    358 
    359     def start_namespace_decl(self, prefix, uri):
    360         self._cont_handler.startPrefixMapping(prefix, uri)
    361 
    362     def end_namespace_decl(self, prefix):
    363         self._cont_handler.endPrefixMapping(prefix)
    364 
    365     def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
    366         self._lex_handler_prop.startDTD(name, pubid, sysid)
    367 
    368     def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
    369         self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
    370 
    371     def notation_decl(self, name, base, sysid, pubid):
    372         self._dtd_handler.notationDecl(name, pubid, sysid)
    373 
    374     def external_entity_ref(self, context, base, sysid, pubid):
    375         if not self._external_ges:
    376             return 1
    377 
    378         source = self._ent_handler.resolveEntity(pubid, sysid)
    379         source = saxutils.prepare_input_source(source,
    380                                                self._source.getSystemId() or
    381                                                "")
    382 
    383         self._entity_stack.append((self._parser, self._source))
    384         self._parser = self._parser.ExternalEntityParserCreate(context)
    385         self._source = source
    386 
    387         try:
    388             xmlreader.IncrementalParser.parse(self, source)
    389         except:
    390             return 0  # FIXME: save error info here?

    391 
    392         (self._parser, self._source) = self._entity_stack[-1]
    393         del self._entity_stack[-1]
    394         return 1
    395 
    396     def skipped_entity_handler(self, name, is_pe):
    397         if is_pe:
    398             # The SAX spec requires to report skipped PEs with a '%'

    399             name = '%'+name
    400         self._cont_handler.skippedEntity(name)
    401 
    402 # ---

    403 
    404 def create_parser(*args, **kwargs):
    405     return ExpatParser(*args, **kwargs)
    406 
    407 # ---

    408 
    409 if __name__ == "__main__":
    410     import xml.sax.saxutils
    411     p = create_parser()
    412     p.setContentHandler(xml.sax.saxutils.XMLGenerator())
    413     p.setErrorHandler(xml.sax.ErrorHandler())
    414     p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
    415