Home | History | Annotate | Download | only in sax
      1 """
      2 SAX driver for the pyexpat C module.  This driver works with
      3 pyexpat.__version__ == '2.22'.
      4 """
      5 
      6 version = "0.20"
      7 
      8 from xml.sax._exceptions import *
      9 from xml.sax.handler import feature_validation, feature_namespaces
     10 from xml.sax.handler import feature_namespace_prefixes
     11 from xml.sax.handler import feature_external_ges, feature_external_pes
     12 from xml.sax.handler import feature_string_interning
     13 from xml.sax.handler import property_xml_string, property_interning_dict
     14 
     15 # xml.parsers.expat does not raise ImportError in Jython
     16 import sys
     17 if sys.platform[:4] == "java":
     18     raise SAXReaderNotAvailable("expat not available in Java", None)
     19 del sys
     20 
     21 try:
     22     from xml.parsers import expat
     23 except ImportError:
     24     raise SAXReaderNotAvailable("expat not supported", None)
     25 else:
     26     if not hasattr(expat, "ParserCreate"):
     27         raise SAXReaderNotAvailable("expat not supported", None)
     28 from xml.sax import xmlreader, saxutils, handler
     29 
     30 AttributesImpl = xmlreader.AttributesImpl
     31 AttributesNSImpl = xmlreader.AttributesNSImpl
     32 
     33 # If we're using a sufficiently recent version of Python, we can use
     34 # weak references to avoid cycles between the parser and content
     35 # handler, otherwise we'll just have to pretend.
     36 try:
     37     import _weakref
     38 except ImportError:
     39     def _mkproxy(o):
     40         return o
     41 else:
     42     import weakref
     43     _mkproxy = weakref.proxy
     44     del weakref, _weakref
     45 
     46 class _ClosedParser:
     47     pass
     48 
     49 # --- ExpatLocator
     50 
     51 class ExpatLocator(xmlreader.Locator):
     52     """Locator for use with the ExpatParser class.
     53 
     54     This uses a weak reference to the parser object to avoid creating
     55     a circular reference between the parser and the content handler.
     56     """
     57     def __init__(self, parser):
     58         self._ref = _mkproxy(parser)
     59 
     60     def getColumnNumber(self):
     61         parser = self._ref
     62         if parser._parser is None:
     63             return None
     64         return parser._parser.ErrorColumnNumber
     65 
     66     def getLineNumber(self):
     67         parser = self._ref
     68         if parser._parser is None:
     69             return 1
     70         return parser._parser.ErrorLineNumber
     71 
     72     def getPublicId(self):
     73         parser = self._ref
     74         if parser is None:
     75             return None
     76         return parser._source.getPublicId()
     77 
     78     def getSystemId(self):
     79         parser = self._ref
     80         if parser is None:
     81             return None
     82         return parser._source.getSystemId()
     83 
     84 
     85 # --- ExpatParser
     86 
     87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
     88     """SAX driver for the pyexpat C module."""
     89 
     90     def __init__(self, namespaceHandling=0, bufsize=2**16-20):
     91         xmlreader.IncrementalParser.__init__(self, bufsize)
     92         self._source = xmlreader.InputSource()
     93         self._parser = None
     94         self._namespaces = namespaceHandling
     95         self._lex_handler_prop = None
     96         self._parsing = 0
     97         self._entity_stack = []
     98         self._external_ges = 1
     99         self._interning = None
    100 
    101     # XMLReader methods
    102 
    103     def parse(self, source):
    104         "Parse an XML document from a URL or an InputSource."
    105         source = saxutils.prepare_input_source(source)
    106 
    107         self._source = source
    108         self.reset()
    109         self._cont_handler.setDocumentLocator(ExpatLocator(self))
    110         xmlreader.IncrementalParser.parse(self, source)
    111 
    112     def prepareParser(self, source):
    113         if source.getSystemId() is not None:
    114             base = source.getSystemId()
    115             if isinstance(base, unicode):
    116                 base = base.encode('utf-8')
    117             self._parser.SetBase(base)
    118 
    119     # Redefined setContentHandler to allow changing handlers during parsing
    120 
    121     def setContentHandler(self, handler):
    122         xmlreader.IncrementalParser.setContentHandler(self, handler)
    123         if self._parsing:
    124             self._reset_cont_handler()
    125 
    126     def getFeature(self, name):
    127         if name == feature_namespaces:
    128             return self._namespaces
    129         elif name == feature_string_interning:
    130             return self._interning is not None
    131         elif name in (feature_validation, feature_external_pes,
    132                       feature_namespace_prefixes):
    133             return 0
    134         elif name == feature_external_ges:
    135             return self._external_ges
    136         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
    137 
    138     def setFeature(self, name, state):
    139         if self._parsing:
    140             raise SAXNotSupportedException("Cannot set features while parsing")
    141 
    142         if name == feature_namespaces:
    143             self._namespaces = state
    144         elif name == feature_external_ges:
    145             self._external_ges = state
    146         elif name == feature_string_interning:
    147             if state:
    148                 if self._interning is None:
    149                     self._interning = {}
    150             else:
    151                 self._interning = None
    152         elif name == feature_validation:
    153             if state:
    154                 raise SAXNotSupportedException(
    155                     "expat does not support validation")
    156         elif name == feature_external_pes:
    157             if state:
    158                 raise SAXNotSupportedException(
    159                     "expat does not read external parameter entities")
    160         elif name == feature_namespace_prefixes:
    161             if state:
    162                 raise SAXNotSupportedException(
    163                     "expat does not report namespace prefixes")
    164         else:
    165             raise SAXNotRecognizedException(
    166                 "Feature '%s' not recognized" % name)
    167 
    168     def getProperty(self, name):
    169         if name == handler.property_lexical_handler:
    170             return self._lex_handler_prop
    171         elif name == property_interning_dict:
    172             return self._interning
    173         elif name == property_xml_string:
    174             if self._parser:
    175                 if hasattr(self._parser, "GetInputContext"):
    176                     return self._parser.GetInputContext()
    177                 else:
    178                     raise SAXNotRecognizedException(
    179                         "This version of expat does not support getting"
    180                         " the XML string")
    181             else:
    182                 raise SAXNotSupportedException(
    183                     "XML string cannot be returned when not parsing")
    184         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
    185 
    186     def setProperty(self, name, value):
    187         if name == handler.property_lexical_handler:
    188             self._lex_handler_prop = value
    189             if self._parsing:
    190                 self._reset_lex_handler_prop()
    191         elif name == property_interning_dict:
    192             self._interning = value
    193         elif name == property_xml_string:
    194             raise SAXNotSupportedException("Property '%s' cannot be set" %
    195                                            name)
    196         else:
    197             raise SAXNotRecognizedException("Property '%s' not recognized" %
    198                                             name)
    199 
    200     # IncrementalParser methods
    201 
    202     def feed(self, data, isFinal = 0):
    203         if not self._parsing:
    204             self.reset()
    205             self._parsing = 1
    206             self._cont_handler.startDocument()
    207 
    208         try:
    209             # The isFinal parameter is internal to the expat reader.
    210             # If it is set to true, expat will check validity of the entire
    211             # document. When feeding chunks, they are not normally final -
    212             # except when invoked from close.
    213             self._parser.Parse(data, isFinal)
    214         except expat.error, e:
    215             exc = SAXParseException(expat.ErrorString(e.code), e, self)
    216             # FIXME: when to invoke error()?
    217             self._err_handler.fatalError(exc)
    218 
    219     def close(self):
    220         if (self._entity_stack or self._parser is None or
    221             isinstance(self._parser, _ClosedParser)):
    222             # If we are completing an external entity, do nothing here
    223             return
    224         try:
    225             self.feed("", isFinal = 1)
    226             self._cont_handler.endDocument()
    227             self._parsing = 0
    228             # break cycle created by expat handlers pointing to our methods
    229             self._parser = None
    230         finally:
    231             self._parsing = 0
    232             if self._parser is not None:
    233                 # Keep ErrorColumnNumber and ErrorLineNumber after closing.
    234                 parser = _ClosedParser()
    235                 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
    236                 parser.ErrorLineNumber = self._parser.ErrorLineNumber
    237                 self._parser = parser
    238 
    239     def _reset_cont_handler(self):
    240         self._parser.ProcessingInstructionHandler = \
    241                                     self._cont_handler.processingInstruction
    242         self._parser.CharacterDataHandler = self._cont_handler.characters
    243 
    244     def _reset_lex_handler_prop(self):
    245         lex = self._lex_handler_prop
    246         parser = self._parser
    247         if lex is None:
    248             parser.CommentHandler = None
    249             parser.StartCdataSectionHandler = None
    250             parser.EndCdataSectionHandler = None
    251             parser.StartDoctypeDeclHandler = None
    252             parser.EndDoctypeDeclHandler = None
    253         else:
    254             parser.CommentHandler = lex.comment
    255             parser.StartCdataSectionHandler = lex.startCDATA
    256             parser.EndCdataSectionHandler = lex.endCDATA
    257             parser.StartDoctypeDeclHandler = self.start_doctype_decl
    258             parser.EndDoctypeDeclHandler = lex.endDTD
    259 
    260     def reset(self):
    261         if self._namespaces:
    262             self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
    263                                               intern=self._interning)
    264             self._parser.namespace_prefixes = 1
    265             self._parser.StartElementHandler = self.start_element_ns
    266             self._parser.EndElementHandler = self.end_element_ns
    267         else:
    268             self._parser = expat.ParserCreate(self._source.getEncoding(),
    269                                               intern = self._interning)
    270             self._parser.StartElementHandler = self.start_element
    271             self._parser.EndElementHandler = self.end_element
    272 
    273         self._reset_cont_handler()
    274         self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
    275         self._parser.NotationDeclHandler = self.notation_decl
    276         self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
    277         self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
    278 
    279         self._decl_handler_prop = None
    280         if self._lex_handler_prop:
    281             self._reset_lex_handler_prop()
    282 #         self._parser.DefaultHandler =
    283 #         self._parser.DefaultHandlerExpand =
    284 #         self._parser.NotStandaloneHandler =
    285         self._parser.ExternalEntityRefHandler = self.external_entity_ref
    286         try:
    287             self._parser.SkippedEntityHandler = self.skipped_entity_handler
    288         except AttributeError:
    289             # This pyexpat does not support SkippedEntity
    290             pass
    291         self._parser.SetParamEntityParsing(
    292             expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
    293 
    294         self._parsing = 0
    295         self._entity_stack = []
    296 
    297     # Locator methods
    298 
    299     def getColumnNumber(self):
    300         if self._parser is None:
    301             return None
    302         return self._parser.ErrorColumnNumber
    303 
    304     def getLineNumber(self):
    305         if self._parser is None:
    306             return 1
    307         return self._parser.ErrorLineNumber
    308 
    309     def getPublicId(self):
    310         return self._source.getPublicId()
    311 
    312     def getSystemId(self):
    313         return self._source.getSystemId()
    314 
    315     # event handlers
    316     def start_element(self, name, attrs):
    317         self._cont_handler.startElement(name, AttributesImpl(attrs))
    318 
    319     def end_element(self, name):
    320         self._cont_handler.endElement(name)
    321 
    322     def start_element_ns(self, name, attrs):
    323         pair = name.split()
    324         if len(pair) == 1:
    325             # no namespace
    326             pair = (None, name)
    327         elif len(pair) == 3:
    328             pair = pair[0], pair[1]
    329         else:
    330             # default namespace
    331             pair = tuple(pair)
    332 
    333         newattrs = {}
    334         qnames = {}
    335         for (aname, value) in attrs.items():
    336             parts = aname.split()
    337             length = len(parts)
    338             if length == 1:
    339                 # no namespace
    340                 qname = aname
    341                 apair = (None, aname)
    342             elif length == 3:
    343                 qname = "%s:%s" % (parts[2], parts[1])
    344                 apair = parts[0], parts[1]
    345             else:
    346                 # default namespace
    347                 qname = parts[1]
    348                 apair = tuple(parts)
    349 
    350             newattrs[apair] = value
    351             qnames[apair] = qname
    352 
    353         self._cont_handler.startElementNS(pair, None,
    354                                           AttributesNSImpl(newattrs, qnames))
    355 
    356     def end_element_ns(self, name):
    357         pair = name.split()
    358         if len(pair) == 1:
    359             pair = (None, name)
    360         elif len(pair) == 3:
    361             pair = pair[0], pair[1]
    362         else:
    363             pair = tuple(pair)
    364 
    365         self._cont_handler.endElementNS(pair, None)
    366 
    367     # this is not used (call directly to ContentHandler)
    368     def processing_instruction(self, target, data):
    369         self._cont_handler.processingInstruction(target, data)
    370 
    371     # this is not used (call directly to ContentHandler)
    372     def character_data(self, data):
    373         self._cont_handler.characters(data)
    374 
    375     def start_namespace_decl(self, prefix, uri):
    376         self._cont_handler.startPrefixMapping(prefix, uri)
    377 
    378     def end_namespace_decl(self, prefix):
    379         self._cont_handler.endPrefixMapping(prefix)
    380 
    381     def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
    382         self._lex_handler_prop.startDTD(name, pubid, sysid)
    383 
    384     def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
    385         self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
    386 
    387     def notation_decl(self, name, base, sysid, pubid):
    388         self._dtd_handler.notationDecl(name, pubid, sysid)
    389 
    390     def external_entity_ref(self, context, base, sysid, pubid):
    391         if not self._external_ges:
    392             return 1
    393 
    394         source = self._ent_handler.resolveEntity(pubid, sysid)
    395         source = saxutils.prepare_input_source(source,
    396                                                self._source.getSystemId() or
    397                                                "")
    398 
    399         self._entity_stack.append((self._parser, self._source))
    400         self._parser = self._parser.ExternalEntityParserCreate(context)
    401         self._source = source
    402 
    403         try:
    404             xmlreader.IncrementalParser.parse(self, source)
    405         except:
    406             return 0  # FIXME: save error info here?
    407 
    408         (self._parser, self._source) = self._entity_stack[-1]
    409         del self._entity_stack[-1]
    410         return 1
    411 
    412     def skipped_entity_handler(self, name, is_pe):
    413         if is_pe:
    414             # The SAX spec requires to report skipped PEs with a '%'
    415             name = '%'+name
    416         self._cont_handler.skippedEntity(name)
    417 
    418 # ---
    419 
    420 def create_parser(*args, **kwargs):
    421     return ExpatParser(*args, **kwargs)
    422 
    423 # ---
    424 
    425 if __name__ == "__main__":
    426     import xml.sax.saxutils
    427     p = create_parser()
    428     p.setContentHandler(xml.sax.saxutils.XMLGenerator())
    429     p.setErrorHandler(xml.sax.ErrorHandler())
    430     p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
    431