Home | History | Annotate | Download | only in sax
      1 """
      2 SAX driver for the pyexpat C module.  This driver works with
      3 pyexpat.__version__ == '2.22'.
      4 """
      5 
      6 version = "0.20"
      7 
      8 from xml.sax._exceptions import *
      9 from xml.sax.handler import feature_validation, feature_namespaces
     10 from xml.sax.handler import feature_namespace_prefixes
     11 from xml.sax.handler import feature_external_ges, feature_external_pes
     12 from xml.sax.handler import feature_string_interning
     13 from xml.sax.handler import property_xml_string, property_interning_dict
     14 
     15 # xml.parsers.expat does not raise ImportError in Jython
     16 import sys
     17 if sys.platform[:4] == "java":
     18     raise SAXReaderNotAvailable("expat not available in Java", None)
     19 del sys
     20 
     21 try:
     22     from xml.parsers import expat
     23 except ImportError:
     24     raise SAXReaderNotAvailable("expat not supported", None)
     25 else:
     26     if not hasattr(expat, "ParserCreate"):
     27         raise SAXReaderNotAvailable("expat not supported", None)
     28 from xml.sax import xmlreader, saxutils, handler
     29 
     30 AttributesImpl = xmlreader.AttributesImpl
     31 AttributesNSImpl = xmlreader.AttributesNSImpl
     32 
     33 # If we're using a sufficiently recent version of Python, we can use
     34 # weak references to avoid cycles between the parser and content
     35 # handler, otherwise we'll just have to pretend.
     36 try:
     37     import _weakref
     38 except ImportError:
     39     def _mkproxy(o):
     40         return o
     41 else:
     42     import weakref
     43     _mkproxy = weakref.proxy
     44     del weakref, _weakref
     45 
     46 class _ClosedParser:
     47     pass
     48 
     49 # --- ExpatLocator
     50 
     51 class ExpatLocator(xmlreader.Locator):
     52     """Locator for use with the ExpatParser class.
     53 
     54     This uses a weak reference to the parser object to avoid creating
     55     a circular reference between the parser and the content handler.
     56     """
     57     def __init__(self, parser):
     58         self._ref = _mkproxy(parser)
     59 
     60     def getColumnNumber(self):
     61         parser = self._ref
     62         if parser._parser is None:
     63             return None
     64         return parser._parser.ErrorColumnNumber
     65 
     66     def getLineNumber(self):
     67         parser = self._ref
     68         if parser._parser is None:
     69             return 1
     70         return parser._parser.ErrorLineNumber
     71 
     72     def getPublicId(self):
     73         parser = self._ref
     74         if parser is None:
     75             return None
     76         return parser._source.getPublicId()
     77 
     78     def getSystemId(self):
     79         parser = self._ref
     80         if parser is None:
     81             return None
     82         return parser._source.getSystemId()
     83 
     84 
     85 # --- ExpatParser
     86 
     87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
     88     """SAX driver for the pyexpat C module."""
     89 
     90     def __init__(self, namespaceHandling=0, bufsize=2**16-20):
     91         xmlreader.IncrementalParser.__init__(self, bufsize)
     92         self._source = xmlreader.InputSource()
     93         self._parser = None
     94         self._namespaces = namespaceHandling
     95         self._lex_handler_prop = None
     96         self._parsing = 0
     97         self._entity_stack = []
     98         self._external_ges = 1
     99         self._interning = None
    100 
    101     # XMLReader methods
    102 
    103     def parse(self, source):
    104         "Parse an XML document from a URL or an InputSource."
    105         source = saxutils.prepare_input_source(source)
    106 
    107         self._source = source
    108         self.reset()
    109         self._cont_handler.setDocumentLocator(ExpatLocator(self))
    110         xmlreader.IncrementalParser.parse(self, source)
    111 
    112     def prepareParser(self, source):
    113         if source.getSystemId() is not None:
    114             self._parser.SetBase(source.getSystemId())
    115 
    116     # Redefined setContentHandler to allow changing handlers during parsing
    117 
    118     def setContentHandler(self, handler):
    119         xmlreader.IncrementalParser.setContentHandler(self, handler)
    120         if self._parsing:
    121             self._reset_cont_handler()
    122 
    123     def getFeature(self, name):
    124         if name == feature_namespaces:
    125             return self._namespaces
    126         elif name == feature_string_interning:
    127             return self._interning is not None
    128         elif name in (feature_validation, feature_external_pes,
    129                       feature_namespace_prefixes):
    130             return 0
    131         elif name == feature_external_ges:
    132             return self._external_ges
    133         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
    134 
    135     def setFeature(self, name, state):
    136         if self._parsing:
    137             raise SAXNotSupportedException("Cannot set features while parsing")
    138 
    139         if name == feature_namespaces:
    140             self._namespaces = state
    141         elif name == feature_external_ges:
    142             self._external_ges = state
    143         elif name == feature_string_interning:
    144             if state:
    145                 if self._interning is None:
    146                     self._interning = {}
    147             else:
    148                 self._interning = None
    149         elif name == feature_validation:
    150             if state:
    151                 raise SAXNotSupportedException(
    152                     "expat does not support validation")
    153         elif name == feature_external_pes:
    154             if state:
    155                 raise SAXNotSupportedException(
    156                     "expat does not read external parameter entities")
    157         elif name == feature_namespace_prefixes:
    158             if state:
    159                 raise SAXNotSupportedException(
    160                     "expat does not report namespace prefixes")
    161         else:
    162             raise SAXNotRecognizedException(
    163                 "Feature '%s' not recognized" % name)
    164 
    165     def getProperty(self, name):
    166         if name == handler.property_lexical_handler:
    167             return self._lex_handler_prop
    168         elif name == property_interning_dict:
    169             return self._interning
    170         elif name == property_xml_string:
    171             if self._parser:
    172                 if hasattr(self._parser, "GetInputContext"):
    173                     return self._parser.GetInputContext()
    174                 else:
    175                     raise SAXNotRecognizedException(
    176                         "This version of expat does not support getting"
    177                         " the XML string")
    178             else:
    179                 raise SAXNotSupportedException(
    180                     "XML string cannot be returned when not parsing")
    181         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
    182 
    183     def setProperty(self, name, value):
    184         if name == handler.property_lexical_handler:
    185             self._lex_handler_prop = value
    186             if self._parsing:
    187                 self._reset_lex_handler_prop()
    188         elif name == property_interning_dict:
    189             self._interning = value
    190         elif name == property_xml_string:
    191             raise SAXNotSupportedException("Property '%s' cannot be set" %
    192                                            name)
    193         else:
    194             raise SAXNotRecognizedException("Property '%s' not recognized" %
    195                                             name)
    196 
    197     # IncrementalParser methods
    198 
    199     def feed(self, data, isFinal = 0):
    200         if not self._parsing:
    201             self.reset()
    202             self._parsing = 1
    203             self._cont_handler.startDocument()
    204 
    205         try:
    206             # The isFinal parameter is internal to the expat reader.
    207             # If it is set to true, expat will check validity of the entire
    208             # document. When feeding chunks, they are not normally final -
    209             # except when invoked from close.
    210             self._parser.Parse(data, isFinal)
    211         except expat.error as e:
    212             exc = SAXParseException(expat.ErrorString(e.code), e, self)
    213             # FIXME: when to invoke error()?
    214             self._err_handler.fatalError(exc)
    215 
    216     def close(self):
    217         if (self._entity_stack or self._parser is None or
    218             isinstance(self._parser, _ClosedParser)):
    219             # If we are completing an external entity, do nothing here
    220             return
    221         try:
    222             self.feed("", isFinal = 1)
    223             self._cont_handler.endDocument()
    224             self._parsing = 0
    225             # break cycle created by expat handlers pointing to our methods
    226             self._parser = None
    227         finally:
    228             self._parsing = 0
    229             if self._parser is not None:
    230                 # Keep ErrorColumnNumber and ErrorLineNumber after closing.
    231                 parser = _ClosedParser()
    232                 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber
    233                 parser.ErrorLineNumber = self._parser.ErrorLineNumber
    234                 self._parser = parser
    235             try:
    236                 file = self._source.getCharacterStream()
    237                 if file is not None:
    238                     file.close()
    239             finally:
    240                 file = self._source.getByteStream()
    241                 if file is not None:
    242                     file.close()
    243 
    244     def _reset_cont_handler(self):
    245         self._parser.ProcessingInstructionHandler = \
    246                                     self._cont_handler.processingInstruction
    247         self._parser.CharacterDataHandler = self._cont_handler.characters
    248 
    249     def _reset_lex_handler_prop(self):
    250         lex = self._lex_handler_prop
    251         parser = self._parser
    252         if lex is None:
    253             parser.CommentHandler = None
    254             parser.StartCdataSectionHandler = None
    255             parser.EndCdataSectionHandler = None
    256             parser.StartDoctypeDeclHandler = None
    257             parser.EndDoctypeDeclHandler = None
    258         else:
    259             parser.CommentHandler = lex.comment
    260             parser.StartCdataSectionHandler = lex.startCDATA
    261             parser.EndCdataSectionHandler = lex.endCDATA
    262             parser.StartDoctypeDeclHandler = self.start_doctype_decl
    263             parser.EndDoctypeDeclHandler = lex.endDTD
    264 
    265     def reset(self):
    266         if self._namespaces:
    267             self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
    268                                               intern=self._interning)
    269             self._parser.namespace_prefixes = 1
    270             self._parser.StartElementHandler = self.start_element_ns
    271             self._parser.EndElementHandler = self.end_element_ns
    272         else:
    273             self._parser = expat.ParserCreate(self._source.getEncoding(),
    274                                               intern = self._interning)
    275             self._parser.StartElementHandler = self.start_element
    276             self._parser.EndElementHandler = self.end_element
    277 
    278         self._reset_cont_handler()
    279         self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
    280         self._parser.NotationDeclHandler = self.notation_decl
    281         self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
    282         self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
    283 
    284         self._decl_handler_prop = None
    285         if self._lex_handler_prop:
    286             self._reset_lex_handler_prop()
    287 #         self._parser.DefaultHandler =
    288 #         self._parser.DefaultHandlerExpand =
    289 #         self._parser.NotStandaloneHandler =
    290         self._parser.ExternalEntityRefHandler = self.external_entity_ref
    291         try:
    292             self._parser.SkippedEntityHandler = self.skipped_entity_handler
    293         except AttributeError:
    294             # This pyexpat does not support SkippedEntity
    295             pass
    296         self._parser.SetParamEntityParsing(
    297             expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
    298 
    299         self._parsing = 0
    300         self._entity_stack = []
    301 
    302     # Locator methods
    303 
    304     def getColumnNumber(self):
    305         if self._parser is None:
    306             return None
    307         return self._parser.ErrorColumnNumber
    308 
    309     def getLineNumber(self):
    310         if self._parser is None:
    311             return 1
    312         return self._parser.ErrorLineNumber
    313 
    314     def getPublicId(self):
    315         return self._source.getPublicId()
    316 
    317     def getSystemId(self):
    318         return self._source.getSystemId()
    319 
    320     # event handlers
    321     def start_element(self, name, attrs):
    322         self._cont_handler.startElement(name, AttributesImpl(attrs))
    323 
    324     def end_element(self, name):
    325         self._cont_handler.endElement(name)
    326 
    327     def start_element_ns(self, name, attrs):
    328         pair = name.split()
    329         if len(pair) == 1:
    330             # no namespace
    331             pair = (None, name)
    332         elif len(pair) == 3:
    333             pair = pair[0], pair[1]
    334         else:
    335             # default namespace
    336             pair = tuple(pair)
    337 
    338         newattrs = {}
    339         qnames = {}
    340         for (aname, value) in attrs.items():
    341             parts = aname.split()
    342             length = len(parts)
    343             if length == 1:
    344                 # no namespace
    345                 qname = aname
    346                 apair = (None, aname)
    347             elif length == 3:
    348                 qname = "%s:%s" % (parts[2], parts[1])
    349                 apair = parts[0], parts[1]
    350             else:
    351                 # default namespace
    352                 qname = parts[1]
    353                 apair = tuple(parts)
    354 
    355             newattrs[apair] = value
    356             qnames[apair] = qname
    357 
    358         self._cont_handler.startElementNS(pair, None,
    359                                           AttributesNSImpl(newattrs, qnames))
    360 
    361     def end_element_ns(self, name):
    362         pair = name.split()
    363         if len(pair) == 1:
    364             pair = (None, name)
    365         elif len(pair) == 3:
    366             pair = pair[0], pair[1]
    367         else:
    368             pair = tuple(pair)
    369 
    370         self._cont_handler.endElementNS(pair, None)
    371 
    372     # this is not used (call directly to ContentHandler)
    373     def processing_instruction(self, target, data):
    374         self._cont_handler.processingInstruction(target, data)
    375 
    376     # this is not used (call directly to ContentHandler)
    377     def character_data(self, data):
    378         self._cont_handler.characters(data)
    379 
    380     def start_namespace_decl(self, prefix, uri):
    381         self._cont_handler.startPrefixMapping(prefix, uri)
    382 
    383     def end_namespace_decl(self, prefix):
    384         self._cont_handler.endPrefixMapping(prefix)
    385 
    386     def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
    387         self._lex_handler_prop.startDTD(name, pubid, sysid)
    388 
    389     def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
    390         self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
    391 
    392     def notation_decl(self, name, base, sysid, pubid):
    393         self._dtd_handler.notationDecl(name, pubid, sysid)
    394 
    395     def external_entity_ref(self, context, base, sysid, pubid):
    396         if not self._external_ges:
    397             return 1
    398 
    399         source = self._ent_handler.resolveEntity(pubid, sysid)
    400         source = saxutils.prepare_input_source(source,
    401                                                self._source.getSystemId() or
    402                                                "")
    403 
    404         self._entity_stack.append((self._parser, self._source))
    405         self._parser = self._parser.ExternalEntityParserCreate(context)
    406         self._source = source
    407 
    408         try:
    409             xmlreader.IncrementalParser.parse(self, source)
    410         except:
    411             return 0  # FIXME: save error info here?
    412 
    413         (self._parser, self._source) = self._entity_stack[-1]
    414         del self._entity_stack[-1]
    415         return 1
    416 
    417     def skipped_entity_handler(self, name, is_pe):
    418         if is_pe:
    419             # The SAX spec requires to report skipped PEs with a '%'
    420             name = '%'+name
    421         self._cont_handler.skippedEntity(name)
    422 
    423 # ---
    424 
    425 def create_parser(*args, **kwargs):
    426     return ExpatParser(*args, **kwargs)
    427 
    428 # ---
    429 
    430 if __name__ == "__main__":
    431     import xml.sax.saxutils
    432     p = create_parser()
    433     p.setContentHandler(xml.sax.saxutils.XMLGenerator())
    434     p.setErrorHandler(xml.sax.ErrorHandler())
    435     p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
    436