Home | History | Annotate | Download | only in python
      1 # -*- coding: iso-8859-1 -*-
      2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
      3 
      4 USAGE
      5     # put this file (drv_libxml2.py) in PYTHONPATH
      6     import xml.sax
      7     reader = xml.sax.make_parser(["drv_libxml2"])
      8     # ...and the rest is standard python sax.
      9 
     10 CAVEATS
     11     - Lexical handlers are supported, except for start/endEntity
     12       (waiting for XmlReader.ResolveEntity) and start/endDTD
     13     - Error callbacks are not exactly synchronous, they tend
     14       to be invoked before the corresponding content callback,
     15       because the underlying reader interface parses
     16       data by chunks of 512 bytes
     17     
     18 TODO
     19     - search for TODO
     20     - some ErrorHandler events (warning)
     21     - some ContentHandler events (setDocumentLocator, skippedEntity)
     22     - EntityResolver (using libxml2.?)
     23     - DTDHandler (if/when libxml2 exposes such node types)
     24     - DeclHandler (if/when libxml2 exposes such node types)
     25     - property_xml_string?
     26     - feature_string_interning?
     27     - Incremental parser
     28     - additional performance tuning:
     29       - one might cache callbacks to avoid some name lookups
     30       - one might implement a smarter way to pass attributes to startElement
     31         (some kind of lazy evaluation?)
     32       - there might be room for improvement in start/endPrefixMapping
     33       - other?
     34 
     35 """
     36 
     37 __author__  = "Stphane Bidoul <sbi (at] skynet.be>"
     38 __version__ = "0.3"
     39 
     40 import sys
     41 import codecs
     42 
     43 if sys.version_info[0] < 3:
     44     __author__  = codecs.unicode_escape_decode(__author__)[0]
     45 
     46     StringTypes = (str, unicode)
     47 else:
     48     StringTypes = str
     49 
     50 from xml.sax._exceptions import *
     51 from xml.sax import xmlreader, saxutils
     52 from xml.sax.handler import \
     53      feature_namespaces, \
     54      feature_namespace_prefixes, \
     55      feature_string_interning, \
     56      feature_validation, \
     57      feature_external_ges, \
     58      feature_external_pes, \
     59      property_lexical_handler, \
     60      property_declaration_handler, \
     61      property_dom_node, \
     62      property_xml_string
     63 
     64 # libxml2 returns strings as UTF8
     65 _decoder = codecs.lookup("utf8")[1]
     66 def _d(s):
     67     if s is None:
     68         return s
     69     else:
     70         return _decoder(s)[0]
     71 
     72 try:
     73     import libxml2
     74 except ImportError:
     75     raise SAXReaderNotAvailable("libxml2 not available: " \
     76                                 "import error was: %s" % sys.exc_info()[1])
     77 
     78 class Locator(xmlreader.Locator):
     79     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
     80 
     81     def __init__(self,locator):
     82         self.__locator = locator
     83 
     84     def getColumnNumber(self):
     85         "Return the column number where the current event ends."
     86         return -1
     87 
     88     def getLineNumber(self):
     89         "Return the line number where the current event ends."
     90         return self.__locator.LineNumber()
     91 
     92     def getPublicId(self):
     93         "Return the public identifier for the current event."
     94         return None
     95 
     96     def getSystemId(self):
     97         "Return the system identifier for the current event."
     98         return self.__locator.BaseURI()
     99 
    100 class LibXml2Reader(xmlreader.XMLReader):
    101 
    102     def __init__(self):
    103         xmlreader.XMLReader.__init__(self)
    104         # features
    105         self.__ns = 0
    106         self.__nspfx = 0
    107         self.__validate = 0
    108         self.__extparams = 1
    109         # parsing flag
    110         self.__parsing = 0
    111         # additional handlers
    112         self.__lex_handler = None
    113         self.__decl_handler = None
    114         # error messages accumulator
    115         self.__errors = None
    116 
    117     def _errorHandler(self,arg,msg,severity,locator):
    118         if self.__errors is None:
    119             self.__errors = []
    120         self.__errors.append((severity,
    121                               SAXParseException(msg,None,
    122                                                 Locator(locator))))
    123 
    124     def _reportErrors(self,fatal):
    125         for severity,exception in self.__errors:
    126             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
    127                             libxml2.PARSER_SEVERITY_WARNING):
    128                 self._err_handler.warning(exception)
    129             else:
    130                 # when fatal is set, the parse will stop;
    131                 # we consider that the last error reported
    132                 # is the fatal one.
    133                 if fatal and exception is self.__errors[-1][1]:
    134                     self._err_handler.fatalError(exception)
    135                 else:
    136                     self._err_handler.error(exception)
    137         self.__errors = None
    138 
    139     def parse(self, source):
    140         self.__parsing = 1
    141         try:
    142             # prepare source and create reader
    143             if isinstance(source, StringTypes):
    144                 reader = libxml2.newTextReaderFilename(source)
    145             else:
    146                 source = saxutils.prepare_input_source(source)
    147                 input = libxml2.inputBuffer(source.getByteStream())
    148                 reader = input.newTextReader(source.getSystemId())
    149             reader.SetErrorHandler(self._errorHandler,None)
    150             # configure reader
    151             if self.__extparams:
    152                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
    153                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
    154                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
    155                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
    156             else:
    157                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
    158             # we reuse attribute maps (for a slight performance gain)
    159             if self.__ns:
    160                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
    161             else:
    162                 attributesImpl = xmlreader.AttributesImpl({})
    163             # prefixes to pop (for endPrefixMapping)
    164             prefixes = []
    165             # start loop
    166             self._cont_handler.startDocument()
    167             while 1:
    168                 r = reader.Read()
    169                 # check for errors
    170                 if r == 1:
    171                     if not self.__errors is None:
    172                         self._reportErrors(0)
    173                 elif r == 0:
    174                     if not self.__errors is None:
    175                         self._reportErrors(0)
    176                     break # end of parse
    177                 else:
    178                     if not self.__errors is None:
    179                         self._reportErrors(1)
    180                     else:
    181                         self._err_handler.fatalError(\
    182                             SAXException("Read failed (no details available)"))
    183                     break # fatal parse error
    184                 # get node type
    185                 nodeType = reader.NodeType()
    186                 # Element
    187                 if nodeType == 1: 
    188                     if self.__ns:
    189                         eltName = (_d(reader.NamespaceUri()),\
    190                                    _d(reader.LocalName()))
    191                         eltQName = _d(reader.Name())
    192                         attributesNSImpl._attrs = attrs = {}
    193                         attributesNSImpl._qnames = qnames = {}
    194                         newPrefixes = []
    195                         while reader.MoveToNextAttribute():
    196                             qname = _d(reader.Name())
    197                             value = _d(reader.Value())
    198                             if qname.startswith("xmlns"):
    199                                 if len(qname) > 5:
    200                                     newPrefix = qname[6:]
    201                                 else:
    202                                     newPrefix = None
    203                                 newPrefixes.append(newPrefix)
    204                                 self._cont_handler.startPrefixMapping(\
    205                                     newPrefix,value)
    206                                 if not self.__nspfx:
    207                                     continue # don't report xmlns attribute
    208                             attName = (_d(reader.NamespaceUri()),
    209                                        _d(reader.LocalName()))
    210                             qnames[attName] = qname
    211                             attrs[attName] = value
    212                         reader.MoveToElement()
    213                         self._cont_handler.startElementNS( \
    214                             eltName,eltQName,attributesNSImpl) 
    215                         if reader.IsEmptyElement():
    216                             self._cont_handler.endElementNS(eltName,eltQName)
    217                             for newPrefix in newPrefixes:
    218                                 self._cont_handler.endPrefixMapping(newPrefix)
    219                         else:
    220                             prefixes.append(newPrefixes)
    221                     else:
    222                         eltName = _d(reader.Name())
    223                         attributesImpl._attrs = attrs = {}
    224                         while reader.MoveToNextAttribute():
    225                             attName = _d(reader.Name())
    226                             attrs[attName] = _d(reader.Value())
    227                         reader.MoveToElement()
    228                         self._cont_handler.startElement( \
    229                             eltName,attributesImpl)
    230                         if reader.IsEmptyElement():
    231                             self._cont_handler.endElement(eltName)
    232                 # EndElement
    233                 elif nodeType == 15: 
    234                     if self.__ns:
    235                         self._cont_handler.endElementNS( \
    236                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
    237                              _d(reader.Name()))
    238                         for prefix in prefixes.pop():
    239                             self._cont_handler.endPrefixMapping(prefix)
    240                     else:
    241                         self._cont_handler.endElement(_d(reader.Name()))
    242                 # Text
    243                 elif nodeType == 3: 
    244                     self._cont_handler.characters(_d(reader.Value()))
    245                 # Whitespace
    246                 elif nodeType == 13: 
    247                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
    248                 # SignificantWhitespace
    249                 elif nodeType == 14:
    250                     self._cont_handler.characters(_d(reader.Value()))
    251                 # CDATA
    252                 elif nodeType == 4:
    253                     if not self.__lex_handler is None:
    254                         self.__lex_handler.startCDATA()
    255                     self._cont_handler.characters(_d(reader.Value()))
    256                     if not self.__lex_handler is None:
    257                         self.__lex_handler.endCDATA()
    258                 # EntityReference
    259                 elif nodeType == 5:
    260                     if not self.__lex_handler is None:
    261                         self.startEntity(_d(reader.Name()))
    262                     reader.ResolveEntity()
    263                 # EndEntity
    264                 elif nodeType == 16:
    265                     if not self.__lex_handler is None:
    266                         self.endEntity(_d(reader.Name()))
    267                 # ProcessingInstruction
    268                 elif nodeType == 7: 
    269                     self._cont_handler.processingInstruction( \
    270                         _d(reader.Name()),_d(reader.Value()))
    271                 # Comment
    272                 elif nodeType == 8:
    273                     if not self.__lex_handler is None:
    274                         self.__lex_handler.comment(_d(reader.Value()))
    275                 # DocumentType
    276                 elif nodeType == 10:
    277                     #if not self.__lex_handler is None:
    278                     #    self.__lex_handler.startDTD()
    279                     pass # TODO (how to detect endDTD? on first non-dtd event?)
    280                 # XmlDeclaration
    281                 elif nodeType == 17:
    282                     pass # TODO
    283                 # Entity
    284                 elif nodeType == 6:
    285                     pass # TODO (entity decl)
    286                 # Notation (decl)
    287                 elif nodeType == 12:
    288                     pass # TODO
    289                 # Attribute (never in this loop)
    290                 #elif nodeType == 2: 
    291                 #    pass
    292                 # Document (not exposed)
    293                 #elif nodeType == 9: 
    294                 #    pass
    295                 # DocumentFragment (never returned by XmlReader)
    296                 #elif nodeType == 11:
    297                 #    pass
    298                 # None
    299                 #elif nodeType == 0:
    300                 #    pass
    301                 # -
    302                 else:
    303                     raise SAXException("Unexpected node type %d" % nodeType)
    304             if r == 0:
    305                 self._cont_handler.endDocument()
    306             reader.Close()
    307         finally:
    308             self.__parsing = 0
    309 
    310     def setDTDHandler(self, handler):
    311         # TODO (when supported, the inherited method works just fine)
    312         raise SAXNotSupportedException("DTDHandler not supported")
    313 
    314     def setEntityResolver(self, resolver):
    315         # TODO (when supported, the inherited method works just fine)
    316         raise SAXNotSupportedException("EntityResolver not supported")
    317 
    318     def getFeature(self, name):
    319         if name == feature_namespaces:
    320             return self.__ns
    321         elif name == feature_namespace_prefixes:
    322             return self.__nspfx
    323         elif name == feature_validation:
    324             return self.__validate
    325         elif name == feature_external_ges:
    326             return 1 # TODO (does that relate to PARSER_LOADDTD)?
    327         elif name == feature_external_pes:
    328             return self.__extparams
    329         else:
    330             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
    331                                             name)
    332 
    333     def setFeature(self, name, state):
    334         if self.__parsing:
    335             raise SAXNotSupportedException("Cannot set feature %s " \
    336                                            "while parsing" % name)
    337         if name == feature_namespaces:
    338             self.__ns = state
    339         elif name == feature_namespace_prefixes:
    340             self.__nspfx = state
    341         elif name == feature_validation:
    342             self.__validate = state
    343         elif name == feature_external_ges:
    344             if state == 0:
    345                 # TODO (does that relate to PARSER_LOADDTD)?
    346                 raise SAXNotSupportedException("Feature '%s' not supported" % \
    347                                                name)
    348         elif name == feature_external_pes:
    349             self.__extparams = state
    350         else:
    351             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
    352                                             name)
    353 
    354     def getProperty(self, name):
    355         if name == property_lexical_handler:
    356             return self.__lex_handler
    357         elif name == property_declaration_handler:
    358             return self.__decl_handler
    359         else:
    360             raise SAXNotRecognizedException("Property '%s' not recognized" % \
    361                                             name)
    362 
    363     def setProperty(self, name, value):     
    364         if name == property_lexical_handler:
    365             self.__lex_handler = value
    366         elif name == property_declaration_handler:
    367             # TODO: remove if/when libxml2 supports dtd events
    368             raise SAXNotSupportedException("Property '%s' not supported" % \
    369                                            name)
    370             self.__decl_handler = value
    371         else:
    372             raise SAXNotRecognizedException("Property '%s' not recognized" % \
    373                                             name)
    374 
    375 def create_parser():
    376     return LibXml2Reader()
    377 
    378