Home | History | Annotate | Download | only in sax
      1 """An XML Reader is the SAX 2 name for an XML parser. XML Parsers
      2 should be based on this code. """
      3 
      4 import handler
      5 
      6 from _exceptions import SAXNotSupportedException, SAXNotRecognizedException
      7 
      8 
      9 # ===== XMLREADER =====
     10 
     11 class XMLReader:
     12     """Interface for reading an XML document using callbacks.
     13 
     14     XMLReader is the interface that an XML parser's SAX2 driver must
     15     implement. This interface allows an application to set and query
     16     features and properties in the parser, to register event handlers
     17     for document processing, and to initiate a document parse.
     18 
     19     All SAX interfaces are assumed to be synchronous: the parse
     20     methods must not return until parsing is complete, and readers
     21     must wait for an event-handler callback to return before reporting
     22     the next event."""
     23 
     24     def __init__(self):
     25         self._cont_handler = handler.ContentHandler()
     26         self._dtd_handler = handler.DTDHandler()
     27         self._ent_handler = handler.EntityResolver()
     28         self._err_handler = handler.ErrorHandler()
     29 
     30     def parse(self, source):
     31         "Parse an XML document from a system identifier or an InputSource."
     32         raise NotImplementedError("This method must be implemented!")
     33 
     34     def getContentHandler(self):
     35         "Returns the current ContentHandler."
     36         return self._cont_handler
     37 
     38     def setContentHandler(self, handler):
     39         "Registers a new object to receive document content events."
     40         self._cont_handler = handler
     41 
     42     def getDTDHandler(self):
     43         "Returns the current DTD handler."
     44         return self._dtd_handler
     45 
     46     def setDTDHandler(self, handler):
     47         "Register an object to receive basic DTD-related events."
     48         self._dtd_handler = handler
     49 
     50     def getEntityResolver(self):
     51         "Returns the current EntityResolver."
     52         return self._ent_handler
     53 
     54     def setEntityResolver(self, resolver):
     55         "Register an object to resolve external entities."
     56         self._ent_handler = resolver
     57 
     58     def getErrorHandler(self):
     59         "Returns the current ErrorHandler."
     60         return self._err_handler
     61 
     62     def setErrorHandler(self, handler):
     63         "Register an object to receive error-message events."
     64         self._err_handler = handler
     65 
     66     def setLocale(self, locale):
     67         """Allow an application to set the locale for errors and warnings.
     68 
     69         SAX parsers are not required to provide localization for errors
     70         and warnings; if they cannot support the requested locale,
     71         however, they must raise a SAX exception. Applications may
     72         request a locale change in the middle of a parse."""
     73         raise SAXNotSupportedException("Locale support not implemented")
     74 
     75     def getFeature(self, name):
     76         "Looks up and returns the state of a SAX2 feature."
     77         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
     78 
     79     def setFeature(self, name, state):
     80         "Sets the state of a SAX2 feature."
     81         raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
     82 
     83     def getProperty(self, name):
     84         "Looks up and returns the value of a SAX2 property."
     85         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
     86 
     87     def setProperty(self, name, value):
     88         "Sets the value of a SAX2 property."
     89         raise SAXNotRecognizedException("Property '%s' not recognized" % name)
     90 
     91 class IncrementalParser(XMLReader):
     92     """This interface adds three extra methods to the XMLReader
     93     interface that allow XML parsers to support incremental
     94     parsing. Support for this interface is optional, since not all
     95     underlying XML parsers support this functionality.
     96 
     97     When the parser is instantiated it is ready to begin accepting
     98     data from the feed method immediately. After parsing has been
     99     finished with a call to close the reset method must be called to
    100     make the parser ready to accept new data, either from feed or
    101     using the parse method.
    102 
    103     Note that these methods must _not_ be called during parsing, that
    104     is, after parse has been called and before it returns.
    105 
    106     By default, the class also implements the parse method of the XMLReader
    107     interface using the feed, close and reset methods of the
    108     IncrementalParser interface as a convenience to SAX 2.0 driver
    109     writers."""
    110 
    111     def __init__(self, bufsize=2**16):
    112         self._bufsize = bufsize
    113         XMLReader.__init__(self)
    114 
    115     def parse(self, source):
    116         import saxutils
    117         source = saxutils.prepare_input_source(source)
    118 
    119         self.prepareParser(source)
    120         file = source.getByteStream()
    121         buffer = file.read(self._bufsize)
    122         while buffer != "":
    123             self.feed(buffer)
    124             buffer = file.read(self._bufsize)
    125         self.close()
    126 
    127     def feed(self, data):
    128         """This method gives the raw XML data in the data parameter to
    129         the parser and makes it parse the data, emitting the
    130         corresponding events. It is allowed for XML constructs to be
    131         split across several calls to feed.
    132 
    133         feed may raise SAXException."""
    134         raise NotImplementedError("This method must be implemented!")
    135 
    136     def prepareParser(self, source):
    137         """This method is called by the parse implementation to allow
    138         the SAX 2.0 driver to prepare itself for parsing."""
    139         raise NotImplementedError("prepareParser must be overridden!")
    140 
    141     def close(self):
    142         """This method is called when the entire XML document has been
    143         passed to the parser through the feed method, to notify the
    144         parser that there are no more data. This allows the parser to
    145         do the final checks on the document and empty the internal
    146         data buffer.
    147 
    148         The parser will not be ready to parse another document until
    149         the reset method has been called.
    150 
    151         close may raise SAXException."""
    152         raise NotImplementedError("This method must be implemented!")
    153 
    154     def reset(self):
    155         """This method is called after close has been called to reset
    156         the parser so that it is ready to parse new documents. The
    157         results of calling parse or feed after close without calling
    158         reset are undefined."""
    159         raise NotImplementedError("This method must be implemented!")
    160 
    161 # ===== LOCATOR =====
    162 
    163 class Locator:
    164     """Interface for associating a SAX event with a document
    165     location. A locator object will return valid results only during
    166     calls to DocumentHandler methods; at any other time, the
    167     results are unpredictable."""
    168 
    169     def getColumnNumber(self):
    170         "Return the column number where the current event ends."
    171         return -1
    172 
    173     def getLineNumber(self):
    174         "Return the line number where the current event ends."
    175         return -1
    176 
    177     def getPublicId(self):
    178         "Return the public identifier for the current event."
    179         return None
    180 
    181     def getSystemId(self):
    182         "Return the system identifier for the current event."
    183         return None
    184 
    185 # ===== INPUTSOURCE =====
    186 
    187 class InputSource:
    188     """Encapsulation of the information needed by the XMLReader to
    189     read entities.
    190 
    191     This class may include information about the public identifier,
    192     system identifier, byte stream (possibly with character encoding
    193     information) and/or the character stream of an entity.
    194 
    195     Applications will create objects of this class for use in the
    196     XMLReader.parse method and for returning from
    197     EntityResolver.resolveEntity.
    198 
    199     An InputSource belongs to the application, the XMLReader is not
    200     allowed to modify InputSource objects passed to it from the
    201     application, although it may make copies and modify those."""
    202 
    203     def __init__(self, system_id = None):
    204         self.__system_id = system_id
    205         self.__public_id = None
    206         self.__encoding  = None
    207         self.__bytefile  = None
    208         self.__charfile  = None
    209 
    210     def setPublicId(self, public_id):
    211         "Sets the public identifier of this InputSource."
    212         self.__public_id = public_id
    213 
    214     def getPublicId(self):
    215         "Returns the public identifier of this InputSource."
    216         return self.__public_id
    217 
    218     def setSystemId(self, system_id):
    219         "Sets the system identifier of this InputSource."
    220         self.__system_id = system_id
    221 
    222     def getSystemId(self):
    223         "Returns the system identifier of this InputSource."
    224         return self.__system_id
    225 
    226     def setEncoding(self, encoding):
    227         """Sets the character encoding of this InputSource.
    228 
    229         The encoding must be a string acceptable for an XML encoding
    230         declaration (see section 4.3.3 of the XML recommendation).
    231 
    232         The encoding attribute of the InputSource is ignored if the
    233         InputSource also contains a character stream."""
    234         self.__encoding = encoding
    235 
    236     def getEncoding(self):
    237         "Get the character encoding of this InputSource."
    238         return self.__encoding
    239 
    240     def setByteStream(self, bytefile):
    241         """Set the byte stream (a Python file-like object which does
    242         not perform byte-to-character conversion) for this input
    243         source.
    244 
    245         The SAX parser will ignore this if there is also a character
    246         stream specified, but it will use a byte stream in preference
    247         to opening a URI connection itself.
    248 
    249         If the application knows the character encoding of the byte
    250         stream, it should set it with the setEncoding method."""
    251         self.__bytefile = bytefile
    252 
    253     def getByteStream(self):
    254         """Get the byte stream for this input source.
    255 
    256         The getEncoding method will return the character encoding for
    257         this byte stream, or None if unknown."""
    258         return self.__bytefile
    259 
    260     def setCharacterStream(self, charfile):
    261         """Set the character stream for this input source. (The stream
    262         must be a Python 2.0 Unicode-wrapped file-like that performs
    263         conversion to Unicode strings.)
    264 
    265         If there is a character stream specified, the SAX parser will
    266         ignore any byte stream and will not attempt to open a URI
    267         connection to the system identifier."""
    268         self.__charfile = charfile
    269 
    270     def getCharacterStream(self):
    271         "Get the character stream for this input source."
    272         return self.__charfile
    273 
    274 # ===== ATTRIBUTESIMPL =====
    275 
    276 class AttributesImpl:
    277 
    278     def __init__(self, attrs):
    279         """Non-NS-aware implementation.
    280 
    281         attrs should be of the form {name : value}."""
    282         self._attrs = attrs
    283 
    284     def getLength(self):
    285         return len(self._attrs)
    286 
    287     def getType(self, name):
    288         return "CDATA"
    289 
    290     def getValue(self, name):
    291         return self._attrs[name]
    292 
    293     def getValueByQName(self, name):
    294         return self._attrs[name]
    295 
    296     def getNameByQName(self, name):
    297         if not name in self._attrs:
    298             raise KeyError, name
    299         return name
    300 
    301     def getQNameByName(self, name):
    302         if not name in self._attrs:
    303             raise KeyError, name
    304         return name
    305 
    306     def getNames(self):
    307         return self._attrs.keys()
    308 
    309     def getQNames(self):
    310         return self._attrs.keys()
    311 
    312     def __len__(self):
    313         return len(self._attrs)
    314 
    315     def __getitem__(self, name):
    316         return self._attrs[name]
    317 
    318     def keys(self):
    319         return self._attrs.keys()
    320 
    321     def has_key(self, name):
    322         return name in self._attrs
    323 
    324     def __contains__(self, name):
    325         return name in self._attrs
    326 
    327     def get(self, name, alternative=None):
    328         return self._attrs.get(name, alternative)
    329 
    330     def copy(self):
    331         return self.__class__(self._attrs)
    332 
    333     def items(self):
    334         return self._attrs.items()
    335 
    336     def values(self):
    337         return self._attrs.values()
    338 
    339 # ===== ATTRIBUTESNSIMPL =====
    340 
    341 class AttributesNSImpl(AttributesImpl):
    342 
    343     def __init__(self, attrs, qnames):
    344         """NS-aware implementation.
    345 
    346         attrs should be of the form {(ns_uri, lname): value, ...}.
    347         qnames of the form {(ns_uri, lname): qname, ...}."""
    348         self._attrs = attrs
    349         self._qnames = qnames
    350 
    351     def getValueByQName(self, name):
    352         for (nsname, qname) in self._qnames.items():
    353             if qname == name:
    354                 return self._attrs[nsname]
    355 
    356         raise KeyError, name
    357 
    358     def getNameByQName(self, name):
    359         for (nsname, qname) in self._qnames.items():
    360             if qname == name:
    361                 return nsname
    362 
    363         raise KeyError, name
    364 
    365     def getQNameByName(self, name):
    366         return self._qnames[name]
    367 
    368     def getQNames(self):
    369         return self._qnames.values()
    370 
    371     def copy(self):
    372         return self.__class__(self._attrs, self._qnames)
    373 
    374 
    375 def _test():
    376     XMLReader()
    377     IncrementalParser()
    378     Locator()
    379 
    380 if __name__ == "__main__":
    381     _test()
    382