1 """An XML Reader is the SAX 2 name for an XML parser. XML Parsers 2 should be based on this code. """ 3 4 import handler 5 6 from _exceptions import SAXNotSupportedException, SAXNotRecognizedException 7 8 9 # ===== XMLREADER ===== 10 11 class XMLReader: 12 """Interface for reading an XML document using callbacks. 13 14 XMLReader is the interface that an XML parser's SAX2 driver must 15 implement. This interface allows an application to set and query 16 features and properties in the parser, to register event handlers 17 for document processing, and to initiate a document parse. 18 19 All SAX interfaces are assumed to be synchronous: the parse 20 methods must not return until parsing is complete, and readers 21 must wait for an event-handler callback to return before reporting 22 the next event.""" 23 24 def __init__(self): 25 self._cont_handler = handler.ContentHandler() 26 self._dtd_handler = handler.DTDHandler() 27 self._ent_handler = handler.EntityResolver() 28 self._err_handler = handler.ErrorHandler() 29 30 def parse(self, source): 31 "Parse an XML document from a system identifier or an InputSource." 32 raise NotImplementedError("This method must be implemented!") 33 34 def getContentHandler(self): 35 "Returns the current ContentHandler." 36 return self._cont_handler 37 38 def setContentHandler(self, handler): 39 "Registers a new object to receive document content events." 40 self._cont_handler = handler 41 42 def getDTDHandler(self): 43 "Returns the current DTD handler." 44 return self._dtd_handler 45 46 def setDTDHandler(self, handler): 47 "Register an object to receive basic DTD-related events." 48 self._dtd_handler = handler 49 50 def getEntityResolver(self): 51 "Returns the current EntityResolver." 52 return self._ent_handler 53 54 def setEntityResolver(self, resolver): 55 "Register an object to resolve external entities." 56 self._ent_handler = resolver 57 58 def getErrorHandler(self): 59 "Returns the current ErrorHandler." 60 return self._err_handler 61 62 def setErrorHandler(self, handler): 63 "Register an object to receive error-message events." 64 self._err_handler = handler 65 66 def setLocale(self, locale): 67 """Allow an application to set the locale for errors and warnings. 68 69 SAX parsers are not required to provide localization for errors 70 and warnings; if they cannot support the requested locale, 71 however, they must raise a SAX exception. Applications may 72 request a locale change in the middle of a parse.""" 73 raise SAXNotSupportedException("Locale support not implemented") 74 75 def getFeature(self, name): 76 "Looks up and returns the state of a SAX2 feature." 77 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 78 79 def setFeature(self, name, state): 80 "Sets the state of a SAX2 feature." 81 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 82 83 def getProperty(self, name): 84 "Looks up and returns the value of a SAX2 property." 85 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 86 87 def setProperty(self, name, value): 88 "Sets the value of a SAX2 property." 89 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 90 91 class IncrementalParser(XMLReader): 92 """This interface adds three extra methods to the XMLReader 93 interface that allow XML parsers to support incremental 94 parsing. Support for this interface is optional, since not all 95 underlying XML parsers support this functionality. 96 97 When the parser is instantiated it is ready to begin accepting 98 data from the feed method immediately. After parsing has been 99 finished with a call to close the reset method must be called to 100 make the parser ready to accept new data, either from feed or 101 using the parse method. 102 103 Note that these methods must _not_ be called during parsing, that 104 is, after parse has been called and before it returns. 105 106 By default, the class also implements the parse method of the XMLReader 107 interface using the feed, close and reset methods of the 108 IncrementalParser interface as a convenience to SAX 2.0 driver 109 writers.""" 110 111 def __init__(self, bufsize=2**16): 112 self._bufsize = bufsize 113 XMLReader.__init__(self) 114 115 def parse(self, source): 116 import saxutils 117 source = saxutils.prepare_input_source(source) 118 119 self.prepareParser(source) 120 file = source.getByteStream() 121 buffer = file.read(self._bufsize) 122 while buffer != "": 123 self.feed(buffer) 124 buffer = file.read(self._bufsize) 125 self.close() 126 127 def feed(self, data): 128 """This method gives the raw XML data in the data parameter to 129 the parser and makes it parse the data, emitting the 130 corresponding events. It is allowed for XML constructs to be 131 split across several calls to feed. 132 133 feed may raise SAXException.""" 134 raise NotImplementedError("This method must be implemented!") 135 136 def prepareParser(self, source): 137 """This method is called by the parse implementation to allow 138 the SAX 2.0 driver to prepare itself for parsing.""" 139 raise NotImplementedError("prepareParser must be overridden!") 140 141 def close(self): 142 """This method is called when the entire XML document has been 143 passed to the parser through the feed method, to notify the 144 parser that there are no more data. This allows the parser to 145 do the final checks on the document and empty the internal 146 data buffer. 147 148 The parser will not be ready to parse another document until 149 the reset method has been called. 150 151 close may raise SAXException.""" 152 raise NotImplementedError("This method must be implemented!") 153 154 def reset(self): 155 """This method is called after close has been called to reset 156 the parser so that it is ready to parse new documents. The 157 results of calling parse or feed after close without calling 158 reset are undefined.""" 159 raise NotImplementedError("This method must be implemented!") 160 161 # ===== LOCATOR ===== 162 163 class Locator: 164 """Interface for associating a SAX event with a document 165 location. A locator object will return valid results only during 166 calls to DocumentHandler methods; at any other time, the 167 results are unpredictable.""" 168 169 def getColumnNumber(self): 170 "Return the column number where the current event ends." 171 return -1 172 173 def getLineNumber(self): 174 "Return the line number where the current event ends." 175 return -1 176 177 def getPublicId(self): 178 "Return the public identifier for the current event." 179 return None 180 181 def getSystemId(self): 182 "Return the system identifier for the current event." 183 return None 184 185 # ===== INPUTSOURCE ===== 186 187 class InputSource: 188 """Encapsulation of the information needed by the XMLReader to 189 read entities. 190 191 This class may include information about the public identifier, 192 system identifier, byte stream (possibly with character encoding 193 information) and/or the character stream of an entity. 194 195 Applications will create objects of this class for use in the 196 XMLReader.parse method and for returning from 197 EntityResolver.resolveEntity. 198 199 An InputSource belongs to the application, the XMLReader is not 200 allowed to modify InputSource objects passed to it from the 201 application, although it may make copies and modify those.""" 202 203 def __init__(self, system_id = None): 204 self.__system_id = system_id 205 self.__public_id = None 206 self.__encoding = None 207 self.__bytefile = None 208 self.__charfile = None 209 210 def setPublicId(self, public_id): 211 "Sets the public identifier of this InputSource." 212 self.__public_id = public_id 213 214 def getPublicId(self): 215 "Returns the public identifier of this InputSource." 216 return self.__public_id 217 218 def setSystemId(self, system_id): 219 "Sets the system identifier of this InputSource." 220 self.__system_id = system_id 221 222 def getSystemId(self): 223 "Returns the system identifier of this InputSource." 224 return self.__system_id 225 226 def setEncoding(self, encoding): 227 """Sets the character encoding of this InputSource. 228 229 The encoding must be a string acceptable for an XML encoding 230 declaration (see section 4.3.3 of the XML recommendation). 231 232 The encoding attribute of the InputSource is ignored if the 233 InputSource also contains a character stream.""" 234 self.__encoding = encoding 235 236 def getEncoding(self): 237 "Get the character encoding of this InputSource." 238 return self.__encoding 239 240 def setByteStream(self, bytefile): 241 """Set the byte stream (a Python file-like object which does 242 not perform byte-to-character conversion) for this input 243 source. 244 245 The SAX parser will ignore this if there is also a character 246 stream specified, but it will use a byte stream in preference 247 to opening a URI connection itself. 248 249 If the application knows the character encoding of the byte 250 stream, it should set it with the setEncoding method.""" 251 self.__bytefile = bytefile 252 253 def getByteStream(self): 254 """Get the byte stream for this input source. 255 256 The getEncoding method will return the character encoding for 257 this byte stream, or None if unknown.""" 258 return self.__bytefile 259 260 def setCharacterStream(self, charfile): 261 """Set the character stream for this input source. (The stream 262 must be a Python 2.0 Unicode-wrapped file-like that performs 263 conversion to Unicode strings.) 264 265 If there is a character stream specified, the SAX parser will 266 ignore any byte stream and will not attempt to open a URI 267 connection to the system identifier.""" 268 self.__charfile = charfile 269 270 def getCharacterStream(self): 271 "Get the character stream for this input source." 272 return self.__charfile 273 274 # ===== ATTRIBUTESIMPL ===== 275 276 class AttributesImpl: 277 278 def __init__(self, attrs): 279 """Non-NS-aware implementation. 280 281 attrs should be of the form {name : value}.""" 282 self._attrs = attrs 283 284 def getLength(self): 285 return len(self._attrs) 286 287 def getType(self, name): 288 return "CDATA" 289 290 def getValue(self, name): 291 return self._attrs[name] 292 293 def getValueByQName(self, name): 294 return self._attrs[name] 295 296 def getNameByQName(self, name): 297 if not name in self._attrs: 298 raise KeyError, name 299 return name 300 301 def getQNameByName(self, name): 302 if not name in self._attrs: 303 raise KeyError, name 304 return name 305 306 def getNames(self): 307 return self._attrs.keys() 308 309 def getQNames(self): 310 return self._attrs.keys() 311 312 def __len__(self): 313 return len(self._attrs) 314 315 def __getitem__(self, name): 316 return self._attrs[name] 317 318 def keys(self): 319 return self._attrs.keys() 320 321 def has_key(self, name): 322 return name in self._attrs 323 324 def __contains__(self, name): 325 return name in self._attrs 326 327 def get(self, name, alternative=None): 328 return self._attrs.get(name, alternative) 329 330 def copy(self): 331 return self.__class__(self._attrs) 332 333 def items(self): 334 return self._attrs.items() 335 336 def values(self): 337 return self._attrs.values() 338 339 # ===== ATTRIBUTESNSIMPL ===== 340 341 class AttributesNSImpl(AttributesImpl): 342 343 def __init__(self, attrs, qnames): 344 """NS-aware implementation. 345 346 attrs should be of the form {(ns_uri, lname): value, ...}. 347 qnames of the form {(ns_uri, lname): qname, ...}.""" 348 self._attrs = attrs 349 self._qnames = qnames 350 351 def getValueByQName(self, name): 352 for (nsname, qname) in self._qnames.items(): 353 if qname == name: 354 return self._attrs[nsname] 355 356 raise KeyError, name 357 358 def getNameByQName(self, name): 359 for (nsname, qname) in self._qnames.items(): 360 if qname == name: 361 return nsname 362 363 raise KeyError, name 364 365 def getQNameByName(self, name): 366 return self._qnames[name] 367 368 def getQNames(self): 369 return self._qnames.values() 370 371 def copy(self): 372 return self.__class__(self._attrs, self._qnames) 373 374 375 def _test(): 376 XMLReader() 377 IncrementalParser() 378 Locator() 379 380 if __name__ == "__main__": 381 _test() 382