1 """ 2 SAX driver for the pyexpat C module. This driver works with 3 pyexpat.__version__ == '2.22'. 4 """ 5 6 version = "0.20" 7 8 from xml.sax._exceptions import * 9 from xml.sax.handler import feature_validation, feature_namespaces 10 from xml.sax.handler import feature_namespace_prefixes 11 from xml.sax.handler import feature_external_ges, feature_external_pes 12 from xml.sax.handler import feature_string_interning 13 from xml.sax.handler import property_xml_string, property_interning_dict 14 15 # xml.parsers.expat does not raise ImportError in Jython 16 import sys 17 if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19 del sys 20 21 try: 22 from xml.parsers import expat 23 except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25 else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28 from xml.sax import xmlreader, saxutils, handler 29 30 AttributesImpl = xmlreader.AttributesImpl 31 AttributesNSImpl = xmlreader.AttributesNSImpl 32 33 # If we're using a sufficiently recent version of Python, we can use 34 # weak references to avoid cycles between the parser and content 35 # handler, otherwise we'll just have to pretend. 36 try: 37 import _weakref 38 except ImportError: 39 def _mkproxy(o): 40 return o 41 else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46 # --- ExpatLocator 47 48 class ExpatLocator(xmlreader.Locator): 49 """Locator for use with the ExpatParser class. 50 51 This uses a weak reference to the parser object to avoid creating 52 a circular reference between the parser and the content handler. 53 """ 54 def __init__(self, parser): 55 self._ref = _mkproxy(parser) 56 57 def getColumnNumber(self): 58 parser = self._ref 59 if parser._parser is None: 60 return None 61 return parser._parser.ErrorColumnNumber 62 63 def getLineNumber(self): 64 parser = self._ref 65 if parser._parser is None: 66 return 1 67 return parser._parser.ErrorLineNumber 68 69 def getPublicId(self): 70 parser = self._ref 71 if parser is None: 72 return None 73 return parser._source.getPublicId() 74 75 def getSystemId(self): 76 parser = self._ref 77 if parser is None: 78 return None 79 return parser._source.getSystemId() 80 81 82 # --- ExpatParser 83 84 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 85 """SAX driver for the pyexpat C module.""" 86 87 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 88 xmlreader.IncrementalParser.__init__(self, bufsize) 89 self._source = xmlreader.InputSource() 90 self._parser = None 91 self._namespaces = namespaceHandling 92 self._lex_handler_prop = None 93 self._parsing = 0 94 self._entity_stack = [] 95 self._external_ges = 1 96 self._interning = None 97 98 # XMLReader methods 99 100 def parse(self, source): 101 "Parse an XML document from a URL or an InputSource." 102 source = saxutils.prepare_input_source(source) 103 104 self._source = source 105 self.reset() 106 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 107 xmlreader.IncrementalParser.parse(self, source) 108 109 def prepareParser(self, source): 110 if source.getSystemId() is not None: 111 base = source.getSystemId() 112 if isinstance(base, unicode): 113 base = base.encode('utf-8') 114 self._parser.SetBase(base) 115 116 # Redefined setContentHandler to allow changing handlers during parsing 117 118 def setContentHandler(self, handler): 119 xmlreader.IncrementalParser.setContentHandler(self, handler) 120 if self._parsing: 121 self._reset_cont_handler() 122 123 def getFeature(self, name): 124 if name == feature_namespaces: 125 return self._namespaces 126 elif name == feature_string_interning: 127 return self._interning is not None 128 elif name in (feature_validation, feature_external_pes, 129 feature_namespace_prefixes): 130 return 0 131 elif name == feature_external_ges: 132 return self._external_ges 133 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 134 135 def setFeature(self, name, state): 136 if self._parsing: 137 raise SAXNotSupportedException("Cannot set features while parsing") 138 139 if name == feature_namespaces: 140 self._namespaces = state 141 elif name == feature_external_ges: 142 self._external_ges = state 143 elif name == feature_string_interning: 144 if state: 145 if self._interning is None: 146 self._interning = {} 147 else: 148 self._interning = None 149 elif name == feature_validation: 150 if state: 151 raise SAXNotSupportedException( 152 "expat does not support validation") 153 elif name == feature_external_pes: 154 if state: 155 raise SAXNotSupportedException( 156 "expat does not read external parameter entities") 157 elif name == feature_namespace_prefixes: 158 if state: 159 raise SAXNotSupportedException( 160 "expat does not report namespace prefixes") 161 else: 162 raise SAXNotRecognizedException( 163 "Feature '%s' not recognized" % name) 164 165 def getProperty(self, name): 166 if name == handler.property_lexical_handler: 167 return self._lex_handler_prop 168 elif name == property_interning_dict: 169 return self._interning 170 elif name == property_xml_string: 171 if self._parser: 172 if hasattr(self._parser, "GetInputContext"): 173 return self._parser.GetInputContext() 174 else: 175 raise SAXNotRecognizedException( 176 "This version of expat does not support getting" 177 " the XML string") 178 else: 179 raise SAXNotSupportedException( 180 "XML string cannot be returned when not parsing") 181 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 182 183 def setProperty(self, name, value): 184 if name == handler.property_lexical_handler: 185 self._lex_handler_prop = value 186 if self._parsing: 187 self._reset_lex_handler_prop() 188 elif name == property_interning_dict: 189 self._interning = value 190 elif name == property_xml_string: 191 raise SAXNotSupportedException("Property '%s' cannot be set" % 192 name) 193 else: 194 raise SAXNotRecognizedException("Property '%s' not recognized" % 195 name) 196 197 # IncrementalParser methods 198 199 def feed(self, data, isFinal = 0): 200 if not self._parsing: 201 self.reset() 202 self._parsing = 1 203 self._cont_handler.startDocument() 204 205 try: 206 # The isFinal parameter is internal to the expat reader. 207 # If it is set to true, expat will check validity of the entire 208 # document. When feeding chunks, they are not normally final - 209 # except when invoked from close. 210 self._parser.Parse(data, isFinal) 211 except expat.error, e: 212 exc = SAXParseException(expat.ErrorString(e.code), e, self) 213 # FIXME: when to invoke error()? 214 self._err_handler.fatalError(exc) 215 216 def close(self): 217 if self._entity_stack: 218 # If we are completing an external entity, do nothing here 219 return 220 self.feed("", isFinal = 1) 221 self._cont_handler.endDocument() 222 self._parsing = 0 223 # break cycle created by expat handlers pointing to our methods 224 self._parser = None 225 226 def _reset_cont_handler(self): 227 self._parser.ProcessingInstructionHandler = \ 228 self._cont_handler.processingInstruction 229 self._parser.CharacterDataHandler = self._cont_handler.characters 230 231 def _reset_lex_handler_prop(self): 232 lex = self._lex_handler_prop 233 parser = self._parser 234 if lex is None: 235 parser.CommentHandler = None 236 parser.StartCdataSectionHandler = None 237 parser.EndCdataSectionHandler = None 238 parser.StartDoctypeDeclHandler = None 239 parser.EndDoctypeDeclHandler = None 240 else: 241 parser.CommentHandler = lex.comment 242 parser.StartCdataSectionHandler = lex.startCDATA 243 parser.EndCdataSectionHandler = lex.endCDATA 244 parser.StartDoctypeDeclHandler = self.start_doctype_decl 245 parser.EndDoctypeDeclHandler = lex.endDTD 246 247 def reset(self): 248 if self._namespaces: 249 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 250 intern=self._interning) 251 self._parser.namespace_prefixes = 1 252 self._parser.StartElementHandler = self.start_element_ns 253 self._parser.EndElementHandler = self.end_element_ns 254 else: 255 self._parser = expat.ParserCreate(self._source.getEncoding(), 256 intern = self._interning) 257 self._parser.StartElementHandler = self.start_element 258 self._parser.EndElementHandler = self.end_element 259 260 self._reset_cont_handler() 261 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 262 self._parser.NotationDeclHandler = self.notation_decl 263 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 264 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 265 266 self._decl_handler_prop = None 267 if self._lex_handler_prop: 268 self._reset_lex_handler_prop() 269 # self._parser.DefaultHandler = 270 # self._parser.DefaultHandlerExpand = 271 # self._parser.NotStandaloneHandler = 272 self._parser.ExternalEntityRefHandler = self.external_entity_ref 273 try: 274 self._parser.SkippedEntityHandler = self.skipped_entity_handler 275 except AttributeError: 276 # This pyexpat does not support SkippedEntity 277 pass 278 self._parser.SetParamEntityParsing( 279 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 280 281 self._parsing = 0 282 self._entity_stack = [] 283 284 # Locator methods 285 286 def getColumnNumber(self): 287 if self._parser is None: 288 return None 289 return self._parser.ErrorColumnNumber 290 291 def getLineNumber(self): 292 if self._parser is None: 293 return 1 294 return self._parser.ErrorLineNumber 295 296 def getPublicId(self): 297 return self._source.getPublicId() 298 299 def getSystemId(self): 300 return self._source.getSystemId() 301 302 # event handlers 303 def start_element(self, name, attrs): 304 self._cont_handler.startElement(name, AttributesImpl(attrs)) 305 306 def end_element(self, name): 307 self._cont_handler.endElement(name) 308 309 def start_element_ns(self, name, attrs): 310 pair = name.split() 311 if len(pair) == 1: 312 # no namespace 313 pair = (None, name) 314 elif len(pair) == 3: 315 pair = pair[0], pair[1] 316 else: 317 # default namespace 318 pair = tuple(pair) 319 320 newattrs = {} 321 qnames = {} 322 for (aname, value) in attrs.items(): 323 parts = aname.split() 324 length = len(parts) 325 if length == 1: 326 # no namespace 327 qname = aname 328 apair = (None, aname) 329 elif length == 3: 330 qname = "%s:%s" % (parts[2], parts[1]) 331 apair = parts[0], parts[1] 332 else: 333 # default namespace 334 qname = parts[1] 335 apair = tuple(parts) 336 337 newattrs[apair] = value 338 qnames[apair] = qname 339 340 self._cont_handler.startElementNS(pair, None, 341 AttributesNSImpl(newattrs, qnames)) 342 343 def end_element_ns(self, name): 344 pair = name.split() 345 if len(pair) == 1: 346 pair = (None, name) 347 elif len(pair) == 3: 348 pair = pair[0], pair[1] 349 else: 350 pair = tuple(pair) 351 352 self._cont_handler.endElementNS(pair, None) 353 354 # this is not used (call directly to ContentHandler) 355 def processing_instruction(self, target, data): 356 self._cont_handler.processingInstruction(target, data) 357 358 # this is not used (call directly to ContentHandler) 359 def character_data(self, data): 360 self._cont_handler.characters(data) 361 362 def start_namespace_decl(self, prefix, uri): 363 self._cont_handler.startPrefixMapping(prefix, uri) 364 365 def end_namespace_decl(self, prefix): 366 self._cont_handler.endPrefixMapping(prefix) 367 368 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 369 self._lex_handler_prop.startDTD(name, pubid, sysid) 370 371 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 372 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 373 374 def notation_decl(self, name, base, sysid, pubid): 375 self._dtd_handler.notationDecl(name, pubid, sysid) 376 377 def external_entity_ref(self, context, base, sysid, pubid): 378 if not self._external_ges: 379 return 1 380 381 source = self._ent_handler.resolveEntity(pubid, sysid) 382 source = saxutils.prepare_input_source(source, 383 self._source.getSystemId() or 384 "") 385 386 self._entity_stack.append((self._parser, self._source)) 387 self._parser = self._parser.ExternalEntityParserCreate(context) 388 self._source = source 389 390 try: 391 xmlreader.IncrementalParser.parse(self, source) 392 except: 393 return 0 # FIXME: save error info here? 394 395 (self._parser, self._source) = self._entity_stack[-1] 396 del self._entity_stack[-1] 397 return 1 398 399 def skipped_entity_handler(self, name, is_pe): 400 if is_pe: 401 # The SAX spec requires to report skipped PEs with a '%' 402 name = '%'+name 403 self._cont_handler.skippedEntity(name) 404 405 # --- 406 407 def create_parser(*args, **kwargs): 408 return ExpatParser(*args, **kwargs) 409 410 # --- 411 412 if __name__ == "__main__": 413 import xml.sax.saxutils 414 p = create_parser() 415 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 416 p.setErrorHandler(xml.sax.ErrorHandler()) 417 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 418