1 """ 2 SAX driver for the pyexpat C module. This driver works with 3 pyexpat.__version__ == '2.22'. 4 """ 5 6 version = "0.20" 7 8 from xml.sax._exceptions import * 9 from xml.sax.handler import feature_validation, feature_namespaces 10 from xml.sax.handler import feature_namespace_prefixes 11 from xml.sax.handler import feature_external_ges, feature_external_pes 12 from xml.sax.handler import feature_string_interning 13 from xml.sax.handler import property_xml_string, property_interning_dict 14 15 # xml.parsers.expat does not raise ImportError in Jython 16 import sys 17 if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19 del sys 20 21 try: 22 from xml.parsers import expat 23 except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25 else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28 from xml.sax import xmlreader, saxutils, handler 29 30 AttributesImpl = xmlreader.AttributesImpl 31 AttributesNSImpl = xmlreader.AttributesNSImpl 32 33 # If we're using a sufficiently recent version of Python, we can use 34 # weak references to avoid cycles between the parser and content 35 # handler, otherwise we'll just have to pretend. 36 try: 37 import _weakref 38 except ImportError: 39 def _mkproxy(o): 40 return o 41 else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46 class _ClosedParser: 47 pass 48 49 # --- ExpatLocator 50 51 class ExpatLocator(xmlreader.Locator): 52 """Locator for use with the ExpatParser class. 53 54 This uses a weak reference to the parser object to avoid creating 55 a circular reference between the parser and the content handler. 56 """ 57 def __init__(self, parser): 58 self._ref = _mkproxy(parser) 59 60 def getColumnNumber(self): 61 parser = self._ref 62 if parser._parser is None: 63 return None 64 return parser._parser.ErrorColumnNumber 65 66 def getLineNumber(self): 67 parser = self._ref 68 if parser._parser is None: 69 return 1 70 return parser._parser.ErrorLineNumber 71 72 def getPublicId(self): 73 parser = self._ref 74 if parser is None: 75 return None 76 return parser._source.getPublicId() 77 78 def getSystemId(self): 79 parser = self._ref 80 if parser is None: 81 return None 82 return parser._source.getSystemId() 83 84 85 # --- ExpatParser 86 87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 88 """SAX driver for the pyexpat C module.""" 89 90 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 91 xmlreader.IncrementalParser.__init__(self, bufsize) 92 self._source = xmlreader.InputSource() 93 self._parser = None 94 self._namespaces = namespaceHandling 95 self._lex_handler_prop = None 96 self._parsing = 0 97 self._entity_stack = [] 98 self._external_ges = 0 99 self._interning = None 100 101 # XMLReader methods 102 103 def parse(self, source): 104 "Parse an XML document from a URL or an InputSource." 105 source = saxutils.prepare_input_source(source) 106 107 self._source = source 108 try: 109 self.reset() 110 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 111 xmlreader.IncrementalParser.parse(self, source) 112 except: 113 # bpo-30264: Close the source on error to not leak resources: 114 # xml.sax.parse() doesn't give access to the underlying parser 115 # to the caller 116 self._close_source() 117 raise 118 119 def prepareParser(self, source): 120 if source.getSystemId() is not None: 121 self._parser.SetBase(source.getSystemId()) 122 123 # Redefined setContentHandler to allow changing handlers during parsing 124 125 def setContentHandler(self, handler): 126 xmlreader.IncrementalParser.setContentHandler(self, handler) 127 if self._parsing: 128 self._reset_cont_handler() 129 130 def getFeature(self, name): 131 if name == feature_namespaces: 132 return self._namespaces 133 elif name == feature_string_interning: 134 return self._interning is not None 135 elif name in (feature_validation, feature_external_pes, 136 feature_namespace_prefixes): 137 return 0 138 elif name == feature_external_ges: 139 return self._external_ges 140 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 141 142 def setFeature(self, name, state): 143 if self._parsing: 144 raise SAXNotSupportedException("Cannot set features while parsing") 145 146 if name == feature_namespaces: 147 self._namespaces = state 148 elif name == feature_external_ges: 149 self._external_ges = state 150 elif name == feature_string_interning: 151 if state: 152 if self._interning is None: 153 self._interning = {} 154 else: 155 self._interning = None 156 elif name == feature_validation: 157 if state: 158 raise SAXNotSupportedException( 159 "expat does not support validation") 160 elif name == feature_external_pes: 161 if state: 162 raise SAXNotSupportedException( 163 "expat does not read external parameter entities") 164 elif name == feature_namespace_prefixes: 165 if state: 166 raise SAXNotSupportedException( 167 "expat does not report namespace prefixes") 168 else: 169 raise SAXNotRecognizedException( 170 "Feature '%s' not recognized" % name) 171 172 def getProperty(self, name): 173 if name == handler.property_lexical_handler: 174 return self._lex_handler_prop 175 elif name == property_interning_dict: 176 return self._interning 177 elif name == property_xml_string: 178 if self._parser: 179 if hasattr(self._parser, "GetInputContext"): 180 return self._parser.GetInputContext() 181 else: 182 raise SAXNotRecognizedException( 183 "This version of expat does not support getting" 184 " the XML string") 185 else: 186 raise SAXNotSupportedException( 187 "XML string cannot be returned when not parsing") 188 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 189 190 def setProperty(self, name, value): 191 if name == handler.property_lexical_handler: 192 self._lex_handler_prop = value 193 if self._parsing: 194 self._reset_lex_handler_prop() 195 elif name == property_interning_dict: 196 self._interning = value 197 elif name == property_xml_string: 198 raise SAXNotSupportedException("Property '%s' cannot be set" % 199 name) 200 else: 201 raise SAXNotRecognizedException("Property '%s' not recognized" % 202 name) 203 204 # IncrementalParser methods 205 206 def feed(self, data, isFinal = 0): 207 if not self._parsing: 208 self.reset() 209 self._parsing = 1 210 self._cont_handler.startDocument() 211 212 try: 213 # The isFinal parameter is internal to the expat reader. 214 # If it is set to true, expat will check validity of the entire 215 # document. When feeding chunks, they are not normally final - 216 # except when invoked from close. 217 self._parser.Parse(data, isFinal) 218 except expat.error as e: 219 exc = SAXParseException(expat.ErrorString(e.code), e, self) 220 # FIXME: when to invoke error()? 221 self._err_handler.fatalError(exc) 222 223 def _close_source(self): 224 source = self._source 225 try: 226 file = source.getCharacterStream() 227 if file is not None: 228 file.close() 229 finally: 230 file = source.getByteStream() 231 if file is not None: 232 file.close() 233 234 def close(self): 235 if (self._entity_stack or self._parser is None or 236 isinstance(self._parser, _ClosedParser)): 237 # If we are completing an external entity, do nothing here 238 return 239 try: 240 self.feed("", isFinal = 1) 241 self._cont_handler.endDocument() 242 self._parsing = 0 243 # break cycle created by expat handlers pointing to our methods 244 self._parser = None 245 finally: 246 self._parsing = 0 247 if self._parser is not None: 248 # Keep ErrorColumnNumber and ErrorLineNumber after closing. 249 parser = _ClosedParser() 250 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber 251 parser.ErrorLineNumber = self._parser.ErrorLineNumber 252 self._parser = parser 253 self._close_source() 254 255 def _reset_cont_handler(self): 256 self._parser.ProcessingInstructionHandler = \ 257 self._cont_handler.processingInstruction 258 self._parser.CharacterDataHandler = self._cont_handler.characters 259 260 def _reset_lex_handler_prop(self): 261 lex = self._lex_handler_prop 262 parser = self._parser 263 if lex is None: 264 parser.CommentHandler = None 265 parser.StartCdataSectionHandler = None 266 parser.EndCdataSectionHandler = None 267 parser.StartDoctypeDeclHandler = None 268 parser.EndDoctypeDeclHandler = None 269 else: 270 parser.CommentHandler = lex.comment 271 parser.StartCdataSectionHandler = lex.startCDATA 272 parser.EndCdataSectionHandler = lex.endCDATA 273 parser.StartDoctypeDeclHandler = self.start_doctype_decl 274 parser.EndDoctypeDeclHandler = lex.endDTD 275 276 def reset(self): 277 if self._namespaces: 278 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 279 intern=self._interning) 280 self._parser.namespace_prefixes = 1 281 self._parser.StartElementHandler = self.start_element_ns 282 self._parser.EndElementHandler = self.end_element_ns 283 else: 284 self._parser = expat.ParserCreate(self._source.getEncoding(), 285 intern = self._interning) 286 self._parser.StartElementHandler = self.start_element 287 self._parser.EndElementHandler = self.end_element 288 289 self._reset_cont_handler() 290 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 291 self._parser.NotationDeclHandler = self.notation_decl 292 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 293 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 294 295 self._decl_handler_prop = None 296 if self._lex_handler_prop: 297 self._reset_lex_handler_prop() 298 # self._parser.DefaultHandler = 299 # self._parser.DefaultHandlerExpand = 300 # self._parser.NotStandaloneHandler = 301 self._parser.ExternalEntityRefHandler = self.external_entity_ref 302 try: 303 self._parser.SkippedEntityHandler = self.skipped_entity_handler 304 except AttributeError: 305 # This pyexpat does not support SkippedEntity 306 pass 307 self._parser.SetParamEntityParsing( 308 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 309 310 self._parsing = 0 311 self._entity_stack = [] 312 313 # Locator methods 314 315 def getColumnNumber(self): 316 if self._parser is None: 317 return None 318 return self._parser.ErrorColumnNumber 319 320 def getLineNumber(self): 321 if self._parser is None: 322 return 1 323 return self._parser.ErrorLineNumber 324 325 def getPublicId(self): 326 return self._source.getPublicId() 327 328 def getSystemId(self): 329 return self._source.getSystemId() 330 331 # event handlers 332 def start_element(self, name, attrs): 333 self._cont_handler.startElement(name, AttributesImpl(attrs)) 334 335 def end_element(self, name): 336 self._cont_handler.endElement(name) 337 338 def start_element_ns(self, name, attrs): 339 pair = name.split() 340 if len(pair) == 1: 341 # no namespace 342 pair = (None, name) 343 elif len(pair) == 3: 344 pair = pair[0], pair[1] 345 else: 346 # default namespace 347 pair = tuple(pair) 348 349 newattrs = {} 350 qnames = {} 351 for (aname, value) in attrs.items(): 352 parts = aname.split() 353 length = len(parts) 354 if length == 1: 355 # no namespace 356 qname = aname 357 apair = (None, aname) 358 elif length == 3: 359 qname = "%s:%s" % (parts[2], parts[1]) 360 apair = parts[0], parts[1] 361 else: 362 # default namespace 363 qname = parts[1] 364 apair = tuple(parts) 365 366 newattrs[apair] = value 367 qnames[apair] = qname 368 369 self._cont_handler.startElementNS(pair, None, 370 AttributesNSImpl(newattrs, qnames)) 371 372 def end_element_ns(self, name): 373 pair = name.split() 374 if len(pair) == 1: 375 pair = (None, name) 376 elif len(pair) == 3: 377 pair = pair[0], pair[1] 378 else: 379 pair = tuple(pair) 380 381 self._cont_handler.endElementNS(pair, None) 382 383 # this is not used (call directly to ContentHandler) 384 def processing_instruction(self, target, data): 385 self._cont_handler.processingInstruction(target, data) 386 387 # this is not used (call directly to ContentHandler) 388 def character_data(self, data): 389 self._cont_handler.characters(data) 390 391 def start_namespace_decl(self, prefix, uri): 392 self._cont_handler.startPrefixMapping(prefix, uri) 393 394 def end_namespace_decl(self, prefix): 395 self._cont_handler.endPrefixMapping(prefix) 396 397 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 398 self._lex_handler_prop.startDTD(name, pubid, sysid) 399 400 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 401 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 402 403 def notation_decl(self, name, base, sysid, pubid): 404 self._dtd_handler.notationDecl(name, pubid, sysid) 405 406 def external_entity_ref(self, context, base, sysid, pubid): 407 if not self._external_ges: 408 return 1 409 410 source = self._ent_handler.resolveEntity(pubid, sysid) 411 source = saxutils.prepare_input_source(source, 412 self._source.getSystemId() or 413 "") 414 415 self._entity_stack.append((self._parser, self._source)) 416 self._parser = self._parser.ExternalEntityParserCreate(context) 417 self._source = source 418 419 try: 420 xmlreader.IncrementalParser.parse(self, source) 421 except: 422 return 0 # FIXME: save error info here? 423 424 (self._parser, self._source) = self._entity_stack[-1] 425 del self._entity_stack[-1] 426 return 1 427 428 def skipped_entity_handler(self, name, is_pe): 429 if is_pe: 430 # The SAX spec requires to report skipped PEs with a '%' 431 name = '%'+name 432 self._cont_handler.skippedEntity(name) 433 434 # --- 435 436 def create_parser(*args, **kwargs): 437 return ExpatParser(*args, **kwargs) 438 439 # --- 440 441 if __name__ == "__main__": 442 import xml.sax.saxutils 443 p = create_parser() 444 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 445 p.setErrorHandler(xml.sax.ErrorHandler()) 446 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 447