1 """ 2 SAX driver for the pyexpat C module. This driver works with 3 pyexpat.__version__ == '2.22'. 4 """ 5 6 version = "0.20" 7 8 from xml.sax._exceptions import * 9 from xml.sax.handler import feature_validation, feature_namespaces 10 from xml.sax.handler import feature_namespace_prefixes 11 from xml.sax.handler import feature_external_ges, feature_external_pes 12 from xml.sax.handler import feature_string_interning 13 from xml.sax.handler import property_xml_string, property_interning_dict 14 15 # xml.parsers.expat does not raise ImportError in Jython 16 import sys 17 if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19 del sys 20 21 try: 22 from xml.parsers import expat 23 except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25 else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28 from xml.sax import xmlreader, saxutils, handler 29 30 AttributesImpl = xmlreader.AttributesImpl 31 AttributesNSImpl = xmlreader.AttributesNSImpl 32 33 # If we're using a sufficiently recent version of Python, we can use 34 # weak references to avoid cycles between the parser and content 35 # handler, otherwise we'll just have to pretend. 36 try: 37 import _weakref 38 except ImportError: 39 def _mkproxy(o): 40 return o 41 else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46 class _ClosedParser: 47 pass 48 49 # --- ExpatLocator 50 51 class ExpatLocator(xmlreader.Locator): 52 """Locator for use with the ExpatParser class. 53 54 This uses a weak reference to the parser object to avoid creating 55 a circular reference between the parser and the content handler. 56 """ 57 def __init__(self, parser): 58 self._ref = _mkproxy(parser) 59 60 def getColumnNumber(self): 61 parser = self._ref 62 if parser._parser is None: 63 return None 64 return parser._parser.ErrorColumnNumber 65 66 def getLineNumber(self): 67 parser = self._ref 68 if parser._parser is None: 69 return 1 70 return parser._parser.ErrorLineNumber 71 72 def getPublicId(self): 73 parser = self._ref 74 if parser is None: 75 return None 76 return parser._source.getPublicId() 77 78 def getSystemId(self): 79 parser = self._ref 80 if parser is None: 81 return None 82 return parser._source.getSystemId() 83 84 85 # --- ExpatParser 86 87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 88 """SAX driver for the pyexpat C module.""" 89 90 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 91 xmlreader.IncrementalParser.__init__(self, bufsize) 92 self._source = xmlreader.InputSource() 93 self._parser = None 94 self._namespaces = namespaceHandling 95 self._lex_handler_prop = None 96 self._parsing = 0 97 self._entity_stack = [] 98 self._external_ges = 1 99 self._interning = None 100 101 # XMLReader methods 102 103 def parse(self, source): 104 "Parse an XML document from a URL or an InputSource." 105 source = saxutils.prepare_input_source(source) 106 107 self._source = source 108 try: 109 self.reset() 110 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 111 xmlreader.IncrementalParser.parse(self, source) 112 except: 113 # bpo-30264: Close the source on error to not leak resources: 114 # xml.sax.parse() doesn't give access to the underlying parser 115 # to the caller 116 self._close_source() 117 raise 118 119 def prepareParser(self, source): 120 if source.getSystemId() is not None: 121 base = source.getSystemId() 122 if isinstance(base, unicode): 123 base = base.encode('utf-8') 124 self._parser.SetBase(base) 125 126 # Redefined setContentHandler to allow changing handlers during parsing 127 128 def setContentHandler(self, handler): 129 xmlreader.IncrementalParser.setContentHandler(self, handler) 130 if self._parsing: 131 self._reset_cont_handler() 132 133 def getFeature(self, name): 134 if name == feature_namespaces: 135 return self._namespaces 136 elif name == feature_string_interning: 137 return self._interning is not None 138 elif name in (feature_validation, feature_external_pes, 139 feature_namespace_prefixes): 140 return 0 141 elif name == feature_external_ges: 142 return self._external_ges 143 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 144 145 def setFeature(self, name, state): 146 if self._parsing: 147 raise SAXNotSupportedException("Cannot set features while parsing") 148 149 if name == feature_namespaces: 150 self._namespaces = state 151 elif name == feature_external_ges: 152 self._external_ges = state 153 elif name == feature_string_interning: 154 if state: 155 if self._interning is None: 156 self._interning = {} 157 else: 158 self._interning = None 159 elif name == feature_validation: 160 if state: 161 raise SAXNotSupportedException( 162 "expat does not support validation") 163 elif name == feature_external_pes: 164 if state: 165 raise SAXNotSupportedException( 166 "expat does not read external parameter entities") 167 elif name == feature_namespace_prefixes: 168 if state: 169 raise SAXNotSupportedException( 170 "expat does not report namespace prefixes") 171 else: 172 raise SAXNotRecognizedException( 173 "Feature '%s' not recognized" % name) 174 175 def getProperty(self, name): 176 if name == handler.property_lexical_handler: 177 return self._lex_handler_prop 178 elif name == property_interning_dict: 179 return self._interning 180 elif name == property_xml_string: 181 if self._parser: 182 if hasattr(self._parser, "GetInputContext"): 183 return self._parser.GetInputContext() 184 else: 185 raise SAXNotRecognizedException( 186 "This version of expat does not support getting" 187 " the XML string") 188 else: 189 raise SAXNotSupportedException( 190 "XML string cannot be returned when not parsing") 191 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 192 193 def setProperty(self, name, value): 194 if name == handler.property_lexical_handler: 195 self._lex_handler_prop = value 196 if self._parsing: 197 self._reset_lex_handler_prop() 198 elif name == property_interning_dict: 199 self._interning = value 200 elif name == property_xml_string: 201 raise SAXNotSupportedException("Property '%s' cannot be set" % 202 name) 203 else: 204 raise SAXNotRecognizedException("Property '%s' not recognized" % 205 name) 206 207 # IncrementalParser methods 208 209 def feed(self, data, isFinal = 0): 210 if not self._parsing: 211 self.reset() 212 self._parsing = 1 213 self._cont_handler.startDocument() 214 215 try: 216 # The isFinal parameter is internal to the expat reader. 217 # If it is set to true, expat will check validity of the entire 218 # document. When feeding chunks, they are not normally final - 219 # except when invoked from close. 220 self._parser.Parse(data, isFinal) 221 except expat.error, e: 222 exc = SAXParseException(expat.ErrorString(e.code), e, self) 223 # FIXME: when to invoke error()? 224 self._err_handler.fatalError(exc) 225 226 def _close_source(self): 227 source = self._source 228 try: 229 file = source.getCharacterStream() 230 if file is not None: 231 file.close() 232 finally: 233 file = source.getByteStream() 234 if file is not None: 235 file.close() 236 237 def close(self): 238 if (self._entity_stack or self._parser is None or 239 isinstance(self._parser, _ClosedParser)): 240 # If we are completing an external entity, do nothing here 241 return 242 try: 243 self.feed("", isFinal = 1) 244 self._cont_handler.endDocument() 245 self._parsing = 0 246 # break cycle created by expat handlers pointing to our methods 247 self._parser = None 248 finally: 249 self._parsing = 0 250 if self._parser is not None: 251 # Keep ErrorColumnNumber and ErrorLineNumber after closing. 252 parser = _ClosedParser() 253 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber 254 parser.ErrorLineNumber = self._parser.ErrorLineNumber 255 self._parser = parser 256 self._close_source() 257 258 def _reset_cont_handler(self): 259 self._parser.ProcessingInstructionHandler = \ 260 self._cont_handler.processingInstruction 261 self._parser.CharacterDataHandler = self._cont_handler.characters 262 263 def _reset_lex_handler_prop(self): 264 lex = self._lex_handler_prop 265 parser = self._parser 266 if lex is None: 267 parser.CommentHandler = None 268 parser.StartCdataSectionHandler = None 269 parser.EndCdataSectionHandler = None 270 parser.StartDoctypeDeclHandler = None 271 parser.EndDoctypeDeclHandler = None 272 else: 273 parser.CommentHandler = lex.comment 274 parser.StartCdataSectionHandler = lex.startCDATA 275 parser.EndCdataSectionHandler = lex.endCDATA 276 parser.StartDoctypeDeclHandler = self.start_doctype_decl 277 parser.EndDoctypeDeclHandler = lex.endDTD 278 279 def reset(self): 280 if self._namespaces: 281 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 282 intern=self._interning) 283 self._parser.namespace_prefixes = 1 284 self._parser.StartElementHandler = self.start_element_ns 285 self._parser.EndElementHandler = self.end_element_ns 286 else: 287 self._parser = expat.ParserCreate(self._source.getEncoding(), 288 intern = self._interning) 289 self._parser.StartElementHandler = self.start_element 290 self._parser.EndElementHandler = self.end_element 291 292 self._reset_cont_handler() 293 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 294 self._parser.NotationDeclHandler = self.notation_decl 295 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 296 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 297 298 self._decl_handler_prop = None 299 if self._lex_handler_prop: 300 self._reset_lex_handler_prop() 301 # self._parser.DefaultHandler = 302 # self._parser.DefaultHandlerExpand = 303 # self._parser.NotStandaloneHandler = 304 self._parser.ExternalEntityRefHandler = self.external_entity_ref 305 try: 306 self._parser.SkippedEntityHandler = self.skipped_entity_handler 307 except AttributeError: 308 # This pyexpat does not support SkippedEntity 309 pass 310 self._parser.SetParamEntityParsing( 311 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 312 313 self._parsing = 0 314 self._entity_stack = [] 315 316 # Locator methods 317 318 def getColumnNumber(self): 319 if self._parser is None: 320 return None 321 return self._parser.ErrorColumnNumber 322 323 def getLineNumber(self): 324 if self._parser is None: 325 return 1 326 return self._parser.ErrorLineNumber 327 328 def getPublicId(self): 329 return self._source.getPublicId() 330 331 def getSystemId(self): 332 return self._source.getSystemId() 333 334 # event handlers 335 def start_element(self, name, attrs): 336 self._cont_handler.startElement(name, AttributesImpl(attrs)) 337 338 def end_element(self, name): 339 self._cont_handler.endElement(name) 340 341 def start_element_ns(self, name, attrs): 342 pair = name.split() 343 if len(pair) == 1: 344 # no namespace 345 pair = (None, name) 346 elif len(pair) == 3: 347 pair = pair[0], pair[1] 348 else: 349 # default namespace 350 pair = tuple(pair) 351 352 newattrs = {} 353 qnames = {} 354 for (aname, value) in attrs.items(): 355 parts = aname.split() 356 length = len(parts) 357 if length == 1: 358 # no namespace 359 qname = aname 360 apair = (None, aname) 361 elif length == 3: 362 qname = "%s:%s" % (parts[2], parts[1]) 363 apair = parts[0], parts[1] 364 else: 365 # default namespace 366 qname = parts[1] 367 apair = tuple(parts) 368 369 newattrs[apair] = value 370 qnames[apair] = qname 371 372 self._cont_handler.startElementNS(pair, None, 373 AttributesNSImpl(newattrs, qnames)) 374 375 def end_element_ns(self, name): 376 pair = name.split() 377 if len(pair) == 1: 378 pair = (None, name) 379 elif len(pair) == 3: 380 pair = pair[0], pair[1] 381 else: 382 pair = tuple(pair) 383 384 self._cont_handler.endElementNS(pair, None) 385 386 # this is not used (call directly to ContentHandler) 387 def processing_instruction(self, target, data): 388 self._cont_handler.processingInstruction(target, data) 389 390 # this is not used (call directly to ContentHandler) 391 def character_data(self, data): 392 self._cont_handler.characters(data) 393 394 def start_namespace_decl(self, prefix, uri): 395 self._cont_handler.startPrefixMapping(prefix, uri) 396 397 def end_namespace_decl(self, prefix): 398 self._cont_handler.endPrefixMapping(prefix) 399 400 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 401 self._lex_handler_prop.startDTD(name, pubid, sysid) 402 403 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 404 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 405 406 def notation_decl(self, name, base, sysid, pubid): 407 self._dtd_handler.notationDecl(name, pubid, sysid) 408 409 def external_entity_ref(self, context, base, sysid, pubid): 410 if not self._external_ges: 411 return 1 412 413 source = self._ent_handler.resolveEntity(pubid, sysid) 414 source = saxutils.prepare_input_source(source, 415 self._source.getSystemId() or 416 "") 417 418 self._entity_stack.append((self._parser, self._source)) 419 self._parser = self._parser.ExternalEntityParserCreate(context) 420 self._source = source 421 422 try: 423 xmlreader.IncrementalParser.parse(self, source) 424 except: 425 return 0 # FIXME: save error info here? 426 427 (self._parser, self._source) = self._entity_stack[-1] 428 del self._entity_stack[-1] 429 return 1 430 431 def skipped_entity_handler(self, name, is_pe): 432 if is_pe: 433 # The SAX spec requires to report skipped PEs with a '%' 434 name = '%'+name 435 self._cont_handler.skippedEntity(name) 436 437 # --- 438 439 def create_parser(*args, **kwargs): 440 return ExpatParser(*args, **kwargs) 441 442 # --- 443 444 if __name__ == "__main__": 445 import xml.sax.saxutils 446 p = create_parser() 447 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 448 p.setErrorHandler(xml.sax.ErrorHandler()) 449 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 450