1 """ 2 SAX driver for the pyexpat C module. This driver works with 3 pyexpat.__version__ == '2.22'. 4 """ 5 6 version = "0.20" 7 8 from xml.sax._exceptions import * 9 from xml.sax.handler import feature_validation, feature_namespaces 10 from xml.sax.handler import feature_namespace_prefixes 11 from xml.sax.handler import feature_external_ges, feature_external_pes 12 from xml.sax.handler import feature_string_interning 13 from xml.sax.handler import property_xml_string, property_interning_dict 14 15 # xml.parsers.expat does not raise ImportError in Jython 16 import sys 17 if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19 del sys 20 21 try: 22 from xml.parsers import expat 23 except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25 else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28 from xml.sax import xmlreader, saxutils, handler 29 30 AttributesImpl = xmlreader.AttributesImpl 31 AttributesNSImpl = xmlreader.AttributesNSImpl 32 33 # If we're using a sufficiently recent version of Python, we can use 34 # weak references to avoid cycles between the parser and content 35 # handler, otherwise we'll just have to pretend. 36 try: 37 import _weakref 38 except ImportError: 39 def _mkproxy(o): 40 return o 41 else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46 class _ClosedParser: 47 pass 48 49 # --- ExpatLocator 50 51 class ExpatLocator(xmlreader.Locator): 52 """Locator for use with the ExpatParser class. 53 54 This uses a weak reference to the parser object to avoid creating 55 a circular reference between the parser and the content handler. 56 """ 57 def __init__(self, parser): 58 self._ref = _mkproxy(parser) 59 60 def getColumnNumber(self): 61 parser = self._ref 62 if parser._parser is None: 63 return None 64 return parser._parser.ErrorColumnNumber 65 66 def getLineNumber(self): 67 parser = self._ref 68 if parser._parser is None: 69 return 1 70 return parser._parser.ErrorLineNumber 71 72 def getPublicId(self): 73 parser = self._ref 74 if parser is None: 75 return None 76 return parser._source.getPublicId() 77 78 def getSystemId(self): 79 parser = self._ref 80 if parser is None: 81 return None 82 return parser._source.getSystemId() 83 84 85 # --- ExpatParser 86 87 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 88 """SAX driver for the pyexpat C module.""" 89 90 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 91 xmlreader.IncrementalParser.__init__(self, bufsize) 92 self._source = xmlreader.InputSource() 93 self._parser = None 94 self._namespaces = namespaceHandling 95 self._lex_handler_prop = None 96 self._parsing = 0 97 self._entity_stack = [] 98 self._external_ges = 1 99 self._interning = None 100 101 # XMLReader methods 102 103 def parse(self, source): 104 "Parse an XML document from a URL or an InputSource." 105 source = saxutils.prepare_input_source(source) 106 107 self._source = source 108 self.reset() 109 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 110 xmlreader.IncrementalParser.parse(self, source) 111 112 def prepareParser(self, source): 113 if source.getSystemId() is not None: 114 self._parser.SetBase(source.getSystemId()) 115 116 # Redefined setContentHandler to allow changing handlers during parsing 117 118 def setContentHandler(self, handler): 119 xmlreader.IncrementalParser.setContentHandler(self, handler) 120 if self._parsing: 121 self._reset_cont_handler() 122 123 def getFeature(self, name): 124 if name == feature_namespaces: 125 return self._namespaces 126 elif name == feature_string_interning: 127 return self._interning is not None 128 elif name in (feature_validation, feature_external_pes, 129 feature_namespace_prefixes): 130 return 0 131 elif name == feature_external_ges: 132 return self._external_ges 133 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 134 135 def setFeature(self, name, state): 136 if self._parsing: 137 raise SAXNotSupportedException("Cannot set features while parsing") 138 139 if name == feature_namespaces: 140 self._namespaces = state 141 elif name == feature_external_ges: 142 self._external_ges = state 143 elif name == feature_string_interning: 144 if state: 145 if self._interning is None: 146 self._interning = {} 147 else: 148 self._interning = None 149 elif name == feature_validation: 150 if state: 151 raise SAXNotSupportedException( 152 "expat does not support validation") 153 elif name == feature_external_pes: 154 if state: 155 raise SAXNotSupportedException( 156 "expat does not read external parameter entities") 157 elif name == feature_namespace_prefixes: 158 if state: 159 raise SAXNotSupportedException( 160 "expat does not report namespace prefixes") 161 else: 162 raise SAXNotRecognizedException( 163 "Feature '%s' not recognized" % name) 164 165 def getProperty(self, name): 166 if name == handler.property_lexical_handler: 167 return self._lex_handler_prop 168 elif name == property_interning_dict: 169 return self._interning 170 elif name == property_xml_string: 171 if self._parser: 172 if hasattr(self._parser, "GetInputContext"): 173 return self._parser.GetInputContext() 174 else: 175 raise SAXNotRecognizedException( 176 "This version of expat does not support getting" 177 " the XML string") 178 else: 179 raise SAXNotSupportedException( 180 "XML string cannot be returned when not parsing") 181 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 182 183 def setProperty(self, name, value): 184 if name == handler.property_lexical_handler: 185 self._lex_handler_prop = value 186 if self._parsing: 187 self._reset_lex_handler_prop() 188 elif name == property_interning_dict: 189 self._interning = value 190 elif name == property_xml_string: 191 raise SAXNotSupportedException("Property '%s' cannot be set" % 192 name) 193 else: 194 raise SAXNotRecognizedException("Property '%s' not recognized" % 195 name) 196 197 # IncrementalParser methods 198 199 def feed(self, data, isFinal = 0): 200 if not self._parsing: 201 self.reset() 202 self._parsing = 1 203 self._cont_handler.startDocument() 204 205 try: 206 # The isFinal parameter is internal to the expat reader. 207 # If it is set to true, expat will check validity of the entire 208 # document. When feeding chunks, they are not normally final - 209 # except when invoked from close. 210 self._parser.Parse(data, isFinal) 211 except expat.error as e: 212 exc = SAXParseException(expat.ErrorString(e.code), e, self) 213 # FIXME: when to invoke error()? 214 self._err_handler.fatalError(exc) 215 216 def close(self): 217 if (self._entity_stack or self._parser is None or 218 isinstance(self._parser, _ClosedParser)): 219 # If we are completing an external entity, do nothing here 220 return 221 try: 222 self.feed("", isFinal = 1) 223 self._cont_handler.endDocument() 224 self._parsing = 0 225 # break cycle created by expat handlers pointing to our methods 226 self._parser = None 227 finally: 228 self._parsing = 0 229 if self._parser is not None: 230 # Keep ErrorColumnNumber and ErrorLineNumber after closing. 231 parser = _ClosedParser() 232 parser.ErrorColumnNumber = self._parser.ErrorColumnNumber 233 parser.ErrorLineNumber = self._parser.ErrorLineNumber 234 self._parser = parser 235 try: 236 file = self._source.getCharacterStream() 237 if file is not None: 238 file.close() 239 finally: 240 file = self._source.getByteStream() 241 if file is not None: 242 file.close() 243 244 def _reset_cont_handler(self): 245 self._parser.ProcessingInstructionHandler = \ 246 self._cont_handler.processingInstruction 247 self._parser.CharacterDataHandler = self._cont_handler.characters 248 249 def _reset_lex_handler_prop(self): 250 lex = self._lex_handler_prop 251 parser = self._parser 252 if lex is None: 253 parser.CommentHandler = None 254 parser.StartCdataSectionHandler = None 255 parser.EndCdataSectionHandler = None 256 parser.StartDoctypeDeclHandler = None 257 parser.EndDoctypeDeclHandler = None 258 else: 259 parser.CommentHandler = lex.comment 260 parser.StartCdataSectionHandler = lex.startCDATA 261 parser.EndCdataSectionHandler = lex.endCDATA 262 parser.StartDoctypeDeclHandler = self.start_doctype_decl 263 parser.EndDoctypeDeclHandler = lex.endDTD 264 265 def reset(self): 266 if self._namespaces: 267 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 268 intern=self._interning) 269 self._parser.namespace_prefixes = 1 270 self._parser.StartElementHandler = self.start_element_ns 271 self._parser.EndElementHandler = self.end_element_ns 272 else: 273 self._parser = expat.ParserCreate(self._source.getEncoding(), 274 intern = self._interning) 275 self._parser.StartElementHandler = self.start_element 276 self._parser.EndElementHandler = self.end_element 277 278 self._reset_cont_handler() 279 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 280 self._parser.NotationDeclHandler = self.notation_decl 281 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 282 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 283 284 self._decl_handler_prop = None 285 if self._lex_handler_prop: 286 self._reset_lex_handler_prop() 287 # self._parser.DefaultHandler = 288 # self._parser.DefaultHandlerExpand = 289 # self._parser.NotStandaloneHandler = 290 self._parser.ExternalEntityRefHandler = self.external_entity_ref 291 try: 292 self._parser.SkippedEntityHandler = self.skipped_entity_handler 293 except AttributeError: 294 # This pyexpat does not support SkippedEntity 295 pass 296 self._parser.SetParamEntityParsing( 297 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 298 299 self._parsing = 0 300 self._entity_stack = [] 301 302 # Locator methods 303 304 def getColumnNumber(self): 305 if self._parser is None: 306 return None 307 return self._parser.ErrorColumnNumber 308 309 def getLineNumber(self): 310 if self._parser is None: 311 return 1 312 return self._parser.ErrorLineNumber 313 314 def getPublicId(self): 315 return self._source.getPublicId() 316 317 def getSystemId(self): 318 return self._source.getSystemId() 319 320 # event handlers 321 def start_element(self, name, attrs): 322 self._cont_handler.startElement(name, AttributesImpl(attrs)) 323 324 def end_element(self, name): 325 self._cont_handler.endElement(name) 326 327 def start_element_ns(self, name, attrs): 328 pair = name.split() 329 if len(pair) == 1: 330 # no namespace 331 pair = (None, name) 332 elif len(pair) == 3: 333 pair = pair[0], pair[1] 334 else: 335 # default namespace 336 pair = tuple(pair) 337 338 newattrs = {} 339 qnames = {} 340 for (aname, value) in attrs.items(): 341 parts = aname.split() 342 length = len(parts) 343 if length == 1: 344 # no namespace 345 qname = aname 346 apair = (None, aname) 347 elif length == 3: 348 qname = "%s:%s" % (parts[2], parts[1]) 349 apair = parts[0], parts[1] 350 else: 351 # default namespace 352 qname = parts[1] 353 apair = tuple(parts) 354 355 newattrs[apair] = value 356 qnames[apair] = qname 357 358 self._cont_handler.startElementNS(pair, None, 359 AttributesNSImpl(newattrs, qnames)) 360 361 def end_element_ns(self, name): 362 pair = name.split() 363 if len(pair) == 1: 364 pair = (None, name) 365 elif len(pair) == 3: 366 pair = pair[0], pair[1] 367 else: 368 pair = tuple(pair) 369 370 self._cont_handler.endElementNS(pair, None) 371 372 # this is not used (call directly to ContentHandler) 373 def processing_instruction(self, target, data): 374 self._cont_handler.processingInstruction(target, data) 375 376 # this is not used (call directly to ContentHandler) 377 def character_data(self, data): 378 self._cont_handler.characters(data) 379 380 def start_namespace_decl(self, prefix, uri): 381 self._cont_handler.startPrefixMapping(prefix, uri) 382 383 def end_namespace_decl(self, prefix): 384 self._cont_handler.endPrefixMapping(prefix) 385 386 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 387 self._lex_handler_prop.startDTD(name, pubid, sysid) 388 389 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 390 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 391 392 def notation_decl(self, name, base, sysid, pubid): 393 self._dtd_handler.notationDecl(name, pubid, sysid) 394 395 def external_entity_ref(self, context, base, sysid, pubid): 396 if not self._external_ges: 397 return 1 398 399 source = self._ent_handler.resolveEntity(pubid, sysid) 400 source = saxutils.prepare_input_source(source, 401 self._source.getSystemId() or 402 "") 403 404 self._entity_stack.append((self._parser, self._source)) 405 self._parser = self._parser.ExternalEntityParserCreate(context) 406 self._source = source 407 408 try: 409 xmlreader.IncrementalParser.parse(self, source) 410 except: 411 return 0 # FIXME: save error info here? 412 413 (self._parser, self._source) = self._entity_stack[-1] 414 del self._entity_stack[-1] 415 return 1 416 417 def skipped_entity_handler(self, name, is_pe): 418 if is_pe: 419 # The SAX spec requires to report skipped PEs with a '%' 420 name = '%'+name 421 self._cont_handler.skippedEntity(name) 422 423 # --- 424 425 def create_parser(*args, **kwargs): 426 return ExpatParser(*args, **kwargs) 427 428 # --- 429 430 if __name__ == "__main__": 431 import xml.sax.saxutils 432 p = create_parser() 433 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 434 p.setErrorHandler(xml.sax.ErrorHandler()) 435 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 436