1 """ 2 SAX driver for the pyexpat C module. This driver works with 3 pyexpat.__version__ == '2.22'. 4 """ 5 6 version = "0.20" 7 8 from xml.sax._exceptions import * 9 from xml.sax.handler import feature_validation, feature_namespaces 10 from xml.sax.handler import feature_namespace_prefixes 11 from xml.sax.handler import feature_external_ges, feature_external_pes 12 from xml.sax.handler import feature_string_interning 13 from xml.sax.handler import property_xml_string, property_interning_dict 14 15 # xml.parsers.expat does not raise ImportError in Jython 16 import sys 17 if sys.platform[:4] == "java": 18 raise SAXReaderNotAvailable("expat not available in Java", None) 19 del sys 20 21 try: 22 from xml.parsers import expat 23 except ImportError: 24 raise SAXReaderNotAvailable("expat not supported", None) 25 else: 26 if not hasattr(expat, "ParserCreate"): 27 raise SAXReaderNotAvailable("expat not supported", None) 28 from xml.sax import xmlreader, saxutils, handler 29 30 AttributesImpl = xmlreader.AttributesImpl 31 AttributesNSImpl = xmlreader.AttributesNSImpl 32 33 # If we're using a sufficiently recent version of Python, we can use 34 # weak references to avoid cycles between the parser and content 35 # handler, otherwise we'll just have to pretend. 36 try: 37 import _weakref 38 except ImportError: 39 def _mkproxy(o): 40 return o 41 else: 42 import weakref 43 _mkproxy = weakref.proxy 44 del weakref, _weakref 45 46 # --- ExpatLocator 47 48 class ExpatLocator(xmlreader.Locator): 49 """Locator for use with the ExpatParser class. 50 51 This uses a weak reference to the parser object to avoid creating 52 a circular reference between the parser and the content handler. 53 """ 54 def __init__(self, parser): 55 self._ref = _mkproxy(parser) 56 57 def getColumnNumber(self): 58 parser = self._ref 59 if parser._parser is None: 60 return None 61 return parser._parser.ErrorColumnNumber 62 63 def getLineNumber(self): 64 parser = self._ref 65 if parser._parser is None: 66 return 1 67 return parser._parser.ErrorLineNumber 68 69 def getPublicId(self): 70 parser = self._ref 71 if parser is None: 72 return None 73 return parser._source.getPublicId() 74 75 def getSystemId(self): 76 parser = self._ref 77 if parser is None: 78 return None 79 return parser._source.getSystemId() 80 81 82 # --- ExpatParser 83 84 class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator): 85 """SAX driver for the pyexpat C module.""" 86 87 def __init__(self, namespaceHandling=0, bufsize=2**16-20): 88 xmlreader.IncrementalParser.__init__(self, bufsize) 89 self._source = xmlreader.InputSource() 90 self._parser = None 91 self._namespaces = namespaceHandling 92 self._lex_handler_prop = None 93 self._parsing = 0 94 self._entity_stack = [] 95 self._external_ges = 1 96 self._interning = None 97 98 # XMLReader methods 99 100 def parse(self, source): 101 "Parse an XML document from a URL or an InputSource." 102 source = saxutils.prepare_input_source(source) 103 104 self._source = source 105 self.reset() 106 self._cont_handler.setDocumentLocator(ExpatLocator(self)) 107 xmlreader.IncrementalParser.parse(self, source) 108 109 def prepareParser(self, source): 110 if source.getSystemId() is not None: 111 self._parser.SetBase(source.getSystemId()) 112 113 # Redefined setContentHandler to allow changing handlers during parsing 114 115 def setContentHandler(self, handler): 116 xmlreader.IncrementalParser.setContentHandler(self, handler) 117 if self._parsing: 118 self._reset_cont_handler() 119 120 def getFeature(self, name): 121 if name == feature_namespaces: 122 return self._namespaces 123 elif name == feature_string_interning: 124 return self._interning is not None 125 elif name in (feature_validation, feature_external_pes, 126 feature_namespace_prefixes): 127 return 0 128 elif name == feature_external_ges: 129 return self._external_ges 130 raise SAXNotRecognizedException("Feature '%s' not recognized" % name) 131 132 def setFeature(self, name, state): 133 if self._parsing: 134 raise SAXNotSupportedException("Cannot set features while parsing") 135 136 if name == feature_namespaces: 137 self._namespaces = state 138 elif name == feature_external_ges: 139 self._external_ges = state 140 elif name == feature_string_interning: 141 if state: 142 if self._interning is None: 143 self._interning = {} 144 else: 145 self._interning = None 146 elif name == feature_validation: 147 if state: 148 raise SAXNotSupportedException( 149 "expat does not support validation") 150 elif name == feature_external_pes: 151 if state: 152 raise SAXNotSupportedException( 153 "expat does not read external parameter entities") 154 elif name == feature_namespace_prefixes: 155 if state: 156 raise SAXNotSupportedException( 157 "expat does not report namespace prefixes") 158 else: 159 raise SAXNotRecognizedException( 160 "Feature '%s' not recognized" % name) 161 162 def getProperty(self, name): 163 if name == handler.property_lexical_handler: 164 return self._lex_handler_prop 165 elif name == property_interning_dict: 166 return self._interning 167 elif name == property_xml_string: 168 if self._parser: 169 if hasattr(self._parser, "GetInputContext"): 170 return self._parser.GetInputContext() 171 else: 172 raise SAXNotRecognizedException( 173 "This version of expat does not support getting" 174 " the XML string") 175 else: 176 raise SAXNotSupportedException( 177 "XML string cannot be returned when not parsing") 178 raise SAXNotRecognizedException("Property '%s' not recognized" % name) 179 180 def setProperty(self, name, value): 181 if name == handler.property_lexical_handler: 182 self._lex_handler_prop = value 183 if self._parsing: 184 self._reset_lex_handler_prop() 185 elif name == property_interning_dict: 186 self._interning = value 187 elif name == property_xml_string: 188 raise SAXNotSupportedException("Property '%s' cannot be set" % 189 name) 190 else: 191 raise SAXNotRecognizedException("Property '%s' not recognized" % 192 name) 193 194 # IncrementalParser methods 195 196 def feed(self, data, isFinal = 0): 197 if not self._parsing: 198 self.reset() 199 self._parsing = 1 200 self._cont_handler.startDocument() 201 202 try: 203 # The isFinal parameter is internal to the expat reader. 204 # If it is set to true, expat will check validity of the entire 205 # document. When feeding chunks, they are not normally final - 206 # except when invoked from close. 207 self._parser.Parse(data, isFinal) 208 except expat.error, e: 209 exc = SAXParseException(expat.ErrorString(e.code), e, self) 210 # FIXME: when to invoke error()? 211 self._err_handler.fatalError(exc) 212 213 def close(self): 214 if self._entity_stack: 215 # If we are completing an external entity, do nothing here 216 return 217 self.feed("", isFinal = 1) 218 self._cont_handler.endDocument() 219 self._parsing = 0 220 # break cycle created by expat handlers pointing to our methods 221 self._parser = None 222 223 def _reset_cont_handler(self): 224 self._parser.ProcessingInstructionHandler = \ 225 self._cont_handler.processingInstruction 226 self._parser.CharacterDataHandler = self._cont_handler.characters 227 228 def _reset_lex_handler_prop(self): 229 lex = self._lex_handler_prop 230 parser = self._parser 231 if lex is None: 232 parser.CommentHandler = None 233 parser.StartCdataSectionHandler = None 234 parser.EndCdataSectionHandler = None 235 parser.StartDoctypeDeclHandler = None 236 parser.EndDoctypeDeclHandler = None 237 else: 238 parser.CommentHandler = lex.comment 239 parser.StartCdataSectionHandler = lex.startCDATA 240 parser.EndCdataSectionHandler = lex.endCDATA 241 parser.StartDoctypeDeclHandler = self.start_doctype_decl 242 parser.EndDoctypeDeclHandler = lex.endDTD 243 244 def reset(self): 245 if self._namespaces: 246 self._parser = expat.ParserCreate(self._source.getEncoding(), " ", 247 intern=self._interning) 248 self._parser.namespace_prefixes = 1 249 self._parser.StartElementHandler = self.start_element_ns 250 self._parser.EndElementHandler = self.end_element_ns 251 else: 252 self._parser = expat.ParserCreate(self._source.getEncoding(), 253 intern = self._interning) 254 self._parser.StartElementHandler = self.start_element 255 self._parser.EndElementHandler = self.end_element 256 257 self._reset_cont_handler() 258 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl 259 self._parser.NotationDeclHandler = self.notation_decl 260 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl 261 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl 262 263 self._decl_handler_prop = None 264 if self._lex_handler_prop: 265 self._reset_lex_handler_prop() 266 # self._parser.DefaultHandler = 267 # self._parser.DefaultHandlerExpand = 268 # self._parser.NotStandaloneHandler = 269 self._parser.ExternalEntityRefHandler = self.external_entity_ref 270 try: 271 self._parser.SkippedEntityHandler = self.skipped_entity_handler 272 except AttributeError: 273 # This pyexpat does not support SkippedEntity 274 pass 275 self._parser.SetParamEntityParsing( 276 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 277 278 self._parsing = 0 279 self._entity_stack = [] 280 281 # Locator methods 282 283 def getColumnNumber(self): 284 if self._parser is None: 285 return None 286 return self._parser.ErrorColumnNumber 287 288 def getLineNumber(self): 289 if self._parser is None: 290 return 1 291 return self._parser.ErrorLineNumber 292 293 def getPublicId(self): 294 return self._source.getPublicId() 295 296 def getSystemId(self): 297 return self._source.getSystemId() 298 299 # event handlers 300 def start_element(self, name, attrs): 301 self._cont_handler.startElement(name, AttributesImpl(attrs)) 302 303 def end_element(self, name): 304 self._cont_handler.endElement(name) 305 306 def start_element_ns(self, name, attrs): 307 pair = name.split() 308 if len(pair) == 1: 309 # no namespace 310 pair = (None, name) 311 elif len(pair) == 3: 312 pair = pair[0], pair[1] 313 else: 314 # default namespace 315 pair = tuple(pair) 316 317 newattrs = {} 318 qnames = {} 319 for (aname, value) in attrs.items(): 320 parts = aname.split() 321 length = len(parts) 322 if length == 1: 323 # no namespace 324 qname = aname 325 apair = (None, aname) 326 elif length == 3: 327 qname = "%s:%s" % (parts[2], parts[1]) 328 apair = parts[0], parts[1] 329 else: 330 # default namespace 331 qname = parts[1] 332 apair = tuple(parts) 333 334 newattrs[apair] = value 335 qnames[apair] = qname 336 337 self._cont_handler.startElementNS(pair, None, 338 AttributesNSImpl(newattrs, qnames)) 339 340 def end_element_ns(self, name): 341 pair = name.split() 342 if len(pair) == 1: 343 pair = (None, name) 344 elif len(pair) == 3: 345 pair = pair[0], pair[1] 346 else: 347 pair = tuple(pair) 348 349 self._cont_handler.endElementNS(pair, None) 350 351 # this is not used (call directly to ContentHandler) 352 def processing_instruction(self, target, data): 353 self._cont_handler.processingInstruction(target, data) 354 355 # this is not used (call directly to ContentHandler) 356 def character_data(self, data): 357 self._cont_handler.characters(data) 358 359 def start_namespace_decl(self, prefix, uri): 360 self._cont_handler.startPrefixMapping(prefix, uri) 361 362 def end_namespace_decl(self, prefix): 363 self._cont_handler.endPrefixMapping(prefix) 364 365 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset): 366 self._lex_handler_prop.startDTD(name, pubid, sysid) 367 368 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name): 369 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name) 370 371 def notation_decl(self, name, base, sysid, pubid): 372 self._dtd_handler.notationDecl(name, pubid, sysid) 373 374 def external_entity_ref(self, context, base, sysid, pubid): 375 if not self._external_ges: 376 return 1 377 378 source = self._ent_handler.resolveEntity(pubid, sysid) 379 source = saxutils.prepare_input_source(source, 380 self._source.getSystemId() or 381 "") 382 383 self._entity_stack.append((self._parser, self._source)) 384 self._parser = self._parser.ExternalEntityParserCreate(context) 385 self._source = source 386 387 try: 388 xmlreader.IncrementalParser.parse(self, source) 389 except: 390 return 0 # FIXME: save error info here? 391 392 (self._parser, self._source) = self._entity_stack[-1] 393 del self._entity_stack[-1] 394 return 1 395 396 def skipped_entity_handler(self, name, is_pe): 397 if is_pe: 398 # The SAX spec requires to report skipped PEs with a '%' 399 name = '%'+name 400 self._cont_handler.skippedEntity(name) 401 402 # --- 403 404 def create_parser(*args, **kwargs): 405 return ExpatParser(*args, **kwargs) 406 407 # --- 408 409 if __name__ == "__main__": 410 import xml.sax.saxutils 411 p = create_parser() 412 p.setContentHandler(xml.sax.saxutils.XMLGenerator()) 413 p.setErrorHandler(xml.sax.ErrorHandler()) 414 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml") 415