1 """A parser for XML, using the derived class as static DTD.""" 2 3 # Author: Sjoerd Mullender. 4 5 import re 6 import string 7 8 import warnings 9 warnings.warn("The xmllib module is obsolete. Use xml.sax instead.", 10 DeprecationWarning, 2) 11 del warnings 12 13 version = '0.3' 14 15 class Error(RuntimeError): 16 pass 17 18 # Regular expressions used for parsing 19 20 _S = '[ \t\r\n]+' # white space 21 _opS = '[ \t\r\n]*' # optional white space 22 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name 23 _QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string 24 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content 25 interesting = re.compile('[]&<]') 26 27 amp = re.compile('&') 28 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') 29 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]') 30 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') 31 space = re.compile(_S + '$') 32 newline = re.compile('\n') 33 34 attrfind = re.compile( 35 _S + '(?P<name>' + _Name + ')' 36 '(' + _opS + '=' + _opS + 37 '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?') 38 starttagopen = re.compile('<' + _Name) 39 starttagend = re.compile(_opS + '(?P<slash>/?)>') 40 starttagmatch = re.compile('<(?P<tagname>'+_Name+')' 41 '(?P<attrs>(?:'+attrfind.pattern+')*)'+ 42 starttagend.pattern) 43 endtagopen = re.compile('</') 44 endbracket = re.compile(_opS + '>') 45 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>') 46 tagfind = re.compile(_Name) 47 cdataopen = re.compile(r'<!\[CDATA\[') 48 cdataclose = re.compile(r'\]\]>') 49 # this matches one of the following: 50 # SYSTEM SystemLiteral 51 # PUBLIC PubidLiteral SystemLiteral 52 _SystemLiteral = '(?P<%s>'+_QStr+')' 53 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ 54 "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" 55 _ExternalId = '(?:SYSTEM|' \ 56 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ 57 ')'+_S+_SystemLiteral%'syslit' 58 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')' 59 '(?:'+_S+_ExternalId+')?'+_opS) 60 xmldecl = re.compile('<\?xml'+_S+ 61 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+ 62 '(?:'+_S+'encoding'+_opS+'='+_opS+ 63 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|" 64 '"[A-Za-z][-A-Za-z0-9._]*"))?' 65 '(?:'+_S+'standalone'+_opS+'='+_opS+ 66 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+ 67 _opS+'\?>') 68 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS) 69 procclose = re.compile(_opS + r'\?>') 70 commentopen = re.compile('<!--') 71 commentclose = re.compile('-->') 72 doubledash = re.compile('--') 73 attrtrans = string.maketrans(' \r\n\t', ' ') 74 75 # definitions for XML namespaces 76 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" 77 ncname = re.compile(_NCName + '$') 78 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix 79 '(?P<local>' + _NCName + ')$') 80 81 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$') 82 83 # XML parser base class -- find tags and call handler functions. 84 # Usage: p = XMLParser(); p.feed(data); ...; p.close(). 85 # The dtd is defined by deriving a class which defines methods with 86 # special names to handle tags: start_foo and end_foo to handle <foo> 87 # and </foo>, respectively. The data between tags is passed to the 88 # parser by calling self.handle_data() with some data as argument (the 89 # data may be split up in arbitrary chunks). 90 91 class XMLParser: 92 attributes = {} # default, to be overridden 93 elements = {} # default, to be overridden 94 95 # parsing options, settable using keyword args in __init__ 96 __accept_unquoted_attributes = 0 97 __accept_missing_endtag_name = 0 98 __map_case = 0 99 __accept_utf8 = 0 100 __translate_attribute_references = 1 101 102 # Interface -- initialize and reset this instance 103 def __init__(self, **kw): 104 self.__fixed = 0 105 if 'accept_unquoted_attributes' in kw: 106 self.__accept_unquoted_attributes = kw['accept_unquoted_attributes'] 107 if 'accept_missing_endtag_name' in kw: 108 self.__accept_missing_endtag_name = kw['accept_missing_endtag_name'] 109 if 'map_case' in kw: 110 self.__map_case = kw['map_case'] 111 if 'accept_utf8' in kw: 112 self.__accept_utf8 = kw['accept_utf8'] 113 if 'translate_attribute_references' in kw: 114 self.__translate_attribute_references = kw['translate_attribute_references'] 115 self.reset() 116 117 def __fixelements(self): 118 self.__fixed = 1 119 self.elements = {} 120 self.__fixdict(self.__dict__) 121 self.__fixclass(self.__class__) 122 123 def __fixclass(self, kl): 124 self.__fixdict(kl.__dict__) 125 for k in kl.__bases__: 126 self.__fixclass(k) 127 128 def __fixdict(self, dict): 129 for key in dict.keys(): 130 if key[:6] == 'start_': 131 tag = key[6:] 132 start, end = self.elements.get(tag, (None, None)) 133 if start is None: 134 self.elements[tag] = getattr(self, key), end 135 elif key[:4] == 'end_': 136 tag = key[4:] 137 start, end = self.elements.get(tag, (None, None)) 138 if end is None: 139 self.elements[tag] = start, getattr(self, key) 140 141 # Interface -- reset this instance. Loses all unprocessed data 142 def reset(self): 143 self.rawdata = '' 144 self.stack = [] 145 self.nomoretags = 0 146 self.literal = 0 147 self.lineno = 1 148 self.__at_start = 1 149 self.__seen_doctype = None 150 self.__seen_starttag = 0 151 self.__use_namespaces = 0 152 self.__namespaces = {'xml':None} # xml is implicitly declared 153 # backward compatibility hack: if elements not overridden, 154 # fill it in ourselves 155 if self.elements is XMLParser.elements: 156 self.__fixelements() 157 158 # For derived classes only -- enter literal mode (CDATA) till EOF 159 def setnomoretags(self): 160 self.nomoretags = self.literal = 1 161 162 # For derived classes only -- enter literal mode (CDATA) 163 def setliteral(self, *args): 164 self.literal = 1 165 166 # Interface -- feed some data to the parser. Call this as 167 # often as you want, with as little or as much text as you 168 # want (may include '\n'). (This just saves the text, all the 169 # processing is done by goahead().) 170 def feed(self, data): 171 self.rawdata = self.rawdata + data 172 self.goahead(0) 173 174 # Interface -- handle the remaining data 175 def close(self): 176 self.goahead(1) 177 if self.__fixed: 178 self.__fixed = 0 179 # remove self.elements so that we don't leak 180 del self.elements 181 182 # Interface -- translate references 183 def translate_references(self, data, all = 1): 184 if not self.__translate_attribute_references: 185 return data 186 i = 0 187 while 1: 188 res = amp.search(data, i) 189 if res is None: 190 return data 191 s = res.start(0) 192 res = ref.match(data, s) 193 if res is None: 194 self.syntax_error("bogus `&'") 195 i = s+1 196 continue 197 i = res.end(0) 198 str = res.group(1) 199 rescan = 0 200 if str[0] == '#': 201 if str[1] == 'x': 202 str = chr(int(str[2:], 16)) 203 else: 204 str = chr(int(str[1:])) 205 if data[i - 1] != ';': 206 self.syntax_error("`;' missing after char reference") 207 i = i-1 208 elif all: 209 if str in self.entitydefs: 210 str = self.entitydefs[str] 211 rescan = 1 212 elif data[i - 1] != ';': 213 self.syntax_error("bogus `&'") 214 i = s + 1 # just past the & 215 continue 216 else: 217 self.syntax_error("reference to unknown entity `&%s;'" % str) 218 str = '&' + str + ';' 219 elif data[i - 1] != ';': 220 self.syntax_error("bogus `&'") 221 i = s + 1 # just past the & 222 continue 223 224 # when we get here, str contains the translated text and i points 225 # to the end of the string that is to be replaced 226 data = data[:s] + str + data[i:] 227 if rescan: 228 i = s 229 else: 230 i = s + len(str) 231 232 # Interface - return a dictionary of all namespaces currently valid 233 def getnamespace(self): 234 nsdict = {} 235 for t, d, nst in self.stack: 236 nsdict.update(d) 237 return nsdict 238 239 # Internal -- handle data as far as reasonable. May leave state 240 # and data to be processed by a subsequent call. If 'end' is 241 # true, force handling all data as if followed by EOF marker. 242 def goahead(self, end): 243 rawdata = self.rawdata 244 i = 0 245 n = len(rawdata) 246 while i < n: 247 if i > 0: 248 self.__at_start = 0 249 if self.nomoretags: 250 data = rawdata[i:n] 251 self.handle_data(data) 252 self.lineno = self.lineno + data.count('\n') 253 i = n 254 break 255 res = interesting.search(rawdata, i) 256 if res: 257 j = res.start(0) 258 else: 259 j = n 260 if i < j: 261 data = rawdata[i:j] 262 if self.__at_start and space.match(data) is None: 263 self.syntax_error('illegal data at start of file') 264 self.__at_start = 0 265 if not self.stack and space.match(data) is None: 266 self.syntax_error('data not in content') 267 if not self.__accept_utf8 and illegal.search(data): 268 self.syntax_error('illegal character in content') 269 self.handle_data(data) 270 self.lineno = self.lineno + data.count('\n') 271 i = j 272 if i == n: break 273 if rawdata[i] == '<': 274 if starttagopen.match(rawdata, i): 275 if self.literal: 276 data = rawdata[i] 277 self.handle_data(data) 278 self.lineno = self.lineno + data.count('\n') 279 i = i+1 280 continue 281 k = self.parse_starttag(i) 282 if k < 0: break 283 self.__seen_starttag = 1 284 self.lineno = self.lineno + rawdata[i:k].count('\n') 285 i = k 286 continue 287 if endtagopen.match(rawdata, i): 288 k = self.parse_endtag(i) 289 if k < 0: break 290 self.lineno = self.lineno + rawdata[i:k].count('\n') 291 i = k 292 continue 293 if commentopen.match(rawdata, i): 294 if self.literal: 295 data = rawdata[i] 296 self.handle_data(data) 297 self.lineno = self.lineno + data.count('\n') 298 i = i+1 299 continue 300 k = self.parse_comment(i) 301 if k < 0: break 302 self.lineno = self.lineno + rawdata[i:k].count('\n') 303 i = k 304 continue 305 if cdataopen.match(rawdata, i): 306 k = self.parse_cdata(i) 307 if k < 0: break 308 self.lineno = self.lineno + rawdata[i:k].count('\n') 309 i = k 310 continue 311 res = xmldecl.match(rawdata, i) 312 if res: 313 if not self.__at_start: 314 self.syntax_error("<?xml?> declaration not at start of document") 315 version, encoding, standalone = res.group('version', 316 'encoding', 317 'standalone') 318 if version[1:-1] != '1.0': 319 raise Error('only XML version 1.0 supported') 320 if encoding: encoding = encoding[1:-1] 321 if standalone: standalone = standalone[1:-1] 322 self.handle_xml(encoding, standalone) 323 i = res.end(0) 324 continue 325 res = procopen.match(rawdata, i) 326 if res: 327 k = self.parse_proc(i) 328 if k < 0: break 329 self.lineno = self.lineno + rawdata[i:k].count('\n') 330 i = k 331 continue 332 res = doctype.match(rawdata, i) 333 if res: 334 if self.literal: 335 data = rawdata[i] 336 self.handle_data(data) 337 self.lineno = self.lineno + data.count('\n') 338 i = i+1 339 continue 340 if self.__seen_doctype: 341 self.syntax_error('multiple DOCTYPE elements') 342 if self.__seen_starttag: 343 self.syntax_error('DOCTYPE not at beginning of document') 344 k = self.parse_doctype(res) 345 if k < 0: break 346 self.__seen_doctype = res.group('name') 347 if self.__map_case: 348 self.__seen_doctype = self.__seen_doctype.lower() 349 self.lineno = self.lineno + rawdata[i:k].count('\n') 350 i = k 351 continue 352 elif rawdata[i] == '&': 353 if self.literal: 354 data = rawdata[i] 355 self.handle_data(data) 356 i = i+1 357 continue 358 res = charref.match(rawdata, i) 359 if res is not None: 360 i = res.end(0) 361 if rawdata[i-1] != ';': 362 self.syntax_error("`;' missing in charref") 363 i = i-1 364 if not self.stack: 365 self.syntax_error('data not in content') 366 self.handle_charref(res.group('char')[:-1]) 367 self.lineno = self.lineno + res.group(0).count('\n') 368 continue 369 res = entityref.match(rawdata, i) 370 if res is not None: 371 i = res.end(0) 372 if rawdata[i-1] != ';': 373 self.syntax_error("`;' missing in entityref") 374 i = i-1 375 name = res.group('name') 376 if self.__map_case: 377 name = name.lower() 378 if name in self.entitydefs: 379 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] 380 n = len(rawdata) 381 i = res.start(0) 382 else: 383 self.unknown_entityref(name) 384 self.lineno = self.lineno + res.group(0).count('\n') 385 continue 386 elif rawdata[i] == ']': 387 if self.literal: 388 data = rawdata[i] 389 self.handle_data(data) 390 i = i+1 391 continue 392 if n-i < 3: 393 break 394 if cdataclose.match(rawdata, i): 395 self.syntax_error("bogus `]]>'") 396 self.handle_data(rawdata[i]) 397 i = i+1 398 continue 399 else: 400 raise Error('neither < nor & ??') 401 # We get here only if incomplete matches but 402 # nothing else 403 break 404 # end while 405 if i > 0: 406 self.__at_start = 0 407 if end and i < n: 408 data = rawdata[i] 409 self.syntax_error("bogus `%s'" % data) 410 if not self.__accept_utf8 and illegal.search(data): 411 self.syntax_error('illegal character in content') 412 self.handle_data(data) 413 self.lineno = self.lineno + data.count('\n') 414 self.rawdata = rawdata[i+1:] 415 return self.goahead(end) 416 self.rawdata = rawdata[i:] 417 if end: 418 if not self.__seen_starttag: 419 self.syntax_error('no elements in file') 420 if self.stack: 421 self.syntax_error('missing end tags') 422 while self.stack: 423 self.finish_endtag(self.stack[-1][0]) 424 425 # Internal -- parse comment, return length or -1 if not terminated 426 def parse_comment(self, i): 427 rawdata = self.rawdata 428 if rawdata[i:i+4] != '<!--': 429 raise Error('unexpected call to handle_comment') 430 res = commentclose.search(rawdata, i+4) 431 if res is None: 432 return -1 433 if doubledash.search(rawdata, i+4, res.start(0)): 434 self.syntax_error("`--' inside comment") 435 if rawdata[res.start(0)-1] == '-': 436 self.syntax_error('comment cannot end in three dashes') 437 if not self.__accept_utf8 and \ 438 illegal.search(rawdata, i+4, res.start(0)): 439 self.syntax_error('illegal character in comment') 440 self.handle_comment(rawdata[i+4: res.start(0)]) 441 return res.end(0) 442 443 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated 444 def parse_doctype(self, res): 445 rawdata = self.rawdata 446 n = len(rawdata) 447 name = res.group('name') 448 if self.__map_case: 449 name = name.lower() 450 pubid, syslit = res.group('pubid', 'syslit') 451 if pubid is not None: 452 pubid = pubid[1:-1] # remove quotes 453 pubid = ' '.join(pubid.split()) # normalize 454 if syslit is not None: syslit = syslit[1:-1] # remove quotes 455 j = k = res.end(0) 456 if k >= n: 457 return -1 458 if rawdata[k] == '[': 459 level = 0 460 k = k+1 461 dq = sq = 0 462 while k < n: 463 c = rawdata[k] 464 if not sq and c == '"': 465 dq = not dq 466 elif not dq and c == "'": 467 sq = not sq 468 elif sq or dq: 469 pass 470 elif level <= 0 and c == ']': 471 res = endbracket.match(rawdata, k+1) 472 if res is None: 473 return -1 474 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k]) 475 return res.end(0) 476 elif c == '<': 477 level = level + 1 478 elif c == '>': 479 level = level - 1 480 if level < 0: 481 self.syntax_error("bogus `>' in DOCTYPE") 482 k = k+1 483 res = endbracketfind.match(rawdata, k) 484 if res is None: 485 return -1 486 if endbracket.match(rawdata, k) is None: 487 self.syntax_error('garbage in DOCTYPE') 488 self.handle_doctype(name, pubid, syslit, None) 489 return res.end(0) 490 491 # Internal -- handle CDATA tag, return length or -1 if not terminated 492 def parse_cdata(self, i): 493 rawdata = self.rawdata 494 if rawdata[i:i+9] != '<![CDATA[': 495 raise Error('unexpected call to parse_cdata') 496 res = cdataclose.search(rawdata, i+9) 497 if res is None: 498 return -1 499 if not self.__accept_utf8 and \ 500 illegal.search(rawdata, i+9, res.start(0)): 501 self.syntax_error('illegal character in CDATA') 502 if not self.stack: 503 self.syntax_error('CDATA not in content') 504 self.handle_cdata(rawdata[i+9:res.start(0)]) 505 return res.end(0) 506 507 __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} 508 # Internal -- handle a processing instruction tag 509 def parse_proc(self, i): 510 rawdata = self.rawdata 511 end = procclose.search(rawdata, i) 512 if end is None: 513 return -1 514 j = end.start(0) 515 if not self.__accept_utf8 and illegal.search(rawdata, i+2, j): 516 self.syntax_error('illegal character in processing instruction') 517 res = tagfind.match(rawdata, i+2) 518 if res is None: 519 raise Error('unexpected call to parse_proc') 520 k = res.end(0) 521 name = res.group(0) 522 if self.__map_case: 523 name = name.lower() 524 if name == 'xml:namespace': 525 self.syntax_error('old-fashioned namespace declaration') 526 self.__use_namespaces = -1 527 # namespace declaration 528 # this must come after the <?xml?> declaration (if any) 529 # and before the <!DOCTYPE> (if any). 530 if self.__seen_doctype or self.__seen_starttag: 531 self.syntax_error('xml:namespace declaration too late in document') 532 attrdict, namespace, k = self.parse_attributes(name, k, j) 533 if namespace: 534 self.syntax_error('namespace declaration inside namespace declaration') 535 for attrname in attrdict.keys(): 536 if not attrname in self.__xml_namespace_attributes: 537 self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) 538 if not 'ns' in attrdict or not 'prefix' in attrdict: 539 self.syntax_error('xml:namespace without required attributes') 540 prefix = attrdict.get('prefix') 541 if ncname.match(prefix) is None: 542 self.syntax_error('xml:namespace illegal prefix value') 543 return end.end(0) 544 if prefix in self.__namespaces: 545 self.syntax_error('xml:namespace prefix not unique') 546 self.__namespaces[prefix] = attrdict['ns'] 547 else: 548 if name.lower() == 'xml': 549 self.syntax_error('illegal processing instruction target name') 550 self.handle_proc(name, rawdata[k:j]) 551 return end.end(0) 552 553 # Internal -- parse attributes between i and j 554 def parse_attributes(self, tag, i, j): 555 rawdata = self.rawdata 556 attrdict = {} 557 namespace = {} 558 while i < j: 559 res = attrfind.match(rawdata, i) 560 if res is None: 561 break 562 attrname, attrvalue = res.group('name', 'value') 563 if self.__map_case: 564 attrname = attrname.lower() 565 i = res.end(0) 566 if attrvalue is None: 567 self.syntax_error("no value specified for attribute `%s'" % attrname) 568 attrvalue = attrname 569 elif attrvalue[:1] == "'" == attrvalue[-1:] or \ 570 attrvalue[:1] == '"' == attrvalue[-1:]: 571 attrvalue = attrvalue[1:-1] 572 elif not self.__accept_unquoted_attributes: 573 self.syntax_error("attribute `%s' value not quoted" % attrname) 574 res = xmlns.match(attrname) 575 if res is not None: 576 # namespace declaration 577 ncname = res.group('ncname') 578 namespace[ncname or ''] = attrvalue or None 579 if not self.__use_namespaces: 580 self.__use_namespaces = len(self.stack)+1 581 continue 582 if '<' in attrvalue: 583 self.syntax_error("`<' illegal in attribute value") 584 if attrname in attrdict: 585 self.syntax_error("attribute `%s' specified twice" % attrname) 586 attrvalue = attrvalue.translate(attrtrans) 587 attrdict[attrname] = self.translate_references(attrvalue) 588 return attrdict, namespace, i 589 590 # Internal -- handle starttag, return length or -1 if not terminated 591 def parse_starttag(self, i): 592 rawdata = self.rawdata 593 # i points to start of tag 594 end = endbracketfind.match(rawdata, i+1) 595 if end is None: 596 return -1 597 tag = starttagmatch.match(rawdata, i) 598 if tag is None or tag.end(0) != end.end(0): 599 self.syntax_error('garbage in starttag') 600 return end.end(0) 601 nstag = tagname = tag.group('tagname') 602 if self.__map_case: 603 nstag = tagname = nstag.lower() 604 if not self.__seen_starttag and self.__seen_doctype and \ 605 tagname != self.__seen_doctype: 606 self.syntax_error('starttag does not match DOCTYPE') 607 if self.__seen_starttag and not self.stack: 608 self.syntax_error('multiple elements on top level') 609 k, j = tag.span('attrs') 610 attrdict, nsdict, k = self.parse_attributes(tagname, k, j) 611 self.stack.append((tagname, nsdict, nstag)) 612 if self.__use_namespaces: 613 res = qname.match(tagname) 614 else: 615 res = None 616 if res is not None: 617 prefix, nstag = res.group('prefix', 'local') 618 if prefix is None: 619 prefix = '' 620 ns = None 621 for t, d, nst in self.stack: 622 if prefix in d: 623 ns = d[prefix] 624 if ns is None and prefix != '': 625 ns = self.__namespaces.get(prefix) 626 if ns is not None: 627 nstag = ns + ' ' + nstag 628 elif prefix != '': 629 nstag = prefix + ':' + nstag # undo split 630 self.stack[-1] = tagname, nsdict, nstag 631 # translate namespace of attributes 632 attrnamemap = {} # map from new name to old name (used for error reporting) 633 for key in attrdict.keys(): 634 attrnamemap[key] = key 635 if self.__use_namespaces: 636 nattrdict = {} 637 for key, val in attrdict.items(): 638 okey = key 639 res = qname.match(key) 640 if res is not None: 641 aprefix, key = res.group('prefix', 'local') 642 if self.__map_case: 643 key = key.lower() 644 if aprefix is not None: 645 ans = None 646 for t, d, nst in self.stack: 647 if aprefix in d: 648 ans = d[aprefix] 649 if ans is None: 650 ans = self.__namespaces.get(aprefix) 651 if ans is not None: 652 key = ans + ' ' + key 653 else: 654 key = aprefix + ':' + key 655 nattrdict[key] = val 656 attrnamemap[key] = okey 657 attrdict = nattrdict 658 attributes = self.attributes.get(nstag) 659 if attributes is not None: 660 for key in attrdict.keys(): 661 if not key in attributes: 662 self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname)) 663 for key, val in attributes.items(): 664 if val is not None and not key in attrdict: 665 attrdict[key] = val 666 method = self.elements.get(nstag, (None, None))[0] 667 self.finish_starttag(nstag, attrdict, method) 668 if tag.group('slash') == '/': 669 self.finish_endtag(tagname) 670 return tag.end(0) 671 672 # Internal -- parse endtag 673 def parse_endtag(self, i): 674 rawdata = self.rawdata 675 end = endbracketfind.match(rawdata, i+1) 676 if end is None: 677 return -1 678 res = tagfind.match(rawdata, i+2) 679 if res is None: 680 if self.literal: 681 self.handle_data(rawdata[i]) 682 return i+1 683 if not self.__accept_missing_endtag_name: 684 self.syntax_error('no name specified in end tag') 685 tag = self.stack[-1][0] 686 k = i+2 687 else: 688 tag = res.group(0) 689 if self.__map_case: 690 tag = tag.lower() 691 if self.literal: 692 if not self.stack or tag != self.stack[-1][0]: 693 self.handle_data(rawdata[i]) 694 return i+1 695 k = res.end(0) 696 if endbracket.match(rawdata, k) is None: 697 self.syntax_error('garbage in end tag') 698 self.finish_endtag(tag) 699 return end.end(0) 700 701 # Internal -- finish processing of start tag 702 def finish_starttag(self, tagname, attrdict, method): 703 if method is not None: 704 self.handle_starttag(tagname, method, attrdict) 705 else: 706 self.unknown_starttag(tagname, attrdict) 707 708 # Internal -- finish processing of end tag 709 def finish_endtag(self, tag): 710 self.literal = 0 711 if not tag: 712 self.syntax_error('name-less end tag') 713 found = len(self.stack) - 1 714 if found < 0: 715 self.unknown_endtag(tag) 716 return 717 else: 718 found = -1 719 for i in range(len(self.stack)): 720 if tag == self.stack[i][0]: 721 found = i 722 if found == -1: 723 self.syntax_error('unopened end tag') 724 return 725 while len(self.stack) > found: 726 if found < len(self.stack) - 1: 727 self.syntax_error('missing close tag for %s' % self.stack[-1][2]) 728 nstag = self.stack[-1][2] 729 method = self.elements.get(nstag, (None, None))[1] 730 if method is not None: 731 self.handle_endtag(nstag, method) 732 else: 733 self.unknown_endtag(nstag) 734 if self.__use_namespaces == len(self.stack): 735 self.__use_namespaces = 0 736 del self.stack[-1] 737 738 # Overridable -- handle xml processing instruction 739 def handle_xml(self, encoding, standalone): 740 pass 741 742 # Overridable -- handle DOCTYPE 743 def handle_doctype(self, tag, pubid, syslit, data): 744 pass 745 746 # Overridable -- handle start tag 747 def handle_starttag(self, tag, method, attrs): 748 method(attrs) 749 750 # Overridable -- handle end tag 751 def handle_endtag(self, tag, method): 752 method() 753 754 # Example -- handle character reference, no need to override 755 def handle_charref(self, name): 756 try: 757 if name[0] == 'x': 758 n = int(name[1:], 16) 759 else: 760 n = int(name) 761 except ValueError: 762 self.unknown_charref(name) 763 return 764 if not 0 <= n <= 255: 765 self.unknown_charref(name) 766 return 767 self.handle_data(chr(n)) 768 769 # Definition of entities -- derived classes may override 770 entitydefs = {'lt': '<', # must use charref 771 'gt': '>', 772 'amp': '&', # must use charref 773 'quot': '"', 774 'apos': ''', 775 } 776 777 # Example -- handle data, should be overridden 778 def handle_data(self, data): 779 pass 780 781 # Example -- handle cdata, could be overridden 782 def handle_cdata(self, data): 783 pass 784 785 # Example -- handle comment, could be overridden 786 def handle_comment(self, data): 787 pass 788 789 # Example -- handle processing instructions, could be overridden 790 def handle_proc(self, name, data): 791 pass 792 793 # Example -- handle relatively harmless syntax errors, could be overridden 794 def syntax_error(self, message): 795 raise Error('Syntax error at line %d: %s' % (self.lineno, message)) 796 797 # To be overridden -- handlers for unknown objects 798 def unknown_starttag(self, tag, attrs): pass 799 def unknown_endtag(self, tag): pass 800 def unknown_charref(self, ref): pass 801 def unknown_entityref(self, name): 802 self.syntax_error("reference to unknown entity `&%s;'" % name) 803 804 805 class TestXMLParser(XMLParser): 806 807 def __init__(self, **kw): 808 self.testdata = "" 809 XMLParser.__init__(self, **kw) 810 811 def handle_xml(self, encoding, standalone): 812 self.flush() 813 print 'xml: encoding =',encoding,'standalone =',standalone 814 815 def handle_doctype(self, tag, pubid, syslit, data): 816 self.flush() 817 print 'DOCTYPE:',tag, repr(data) 818 819 def handle_data(self, data): 820 self.testdata = self.testdata + data 821 if len(repr(self.testdata)) >= 70: 822 self.flush() 823 824 def flush(self): 825 data = self.testdata 826 if data: 827 self.testdata = "" 828 print 'data:', repr(data) 829 830 def handle_cdata(self, data): 831 self.flush() 832 print 'cdata:', repr(data) 833 834 def handle_proc(self, name, data): 835 self.flush() 836 print 'processing:',name,repr(data) 837 838 def handle_comment(self, data): 839 self.flush() 840 r = repr(data) 841 if len(r) > 68: 842 r = r[:32] + '...' + r[-32:] 843 print 'comment:', r 844 845 def syntax_error(self, message): 846 print 'error at line %d:' % self.lineno, message 847 848 def unknown_starttag(self, tag, attrs): 849 self.flush() 850 if not attrs: 851 print 'start tag: <' + tag + '>' 852 else: 853 print 'start tag: <' + tag, 854 for name, value in attrs.items(): 855 print name + '=' + '"' + value + '"', 856 print '>' 857 858 def unknown_endtag(self, tag): 859 self.flush() 860 print 'end tag: </' + tag + '>' 861 862 def unknown_entityref(self, ref): 863 self.flush() 864 print '*** unknown entity ref: &' + ref + ';' 865 866 def unknown_charref(self, ref): 867 self.flush() 868 print '*** unknown char ref: &#' + ref + ';' 869 870 def close(self): 871 XMLParser.close(self) 872 self.flush() 873 874 def test(args = None): 875 import sys, getopt 876 from time import time 877 878 if not args: 879 args = sys.argv[1:] 880 881 opts, args = getopt.getopt(args, 'st') 882 klass = TestXMLParser 883 do_time = 0 884 for o, a in opts: 885 if o == '-s': 886 klass = XMLParser 887 elif o == '-t': 888 do_time = 1 889 890 if args: 891 file = args[0] 892 else: 893 file = 'test.xml' 894 895 if file == '-': 896 f = sys.stdin 897 else: 898 try: 899 f = open(file, 'r') 900 except IOError, msg: 901 print file, ":", msg 902 sys.exit(1) 903 904 data = f.read() 905 if f is not sys.stdin: 906 f.close() 907 908 x = klass() 909 t0 = time() 910 try: 911 if do_time: 912 x.feed(data) 913 x.close() 914 else: 915 for c in data: 916 x.feed(c) 917 x.close() 918 except Error, msg: 919 t1 = time() 920 print msg 921 if do_time: 922 print 'total time: %g' % (t1-t0) 923 sys.exit(1) 924 t1 = time() 925 if do_time: 926 print 'total time: %g' % (t1-t0) 927 928 929 if __name__ == '__main__': 930 test() 931