1 """A parser for SGML, using the derived class as a static DTD.""" 2 3 # XXX This only supports those SGML features used by HTML. 4 5 # XXX There should be a way to distinguish between PCDATA (parsed 6 # character data -- the normal case), RCDATA (replaceable character 7 # data -- only char and entity references and end tags are special) 8 # and CDATA (character data -- only end tags are special). RCDATA is 9 # not supported at all. 10 11 12 from warnings import warnpy3k 13 warnpy3k("the sgmllib module has been removed in Python 3.0", 14 stacklevel=2) 15 del warnpy3k 16 17 import markupbase 18 import re 19 20 __all__ = ["SGMLParser", "SGMLParseError"] 21 22 # Regular expressions used for parsing 23 24 interesting = re.compile('[&<]') 25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' 26 '<([a-zA-Z][^<>]*|' 27 '/([a-zA-Z][^<>]*)?|' 28 '![^<>]*)?') 29 30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 31 charref = re.compile('&#([0-9]+)[^0-9]') 32 33 starttagopen = re.compile('<[>a-zA-Z]') 34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') 35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') 36 piclose = re.compile('>') 37 endbracket = re.compile('[<>]') 38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') 39 attrfind = re.compile( 40 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' 41 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') 42 43 44 class SGMLParseError(RuntimeError): 45 """Exception raised for all parse errors.""" 46 pass 47 48 49 # SGML parser base class -- find tags and call handler functions. 50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). 51 # The dtd is defined by deriving a class which defines methods 52 # with special names to handle tags: start_foo and end_foo to handle 53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. 54 # (Tags are converted to lower case for this purpose.) The data 55 # between tags is passed to the parser by calling self.handle_data() 56 # with some data as argument (the data may be split up in arbitrary 57 # chunks). Entity references are passed by calling 58 # self.handle_entityref() with the entity reference as argument. 59 60 class SGMLParser(markupbase.ParserBase): 61 # Definition of entities -- derived classes may override 62 entity_or_charref = re.compile('&(?:' 63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' 64 ')(;?)') 65 66 def __init__(self, verbose=0): 67 """Initialize and reset this instance.""" 68 self.verbose = verbose 69 self.reset() 70 71 def reset(self): 72 """Reset this instance. Loses all unprocessed data.""" 73 self.__starttag_text = None 74 self.rawdata = '' 75 self.stack = [] 76 self.lasttag = '???' 77 self.nomoretags = 0 78 self.literal = 0 79 markupbase.ParserBase.reset(self) 80 81 def setnomoretags(self): 82 """Enter literal mode (CDATA) till EOF. 83 84 Intended for derived classes only. 85 """ 86 self.nomoretags = self.literal = 1 87 88 def setliteral(self, *args): 89 """Enter literal mode (CDATA). 90 91 Intended for derived classes only. 92 """ 93 self.literal = 1 94 95 def feed(self, data): 96 """Feed some data to the parser. 97 98 Call this as often as you want, with as little or as much text 99 as you want (may include '\n'). (This just saves the text, 100 all the processing is done by goahead().) 101 """ 102 103 self.rawdata = self.rawdata + data 104 self.goahead(0) 105 106 def close(self): 107 """Handle the remaining data.""" 108 self.goahead(1) 109 110 def error(self, message): 111 raise SGMLParseError(message) 112 113 # Internal -- handle data as far as reasonable. May leave state 114 # and data to be processed by a subsequent call. If 'end' is 115 # true, force handling all data as if followed by EOF marker. 116 def goahead(self, end): 117 rawdata = self.rawdata 118 i = 0 119 n = len(rawdata) 120 while i < n: 121 if self.nomoretags: 122 self.handle_data(rawdata[i:n]) 123 i = n 124 break 125 match = interesting.search(rawdata, i) 126 if match: j = match.start() 127 else: j = n 128 if i < j: 129 self.handle_data(rawdata[i:j]) 130 i = j 131 if i == n: break 132 if rawdata[i] == '<': 133 if starttagopen.match(rawdata, i): 134 if self.literal: 135 self.handle_data(rawdata[i]) 136 i = i+1 137 continue 138 k = self.parse_starttag(i) 139 if k < 0: break 140 i = k 141 continue 142 if rawdata.startswith("</", i): 143 k = self.parse_endtag(i) 144 if k < 0: break 145 i = k 146 self.literal = 0 147 continue 148 if self.literal: 149 if n > (i + 1): 150 self.handle_data("<") 151 i = i+1 152 else: 153 # incomplete 154 break 155 continue 156 if rawdata.startswith("<!--", i): 157 # Strictly speaking, a comment is --.*-- 158 # within a declaration tag <!...>. 159 # This should be removed, 160 # and comments handled only in parse_declaration. 161 k = self.parse_comment(i) 162 if k < 0: break 163 i = k 164 continue 165 if rawdata.startswith("<?", i): 166 k = self.parse_pi(i) 167 if k < 0: break 168 i = i+k 169 continue 170 if rawdata.startswith("<!", i): 171 # This is some sort of declaration; in "HTML as 172 # deployed," this should only be the document type 173 # declaration ("<!DOCTYPE html...>"). 174 k = self.parse_declaration(i) 175 if k < 0: break 176 i = k 177 continue 178 elif rawdata[i] == '&': 179 if self.literal: 180 self.handle_data(rawdata[i]) 181 i = i+1 182 continue 183 match = charref.match(rawdata, i) 184 if match: 185 name = match.group(1) 186 self.handle_charref(name) 187 i = match.end(0) 188 if rawdata[i-1] != ';': i = i-1 189 continue 190 match = entityref.match(rawdata, i) 191 if match: 192 name = match.group(1) 193 self.handle_entityref(name) 194 i = match.end(0) 195 if rawdata[i-1] != ';': i = i-1 196 continue 197 else: 198 self.error('neither < nor & ??') 199 # We get here only if incomplete matches but 200 # nothing else 201 match = incomplete.match(rawdata, i) 202 if not match: 203 self.handle_data(rawdata[i]) 204 i = i+1 205 continue 206 j = match.end(0) 207 if j == n: 208 break # Really incomplete 209 self.handle_data(rawdata[i:j]) 210 i = j 211 # end while 212 if end and i < n: 213 self.handle_data(rawdata[i:n]) 214 i = n 215 self.rawdata = rawdata[i:] 216 # XXX if end: check for empty stack 217 218 # Extensions for the DOCTYPE scanner: 219 _decl_otherchars = '=' 220 221 # Internal -- parse processing instr, return length or -1 if not terminated 222 def parse_pi(self, i): 223 rawdata = self.rawdata 224 if rawdata[i:i+2] != '<?': 225 self.error('unexpected call to parse_pi()') 226 match = piclose.search(rawdata, i+2) 227 if not match: 228 return -1 229 j = match.start(0) 230 self.handle_pi(rawdata[i+2: j]) 231 j = match.end(0) 232 return j-i 233 234 def get_starttag_text(self): 235 return self.__starttag_text 236 237 # Internal -- handle starttag, return length or -1 if not terminated 238 def parse_starttag(self, i): 239 self.__starttag_text = None 240 start_pos = i 241 rawdata = self.rawdata 242 if shorttagopen.match(rawdata, i): 243 # SGML shorthand: <tag/data/ == <tag>data</tag> 244 # XXX Can data contain &... (entity or char refs)? 245 # XXX Can data contain < or > (tag characters)? 246 # XXX Can there be whitespace before the first /? 247 match = shorttag.match(rawdata, i) 248 if not match: 249 return -1 250 tag, data = match.group(1, 2) 251 self.__starttag_text = '<%s/' % tag 252 tag = tag.lower() 253 k = match.end(0) 254 self.finish_shorttag(tag, data) 255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1] 256 return k 257 # XXX The following should skip matching quotes (' or ") 258 # As a shortcut way to exit, this isn't so bad, but shouldn't 259 # be used to locate the actual end of the start tag since the 260 # < or > characters may be embedded in an attribute value. 261 match = endbracket.search(rawdata, i+1) 262 if not match: 263 return -1 264 j = match.start(0) 265 # Now parse the data between i+1 and j into a tag and attrs 266 attrs = [] 267 if rawdata[i:i+2] == '<>': 268 # SGML shorthand: <> == <last open tag seen> 269 k = j 270 tag = self.lasttag 271 else: 272 match = tagfind.match(rawdata, i+1) 273 if not match: 274 self.error('unexpected call to parse_starttag') 275 k = match.end(0) 276 tag = rawdata[i+1:k].lower() 277 self.lasttag = tag 278 while k < j: 279 match = attrfind.match(rawdata, k) 280 if not match: break 281 attrname, rest, attrvalue = match.group(1, 2, 3) 282 if not rest: 283 attrvalue = attrname 284 else: 285 if (attrvalue[:1] == "'" == attrvalue[-1:] or 286 attrvalue[:1] == '"' == attrvalue[-1:]): 287 # strip quotes 288 attrvalue = attrvalue[1:-1] 289 attrvalue = self.entity_or_charref.sub( 290 self._convert_ref, attrvalue) 291 attrs.append((attrname.lower(), attrvalue)) 292 k = match.end(0) 293 if rawdata[j] == '>': 294 j = j+1 295 self.__starttag_text = rawdata[start_pos:j] 296 self.finish_starttag(tag, attrs) 297 return j 298 299 # Internal -- convert entity or character reference 300 def _convert_ref(self, match): 301 if match.group(2): 302 return self.convert_charref(match.group(2)) or \ 303 '&#%s%s' % match.groups()[1:] 304 elif match.group(3): 305 return self.convert_entityref(match.group(1)) or \ 306 '&%s;' % match.group(1) 307 else: 308 return '&%s' % match.group(1) 309 310 # Internal -- parse endtag 311 def parse_endtag(self, i): 312 rawdata = self.rawdata 313 match = endbracket.search(rawdata, i+1) 314 if not match: 315 return -1 316 j = match.start(0) 317 tag = rawdata[i+2:j].strip().lower() 318 if rawdata[j] == '>': 319 j = j+1 320 self.finish_endtag(tag) 321 return j 322 323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) 324 def finish_shorttag(self, tag, data): 325 self.finish_starttag(tag, []) 326 self.handle_data(data) 327 self.finish_endtag(tag) 328 329 # Internal -- finish processing of start tag 330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag 331 def finish_starttag(self, tag, attrs): 332 try: 333 method = getattr(self, 'start_' + tag) 334 except AttributeError: 335 try: 336 method = getattr(self, 'do_' + tag) 337 except AttributeError: 338 self.unknown_starttag(tag, attrs) 339 return -1 340 else: 341 self.handle_starttag(tag, method, attrs) 342 return 0 343 else: 344 self.stack.append(tag) 345 self.handle_starttag(tag, method, attrs) 346 return 1 347 348 # Internal -- finish processing of end tag 349 def finish_endtag(self, tag): 350 if not tag: 351 found = len(self.stack) - 1 352 if found < 0: 353 self.unknown_endtag(tag) 354 return 355 else: 356 if tag not in self.stack: 357 try: 358 method = getattr(self, 'end_' + tag) 359 except AttributeError: 360 self.unknown_endtag(tag) 361 else: 362 self.report_unbalanced(tag) 363 return 364 found = len(self.stack) 365 for i in range(found): 366 if self.stack[i] == tag: found = i 367 while len(self.stack) > found: 368 tag = self.stack[-1] 369 try: 370 method = getattr(self, 'end_' + tag) 371 except AttributeError: 372 method = None 373 if method: 374 self.handle_endtag(tag, method) 375 else: 376 self.unknown_endtag(tag) 377 del self.stack[-1] 378 379 # Overridable -- handle start tag 380 def handle_starttag(self, tag, method, attrs): 381 method(attrs) 382 383 # Overridable -- handle end tag 384 def handle_endtag(self, tag, method): 385 method() 386 387 # Example -- report an unbalanced </...> tag. 388 def report_unbalanced(self, tag): 389 if self.verbose: 390 print '*** Unbalanced </' + tag + '>' 391 print '*** Stack:', self.stack 392 393 def convert_charref(self, name): 394 """Convert character reference, may be overridden.""" 395 try: 396 n = int(name) 397 except ValueError: 398 return 399 if not 0 <= n <= 127: 400 return 401 return self.convert_codepoint(n) 402 403 def convert_codepoint(self, codepoint): 404 return chr(codepoint) 405 406 def handle_charref(self, name): 407 """Handle character reference, no need to override.""" 408 replacement = self.convert_charref(name) 409 if replacement is None: 410 self.unknown_charref(name) 411 else: 412 self.handle_data(replacement) 413 414 # Definition of entities -- derived classes may override 415 entitydefs = \ 416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} 417 418 def convert_entityref(self, name): 419 """Convert entity references. 420 421 As an alternative to overriding this method; one can tailor the 422 results by setting up the self.entitydefs mapping appropriately. 423 """ 424 table = self.entitydefs 425 if name in table: 426 return table[name] 427 else: 428 return 429 430 def handle_entityref(self, name): 431 """Handle entity references, no need to override.""" 432 replacement = self.convert_entityref(name) 433 if replacement is None: 434 self.unknown_entityref(name) 435 else: 436 self.handle_data(replacement) 437 438 # Example -- handle data, should be overridden 439 def handle_data(self, data): 440 pass 441 442 # Example -- handle comment, could be overridden 443 def handle_comment(self, data): 444 pass 445 446 # Example -- handle declaration, could be overridden 447 def handle_decl(self, decl): 448 pass 449 450 # Example -- handle processing instruction, could be overridden 451 def handle_pi(self, data): 452 pass 453 454 # To be overridden -- handlers for unknown objects 455 def unknown_starttag(self, tag, attrs): pass 456 def unknown_endtag(self, tag): pass 457 def unknown_charref(self, ref): pass 458 def unknown_entityref(self, ref): pass 459 460 461 class TestSGMLParser(SGMLParser): 462 463 def __init__(self, verbose=0): 464 self.testdata = "" 465 SGMLParser.__init__(self, verbose) 466 467 def handle_data(self, data): 468 self.testdata = self.testdata + data 469 if len(repr(self.testdata)) >= 70: 470 self.flush() 471 472 def flush(self): 473 data = self.testdata 474 if data: 475 self.testdata = "" 476 print 'data:', repr(data) 477 478 def handle_comment(self, data): 479 self.flush() 480 r = repr(data) 481 if len(r) > 68: 482 r = r[:32] + '...' + r[-32:] 483 print 'comment:', r 484 485 def unknown_starttag(self, tag, attrs): 486 self.flush() 487 if not attrs: 488 print 'start tag: <' + tag + '>' 489 else: 490 print 'start tag: <' + tag, 491 for name, value in attrs: 492 print name + '=' + '"' + value + '"', 493 print '>' 494 495 def unknown_endtag(self, tag): 496 self.flush() 497 print 'end tag: </' + tag + '>' 498 499 def unknown_entityref(self, ref): 500 self.flush() 501 print '*** unknown entity ref: &' + ref + ';' 502 503 def unknown_charref(self, ref): 504 self.flush() 505 print '*** unknown char ref: &#' + ref + ';' 506 507 def unknown_decl(self, data): 508 self.flush() 509 print '*** unknown decl: [' + data + ']' 510 511 def close(self): 512 SGMLParser.close(self) 513 self.flush() 514 515 516 def test(args = None): 517 import sys 518 519 if args is None: 520 args = sys.argv[1:] 521 522 if args and args[0] == '-s': 523 args = args[1:] 524 klass = SGMLParser 525 else: 526 klass = TestSGMLParser 527 528 if args: 529 file = args[0] 530 else: 531 file = 'test.html' 532 533 if file == '-': 534 f = sys.stdin 535 else: 536 try: 537 f = open(file, 'r') 538 except IOError, msg: 539 print file, ":", msg 540 sys.exit(1) 541 542 data = f.read() 543 if f is not sys.stdin: 544 f.close() 545 546 x = klass() 547 for c in data: 548 x.feed(c) 549 x.close() 550 551 552 if __name__ == '__main__': 553 test() 554