1 """HTML 2.0 parser. 2 3 See the HTML 2.0 specification: 4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html 5 """ 6 7 from warnings import warnpy3k 8 warnpy3k("the htmllib module has been removed in Python 3.0", 9 stacklevel=2) 10 del warnpy3k 11 12 import sgmllib 13 14 from formatter import AS_IS 15 16 __all__ = ["HTMLParser", "HTMLParseError"] 17 18 19 class HTMLParseError(sgmllib.SGMLParseError): 20 """Error raised when an HTML document can't be parsed.""" 21 22 23 class HTMLParser(sgmllib.SGMLParser): 24 """This is the basic HTML parser class. 25 26 It supports all entity names required by the XHTML 1.0 Recommendation. 27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 28 elements. 29 30 """ 31 32 from htmlentitydefs import entitydefs 33 34 def __init__(self, formatter, verbose=0): 35 """Creates an instance of the HTMLParser class. 36 37 The formatter parameter is the formatter instance associated with 38 the parser. 39 40 """ 41 sgmllib.SGMLParser.__init__(self, verbose) 42 self.formatter = formatter 43 44 def error(self, message): 45 raise HTMLParseError(message) 46 47 def reset(self): 48 sgmllib.SGMLParser.reset(self) 49 self.savedata = None 50 self.isindex = 0 51 self.title = None 52 self.base = None 53 self.anchor = None 54 self.anchorlist = [] 55 self.nofill = 0 56 self.list_stack = [] 57 58 # ------ Methods used internally; some may be overridden 59 60 # --- Formatter interface, taking care of 'savedata' mode; 61 # shouldn't need to be overridden 62 63 def handle_data(self, data): 64 if self.savedata is not None: 65 self.savedata = self.savedata + data 66 else: 67 if self.nofill: 68 self.formatter.add_literal_data(data) 69 else: 70 self.formatter.add_flowing_data(data) 71 72 # --- Hooks to save data; shouldn't need to be overridden 73 74 def save_bgn(self): 75 """Begins saving character data in a buffer instead of sending it 76 to the formatter object. 77 78 Retrieve the stored data via the save_end() method. Use of the 79 save_bgn() / save_end() pair may not be nested. 80 81 """ 82 self.savedata = '' 83 84 def save_end(self): 85 """Ends buffering character data and returns all data saved since 86 the preceding call to the save_bgn() method. 87 88 If the nofill flag is false, whitespace is collapsed to single 89 spaces. A call to this method without a preceding call to the 90 save_bgn() method will raise a TypeError exception. 91 92 """ 93 data = self.savedata 94 self.savedata = None 95 if not self.nofill: 96 data = ' '.join(data.split()) 97 return data 98 99 # --- Hooks for anchors; should probably be overridden 100 101 def anchor_bgn(self, href, name, type): 102 """This method is called at the start of an anchor region. 103 104 The arguments correspond to the attributes of the <A> tag with 105 the same names. The default implementation maintains a list of 106 hyperlinks (defined by the HREF attribute for <A> tags) within 107 the document. The list of hyperlinks is available as the data 108 attribute anchorlist. 109 110 """ 111 self.anchor = href 112 if self.anchor: 113 self.anchorlist.append(href) 114 115 def anchor_end(self): 116 """This method is called at the end of an anchor region. 117 118 The default implementation adds a textual footnote marker using an 119 index into the list of hyperlinks created by the anchor_bgn()method. 120 121 """ 122 if self.anchor: 123 self.handle_data("[%d]" % len(self.anchorlist)) 124 self.anchor = None 125 126 # --- Hook for images; should probably be overridden 127 128 def handle_image(self, src, alt, *args): 129 """This method is called to handle images. 130 131 The default implementation simply passes the alt value to the 132 handle_data() method. 133 134 """ 135 self.handle_data(alt) 136 137 # --------- Top level elememts 138 139 def start_html(self, attrs): pass 140 def end_html(self): pass 141 142 def start_head(self, attrs): pass 143 def end_head(self): pass 144 145 def start_body(self, attrs): pass 146 def end_body(self): pass 147 148 # ------ Head elements 149 150 def start_title(self, attrs): 151 self.save_bgn() 152 153 def end_title(self): 154 self.title = self.save_end() 155 156 def do_base(self, attrs): 157 for a, v in attrs: 158 if a == 'href': 159 self.base = v 160 161 def do_isindex(self, attrs): 162 self.isindex = 1 163 164 def do_link(self, attrs): 165 pass 166 167 def do_meta(self, attrs): 168 pass 169 170 def do_nextid(self, attrs): # Deprecated 171 pass 172 173 # ------ Body elements 174 175 # --- Headings 176 177 def start_h1(self, attrs): 178 self.formatter.end_paragraph(1) 179 self.formatter.push_font(('h1', 0, 1, 0)) 180 181 def end_h1(self): 182 self.formatter.end_paragraph(1) 183 self.formatter.pop_font() 184 185 def start_h2(self, attrs): 186 self.formatter.end_paragraph(1) 187 self.formatter.push_font(('h2', 0, 1, 0)) 188 189 def end_h2(self): 190 self.formatter.end_paragraph(1) 191 self.formatter.pop_font() 192 193 def start_h3(self, attrs): 194 self.formatter.end_paragraph(1) 195 self.formatter.push_font(('h3', 0, 1, 0)) 196 197 def end_h3(self): 198 self.formatter.end_paragraph(1) 199 self.formatter.pop_font() 200 201 def start_h4(self, attrs): 202 self.formatter.end_paragraph(1) 203 self.formatter.push_font(('h4', 0, 1, 0)) 204 205 def end_h4(self): 206 self.formatter.end_paragraph(1) 207 self.formatter.pop_font() 208 209 def start_h5(self, attrs): 210 self.formatter.end_paragraph(1) 211 self.formatter.push_font(('h5', 0, 1, 0)) 212 213 def end_h5(self): 214 self.formatter.end_paragraph(1) 215 self.formatter.pop_font() 216 217 def start_h6(self, attrs): 218 self.formatter.end_paragraph(1) 219 self.formatter.push_font(('h6', 0, 1, 0)) 220 221 def end_h6(self): 222 self.formatter.end_paragraph(1) 223 self.formatter.pop_font() 224 225 # --- Block Structuring Elements 226 227 def do_p(self, attrs): 228 self.formatter.end_paragraph(1) 229 230 def start_pre(self, attrs): 231 self.formatter.end_paragraph(1) 232 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 233 self.nofill = self.nofill + 1 234 235 def end_pre(self): 236 self.formatter.end_paragraph(1) 237 self.formatter.pop_font() 238 self.nofill = max(0, self.nofill - 1) 239 240 def start_xmp(self, attrs): 241 self.start_pre(attrs) 242 self.setliteral('xmp') # Tell SGML parser 243 244 def end_xmp(self): 245 self.end_pre() 246 247 def start_listing(self, attrs): 248 self.start_pre(attrs) 249 self.setliteral('listing') # Tell SGML parser 250 251 def end_listing(self): 252 self.end_pre() 253 254 def start_address(self, attrs): 255 self.formatter.end_paragraph(0) 256 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 257 258 def end_address(self): 259 self.formatter.end_paragraph(0) 260 self.formatter.pop_font() 261 262 def start_blockquote(self, attrs): 263 self.formatter.end_paragraph(1) 264 self.formatter.push_margin('blockquote') 265 266 def end_blockquote(self): 267 self.formatter.end_paragraph(1) 268 self.formatter.pop_margin() 269 270 # --- List Elements 271 272 def start_ul(self, attrs): 273 self.formatter.end_paragraph(not self.list_stack) 274 self.formatter.push_margin('ul') 275 self.list_stack.append(['ul', '*', 0]) 276 277 def end_ul(self): 278 if self.list_stack: del self.list_stack[-1] 279 self.formatter.end_paragraph(not self.list_stack) 280 self.formatter.pop_margin() 281 282 def do_li(self, attrs): 283 self.formatter.end_paragraph(0) 284 if self.list_stack: 285 [dummy, label, counter] = top = self.list_stack[-1] 286 top[2] = counter = counter+1 287 else: 288 label, counter = '*', 0 289 self.formatter.add_label_data(label, counter) 290 291 def start_ol(self, attrs): 292 self.formatter.end_paragraph(not self.list_stack) 293 self.formatter.push_margin('ol') 294 label = '1.' 295 for a, v in attrs: 296 if a == 'type': 297 if len(v) == 1: v = v + '.' 298 label = v 299 self.list_stack.append(['ol', label, 0]) 300 301 def end_ol(self): 302 if self.list_stack: del self.list_stack[-1] 303 self.formatter.end_paragraph(not self.list_stack) 304 self.formatter.pop_margin() 305 306 def start_menu(self, attrs): 307 self.start_ul(attrs) 308 309 def end_menu(self): 310 self.end_ul() 311 312 def start_dir(self, attrs): 313 self.start_ul(attrs) 314 315 def end_dir(self): 316 self.end_ul() 317 318 def start_dl(self, attrs): 319 self.formatter.end_paragraph(1) 320 self.list_stack.append(['dl', '', 0]) 321 322 def end_dl(self): 323 self.ddpop(1) 324 if self.list_stack: del self.list_stack[-1] 325 326 def do_dt(self, attrs): 327 self.ddpop() 328 329 def do_dd(self, attrs): 330 self.ddpop() 331 self.formatter.push_margin('dd') 332 self.list_stack.append(['dd', '', 0]) 333 334 def ddpop(self, bl=0): 335 self.formatter.end_paragraph(bl) 336 if self.list_stack: 337 if self.list_stack[-1][0] == 'dd': 338 del self.list_stack[-1] 339 self.formatter.pop_margin() 340 341 # --- Phrase Markup 342 343 # Idiomatic Elements 344 345 def start_cite(self, attrs): self.start_i(attrs) 346 def end_cite(self): self.end_i() 347 348 def start_code(self, attrs): self.start_tt(attrs) 349 def end_code(self): self.end_tt() 350 351 def start_em(self, attrs): self.start_i(attrs) 352 def end_em(self): self.end_i() 353 354 def start_kbd(self, attrs): self.start_tt(attrs) 355 def end_kbd(self): self.end_tt() 356 357 def start_samp(self, attrs): self.start_tt(attrs) 358 def end_samp(self): self.end_tt() 359 360 def start_strong(self, attrs): self.start_b(attrs) 361 def end_strong(self): self.end_b() 362 363 def start_var(self, attrs): self.start_i(attrs) 364 def end_var(self): self.end_i() 365 366 # Typographic Elements 367 368 def start_i(self, attrs): 369 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) 370 def end_i(self): 371 self.formatter.pop_font() 372 373 def start_b(self, attrs): 374 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) 375 def end_b(self): 376 self.formatter.pop_font() 377 378 def start_tt(self, attrs): 379 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) 380 def end_tt(self): 381 self.formatter.pop_font() 382 383 def start_a(self, attrs): 384 href = '' 385 name = '' 386 type = '' 387 for attrname, value in attrs: 388 value = value.strip() 389 if attrname == 'href': 390 href = value 391 if attrname == 'name': 392 name = value 393 if attrname == 'type': 394 type = value.lower() 395 self.anchor_bgn(href, name, type) 396 397 def end_a(self): 398 self.anchor_end() 399 400 # --- Line Break 401 402 def do_br(self, attrs): 403 self.formatter.add_line_break() 404 405 # --- Horizontal Rule 406 407 def do_hr(self, attrs): 408 self.formatter.add_hor_rule() 409 410 # --- Image 411 412 def do_img(self, attrs): 413 align = '' 414 alt = '(image)' 415 ismap = '' 416 src = '' 417 width = 0 418 height = 0 419 for attrname, value in attrs: 420 if attrname == 'align': 421 align = value 422 if attrname == 'alt': 423 alt = value 424 if attrname == 'ismap': 425 ismap = value 426 if attrname == 'src': 427 src = value 428 if attrname == 'width': 429 try: width = int(value) 430 except ValueError: pass 431 if attrname == 'height': 432 try: height = int(value) 433 except ValueError: pass 434 self.handle_image(src, alt, ismap, align, width, height) 435 436 # --- Really Old Unofficial Deprecated Stuff 437 438 def do_plaintext(self, attrs): 439 self.start_pre(attrs) 440 self.setnomoretags() # Tell SGML parser 441 442 # --- Unhandled tags 443 444 def unknown_starttag(self, tag, attrs): 445 pass 446 447 def unknown_endtag(self, tag): 448 pass 449 450 451 def test(args = None): 452 import sys, formatter 453 454 if not args: 455 args = sys.argv[1:] 456 457 silent = args and args[0] == '-s' 458 if silent: 459 del args[0] 460 461 if args: 462 file = args[0] 463 else: 464 file = 'test.html' 465 466 if file == '-': 467 f = sys.stdin 468 else: 469 try: 470 f = open(file, 'r') 471 except IOError, msg: 472 print file, ":", msg 473 sys.exit(1) 474 475 data = f.read() 476 477 if f is not sys.stdin: 478 f.close() 479 480 if silent: 481 f = formatter.NullFormatter() 482 else: 483 f = formatter.AbstractFormatter(formatter.DumbWriter()) 484 485 p = HTMLParser(f) 486 p.feed(data) 487 p.close() 488 489 490 if __name__ == '__main__': 491 test() 492