Home | History | Annotate | Download | only in python2.7
      1 """HTML 2.0 parser.
      2 
      3 See the HTML 2.0 specification:
      4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
      5 """
      6 
      7 from warnings import warnpy3k
      8 warnpy3k("the htmllib module has been removed in Python 3.0",
      9          stacklevel=2)
     10 del warnpy3k
     11 
     12 import sgmllib
     13 
     14 from formatter import AS_IS
     15 
     16 __all__ = ["HTMLParser", "HTMLParseError"]
     17 
     18 
     19 class HTMLParseError(sgmllib.SGMLParseError):
     20     """Error raised when an HTML document can't be parsed."""
     21 
     22 
     23 class HTMLParser(sgmllib.SGMLParser):
     24     """This is the basic HTML parser class.
     25 
     26     It supports all entity names required by the XHTML 1.0 Recommendation.
     27     It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
     28     elements.
     29 
     30     """
     31 
     32     from htmlentitydefs import entitydefs
     33 
     34     def __init__(self, formatter, verbose=0):
     35         """Creates an instance of the HTMLParser class.
     36 
     37         The formatter parameter is the formatter instance associated with
     38         the parser.
     39 
     40         """
     41         sgmllib.SGMLParser.__init__(self, verbose)
     42         self.formatter = formatter
     43 
     44     def error(self, message):
     45         raise HTMLParseError(message)
     46 
     47     def reset(self):
     48         sgmllib.SGMLParser.reset(self)
     49         self.savedata = None
     50         self.isindex = 0
     51         self.title = None
     52         self.base = None
     53         self.anchor = None
     54         self.anchorlist = []
     55         self.nofill = 0
     56         self.list_stack = []
     57 
     58     # ------ Methods used internally; some may be overridden
     59 
     60     # --- Formatter interface, taking care of 'savedata' mode;
     61     # shouldn't need to be overridden
     62 
     63     def handle_data(self, data):
     64         if self.savedata is not None:
     65             self.savedata = self.savedata + data
     66         else:
     67             if self.nofill:
     68                 self.formatter.add_literal_data(data)
     69             else:
     70                 self.formatter.add_flowing_data(data)
     71 
     72     # --- Hooks to save data; shouldn't need to be overridden
     73 
     74     def save_bgn(self):
     75         """Begins saving character data in a buffer instead of sending it
     76         to the formatter object.
     77 
     78         Retrieve the stored data via the save_end() method.  Use of the
     79         save_bgn() / save_end() pair may not be nested.
     80 
     81         """
     82         self.savedata = ''
     83 
     84     def save_end(self):
     85         """Ends buffering character data and returns all data saved since
     86         the preceding call to the save_bgn() method.
     87 
     88         If the nofill flag is false, whitespace is collapsed to single
     89         spaces.  A call to this method without a preceding call to the
     90         save_bgn() method will raise a TypeError exception.
     91 
     92         """
     93         data = self.savedata
     94         self.savedata = None
     95         if not self.nofill:
     96             data = ' '.join(data.split())
     97         return data
     98 
     99     # --- Hooks for anchors; should probably be overridden
    100 
    101     def anchor_bgn(self, href, name, type):
    102         """This method is called at the start of an anchor region.
    103 
    104         The arguments correspond to the attributes of the <A> tag with
    105         the same names.  The default implementation maintains a list of
    106         hyperlinks (defined by the HREF attribute for <A> tags) within
    107         the document.  The list of hyperlinks is available as the data
    108         attribute anchorlist.
    109 
    110         """
    111         self.anchor = href
    112         if self.anchor:
    113             self.anchorlist.append(href)
    114 
    115     def anchor_end(self):
    116         """This method is called at the end of an anchor region.
    117 
    118         The default implementation adds a textual footnote marker using an
    119         index into the list of hyperlinks created by the anchor_bgn()method.
    120 
    121         """
    122         if self.anchor:
    123             self.handle_data("[%d]" % len(self.anchorlist))
    124             self.anchor = None
    125 
    126     # --- Hook for images; should probably be overridden
    127 
    128     def handle_image(self, src, alt, *args):
    129         """This method is called to handle images.
    130 
    131         The default implementation simply passes the alt value to the
    132         handle_data() method.
    133 
    134         """
    135         self.handle_data(alt)
    136 
    137     # --------- Top level elememts
    138 
    139     def start_html(self, attrs): pass
    140     def end_html(self): pass
    141 
    142     def start_head(self, attrs): pass
    143     def end_head(self): pass
    144 
    145     def start_body(self, attrs): pass
    146     def end_body(self): pass
    147 
    148     # ------ Head elements
    149 
    150     def start_title(self, attrs):
    151         self.save_bgn()
    152 
    153     def end_title(self):
    154         self.title = self.save_end()
    155 
    156     def do_base(self, attrs):
    157         for a, v in attrs:
    158             if a == 'href':
    159                 self.base = v
    160 
    161     def do_isindex(self, attrs):
    162         self.isindex = 1
    163 
    164     def do_link(self, attrs):
    165         pass
    166 
    167     def do_meta(self, attrs):
    168         pass
    169 
    170     def do_nextid(self, attrs): # Deprecated
    171         pass
    172 
    173     # ------ Body elements
    174 
    175     # --- Headings
    176 
    177     def start_h1(self, attrs):
    178         self.formatter.end_paragraph(1)
    179         self.formatter.push_font(('h1', 0, 1, 0))
    180 
    181     def end_h1(self):
    182         self.formatter.end_paragraph(1)
    183         self.formatter.pop_font()
    184 
    185     def start_h2(self, attrs):
    186         self.formatter.end_paragraph(1)
    187         self.formatter.push_font(('h2', 0, 1, 0))
    188 
    189     def end_h2(self):
    190         self.formatter.end_paragraph(1)
    191         self.formatter.pop_font()
    192 
    193     def start_h3(self, attrs):
    194         self.formatter.end_paragraph(1)
    195         self.formatter.push_font(('h3', 0, 1, 0))
    196 
    197     def end_h3(self):
    198         self.formatter.end_paragraph(1)
    199         self.formatter.pop_font()
    200 
    201     def start_h4(self, attrs):
    202         self.formatter.end_paragraph(1)
    203         self.formatter.push_font(('h4', 0, 1, 0))
    204 
    205     def end_h4(self):
    206         self.formatter.end_paragraph(1)
    207         self.formatter.pop_font()
    208 
    209     def start_h5(self, attrs):
    210         self.formatter.end_paragraph(1)
    211         self.formatter.push_font(('h5', 0, 1, 0))
    212 
    213     def end_h5(self):
    214         self.formatter.end_paragraph(1)
    215         self.formatter.pop_font()
    216 
    217     def start_h6(self, attrs):
    218         self.formatter.end_paragraph(1)
    219         self.formatter.push_font(('h6', 0, 1, 0))
    220 
    221     def end_h6(self):
    222         self.formatter.end_paragraph(1)
    223         self.formatter.pop_font()
    224 
    225     # --- Block Structuring Elements
    226 
    227     def do_p(self, attrs):
    228         self.formatter.end_paragraph(1)
    229 
    230     def start_pre(self, attrs):
    231         self.formatter.end_paragraph(1)
    232         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
    233         self.nofill = self.nofill + 1
    234 
    235     def end_pre(self):
    236         self.formatter.end_paragraph(1)
    237         self.formatter.pop_font()
    238         self.nofill = max(0, self.nofill - 1)
    239 
    240     def start_xmp(self, attrs):
    241         self.start_pre(attrs)
    242         self.setliteral('xmp') # Tell SGML parser
    243 
    244     def end_xmp(self):
    245         self.end_pre()
    246 
    247     def start_listing(self, attrs):
    248         self.start_pre(attrs)
    249         self.setliteral('listing') # Tell SGML parser
    250 
    251     def end_listing(self):
    252         self.end_pre()
    253 
    254     def start_address(self, attrs):
    255         self.formatter.end_paragraph(0)
    256         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
    257 
    258     def end_address(self):
    259         self.formatter.end_paragraph(0)
    260         self.formatter.pop_font()
    261 
    262     def start_blockquote(self, attrs):
    263         self.formatter.end_paragraph(1)
    264         self.formatter.push_margin('blockquote')
    265 
    266     def end_blockquote(self):
    267         self.formatter.end_paragraph(1)
    268         self.formatter.pop_margin()
    269 
    270     # --- List Elements
    271 
    272     def start_ul(self, attrs):
    273         self.formatter.end_paragraph(not self.list_stack)
    274         self.formatter.push_margin('ul')
    275         self.list_stack.append(['ul', '*', 0])
    276 
    277     def end_ul(self):
    278         if self.list_stack: del self.list_stack[-1]
    279         self.formatter.end_paragraph(not self.list_stack)
    280         self.formatter.pop_margin()
    281 
    282     def do_li(self, attrs):
    283         self.formatter.end_paragraph(0)
    284         if self.list_stack:
    285             [dummy, label, counter] = top = self.list_stack[-1]
    286             top[2] = counter = counter+1
    287         else:
    288             label, counter = '*', 0
    289         self.formatter.add_label_data(label, counter)
    290 
    291     def start_ol(self, attrs):
    292         self.formatter.end_paragraph(not self.list_stack)
    293         self.formatter.push_margin('ol')
    294         label = '1.'
    295         for a, v in attrs:
    296             if a == 'type':
    297                 if len(v) == 1: v = v + '.'
    298                 label = v
    299         self.list_stack.append(['ol', label, 0])
    300 
    301     def end_ol(self):
    302         if self.list_stack: del self.list_stack[-1]
    303         self.formatter.end_paragraph(not self.list_stack)
    304         self.formatter.pop_margin()
    305 
    306     def start_menu(self, attrs):
    307         self.start_ul(attrs)
    308 
    309     def end_menu(self):
    310         self.end_ul()
    311 
    312     def start_dir(self, attrs):
    313         self.start_ul(attrs)
    314 
    315     def end_dir(self):
    316         self.end_ul()
    317 
    318     def start_dl(self, attrs):
    319         self.formatter.end_paragraph(1)
    320         self.list_stack.append(['dl', '', 0])
    321 
    322     def end_dl(self):
    323         self.ddpop(1)
    324         if self.list_stack: del self.list_stack[-1]
    325 
    326     def do_dt(self, attrs):
    327         self.ddpop()
    328 
    329     def do_dd(self, attrs):
    330         self.ddpop()
    331         self.formatter.push_margin('dd')
    332         self.list_stack.append(['dd', '', 0])
    333 
    334     def ddpop(self, bl=0):
    335         self.formatter.end_paragraph(bl)
    336         if self.list_stack:
    337             if self.list_stack[-1][0] == 'dd':
    338                 del self.list_stack[-1]
    339                 self.formatter.pop_margin()
    340 
    341     # --- Phrase Markup
    342 
    343     # Idiomatic Elements
    344 
    345     def start_cite(self, attrs): self.start_i(attrs)
    346     def end_cite(self): self.end_i()
    347 
    348     def start_code(self, attrs): self.start_tt(attrs)
    349     def end_code(self): self.end_tt()
    350 
    351     def start_em(self, attrs): self.start_i(attrs)
    352     def end_em(self): self.end_i()
    353 
    354     def start_kbd(self, attrs): self.start_tt(attrs)
    355     def end_kbd(self): self.end_tt()
    356 
    357     def start_samp(self, attrs): self.start_tt(attrs)
    358     def end_samp(self): self.end_tt()
    359 
    360     def start_strong(self, attrs): self.start_b(attrs)
    361     def end_strong(self): self.end_b()
    362 
    363     def start_var(self, attrs): self.start_i(attrs)
    364     def end_var(self): self.end_i()
    365 
    366     # Typographic Elements
    367 
    368     def start_i(self, attrs):
    369         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
    370     def end_i(self):
    371         self.formatter.pop_font()
    372 
    373     def start_b(self, attrs):
    374         self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
    375     def end_b(self):
    376         self.formatter.pop_font()
    377 
    378     def start_tt(self, attrs):
    379         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
    380     def end_tt(self):
    381         self.formatter.pop_font()
    382 
    383     def start_a(self, attrs):
    384         href = ''
    385         name = ''
    386         type = ''
    387         for attrname, value in attrs:
    388             value = value.strip()
    389             if attrname == 'href':
    390                 href = value
    391             if attrname == 'name':
    392                 name = value
    393             if attrname == 'type':
    394                 type = value.lower()
    395         self.anchor_bgn(href, name, type)
    396 
    397     def end_a(self):
    398         self.anchor_end()
    399 
    400     # --- Line Break
    401 
    402     def do_br(self, attrs):
    403         self.formatter.add_line_break()
    404 
    405     # --- Horizontal Rule
    406 
    407     def do_hr(self, attrs):
    408         self.formatter.add_hor_rule()
    409 
    410     # --- Image
    411 
    412     def do_img(self, attrs):
    413         align = ''
    414         alt = '(image)'
    415         ismap = ''
    416         src = ''
    417         width = 0
    418         height = 0
    419         for attrname, value in attrs:
    420             if attrname == 'align':
    421                 align = value
    422             if attrname == 'alt':
    423                 alt = value
    424             if attrname == 'ismap':
    425                 ismap = value
    426             if attrname == 'src':
    427                 src = value
    428             if attrname == 'width':
    429                 try: width = int(value)
    430                 except ValueError: pass
    431             if attrname == 'height':
    432                 try: height = int(value)
    433                 except ValueError: pass
    434         self.handle_image(src, alt, ismap, align, width, height)
    435 
    436     # --- Really Old Unofficial Deprecated Stuff
    437 
    438     def do_plaintext(self, attrs):
    439         self.start_pre(attrs)
    440         self.setnomoretags() # Tell SGML parser
    441 
    442     # --- Unhandled tags
    443 
    444     def unknown_starttag(self, tag, attrs):
    445         pass
    446 
    447     def unknown_endtag(self, tag):
    448         pass
    449 
    450 
    451 def test(args = None):
    452     import sys, formatter
    453 
    454     if not args:
    455         args = sys.argv[1:]
    456 
    457     silent = args and args[0] == '-s'
    458     if silent:
    459         del args[0]
    460 
    461     if args:
    462         file = args[0]
    463     else:
    464         file = 'test.html'
    465 
    466     if file == '-':
    467         f = sys.stdin
    468     else:
    469         try:
    470             f = open(file, 'r')
    471         except IOError, msg:
    472             print file, ":", msg
    473             sys.exit(1)
    474 
    475     data = f.read()
    476 
    477     if f is not sys.stdin:
    478         f.close()
    479 
    480     if silent:
    481         f = formatter.NullFormatter()
    482     else:
    483         f = formatter.AbstractFormatter(formatter.DumbWriter())
    484 
    485     p = HTMLParser(f)
    486     p.feed(data)
    487     p.close()
    488 
    489 
    490 if __name__ == '__main__':
    491     test()
    492