Home | History | Annotate | Download | only in bs4
      1 """Diagnostic functions, mainly for use when doing tech support."""
      2 import cProfile
      3 from StringIO import StringIO
      4 from HTMLParser import HTMLParser
      5 import bs4
      6 from bs4 import BeautifulSoup, __version__
      7 from bs4.builder import builder_registry
      8 
      9 import os
     10 import pstats
     11 import random
     12 import tempfile
     13 import time
     14 import traceback
     15 import sys
     16 import cProfile
     17 
     18 def diagnose(data):
     19     """Diagnostic suite for isolating common problems."""
     20     print "Diagnostic running on Beautiful Soup %s" % __version__
     21     print "Python version %s" % sys.version
     22 
     23     basic_parsers = ["html.parser", "html5lib", "lxml"]
     24     for name in basic_parsers:
     25         for builder in builder_registry.builders:
     26             if name in builder.features:
     27                 break
     28         else:
     29             basic_parsers.remove(name)
     30             print (
     31                 "I noticed that %s is not installed. Installing it may help." %
     32                 name)
     33 
     34     if 'lxml' in basic_parsers:
     35         basic_parsers.append(["lxml", "xml"])
     36         from lxml import etree
     37         print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
     38 
     39     if 'html5lib' in basic_parsers:
     40         import html5lib
     41         print "Found html5lib version %s" % html5lib.__version__
     42 
     43     if hasattr(data, 'read'):
     44         data = data.read()
     45     elif os.path.exists(data):
     46         print '"%s" looks like a filename. Reading data from the file.' % data
     47         data = open(data).read()
     48     elif data.startswith("http:") or data.startswith("https:"):
     49         print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
     50         print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
     51         return
     52     print
     53 
     54     for parser in basic_parsers:
     55         print "Trying to parse your markup with %s" % parser
     56         success = False
     57         try:
     58             soup = BeautifulSoup(data, parser)
     59             success = True
     60         except Exception, e:
     61             print "%s could not parse the markup." % parser
     62             traceback.print_exc()
     63         if success:
     64             print "Here's what %s did with the markup:" % parser
     65             print soup.prettify()
     66 
     67         print "-" * 80
     68 
     69 def lxml_trace(data, html=True, **kwargs):
     70     """Print out the lxml events that occur during parsing.
     71 
     72     This lets you see how lxml parses a document when no Beautiful
     73     Soup code is running.
     74     """
     75     from lxml import etree
     76     for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
     77         print("%s, %4s, %s" % (event, element.tag, element.text))
     78 
     79 class AnnouncingParser(HTMLParser):
     80     """Announces HTMLParser parse events, without doing anything else."""
     81 
     82     def _p(self, s):
     83         print(s)
     84 
     85     def handle_starttag(self, name, attrs):
     86         self._p("%s START" % name)
     87 
     88     def handle_endtag(self, name):
     89         self._p("%s END" % name)
     90 
     91     def handle_data(self, data):
     92         self._p("%s DATA" % data)
     93 
     94     def handle_charref(self, name):
     95         self._p("%s CHARREF" % name)
     96 
     97     def handle_entityref(self, name):
     98         self._p("%s ENTITYREF" % name)
     99 
    100     def handle_comment(self, data):
    101         self._p("%s COMMENT" % data)
    102 
    103     def handle_decl(self, data):
    104         self._p("%s DECL" % data)
    105 
    106     def unknown_decl(self, data):
    107         self._p("%s UNKNOWN-DECL" % data)
    108 
    109     def handle_pi(self, data):
    110         self._p("%s PI" % data)
    111 
    112 def htmlparser_trace(data):
    113     """Print out the HTMLParser events that occur during parsing.
    114 
    115     This lets you see how HTMLParser parses a document when no
    116     Beautiful Soup code is running.
    117     """
    118     parser = AnnouncingParser()
    119     parser.feed(data)
    120 
    121 _vowels = "aeiou"
    122 _consonants = "bcdfghjklmnpqrstvwxyz"
    123 
    124 def rword(length=5):
    125     "Generate a random word-like string."
    126     s = ''
    127     for i in range(length):
    128         if i % 2 == 0:
    129             t = _consonants
    130         else:
    131             t = _vowels
    132         s += random.choice(t)
    133     return s
    134 
    135 def rsentence(length=4):
    136     "Generate a random sentence-like string."
    137     return " ".join(rword(random.randint(4,9)) for i in range(length))
    138         
    139 def rdoc(num_elements=1000):
    140     """Randomly generate an invalid HTML document."""
    141     tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    142     elements = []
    143     for i in range(num_elements):
    144         choice = random.randint(0,3)
    145         if choice == 0:
    146             # New tag.
    147             tag_name = random.choice(tag_names)
    148             elements.append("<%s>" % tag_name)
    149         elif choice == 1:
    150             elements.append(rsentence(random.randint(1,4)))
    151         elif choice == 2:
    152             # Close a tag.
    153             tag_name = random.choice(tag_names)
    154             elements.append("</%s>" % tag_name)
    155     return "<html>" + "\n".join(elements) + "</html>"
    156 
    157 def benchmark_parsers(num_elements=100000):
    158     """Very basic head-to-head performance benchmark."""
    159     print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    160     data = rdoc(num_elements)
    161     print "Generated a large invalid HTML document (%d bytes)." % len(data)
    162     
    163     for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
    164         success = False
    165         try:
    166             a = time.time()
    167             soup = BeautifulSoup(data, parser)
    168             b = time.time()
    169             success = True
    170         except Exception, e:
    171             print "%s could not parse the markup." % parser
    172             traceback.print_exc()
    173         if success:
    174             print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
    175 
    176     from lxml import etree
    177     a = time.time()
    178     etree.HTML(data)
    179     b = time.time()
    180     print "Raw lxml parsed the markup in %.2fs." % (b-a)
    181 
    182     import html5lib
    183     parser = html5lib.HTMLParser()
    184     a = time.time()
    185     parser.parse(data)
    186     b = time.time()
    187     print "Raw html5lib parsed the markup in %.2fs." % (b-a)
    188 
    189 def profile(num_elements=100000, parser="lxml"):
    190 
    191     filehandle = tempfile.NamedTemporaryFile()
    192     filename = filehandle.name
    193 
    194     data = rdoc(num_elements)
    195     vars = dict(bs4=bs4, data=data, parser=parser)
    196     cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
    197 
    198     stats = pstats.Stats(filename)
    199     # stats.strip_dirs()
    200     stats.sort_stats("cumulative")
    201     stats.print_stats('_html5lib|bs4', 50)
    202 
    203 if __name__ == '__main__':
    204     diagnose(sys.stdin.read())
    205