1 """Diagnostic functions, mainly for use when doing tech support.""" 2 import cProfile 3 from StringIO import StringIO 4 from HTMLParser import HTMLParser 5 import bs4 6 from bs4 import BeautifulSoup, __version__ 7 from bs4.builder import builder_registry 8 9 import os 10 import pstats 11 import random 12 import tempfile 13 import time 14 import traceback 15 import sys 16 import cProfile 17 18 def diagnose(data): 19 """Diagnostic suite for isolating common problems.""" 20 print "Diagnostic running on Beautiful Soup %s" % __version__ 21 print "Python version %s" % sys.version 22 23 basic_parsers = ["html.parser", "html5lib", "lxml"] 24 for name in basic_parsers: 25 for builder in builder_registry.builders: 26 if name in builder.features: 27 break 28 else: 29 basic_parsers.remove(name) 30 print ( 31 "I noticed that %s is not installed. Installing it may help." % 32 name) 33 34 if 'lxml' in basic_parsers: 35 basic_parsers.append(["lxml", "xml"]) 36 from lxml import etree 37 print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 38 39 if 'html5lib' in basic_parsers: 40 import html5lib 41 print "Found html5lib version %s" % html5lib.__version__ 42 43 if hasattr(data, 'read'): 44 data = data.read() 45 elif os.path.exists(data): 46 print '"%s" looks like a filename. Reading data from the file.' % data 47 data = open(data).read() 48 elif data.startswith("http:") or data.startswith("https:"): 49 print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 50 print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 51 return 52 print 53 54 for parser in basic_parsers: 55 print "Trying to parse your markup with %s" % parser 56 success = False 57 try: 58 soup = BeautifulSoup(data, parser) 59 success = True 60 except Exception, e: 61 print "%s could not parse the markup." % parser 62 traceback.print_exc() 63 if success: 64 print "Here's what %s did with the markup:" % parser 65 print soup.prettify() 66 67 print "-" * 80 68 69 def lxml_trace(data, html=True, **kwargs): 70 """Print out the lxml events that occur during parsing. 71 72 This lets you see how lxml parses a document when no Beautiful 73 Soup code is running. 74 """ 75 from lxml import etree 76 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 77 print("%s, %4s, %s" % (event, element.tag, element.text)) 78 79 class AnnouncingParser(HTMLParser): 80 """Announces HTMLParser parse events, without doing anything else.""" 81 82 def _p(self, s): 83 print(s) 84 85 def handle_starttag(self, name, attrs): 86 self._p("%s START" % name) 87 88 def handle_endtag(self, name): 89 self._p("%s END" % name) 90 91 def handle_data(self, data): 92 self._p("%s DATA" % data) 93 94 def handle_charref(self, name): 95 self._p("%s CHARREF" % name) 96 97 def handle_entityref(self, name): 98 self._p("%s ENTITYREF" % name) 99 100 def handle_comment(self, data): 101 self._p("%s COMMENT" % data) 102 103 def handle_decl(self, data): 104 self._p("%s DECL" % data) 105 106 def unknown_decl(self, data): 107 self._p("%s UNKNOWN-DECL" % data) 108 109 def handle_pi(self, data): 110 self._p("%s PI" % data) 111 112 def htmlparser_trace(data): 113 """Print out the HTMLParser events that occur during parsing. 114 115 This lets you see how HTMLParser parses a document when no 116 Beautiful Soup code is running. 117 """ 118 parser = AnnouncingParser() 119 parser.feed(data) 120 121 _vowels = "aeiou" 122 _consonants = "bcdfghjklmnpqrstvwxyz" 123 124 def rword(length=5): 125 "Generate a random word-like string." 126 s = '' 127 for i in range(length): 128 if i % 2 == 0: 129 t = _consonants 130 else: 131 t = _vowels 132 s += random.choice(t) 133 return s 134 135 def rsentence(length=4): 136 "Generate a random sentence-like string." 137 return " ".join(rword(random.randint(4,9)) for i in range(length)) 138 139 def rdoc(num_elements=1000): 140 """Randomly generate an invalid HTML document.""" 141 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 142 elements = [] 143 for i in range(num_elements): 144 choice = random.randint(0,3) 145 if choice == 0: 146 # New tag. 147 tag_name = random.choice(tag_names) 148 elements.append("<%s>" % tag_name) 149 elif choice == 1: 150 elements.append(rsentence(random.randint(1,4))) 151 elif choice == 2: 152 # Close a tag. 153 tag_name = random.choice(tag_names) 154 elements.append("</%s>" % tag_name) 155 return "<html>" + "\n".join(elements) + "</html>" 156 157 def benchmark_parsers(num_elements=100000): 158 """Very basic head-to-head performance benchmark.""" 159 print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 160 data = rdoc(num_elements) 161 print "Generated a large invalid HTML document (%d bytes)." % len(data) 162 163 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 164 success = False 165 try: 166 a = time.time() 167 soup = BeautifulSoup(data, parser) 168 b = time.time() 169 success = True 170 except Exception, e: 171 print "%s could not parse the markup." % parser 172 traceback.print_exc() 173 if success: 174 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 175 176 from lxml import etree 177 a = time.time() 178 etree.HTML(data) 179 b = time.time() 180 print "Raw lxml parsed the markup in %.2fs." % (b-a) 181 182 import html5lib 183 parser = html5lib.HTMLParser() 184 a = time.time() 185 parser.parse(data) 186 b = time.time() 187 print "Raw html5lib parsed the markup in %.2fs." % (b-a) 188 189 def profile(num_elements=100000, parser="lxml"): 190 191 filehandle = tempfile.NamedTemporaryFile() 192 filename = filehandle.name 193 194 data = rdoc(num_elements) 195 vars = dict(bs4=bs4, data=data, parser=parser) 196 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 197 198 stats = pstats.Stats(filename) 199 # stats.strip_dirs() 200 stats.sort_stats("cumulative") 201 stats.print_stats('_html5lib|bs4', 50) 202 203 if __name__ == '__main__': 204 diagnose(sys.stdin.read()) 205