1 """Demonstrate how different parsers parse the same markup. 2 3 Beautiful Soup can use any of a number of different parsers. Every 4 parser should behave more or less the same on valid markup, and 5 Beautiful Soup's unit tests make sure this is the case. But every 6 parser handles invalid markup differently. Even different versions of 7 the same parser handle invalid markup differently. So instead of unit 8 tests I've created this educational demonstration script. 9 10 The file demonstration_markup.txt contains many lines of HTML. This 11 script tests each line of markup against every parser you have 12 installed, and prints out how each parser sees that markup. This may 13 help you choose a parser, or understand why Beautiful Soup presents 14 your document the way it does. 15 """ 16 17 import os 18 import sys 19 from bs4 import BeautifulSoup 20 parsers = ['html.parser'] 21 22 try: 23 from bs4.builder import _lxml 24 parsers.append('lxml') 25 except ImportError, e: 26 pass 27 28 try: 29 from bs4.builder import _html5lib 30 parsers.append('html5lib') 31 except ImportError, e: 32 pass 33 34 class Demonstration(object): 35 def __init__(self, markup): 36 self.results = {} 37 self.markup = markup 38 39 def run_against(self, *parser_names): 40 uniform_results = True 41 previous_output = None 42 for parser in parser_names: 43 try: 44 soup = BeautifulSoup(self.markup, parser) 45 if markup.startswith("<div>"): 46 # Extract the interesting part 47 output = soup.div 48 else: 49 output = soup 50 except Exception, e: 51 output = "[EXCEPTION] %s" % str(e) 52 self.results[parser] = output 53 if previous_output is None: 54 previous_output = output 55 elif previous_output != output: 56 uniform_results = False 57 return uniform_results 58 59 def dump(self): 60 print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) 61 for parser, output in self.results.items(): 62 print "%s: %s" % (parser.rjust(13), output.encode("utf8")) 63 64 different_results = [] 65 uniform_results = [] 66 67 print "= Testing the following parsers: %s =" % ", ".join(parsers) 68 print 69 70 input_file = sys.stdin 71 if sys.stdin.isatty(): 72 for filename in [ 73 "demonstration_markup.txt", 74 os.path.join("scripts", "demonstration_markup.txt")]: 75 if os.path.exists(filename): 76 input_file = open(filename) 77 78 for markup in input_file: 79 demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) 80 is_uniform = demo.run_against(*parsers) 81 if is_uniform: 82 uniform_results.append(demo) 83 else: 84 different_results.append(demo) 85 86 print "== Markup that's handled the same in every parser ==" 87 print 88 for demo in uniform_results: 89 demo.dump() 90 print 91 print "== Markup that's not handled the same in every parser ==" 92 print 93 for demo in different_results: 94 demo.dump() 95 print 96