Home | History | Annotate | Download | only in scripts
      1 """Demonstrate how different parsers parse the same markup.
      2 
      3 Beautiful Soup can use any of a number of different parsers. Every
      4 parser should behave more or less the same on valid markup, and
      5 Beautiful Soup's unit tests make sure this is the case. But every
      6 parser handles invalid markup differently. Even different versions of
      7 the same parser handle invalid markup differently. So instead of unit
      8 tests I've created this educational demonstration script.
      9 
     10 The file demonstration_markup.txt contains many lines of HTML. This
     11 script tests each line of markup against every parser you have
     12 installed, and prints out how each parser sees that markup. This may
     13 help you choose a parser, or understand why Beautiful Soup presents
     14 your document the way it does.
     15 """
     16 
     17 import os
     18 import sys
     19 from bs4 import BeautifulSoup
     20 parsers = ['html.parser']
     21 
     22 try:
     23     from bs4.builder import _lxml
     24     parsers.append('lxml')
     25 except ImportError, e:
     26     pass
     27 
     28 try:
     29     from bs4.builder import _html5lib
     30     parsers.append('html5lib')
     31 except ImportError, e:
     32     pass
     33 
     34 class Demonstration(object):
     35     def __init__(self, markup):
     36         self.results = {}
     37         self.markup = markup
     38 
     39     def run_against(self, *parser_names):
     40         uniform_results = True
     41         previous_output = None
     42         for parser in parser_names:
     43             try:
     44                 soup = BeautifulSoup(self.markup, parser)
     45                 if markup.startswith("<div>"):
     46                     # Extract the interesting part
     47                     output = soup.div
     48                 else:
     49                     output = soup
     50             except Exception, e:
     51                 output = "[EXCEPTION] %s" % str(e)
     52             self.results[parser] = output
     53             if previous_output is None:
     54                 previous_output = output
     55             elif previous_output != output:
     56                 uniform_results = False
     57         return uniform_results
     58 
     59     def dump(self):
     60         print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
     61         for parser, output in self.results.items():
     62             print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
     63 
     64 different_results = []
     65 uniform_results = []
     66 
     67 print "= Testing the following parsers: %s =" % ", ".join(parsers)
     68 print
     69 
     70 input_file = sys.stdin
     71 if sys.stdin.isatty():
     72     for filename in [
     73         "demonstration_markup.txt",
     74         os.path.join("scripts", "demonstration_markup.txt")]:
     75         if os.path.exists(filename):
     76             input_file = open(filename)
     77 
     78 for markup in input_file:
     79     demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
     80     is_uniform = demo.run_against(*parsers)
     81     if is_uniform:
     82         uniform_results.append(demo)
     83     else:
     84         different_results.append(demo)
     85 
     86 print "== Markup that's handled the same in every parser =="
     87 print
     88 for demo in uniform_results:
     89     demo.dump()
     90     print
     91 print "== Markup that's not handled the same in every parser =="
     92 print
     93 for demo in different_results:
     94     demo.dump()
     95     print
     96