Home | History | Annotate | Download | only in html5lib-python
      1 #!/usr/bin/env python
      2 """usage: %prog [options] filename
      3 
      4 Parse a document to a tree, with optional profiling
      5 """
      6 
      7 import sys
      8 import os
      9 import traceback
     10 from optparse import OptionParser
     11 
     12 from html5lib import html5parser, sanitizer
     13 from html5lib.tokenizer import HTMLTokenizer
     14 from html5lib import treebuilders, serializer, treewalkers
     15 from html5lib import constants
     16 from html5lib import utils
     17 
     18 def parse():
     19     optParser = getOptParser()
     20     opts,args = optParser.parse_args()
     21     encoding = "utf8"
     22 
     23     try:
     24         f = args[-1]
     25         # Try opening from the internet
     26         if f.startswith('http://'):
     27             try:
     28                 import urllib.request, urllib.parse, urllib.error, cgi
     29                 f = urllib.request.urlopen(f)
     30                 contentType = f.headers.get('content-type')
     31                 if contentType:
     32                     (mediaType, params) = cgi.parse_header(contentType)
     33                     encoding = params.get('charset')
     34             except:
     35                 pass
     36         elif f == '-':
     37             f = sys.stdin
     38             if sys.version_info[0] >= 3:
     39                 encoding = None
     40         else:
     41             try:
     42                 # Try opening from file system
     43                 f = open(f, "rb")
     44             except IOError as e:                
     45                 sys.stderr.write("Unable to open file: %s\n" % e)
     46                 sys.exit(1)
     47     except IndexError:
     48         sys.stderr.write("No filename provided. Use -h for help\n")
     49         sys.exit(1)
     50 
     51     treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
     52 
     53     if opts.sanitize:
     54         tokenizer = sanitizer.HTMLSanitizer
     55     else:
     56         tokenizer = HTMLTokenizer
     57 
     58     p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)
     59 
     60     if opts.fragment:
     61         parseMethod = p.parseFragment
     62     else:
     63         parseMethod = p.parse
     64 
     65     if opts.profile:
     66         import cProfile
     67         import pstats
     68         cProfile.runctx("run(parseMethod, f, encoding)", None,
     69                         {"run": run,
     70                          "parseMethod": parseMethod,
     71                          "f": f,
     72                          "encoding": encoding},
     73                         "stats.prof")
     74         # XXX - We should use a temp file here
     75         stats = pstats.Stats('stats.prof')
     76         stats.strip_dirs()
     77         stats.sort_stats('time')
     78         stats.print_stats()
     79     elif opts.time:
     80         import time
     81         t0 = time.time()
     82         document = run(parseMethod, f, encoding)
     83         t1 = time.time()
     84         if document:
     85             printOutput(p, document, opts)
     86             t2 = time.time()
     87             sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
     88         else:
     89             sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
     90     else:
     91         document = run(parseMethod, f, encoding)
     92         if document:
     93             printOutput(p, document, opts)
     94 
     95 def run(parseMethod, f, encoding):
     96     try:
     97         document = parseMethod(f, encoding=encoding)
     98     except:
     99         document = None
    100         traceback.print_exc()
    101     return document
    102 
    103 def printOutput(parser, document, opts):
    104     if opts.encoding:
    105         print("Encoding:", parser.tokenizer.stream.charEncoding)
    106 
    107     for item in parser.log:
    108         print(item)
    109 
    110     if document is not None:
    111         if opts.xml:
    112             tb = opts.treebuilder.lower()
    113             if tb == "dom":
    114                 document.writexml(sys.stdout, encoding="utf-8")
    115             elif tb == "lxml":
    116                 import lxml.etree
    117                 sys.stdout.write(lxml.etree.tostring(document))
    118             elif tb == "etree":
    119                 sys.stdout.write(utils.default_etree.tostring(document))
    120         elif opts.tree:
    121             if not hasattr(document,'__getitem__'):
    122                 document = [document]
    123             for fragment in document:
    124                 print(parser.tree.testSerializer(fragment))
    125         elif opts.hilite:
    126             sys.stdout.write(document.hilite("utf-8"))
    127         elif opts.html:
    128             kwargs = {}
    129             for opt in serializer.HTMLSerializer.options:
    130                 try:
    131                     kwargs[opt] = getattr(opts,opt)
    132                 except:
    133                     pass
    134             if not kwargs['quote_char']:
    135                 del kwargs['quote_char']
    136 
    137             tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
    138             if sys.version_info[0] >= 3:
    139                 encoding = None
    140             else:
    141                 encoding = "utf-8"
    142             for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
    143                 sys.stdout.write(text)
    144             if not text.endswith('\n'): sys.stdout.write('\n')
    145     if opts.error:
    146         errList=[]
    147         for pos, errorcode, datavars in parser.errors:
    148             errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
    149         sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
    150 
    151 def getOptParser():
    152     parser = OptionParser(usage=__doc__)
    153 
    154     parser.add_option("-p", "--profile", action="store_true", default=False,
    155                       dest="profile", help="Use the hotshot profiler to "
    156                       "produce a detailed log of the run")
    157 
    158     parser.add_option("-t", "--time",
    159                       action="store_true", default=False, dest="time",
    160                       help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
    161 
    162     parser.add_option("-b", "--treebuilder", action="store", type="string",
    163                       dest="treebuilder", default="etree")
    164 
    165     parser.add_option("-e", "--error", action="store_true", default=False,
    166                       dest="error", help="Print a list of parse errors")
    167 
    168     parser.add_option("-f", "--fragment", action="store_true", default=False,
    169                       dest="fragment", help="Parse as a fragment")
    170 
    171     parser.add_option("", "--tree", action="store_true", default=False,
    172                       dest="tree", help="Output as debug tree")
    173 
    174     parser.add_option("-x", "--xml", action="store_true", default=False,
    175                       dest="xml", help="Output as xml")
    176 
    177     parser.add_option("", "--no-html", action="store_false", default=True,
    178                       dest="html", help="Don't output html")
    179 
    180     parser.add_option("", "--hilite", action="store_true", default=False,
    181                       dest="hilite", help="Output as formatted highlighted code.")
    182 
    183     parser.add_option("-c", "--encoding", action="store_true", default=False,
    184                       dest="encoding", help="Print character encoding used")
    185 
    186     parser.add_option("", "--inject-meta-charset", action="store_true",
    187                       default=False, dest="inject_meta_charset",
    188                       help="inject <meta charset>")
    189 
    190     parser.add_option("", "--strip-whitespace", action="store_true",
    191                       default=False, dest="strip_whitespace",
    192                       help="strip whitespace")
    193 
    194     parser.add_option("", "--omit-optional-tags", action="store_true",
    195                       default=False, dest="omit_optional_tags",
    196                       help="omit optional tags")
    197 
    198     parser.add_option("", "--quote-attr-values", action="store_true",
    199                       default=False, dest="quote_attr_values",
    200                       help="quote attribute values")
    201 
    202     parser.add_option("", "--use-best-quote-char", action="store_true",
    203                       default=False, dest="use_best_quote_char",
    204                       help="use best quote character")
    205 
    206     parser.add_option("", "--quote-char", action="store",
    207                       default=None, dest="quote_char",
    208                       help="quote character")
    209 
    210     parser.add_option("", "--no-minimize-boolean-attributes",
    211                       action="store_false", default=True,
    212                       dest="minimize_boolean_attributes",
    213                       help="minimize boolean attributes")
    214 
    215     parser.add_option("", "--use-trailing-solidus", action="store_true",
    216                       default=False, dest="use_trailing_solidus",
    217                       help="use trailing solidus")
    218 
    219     parser.add_option("", "--space-before-trailing-solidus",
    220                       action="store_true", default=False,
    221                       dest="space_before_trailing_solidus",
    222                       help="add space before trailing solidus")
    223 
    224     parser.add_option("", "--escape-lt-in-attrs", action="store_true",
    225                       default=False, dest="escape_lt_in_attrs",
    226                       help="escape less than signs in attribute values")
    227 
    228     parser.add_option("", "--escape-rcdata", action="store_true",
    229                       default=False, dest="escape_rcdata",
    230                       help="escape rcdata element values")
    231 
    232     parser.add_option("", "--sanitize", action="store_true", default=False,
    233                       dest="sanitize", help="sanitize")
    234 
    235     parser.add_option("-l", "--log", action="store_true", default=False,
    236                       dest="log", help="log state transitions")
    237 
    238     return parser
    239 
    240 if __name__ == "__main__":
    241     parse()
    242