Home | History | Annotate | Download | only in tests
      1 from __future__ import absolute_import, division, unicode_literals
      2 
      3 import sys
      4 import os
      5 import json
      6 import re
      7 
      8 import html5lib
      9 from . import support
     10 from . import test_tokenizer
     11 
     12 p = html5lib.HTMLParser()
     13 
     14 unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
     15 
     16 
     17 def main(out_path):
     18     if not os.path.exists(out_path):
     19         sys.stderr.write("Path %s does not exist" % out_path)
     20         sys.exit(1)
     21 
     22     for filename in support.get_data_files('tokenizer', '*.test'):
     23         run_file(filename, out_path)
     24 
     25 
     26 def run_file(filename, out_path):
     27     try:
     28         tests_data = json.load(open(filename, "r"))
     29     except ValueError:
     30         sys.stderr.write("Failed to load %s\n" % filename)
     31         return
     32     name = os.path.splitext(os.path.split(filename)[1])[0]
     33     output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
     34 
     35     if 'tests' in tests_data:
     36         for test_data in tests_data['tests']:
     37             if 'initialStates' not in test_data:
     38                 test_data["initialStates"] = ["Data state"]
     39 
     40             for initial_state in test_data["initialStates"]:
     41                 if initial_state != "Data state":
     42                     # don't support this yet
     43                     continue
     44                 test = make_test(test_data)
     45                 output_file.write(test)
     46 
     47     output_file.close()
     48 
     49 
     50 def make_test(test_data):
     51     if 'doubleEscaped' in test_data:
     52         test_data = test_tokenizer.unescape_test(test_data)
     53 
     54     rv = []
     55     rv.append("#data")
     56     rv.append(test_data["input"].encode("utf8"))
     57     rv.append("#errors")
     58     tree = p.parse(test_data["input"])
     59     output = p.tree.testSerializer(tree)
     60     output = "\n".join(("| " + line[3:]) if line.startswith("|  ") else line
     61                        for line in output.split("\n"))
     62     output = unnamespaceExpected(r"\1<\2>", output)
     63     rv.append(output.encode("utf8"))
     64     rv.append("")
     65     return "\n".join(rv)
     66 
     67 if __name__ == "__main__":
     68     main(sys.argv[1])
     69