1 from __future__ import absolute_import, division, unicode_literals 2 3 import sys 4 import os 5 import json 6 import re 7 8 import html5lib 9 from . import support 10 from . import test_tokenizer 11 12 p = html5lib.HTMLParser() 13 14 unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub 15 16 17 def main(out_path): 18 if not os.path.exists(out_path): 19 sys.stderr.write("Path %s does not exist" % out_path) 20 sys.exit(1) 21 22 for filename in support.get_data_files('tokenizer', '*.test'): 23 run_file(filename, out_path) 24 25 26 def run_file(filename, out_path): 27 try: 28 tests_data = json.load(open(filename, "r")) 29 except ValueError: 30 sys.stderr.write("Failed to load %s\n" % filename) 31 return 32 name = os.path.splitext(os.path.split(filename)[1])[0] 33 output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w") 34 35 if 'tests' in tests_data: 36 for test_data in tests_data['tests']: 37 if 'initialStates' not in test_data: 38 test_data["initialStates"] = ["Data state"] 39 40 for initial_state in test_data["initialStates"]: 41 if initial_state != "Data state": 42 # don't support this yet 43 continue 44 test = make_test(test_data) 45 output_file.write(test) 46 47 output_file.close() 48 49 50 def make_test(test_data): 51 if 'doubleEscaped' in test_data: 52 test_data = test_tokenizer.unescape_test(test_data) 53 54 rv = [] 55 rv.append("#data") 56 rv.append(test_data["input"].encode("utf8")) 57 rv.append("#errors") 58 tree = p.parse(test_data["input"]) 59 output = p.tree.testSerializer(tree) 60 output = "\n".join(("| " + line[3:]) if line.startswith("| ") else line 61 for line in output.split("\n")) 62 output = unnamespaceExpected(r"\1<\2>", output) 63 rv.append(output.encode("utf8")) 64 rv.append("") 65 return "\n".join(rv) 66 67 if __name__ == "__main__": 68 main(sys.argv[1]) 69