1 #!/usr/bin/env python3 2 """ 3 Utility for parsing HTML5 entity definitions available from: 4 5 http://dev.w3.org/html5/spec/entities.json 6 7 Written by Ezio Melotti and Iuliia Proskurnia. 8 9 """ 10 11 import os 12 import sys 13 import json 14 from urllib.request import urlopen 15 from html.entities import html5 16 17 entities_url = 'http://dev.w3.org/html5/spec/entities.json' 18 19 def get_json(url): 20 """Download the json file from the url and returns a decoded object.""" 21 with urlopen(url) as f: 22 data = f.read().decode('utf-8') 23 return json.loads(data) 24 25 def create_dict(entities): 26 """Create the html5 dict from the decoded json object.""" 27 new_html5 = {} 28 for name, value in entities.items(): 29 new_html5[name.lstrip('&')] = value['characters'] 30 return new_html5 31 32 def compare_dicts(old, new): 33 """Compare the old and new dicts and print the differences.""" 34 added = new.keys() - old.keys() 35 if added: 36 print('{} entitie(s) have been added:'.format(len(added))) 37 for name in sorted(added): 38 print(' {!r}: {!r}'.format(name, new[name])) 39 removed = old.keys() - new.keys() 40 if removed: 41 print('{} entitie(s) have been removed:'.format(len(removed))) 42 for name in sorted(removed): 43 print(' {!r}: {!r}'.format(name, old[name])) 44 changed = set() 45 for name in (old.keys() & new.keys()): 46 if old[name] != new[name]: 47 changed.add((name, old[name], new[name])) 48 if changed: 49 print('{} entitie(s) have been modified:'.format(len(changed))) 50 for item in sorted(changed): 51 print(' {!r}: {!r} -> {!r}'.format(*item)) 52 53 def write_items(entities, file=sys.stdout): 54 """Write the items of the dictionary in the specified file.""" 55 # The keys in the generated dictionary should be sorted 56 # in a case-insensitive way, however, when two keys are equal, 57 # the uppercase version should come first so that the result 58 # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] 59 # To do this we first sort in a case-sensitive way (so all the 60 # uppercase chars come first) and then sort with key=str.lower. 61 # Since the sorting is stable the uppercase keys will eventually 62 # be before their equivalent lowercase version. 63 keys = sorted(entities.keys()) 64 keys = sorted(keys, key=str.lower) 65 print('html5 = {', file=file) 66 for name in keys: 67 print(' {!r}: {!a},'.format(name, entities[name]), file=file) 68 print('}', file=file) 69 70 71 if __name__ == '__main__': 72 # without args print a diff between html.entities.html5 and new_html5 73 # with --create print the new html5 dict 74 # with --patch patch the Lib/html/entities.py file 75 new_html5 = create_dict(get_json(entities_url)) 76 if '--create' in sys.argv: 77 print('# map the HTML5 named character references to the ' 78 'equivalent Unicode character(s)') 79 print('# Generated by {}. Do not edit manually.'.format(__file__)) 80 write_items(new_html5) 81 elif '--patch' in sys.argv: 82 fname = 'Lib/html/entities.py' 83 temp_fname = fname + '.temp' 84 with open(fname) as f1, open(temp_fname, 'w') as f2: 85 skip = False 86 for line in f1: 87 if line.startswith('html5 = {'): 88 write_items(new_html5, file=f2) 89 skip = True 90 continue 91 if skip: 92 # skip the old items until the } 93 if line.startswith('}'): 94 skip = False 95 continue 96 f2.write(line) 97 os.remove(fname) 98 os.rename(temp_fname, fname) 99 else: 100 if html5 == new_html5: 101 print('The current dictionary is updated.') 102 else: 103 compare_dicts(html5, new_html5) 104 print('Run "./python {0} --patch" to update Lib/html/entities.html ' 105 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) 106