Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/env python3
      2 """
      3 Utility for parsing HTML5 entity definitions available from:
      4 
      5     http://dev.w3.org/html5/spec/entities.json
      6 
      7 Written by Ezio Melotti and Iuliia Proskurnia.
      8 
      9 """
     10 
     11 import os
     12 import sys
     13 import json
     14 from urllib.request import urlopen
     15 from html.entities import html5
     16 
     17 entities_url = 'http://dev.w3.org/html5/spec/entities.json'
     18 
     19 def get_json(url):
     20     """Download the json file from the url and returns a decoded object."""
     21     with urlopen(url) as f:
     22         data = f.read().decode('utf-8')
     23     return json.loads(data)
     24 
     25 def create_dict(entities):
     26     """Create the html5 dict from the decoded json object."""
     27     new_html5 = {}
     28     for name, value in entities.items():
     29         new_html5[name.lstrip('&')] = value['characters']
     30     return new_html5
     31 
     32 def compare_dicts(old, new):
     33     """Compare the old and new dicts and print the differences."""
     34     added = new.keys() - old.keys()
     35     if added:
     36         print('{} entitie(s) have been added:'.format(len(added)))
     37         for name in sorted(added):
     38             print('  {!r}: {!r}'.format(name, new[name]))
     39     removed = old.keys() - new.keys()
     40     if removed:
     41         print('{} entitie(s) have been removed:'.format(len(removed)))
     42         for name in sorted(removed):
     43             print('  {!r}: {!r}'.format(name, old[name]))
     44     changed = set()
     45     for name in (old.keys() & new.keys()):
     46         if old[name] != new[name]:
     47             changed.add((name, old[name], new[name]))
     48     if changed:
     49         print('{} entitie(s) have been modified:'.format(len(changed)))
     50         for item in sorted(changed):
     51             print('  {!r}: {!r} -> {!r}'.format(*item))
     52 
     53 def write_items(entities, file=sys.stdout):
     54     """Write the items of the dictionary in the specified file."""
     55     # The keys in the generated dictionary should be sorted
     56     # in a case-insensitive way, however, when two keys are equal,
     57     # the uppercase version should come first so that the result
     58     # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
     59     # To do this we first sort in a case-sensitive way (so all the
     60     # uppercase chars come first) and then sort with key=str.lower.
     61     # Since the sorting is stable the uppercase keys will eventually
     62     # be before their equivalent lowercase version.
     63     keys = sorted(entities.keys())
     64     keys = sorted(keys, key=str.lower)
     65     print('html5 = {', file=file)
     66     for name in keys:
     67         print('    {!r}: {!a},'.format(name, entities[name]), file=file)
     68     print('}', file=file)
     69 
     70 
     71 if __name__ == '__main__':
     72     # without args print a diff between html.entities.html5 and new_html5
     73     # with --create print the new html5 dict
     74     # with --patch patch the Lib/html/entities.py file
     75     new_html5 = create_dict(get_json(entities_url))
     76     if '--create' in sys.argv:
     77         print('# map the HTML5 named character references to the '
     78               'equivalent Unicode character(s)')
     79         print('# Generated by {}.  Do not edit manually.'.format(__file__))
     80         write_items(new_html5)
     81     elif '--patch' in sys.argv:
     82         fname = 'Lib/html/entities.py'
     83         temp_fname = fname + '.temp'
     84         with open(fname) as f1, open(temp_fname, 'w') as f2:
     85             skip = False
     86             for line in f1:
     87                 if line.startswith('html5 = {'):
     88                     write_items(new_html5, file=f2)
     89                     skip = True
     90                     continue
     91                 if skip:
     92                     # skip the old items until the }
     93                     if line.startswith('}'):
     94                         skip = False
     95                     continue
     96                 f2.write(line)
     97         os.remove(fname)
     98         os.rename(temp_fname, fname)
     99     else:
    100         if html5 == new_html5:
    101             print('The current dictionary is updated.')
    102         else:
    103             compare_dicts(html5, new_html5)
    104             print('Run "./python {0} --patch" to update Lib/html/entities.html '
    105                   'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))
    106