Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/env python3
      2 """ Utility for parsing HTML entity definitions available from:
      3 
      4       http://www.w3.org/ as e.g.
      5       http://www.w3.org/TR/REC-html40/HTMLlat1.ent
      6 
      7     Input is read from stdin, output is written to stdout in form of a
      8     Python snippet defining a dictionary "entitydefs" mapping literal
      9     entity name to character or numeric entity.
     10 
     11     Marc-Andre Lemburg, mal (at] lemburg.com, 1999.
     12     Use as you like. NO WARRANTIES.
     13 
     14 """
     15 import re,sys
     16 
     17 entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
     18 
     19 def parse(text,pos=0,endpos=None):
     20 
     21     pos = 0
     22     if endpos is None:
     23         endpos = len(text)
     24     d = {}
     25     while 1:
     26         m = entityRE.search(text,pos,endpos)
     27         if not m:
     28             break
     29         name,charcode,comment = m.groups()
     30         d[name] = charcode,comment
     31         pos = m.end()
     32     return d
     33 
     34 def writefile(f,defs):
     35 
     36     f.write("entitydefs = {\n")
     37     items = sorted(defs.items())
     38     for name, (charcode,comment) in items:
     39         if charcode[:2] == '&#':
     40             code = int(charcode[2:-1])
     41             if code < 256:
     42                 charcode = r"'\%o'" % code
     43             else:
     44                 charcode = repr(charcode)
     45         else:
     46             charcode = repr(charcode)
     47         comment = ' '.join(comment.split())
     48         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
     49     f.write('\n}\n')
     50 
     51 if __name__ == '__main__':
     52     if len(sys.argv) > 1:
     53         infile = open(sys.argv[1])
     54     else:
     55         infile = sys.stdin
     56     if len(sys.argv) > 2:
     57         outfile = open(sys.argv[2],'w')
     58     else:
     59         outfile = sys.stdout
     60     text = infile.read()
     61     defs = parse(text)
     62     writefile(outfile,defs)
     63