Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/env python
      2 """ Utility for parsing HTML entity definitions available from:
      3 
      4       http://www.w3.org/ as e.g.
      5       http://www.w3.org/TR/REC-html40/HTMLlat1.ent
      6 
      7     Input is read from stdin, output is written to stdout in form of a
      8     Python snippet defining a dictionary "entitydefs" mapping literal
      9     entity name to character or numeric entity.
     10 
     11     Marc-Andre Lemburg, mal (at] lemburg.com, 1999.
     12     Use as you like. NO WARRANTIES.
     13 
     14 """
     15 import re,sys
     16 import TextTools
     17 
     18 entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
     19 
     20 def parse(text,pos=0,endpos=None):
     21 
     22     pos = 0
     23     if endpos is None:
     24         endpos = len(text)
     25     d = {}
     26     while 1:
     27         m = entityRE.search(text,pos,endpos)
     28         if not m:
     29             break
     30         name,charcode,comment = m.groups()
     31         d[name] = charcode,comment
     32         pos = m.end()
     33     return d
     34 
     35 def writefile(f,defs):
     36 
     37     f.write("entitydefs = {\n")
     38     items = defs.items()
     39     items.sort()
     40     for name,(charcode,comment) in items:
     41         if charcode[:2] == '&#':
     42             code = int(charcode[2:-1])
     43             if code < 256:
     44                 charcode = "'\%o'" % code
     45             else:
     46                 charcode = repr(charcode)
     47         else:
     48             charcode = repr(charcode)
     49         comment = TextTools.collapse(comment)
     50         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
     51     f.write('\n}\n')
     52 
     53 if __name__ == '__main__':
     54     if len(sys.argv) > 1:
     55         infile = open(sys.argv[1])
     56     else:
     57         infile = sys.stdin
     58     if len(sys.argv) > 2:
     59         outfile = open(sys.argv[2],'w')
     60     else:
     61         outfile = sys.stdout
     62     text = infile.read()
     63     defs = parse(text)
     64     writefile(outfile,defs)
     65