Home | History | Annotate | Download | only in unicode
      1 """ Unicode Mapping Parser and Codec Generator.
      2 
      3 This script parses Unicode mapping files as available from the Unicode
      4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
      5 modules from them. The codecs use the standard character mapping codec
      6 to actually apply the mapping.
      7 
      8 Synopsis: gencodec.py dir codec_prefix
      9 
     10 All files in dir are scanned and those producing non-empty mappings
     11 will be written to <codec_prefix><mapname>.py with <mapname> being the
     12 first part of the map's filename ('a' in a.b.c.txt) converted to
     13 lowercase with hyphens replaced by underscores.
     14 
     15 The tool also writes marshalled versions of the mapping tables to the
     16 same location (with .mapping extension).
     17 
     18 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
     19 
     20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
     21 (c) Copyright Guido van Rossum, 2000.
     22 
     23 Table generation:
     24 (c) Copyright Marc-Andre Lemburg, 2005.
     25     Licensed to PSF under a Contributor Agreement.
     26 
     27 """#"

     28 
     29 import re, os, marshal, codecs
     30 
     31 # Maximum allowed size of charmap tables

     32 MAX_TABLE_SIZE = 8192
     33 
     34 # Standard undefined Unicode code point

     35 UNI_UNDEFINED = unichr(0xFFFE)
     36 
     37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
     38                    '\s+'
     39                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
     40                    '\s*'
     41                    '(#.+)?')
     42 
     43 def parsecodes(codes, len=len, range=range):
     44 
     45     """ Converts code combinations to either a single code integer
     46         or a tuple of integers.
     47 
     48         meta-codes (in angular brackets, e.g. <LR> and <RL>) are
     49         ignored.
     50 
     51         Empty codes or illegal ones are returned as None.
     52 
     53     """
     54     if not codes:
     55         return None
     56     l = codes.split('+')
     57     if len(l) == 1:
     58         return int(l[0],16)
     59     for i in range(len(l)):
     60         try:
     61             l[i] = int(l[i],16)
     62         except ValueError:
     63             l[i] = None
     64     l = [x for x in l if x is not None]
     65     if len(l) == 1:
     66         return l[0]
     67     else:
     68         return tuple(l)
     69 
     70 def readmap(filename):
     71 
     72     f = open(filename,'r')
     73     lines = f.readlines()
     74     f.close()
     75     enc2uni = {}
     76     identity = []
     77     unmapped = range(256)
     78 
     79     # UTC mapping tables per convention don't include the identity

     80     # mappings for code points 0x00 - 0x1F and 0x7F, unless these are

     81     # explicitly mapped to different characters or undefined

     82     for i in range(32) + [127]:
     83         identity.append(i)
     84         unmapped.remove(i)
     85         enc2uni[i] = (i, 'CONTROL CHARACTER')
     86 
     87     for line in lines:
     88         line = line.strip()
     89         if not line or line[0] == '#':
     90             continue
     91         m = mapRE.match(line)
     92         if not m:
     93             #print '* not matched: %s' % repr(line)

     94             continue
     95         enc,uni,comment = m.groups()
     96         enc = parsecodes(enc)
     97         uni = parsecodes(uni)
     98         if comment is None:
     99             comment = ''
    100         else:
    101             comment = comment[1:].strip()
    102         if enc < 256:
    103             if enc in unmapped:
    104                 unmapped.remove(enc)
    105             if enc == uni:
    106                 identity.append(enc)
    107             enc2uni[enc] = (uni,comment)
    108         else:
    109             enc2uni[enc] = (uni,comment)
    110 
    111     # If there are more identity-mapped entries than unmapped entries,

    112     # it pays to generate an identity dictionary first, and add explicit

    113     # mappings to None for the rest

    114     if len(identity) >= len(unmapped):
    115         for enc in unmapped:
    116             enc2uni[enc] = (None, "")
    117         enc2uni['IDENTITY'] = 256
    118 
    119     return enc2uni
    120 
    121 def hexrepr(t, precision=4):
    122 
    123     if t is None:
    124         return 'None'
    125     try:
    126         len(t)
    127     except:
    128         return '0x%0*X' % (precision, t)
    129     try:
    130         return '(' + ', '.join(['0x%0*X' % (precision, item)
    131                                 for item in t]) + ')'
    132     except TypeError, why:
    133         print '* failed to convert %r: %s' % (t, why)
    134         raise
    135 
    136 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
    137 
    138     l = []
    139     append = l.append
    140     if "IDENTITY" in map:
    141         append("%s = codecs.make_identity_dict(range(%d))" %
    142                (varname, map["IDENTITY"]))
    143         append("%s.update({" % varname)
    144         splits = 1
    145         del map["IDENTITY"]
    146         identity = 1
    147     else:
    148         append("%s = {" % varname)
    149         splits = 0
    150         identity = 0
    151 
    152     mappings = sorted(map.items())
    153     i = 0
    154     key_precision, value_precision = precisions
    155     for mapkey, mapvalue in mappings:
    156         mapcomment = ''
    157         if isinstance(mapkey, tuple):
    158             (mapkey, mapcomment) = mapkey
    159         if isinstance(mapvalue, tuple):
    160             (mapvalue, mapcomment) = mapvalue
    161         if mapkey is None:
    162             continue
    163         if (identity and
    164             mapkey == mapvalue and
    165             mapkey < 256):
    166             # No need to include identity mappings, since these

    167             # are already set for the first 256 code points.

    168             continue
    169         key = hexrepr(mapkey, key_precision)
    170         value = hexrepr(mapvalue, value_precision)
    171         if mapcomment and comments:
    172             append('    %s: %s,\t#  %s' % (key, value, mapcomment))
    173         else:
    174             append('    %s: %s,' % (key, value))
    175         i += 1
    176         if i == 4096:
    177             # Split the definition into parts to that the Python

    178             # parser doesn't dump core

    179             if splits == 0:
    180                 append('}')
    181             else:
    182                 append('})')
    183             append('%s.update({' % varname)
    184             i = 0
    185             splits = splits + 1
    186     if splits == 0:
    187         append('}')
    188     else:
    189         append('})')
    190 
    191     return l
    192 
    193 def python_tabledef_code(varname, map, comments=1, key_precision=2):
    194 
    195     l = []
    196     append = l.append
    197     append('%s = (' % varname)
    198 
    199     # Analyze map and create table dict

    200     mappings = sorted(map.items())
    201     table = {}
    202     maxkey = 0
    203     if 'IDENTITY' in map:
    204         for key in range(256):
    205             table[key] = (key, '')
    206         maxkey = 255
    207         del map['IDENTITY']
    208     for mapkey, mapvalue in mappings:
    209         mapcomment = ''
    210         if isinstance(mapkey, tuple):
    211             (mapkey, mapcomment) = mapkey
    212         if isinstance(mapvalue, tuple):
    213             (mapvalue, mapcomment) = mapvalue
    214         if mapkey is None:
    215             continue
    216         table[mapkey] = (mapvalue, mapcomment)
    217         if mapkey > maxkey:
    218             maxkey = mapkey
    219     if maxkey > MAX_TABLE_SIZE:
    220         # Table too large

    221         return None
    222 
    223     # Create table code

    224     for key in range(maxkey + 1):
    225         if key not in table:
    226             mapvalue = None
    227             mapcomment = 'UNDEFINED'
    228         else:
    229             mapvalue, mapcomment = table[key]
    230         if mapvalue is None:
    231             mapchar = UNI_UNDEFINED
    232         else:
    233             if isinstance(mapvalue, tuple):
    234                 # 1-n mappings not supported

    235                 return None
    236             else:
    237                 mapchar = unichr(mapvalue)
    238         if mapcomment and comments:
    239             append('    %r\t#  %s -> %s' % (mapchar,
    240                                             hexrepr(key, key_precision),
    241                                             mapcomment))
    242         else:
    243             append('    %r' % mapchar)
    244 
    245     append(')')
    246     return l
    247 
    248 def codegen(name, map, encodingname, comments=1):
    249 
    250     """ Returns Python source for the given map.
    251 
    252         Comments are included in the source, if comments is true (default).
    253 
    254     """
    255     # Generate code

    256     decoding_map_code = python_mapdef_code(
    257         'decoding_map',
    258         map,
    259         comments=comments)
    260     decoding_table_code = python_tabledef_code(
    261         'decoding_table',
    262         map,
    263         comments=comments)
    264     encoding_map_code = python_mapdef_code(
    265         'encoding_map',
    266         codecs.make_encoding_map(map),
    267         comments=comments,
    268         precisions=(4, 2))
    269 
    270     if decoding_table_code:
    271         suffix = 'table'
    272     else:
    273         suffix = 'map'
    274 
    275     l = [
    276         '''\
    277 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
    278 
    279 """#"
    280 
    281 import codecs
    282 
    283 ### Codec APIs
    284 
    285 class Codec(codecs.Codec):
    286 
    287     def encode(self,input,errors='strict'):
    288         return codecs.charmap_encode(input,errors,encoding_%s)
    289 
    290     def decode(self,input,errors='strict'):
    291         return codecs.charmap_decode(input,errors,decoding_%s)
    292 ''' % (encodingname, name, suffix, suffix)]
    293     l.append('''\
    294 class IncrementalEncoder(codecs.IncrementalEncoder):
    295     def encode(self, input, final=False):
    296         return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
    297 
    298 class IncrementalDecoder(codecs.IncrementalDecoder):
    299     def decode(self, input, final=False):
    300         return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
    301         (suffix, suffix))
    302 
    303     l.append('''
    304 class StreamWriter(Codec,codecs.StreamWriter):
    305     pass
    306 
    307 class StreamReader(Codec,codecs.StreamReader):
    308     pass
    309 
    310 ### encodings module API
    311 
    312 def getregentry():
    313     return codecs.CodecInfo(
    314         name=%r,
    315         encode=Codec().encode,
    316         decode=Codec().decode,
    317         incrementalencoder=IncrementalEncoder,
    318         incrementaldecoder=IncrementalDecoder,
    319         streamreader=StreamReader,
    320         streamwriter=StreamWriter,
    321     )
    322 ''' % encodingname.replace('_', '-'))
    323 
    324     # Add decoding table or map (with preference to the table)

    325     if not decoding_table_code:
    326         l.append('''
    327 ### Decoding Map
    328 ''')
    329         l.extend(decoding_map_code)
    330     else:
    331         l.append('''
    332 ### Decoding Table
    333 ''')
    334         l.extend(decoding_table_code)
    335 
    336     # Add encoding map

    337     if decoding_table_code:
    338         l.append('''
    339 ### Encoding table
    340 encoding_table=codecs.charmap_build(decoding_table)
    341 ''')
    342     else:
    343         l.append('''
    344 ### Encoding Map
    345 ''')
    346         l.extend(encoding_map_code)
    347 
    348     # Final new-line

    349     l.append('')
    350 
    351     return '\n'.join(l).expandtabs()
    352 
    353 def pymap(name,map,pyfile,encodingname,comments=1):
    354 
    355     code = codegen(name,map,encodingname,comments)
    356     f = open(pyfile,'w')
    357     f.write(code)
    358     f.close()
    359 
    360 def marshalmap(name,map,marshalfile):
    361 
    362     d = {}
    363     for e,(u,c) in map.items():
    364         d[e] = (u,c)
    365     f = open(marshalfile,'wb')
    366     marshal.dump(d,f)
    367     f.close()
    368 
    369 def convertdir(dir, dirprefix='', nameprefix='', comments=1):
    370 
    371     mapnames = os.listdir(dir)
    372     for mapname in mapnames:
    373         mappathname = os.path.join(dir, mapname)
    374         if not os.path.isfile(mappathname):
    375             continue
    376         name = os.path.split(mapname)[1]
    377         name = name.replace('-','_')
    378         name = name.split('.')[0]
    379         name = name.lower()
    380         name = nameprefix + name
    381         codefile = name + '.py'
    382         marshalfile = name + '.mapping'
    383         print 'converting %s to %s and %s' % (mapname,
    384                                               dirprefix + codefile,
    385                                               dirprefix + marshalfile)
    386         try:
    387             map = readmap(os.path.join(dir,mapname))
    388             if not map:
    389                 print '* map is empty; skipping'
    390             else:
    391                 pymap(mappathname, map, dirprefix + codefile,name,comments)
    392                 marshalmap(mappathname, map, dirprefix + marshalfile)
    393         except ValueError, why:
    394             print '* conversion failed: %s' % why
    395             raise
    396 
    397 def rewritepythondir(dir, dirprefix='', comments=1):
    398 
    399     mapnames = os.listdir(dir)
    400     for mapname in mapnames:
    401         if not mapname.endswith('.mapping'):
    402             continue
    403         name = mapname[:-len('.mapping')]
    404         codefile = name + '.py'
    405         print 'converting %s to %s' % (mapname,
    406                                        dirprefix + codefile)
    407         try:
    408             map = marshal.load(open(os.path.join(dir,mapname),
    409                                'rb'))
    410             if not map:
    411                 print '* map is empty; skipping'
    412             else:
    413                 pymap(mapname, map, dirprefix + codefile,name,comments)
    414         except ValueError, why:
    415             print '* conversion failed: %s' % why
    416 
    417 if __name__ == '__main__':
    418 
    419     import sys
    420     if 1:
    421         convertdir(*sys.argv[1:])
    422     else:
    423         rewritepythondir(*sys.argv[1:])
    424