Home | History | Annotate | Download | only in unicode
      1 #
      2 # (re)generate unicode property and type databases
      3 #
      4 # this script converts a unicode 3.2 database file to
      5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
      6 # and Objects/unicodetype_db.h
      7 #
      8 # history:
      9 # 2000-09-24 fl   created (based on bits and pieces from unidb)
     10 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
     11 # 2000-09-25 fl   added character type table
     12 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
     13 # 2000-11-03 fl   expand first/last ranges
     14 # 2001-01-19 fl   added character name tables (2.1)
     15 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
     16 # 2002-09-11 wd   use string methods
     17 # 2002-10-18 mvl  update to Unicode 3.2
     18 # 2002-10-22 mvl  generate NFC tables
     19 # 2002-11-24 mvl  expand all ranges, sort names version-independently
     20 # 2002-11-25 mvl  add UNIDATA_VERSION
     21 # 2004-05-29 perky add east asian width information
     22 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
     23 #
     24 # written by Fredrik Lundh (fredrik (at] pythonware.com)
     25 #
     26 
     27 import sys
     28 
     29 SCRIPT = sys.argv[0]
     30 VERSION = "2.6"
     31 
     32 # The Unicode Database
     33 UNIDATA_VERSION = "5.2.0"
     34 UNICODE_DATA = "UnicodeData%s.txt"
     35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
     36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
     37 UNIHAN = "Unihan%s.txt"
     38 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
     39 LINE_BREAK = "LineBreak%s.txt"
     40 
     41 old_versions = ["3.2.0"]
     42 
     43 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
     44     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
     45     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
     46     "So" ]
     47 
     48 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
     49     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
     50     "ON" ]
     51 
     52 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
     53 
     54 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
     55 
     56 # note: should match definitions in Objects/unicodectype.c
     57 ALPHA_MASK = 0x01
     58 DECIMAL_MASK = 0x02
     59 DIGIT_MASK = 0x04
     60 LOWER_MASK = 0x08
     61 LINEBREAK_MASK = 0x10
     62 SPACE_MASK = 0x20
     63 TITLE_MASK = 0x40
     64 UPPER_MASK = 0x80
     65 NODELTA_MASK = 0x100
     66 NUMERIC_MASK = 0x200
     67 
     68 def maketables(trace=0):
     69 
     70     print "--- Reading", UNICODE_DATA % "", "..."
     71 
     72     version = ""
     73     unicode = UnicodeData(UNICODE_DATA % version,
     74                           COMPOSITION_EXCLUSIONS % version,
     75                           EASTASIAN_WIDTH % version,
     76                           UNIHAN % version,
     77                           DERIVEDNORMALIZATION_PROPS % version,
     78                           LINE_BREAK % version)
     79 
     80     print len(filter(None, unicode.table)), "characters"
     81 
     82     for version in old_versions:
     83         print "--- Reading", UNICODE_DATA % ("-"+version), "..."
     84         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
     85                                   COMPOSITION_EXCLUSIONS % ("-"+version),
     86                                   EASTASIAN_WIDTH % ("-"+version),
     87                                   UNIHAN % ("-"+version))
     88         print len(filter(None, old_unicode.table)), "characters"
     89         merge_old_version(version, unicode, old_unicode)
     90 
     91     makeunicodename(unicode, trace)
     92     makeunicodedata(unicode, trace)
     93     makeunicodetype(unicode, trace)
     94 
     95 # --------------------------------------------------------------------
     96 # unicode character properties
     97 
     98 def makeunicodedata(unicode, trace):
     99 
    100     dummy = (0, 0, 0, 0, 0, 0)
    101     table = [dummy]
    102     cache = {0: dummy}
    103     index = [0] * len(unicode.chars)
    104 
    105     FILE = "Modules/unicodedata_db.h"
    106 
    107     print "--- Preparing", FILE, "..."
    108 
    109     # 1) database properties
    110 
    111     for char in unicode.chars:
    112         record = unicode.table[char]
    113         if record:
    114             # extract database properties
    115             category = CATEGORY_NAMES.index(record[2])
    116             combining = int(record[3])
    117             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
    118             mirrored = record[9] == "Y"
    119             eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
    120             normalizationquickcheck = record[17]
    121             item = (
    122                 category, combining, bidirectional, mirrored, eastasianwidth,
    123                 normalizationquickcheck
    124                 )
    125             # add entry to index and item tables
    126             i = cache.get(item)
    127             if i is None:
    128                 cache[item] = i = len(table)
    129                 table.append(item)
    130             index[char] = i
    131 
    132     # 2) decomposition data
    133 
    134     decomp_data = [0]
    135     decomp_prefix = [""]
    136     decomp_index = [0] * len(unicode.chars)
    137     decomp_size = 0
    138 
    139     comp_pairs = []
    140     comp_first = [None] * len(unicode.chars)
    141     comp_last = [None] * len(unicode.chars)
    142 
    143     for char in unicode.chars:
    144         record = unicode.table[char]
    145         if record:
    146             if record[5]:
    147                 decomp = record[5].split()
    148                 if len(decomp) > 19:
    149                     raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
    150                 # prefix
    151                 if decomp[0][0] == "<":
    152                     prefix = decomp.pop(0)
    153                 else:
    154                     prefix = ""
    155                 try:
    156                     i = decomp_prefix.index(prefix)
    157                 except ValueError:
    158                     i = len(decomp_prefix)
    159                     decomp_prefix.append(prefix)
    160                 prefix = i
    161                 assert prefix < 256
    162                 # content
    163                 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
    164                 # Collect NFC pairs
    165                 if not prefix and len(decomp) == 3 and \
    166                    char not in unicode.exclusions and \
    167                    unicode.table[decomp[1]][3] == "0":
    168                     p, l, r = decomp
    169                     comp_first[l] = 1
    170                     comp_last[r] = 1
    171                     comp_pairs.append((l,r,char))
    172                 try:
    173                     i = decomp_data.index(decomp)
    174                 except ValueError:
    175                     i = len(decomp_data)
    176                     decomp_data.extend(decomp)
    177                     decomp_size = decomp_size + len(decomp) * 2
    178             else:
    179                 i = 0
    180             decomp_index[char] = i
    181 
    182     f = l = 0
    183     comp_first_ranges = []
    184     comp_last_ranges = []
    185     prev_f = prev_l = None
    186     for i in unicode.chars:
    187         if comp_first[i] is not None:
    188             comp_first[i] = f
    189             f += 1
    190             if prev_f is None:
    191                 prev_f = (i,i)
    192             elif prev_f[1]+1 == i:
    193                 prev_f = prev_f[0],i
    194             else:
    195                 comp_first_ranges.append(prev_f)
    196                 prev_f = (i,i)
    197         if comp_last[i] is not None:
    198             comp_last[i] = l
    199             l += 1
    200             if prev_l is None:
    201                 prev_l = (i,i)
    202             elif prev_l[1]+1 == i:
    203                 prev_l = prev_l[0],i
    204             else:
    205                 comp_last_ranges.append(prev_l)
    206                 prev_l = (i,i)
    207     comp_first_ranges.append(prev_f)
    208     comp_last_ranges.append(prev_l)
    209     total_first = f
    210     total_last = l
    211 
    212     comp_data = [0]*(total_first*total_last)
    213     for f,l,char in comp_pairs:
    214         f = comp_first[f]
    215         l = comp_last[l]
    216         comp_data[f*total_last+l] = char
    217 
    218     print len(table), "unique properties"
    219     print len(decomp_prefix), "unique decomposition prefixes"
    220     print len(decomp_data), "unique decomposition entries:",
    221     print decomp_size, "bytes"
    222     print total_first, "first characters in NFC"
    223     print total_last, "last characters in NFC"
    224     print len(comp_pairs), "NFC pairs"
    225 
    226     print "--- Writing", FILE, "..."
    227 
    228     fp = open(FILE, "w")
    229     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    230     print >>fp
    231     print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
    232     print >>fp, "/* a list of unique database records */"
    233     print >>fp, \
    234           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
    235     for item in table:
    236         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
    237     print >>fp, "};"
    238     print >>fp
    239 
    240     print >>fp, "/* Reindexing of NFC first characters. */"
    241     print >>fp, "#define TOTAL_FIRST",total_first
    242     print >>fp, "#define TOTAL_LAST",total_last
    243     print >>fp, "struct reindex{int start;short count,index;};"
    244     print >>fp, "static struct reindex nfc_first[] = {"
    245     for start,end in comp_first_ranges:
    246         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
    247     print >>fp,"  {0,0,0}"
    248     print >>fp,"};\n"
    249     print >>fp, "static struct reindex nfc_last[] = {"
    250     for start,end in comp_last_ranges:
    251         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
    252     print >>fp,"  {0,0,0}"
    253     print >>fp,"};\n"
    254 
    255     # FIXME: <fl> the following tables could be made static, and
    256     # the support code moved into unicodedatabase.c
    257 
    258     print >>fp, "/* string literals */"
    259     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
    260     for name in CATEGORY_NAMES:
    261         print >>fp, "    \"%s\"," % name
    262     print >>fp, "    NULL"
    263     print >>fp, "};"
    264 
    265     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
    266     for name in BIDIRECTIONAL_NAMES:
    267         print >>fp, "    \"%s\"," % name
    268     print >>fp, "    NULL"
    269     print >>fp, "};"
    270 
    271     print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
    272     for name in EASTASIANWIDTH_NAMES:
    273         print >>fp, "    \"%s\"," % name
    274     print >>fp, "    NULL"
    275     print >>fp, "};"
    276 
    277     print >>fp, "static const char *decomp_prefix[] = {"
    278     for name in decomp_prefix:
    279         print >>fp, "    \"%s\"," % name
    280     print >>fp, "    NULL"
    281     print >>fp, "};"
    282 
    283     # split record index table
    284     index1, index2, shift = splitbins(index, trace)
    285 
    286     print >>fp, "/* index tables for the database records */"
    287     print >>fp, "#define SHIFT", shift
    288     Array("index1", index1).dump(fp, trace)
    289     Array("index2", index2).dump(fp, trace)
    290 
    291     # split decomposition index table
    292     index1, index2, shift = splitbins(decomp_index, trace)
    293 
    294     print >>fp, "/* decomposition data */"
    295     Array("decomp_data", decomp_data).dump(fp, trace)
    296 
    297     print >>fp, "/* index tables for the decomposition data */"
    298     print >>fp, "#define DECOMP_SHIFT", shift
    299     Array("decomp_index1", index1).dump(fp, trace)
    300     Array("decomp_index2", index2).dump(fp, trace)
    301 
    302     index, index2, shift = splitbins(comp_data, trace)
    303     print >>fp, "/* NFC pairs */"
    304     print >>fp, "#define COMP_SHIFT", shift
    305     Array("comp_index", index).dump(fp, trace)
    306     Array("comp_data", index2).dump(fp, trace)
    307 
    308     # Generate delta tables for old versions
    309     for version, table, normalization in unicode.changed:
    310         cversion = version.replace(".","_")
    311         records = [table[0]]
    312         cache = {table[0]:0}
    313         index = [0] * len(table)
    314         for i, record in enumerate(table):
    315             try:
    316                 index[i] = cache[record]
    317             except KeyError:
    318                 index[i] = cache[record] = len(records)
    319                 records.append(record)
    320         index1, index2, shift = splitbins(index, trace)
    321         print >>fp, "static const change_record change_records_%s[] = {" % cversion
    322         for record in records:
    323             print >>fp, "\t{ %s }," % ", ".join(map(str,record))
    324         print >>fp, "};"
    325         Array("changes_%s_index" % cversion, index1).dump(fp, trace)
    326         Array("changes_%s_data" % cversion, index2).dump(fp, trace)
    327         print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
    328         print >>fp, "{"
    329         print >>fp, "\tint index;"
    330         print >>fp, "\tif (n >= 0x110000) index = 0;"
    331         print >>fp, "\telse {"
    332         print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
    333         print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
    334               (cversion, shift, ((1<<shift)-1))
    335         print >>fp, "\t}"
    336         print >>fp, "\treturn change_records_%s+index;" % cversion
    337         print >>fp, "}\n"
    338         print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
    339         print >>fp, "{"
    340         print >>fp, "\tswitch(n) {"
    341         for k, v in normalization:
    342             print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
    343         print >>fp, "\tdefault: return 0;"
    344         print >>fp, "\t}\n}\n"
    345 
    346     fp.close()
    347 
    348 # --------------------------------------------------------------------
    349 # unicode character type tables
    350 
    351 def makeunicodetype(unicode, trace):
    352 
    353     FILE = "Objects/unicodetype_db.h"
    354 
    355     print "--- Preparing", FILE, "..."
    356 
    357     # extract unicode types
    358     dummy = (0, 0, 0, 0, 0, 0)
    359     table = [dummy]
    360     cache = {0: dummy}
    361     index = [0] * len(unicode.chars)
    362     numeric = {}
    363     spaces = []
    364     linebreaks = []
    365 
    366     for char in unicode.chars:
    367         record = unicode.table[char]
    368         if record:
    369             # extract database properties
    370             category = record[2]
    371             bidirectional = record[4]
    372             properties = record[16]
    373             flags = 0
    374             delta = True
    375             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
    376                 flags |= ALPHA_MASK
    377             if category == "Ll":
    378                 flags |= LOWER_MASK
    379             if 'Line_Break' in properties or bidirectional == "B":
    380                 flags |= LINEBREAK_MASK
    381                 linebreaks.append(char)
    382             if category == "Zs" or bidirectional in ("WS", "B", "S"):
    383                 flags |= SPACE_MASK
    384                 spaces.append(char)
    385             if category == "Lt":
    386                 flags |= TITLE_MASK
    387             if category == "Lu":
    388                 flags |= UPPER_MASK
    389             # use delta predictor for upper/lower/title if it fits
    390             if record[12]:
    391                 upper = int(record[12], 16)
    392             else:
    393                 upper = char
    394             if record[13]:
    395                 lower = int(record[13], 16)
    396             else:
    397                 lower = char
    398             if record[14]:
    399                 title = int(record[14], 16)
    400             else:
    401                 # UCD.html says that a missing title char means that
    402                 # it defaults to the uppercase character, not to the
    403                 # character itself. Apparently, in the current UCD (5.x)
    404                 # this feature is never used
    405                 title = upper
    406             upper_d = upper - char
    407             lower_d = lower - char
    408             title_d = title - char
    409             if -32768 <= upper_d <= 32767 and \
    410                -32768 <= lower_d <= 32767 and \
    411                -32768 <= title_d <= 32767:
    412                 # use deltas
    413                 upper = upper_d & 0xffff
    414                 lower = lower_d & 0xffff
    415                 title = title_d & 0xffff
    416             else:
    417                 flags |= NODELTA_MASK
    418             # decimal digit, integer digit
    419             decimal = 0
    420             if record[6]:
    421                 flags |= DECIMAL_MASK
    422                 decimal = int(record[6])
    423             digit = 0
    424             if record[7]:
    425                 flags |= DIGIT_MASK
    426                 digit = int(record[7])
    427             if record[8]:
    428                 flags |= NUMERIC_MASK
    429                 numeric.setdefault(record[8], []).append(char)
    430             item = (
    431                 upper, lower, title, decimal, digit, flags
    432                 )
    433             # add entry to index and item tables
    434             i = cache.get(item)
    435             if i is None:
    436                 cache[item] = i = len(table)
    437                 table.append(item)
    438             index[char] = i
    439 
    440     print len(table), "unique character type entries"
    441     print sum(map(len, numeric.values())), "numeric code points"
    442     print len(spaces), "whitespace code points"
    443     print len(linebreaks), "linebreak code points"
    444 
    445     print "--- Writing", FILE, "..."
    446 
    447     fp = open(FILE, "w")
    448     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    449     print >>fp
    450     print >>fp, "/* a list of unique character type descriptors */"
    451     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
    452     for item in table:
    453         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
    454     print >>fp, "};"
    455     print >>fp
    456 
    457     # split decomposition index table
    458     index1, index2, shift = splitbins(index, trace)
    459 
    460     print >>fp, "/* type indexes */"
    461     print >>fp, "#define SHIFT", shift
    462     Array("index1", index1).dump(fp, trace)
    463     Array("index2", index2).dump(fp, trace)
    464 
    465     # Generate code for _PyUnicode_ToNumeric()
    466     numeric_items = sorted(numeric.items())
    467     print >>fp, '/* Returns the numeric value as double for Unicode characters'
    468     print >>fp, ' * having this property, -1.0 otherwise.'
    469     print >>fp, ' */'
    470     print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
    471     print >>fp, '{'
    472     print >>fp, '    switch (ch) {'
    473     for value, codepoints in numeric_items:
    474         # Turn text into float literals
    475         parts = value.split('/')
    476         parts = [repr(float(part)) for part in parts]
    477         value = '/'.join(parts)
    478 
    479         haswide = False
    480         hasnonewide = False
    481         codepoints.sort()
    482         for codepoint in codepoints:
    483             if codepoint < 0x10000:
    484                 hasnonewide = True
    485             if codepoint >= 0x10000 and not haswide:
    486                 print >>fp, '#ifdef Py_UNICODE_WIDE'
    487                 haswide = True
    488             print >>fp, '    case 0x%04X:' % (codepoint,)
    489         if haswide and hasnonewide:
    490             print >>fp, '#endif'
    491         print >>fp, '        return (double) %s;' % (value,)
    492         if haswide and not hasnonewide:
    493             print >>fp, '#endif'
    494     print >>fp,'    }'
    495     print >>fp,'    return -1.0;'
    496     print >>fp,'}'
    497     print >>fp
    498 
    499     # Generate code for _PyUnicode_IsWhitespace()
    500     print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
    501     print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
    502     print >>fp, " */"
    503     print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
    504     print >>fp, '{'
    505     print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
    506     print >>fp, '    return iswspace(ch);'
    507     print >>fp, '#else'
    508     print >>fp, '    switch (ch) {'
    509 
    510     haswide = False
    511     hasnonewide = False
    512     for codepoint in sorted(spaces):
    513         if codepoint < 0x10000:
    514             hasnonewide = True
    515         if codepoint >= 0x10000 and not haswide:
    516             print >>fp, '#ifdef Py_UNICODE_WIDE'
    517             haswide = True
    518         print >>fp, '    case 0x%04X:' % (codepoint,)
    519     if haswide and hasnonewide:
    520         print >>fp, '#endif'
    521     print >>fp, '        return 1;'
    522     if haswide and not hasnonewide:
    523         print >>fp, '#endif'
    524 
    525     print >>fp,'    }'
    526     print >>fp,'    return 0;'
    527     print >>fp, '#endif'
    528     print >>fp,'}'
    529     print >>fp
    530 
    531     # Generate code for _PyUnicode_IsLinebreak()
    532     print >>fp, "/* Returns 1 for Unicode characters having the line break"
    533     print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
    534     print >>fp, " * type 'B', 0 otherwise."
    535     print >>fp, " */"
    536     print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
    537     print >>fp, '{'
    538     print >>fp, '    switch (ch) {'
    539     haswide = False
    540     hasnonewide = False
    541     for codepoint in sorted(linebreaks):
    542         if codepoint < 0x10000:
    543             hasnonewide = True
    544         if codepoint >= 0x10000 and not haswide:
    545             print >>fp, '#ifdef Py_UNICODE_WIDE'
    546             haswide = True
    547         print >>fp, '    case 0x%04X:' % (codepoint,)
    548     if haswide and hasnonewide:
    549         print >>fp, '#endif'
    550     print >>fp, '        return 1;'
    551     if haswide and not hasnonewide:
    552         print >>fp, '#endif'
    553 
    554     print >>fp,'    }'
    555     print >>fp,'    return 0;'
    556     print >>fp,'}'
    557     print >>fp
    558 
    559     fp.close()
    560 
    561 # --------------------------------------------------------------------
    562 # unicode name database
    563 
    564 def makeunicodename(unicode, trace):
    565 
    566     FILE = "Modules/unicodename_db.h"
    567 
    568     print "--- Preparing", FILE, "..."
    569 
    570     # collect names
    571     names = [None] * len(unicode.chars)
    572 
    573     for char in unicode.chars:
    574         record = unicode.table[char]
    575         if record:
    576             name = record[1].strip()
    577             if name and name[0] != "<":
    578                 names[char] = name + chr(0)
    579 
    580     print len(filter(lambda n: n is not None, names)), "distinct names"
    581 
    582     # collect unique words from names (note that we differ between
    583     # words inside a sentence, and words ending a sentence.  the
    584     # latter includes the trailing null byte.
    585 
    586     words = {}
    587     n = b = 0
    588     for char in unicode.chars:
    589         name = names[char]
    590         if name:
    591             w = name.split()
    592             b = b + len(name)
    593             n = n + len(w)
    594             for w in w:
    595                 l = words.get(w)
    596                 if l:
    597                     l.append(None)
    598                 else:
    599                     words[w] = [len(words)]
    600 
    601     print n, "words in text;", b, "bytes"
    602 
    603     wordlist = words.items()
    604 
    605     # sort on falling frequency, then by name
    606     def word_key(a):
    607         aword, alist = a
    608         return -len(alist), aword
    609     wordlist.sort(key=word_key)
    610 
    611     # figure out how many phrasebook escapes we need
    612     escapes = 0
    613     while escapes * 256 < len(wordlist):
    614         escapes = escapes + 1
    615     print escapes, "escapes"
    616 
    617     short = 256 - escapes
    618 
    619     assert short > 0
    620 
    621     print short, "short indexes in lexicon"
    622 
    623     # statistics
    624     n = 0
    625     for i in range(short):
    626         n = n + len(wordlist[i][1])
    627     print n, "short indexes in phrasebook"
    628 
    629     # pick the most commonly used words, and sort the rest on falling
    630     # length (to maximize overlap)
    631 
    632     wordlist, wordtail = wordlist[:short], wordlist[short:]
    633     wordtail.sort(key=lambda a: a[0], reverse=True)
    634     wordlist.extend(wordtail)
    635 
    636     # generate lexicon from words
    637 
    638     lexicon_offset = [0]
    639     lexicon = ""
    640     words = {}
    641 
    642     # build a lexicon string
    643     offset = 0
    644     for w, x in wordlist:
    645         # encoding: bit 7 indicates last character in word (chr(128)
    646         # indicates the last character in an entire string)
    647         ww = w[:-1] + chr(ord(w[-1])+128)
    648         # reuse string tails, when possible
    649         o = lexicon.find(ww)
    650         if o < 0:
    651             o = offset
    652             lexicon = lexicon + ww
    653             offset = offset + len(w)
    654         words[w] = len(lexicon_offset)
    655         lexicon_offset.append(o)
    656 
    657     lexicon = map(ord, lexicon)
    658 
    659     # generate phrasebook from names and lexicon
    660     phrasebook = [0]
    661     phrasebook_offset = [0] * len(unicode.chars)
    662     for char in unicode.chars:
    663         name = names[char]
    664         if name:
    665             w = name.split()
    666             phrasebook_offset[char] = len(phrasebook)
    667             for w in w:
    668                 i = words[w]
    669                 if i < short:
    670                     phrasebook.append(i)
    671                 else:
    672                     # store as two bytes
    673                     phrasebook.append((i>>8) + short)
    674                     phrasebook.append(i&255)
    675 
    676     assert getsize(phrasebook) == 1
    677 
    678     #
    679     # unicode name hash table
    680 
    681     # extract names
    682     data = []
    683     for char in unicode.chars:
    684         record = unicode.table[char]
    685         if record:
    686             name = record[1].strip()
    687             if name and name[0] != "<":
    688                 data.append((name, char))
    689 
    690     # the magic number 47 was chosen to minimize the number of
    691     # collisions on the current data set.  if you like, change it
    692     # and see what happens...
    693 
    694     codehash = Hash("code", data, 47)
    695 
    696     print "--- Writing", FILE, "..."
    697 
    698     fp = open(FILE, "w")
    699     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    700     print >>fp
    701     print >>fp, "#define NAME_MAXLEN", 256
    702     print >>fp
    703     print >>fp, "/* lexicon */"
    704     Array("lexicon", lexicon).dump(fp, trace)
    705     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
    706 
    707     # split decomposition index table
    708     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
    709 
    710     print >>fp, "/* code->name phrasebook */"
    711     print >>fp, "#define phrasebook_shift", shift
    712     print >>fp, "#define phrasebook_short", short
    713 
    714     Array("phrasebook", phrasebook).dump(fp, trace)
    715     Array("phrasebook_offset1", offset1).dump(fp, trace)
    716     Array("phrasebook_offset2", offset2).dump(fp, trace)
    717 
    718     print >>fp, "/* name->code dictionary */"
    719     codehash.dump(fp, trace)
    720 
    721     fp.close()
    722 
    723 
    724 def merge_old_version(version, new, old):
    725     # Changes to exclusion file not implemented yet
    726     if old.exclusions != new.exclusions:
    727         raise NotImplementedError, "exclusions differ"
    728 
    729     # In these change records, 0xFF means "no change"
    730     bidir_changes = [0xFF]*0x110000
    731     category_changes = [0xFF]*0x110000
    732     decimal_changes = [0xFF]*0x110000
    733     mirrored_changes = [0xFF]*0x110000
    734     # In numeric data, 0 means "no change",
    735     # -1 means "did not have a numeric value
    736     numeric_changes = [0] * 0x110000
    737     # normalization_changes is a list of key-value pairs
    738     normalization_changes = []
    739     for i in range(0x110000):
    740         if new.table[i] is None:
    741             # Characters unassigned in the new version ought to
    742             # be unassigned in the old one
    743             assert old.table[i] is None
    744             continue
    745         # check characters unassigned in the old version
    746         if old.table[i] is None:
    747             # category 0 is "unassigned"
    748             category_changes[i] = 0
    749             continue
    750         # check characters that differ
    751         if old.table[i] != new.table[i]:
    752             for k in range(len(old.table[i])):
    753                 if old.table[i][k] != new.table[i][k]:
    754                     value = old.table[i][k]
    755                     if k == 2:
    756                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
    757                         category_changes[i] = CATEGORY_NAMES.index(value)
    758                     elif k == 4:
    759                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
    760                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
    761                     elif k == 5:
    762                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
    763                         # We assume that all normalization changes are in 1:1 mappings
    764                         assert " " not in value
    765                         normalization_changes.append((i, value))
    766                     elif k == 6:
    767                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
    768                         # we only support changes where the old value is a single digit
    769                         assert value in "0123456789"
    770                         decimal_changes[i] = int(value)
    771                     elif k == 8:
    772                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
    773                         # Since 0 encodes "no change", the old value is better not 0
    774                         if not value:
    775                             numeric_changes[i] = -1
    776                         else:
    777                             numeric_changes[i] = float(value)
    778                             assert numeric_changes[i] not in (0, -1)
    779                     elif k == 9:
    780                         if value == 'Y':
    781                             mirrored_changes[i] = '1'
    782                         else:
    783                             mirrored_changes[i] = '0'
    784                     elif k == 11:
    785                         # change to ISO comment, ignore
    786                         pass
    787                     elif k == 12:
    788                         # change to simple uppercase mapping; ignore
    789                         pass
    790                     elif k == 13:
    791                         # change to simple lowercase mapping; ignore
    792                         pass
    793                     elif k == 14:
    794                         # change to simple titlecase mapping; ignore
    795                         pass
    796                     elif k == 16:
    797                         # change to properties; not yet
    798                         pass
    799                     else:
    800                         class Difference(Exception):pass
    801                         raise Difference, (hex(i), k, old.table[i], new.table[i])
    802     new.changed.append((version, zip(bidir_changes, category_changes,
    803                                      decimal_changes, mirrored_changes,
    804                                      numeric_changes),
    805                         normalization_changes))
    806 
    807 
    808 # --------------------------------------------------------------------
    809 # the following support code is taken from the unidb utilities
    810 # Copyright (c) 1999-2000 by Secret Labs AB
    811 
    812 # load a unicode-data file from disk
    813 
    814 class UnicodeData:
    815     # Record structure:
    816     # [ID, name, category, combining, bidi, decomp,  (6)
    817     #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
    818     #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
    819     #  properties] (17)
    820 
    821     def __init__(self, filename, exclusions, eastasianwidth, unihan,
    822                  derivednormalizationprops=None, linebreakprops=None,
    823                  expand=1):
    824         self.changed = []
    825         file = open(filename)
    826         table = [None] * 0x110000
    827         while 1:
    828             s = file.readline()
    829             if not s:
    830                 break
    831             s = s.strip().split(";")
    832             char = int(s[0], 16)
    833             table[char] = s
    834 
    835         # expand first-last ranges
    836         if expand:
    837             field = None
    838             for i in range(0, 0x110000):
    839                 s = table[i]
    840                 if s:
    841                     if s[1][-6:] == "First>":
    842                         s[1] = ""
    843                         field = s
    844                     elif s[1][-5:] == "Last>":
    845                         s[1] = ""
    846                         field = None
    847                 elif field:
    848                     f2 = field[:]
    849                     f2[0] = "%X" % i
    850                     table[i] = f2
    851 
    852         # public attributes
    853         self.filename = filename
    854         self.table = table
    855         self.chars = range(0x110000) # unicode 3.2
    856 
    857         file = open(exclusions)
    858         self.exclusions = {}
    859         for s in file:
    860             s = s.strip()
    861             if not s:
    862                 continue
    863             if s[0] == '#':
    864                 continue
    865             char = int(s.split()[0],16)
    866             self.exclusions[char] = 1
    867 
    868         widths = [None] * 0x110000
    869         for s in open(eastasianwidth):
    870             s = s.strip()
    871             if not s:
    872                 continue
    873             if s[0] == '#':
    874                 continue
    875             s = s.split()[0].split(';')
    876             if '..' in s[0]:
    877                 first, last = [int(c, 16) for c in s[0].split('..')]
    878                 chars = range(first, last+1)
    879             else:
    880                 chars = [int(s[0], 16)]
    881             for char in chars:
    882                 widths[char] = s[1]
    883         for i in range(0, 0x110000):
    884             if table[i] is not None:
    885                 table[i].append(widths[i])
    886 
    887         for i in range(0, 0x110000):
    888             if table[i] is not None:
    889                 table[i].append(set())
    890         if linebreakprops:
    891             for s in open(linebreakprops):
    892                 s = s.partition('#')[0]
    893                 s = [i.strip() for i in s.split(';')]
    894                 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
    895                     continue
    896                 if '..' not in s[0]:
    897                     first = last = int(s[0], 16)
    898                 else:
    899                     first, last = [int(c, 16) for c in s[0].split('..')]
    900                 for char in range(first, last+1):
    901                     table[char][-1].add('Line_Break')
    902 
    903         if derivednormalizationprops:
    904             quickchecks = [0] * 0x110000 # default is Yes
    905             qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
    906             for s in open(derivednormalizationprops):
    907                 if '#' in s:
    908                     s = s[:s.index('#')]
    909                 s = [i.strip() for i in s.split(';')]
    910                 if len(s) < 2 or s[1] not in qc_order:
    911                     continue
    912                 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
    913                 quickcheck_shift = qc_order.index(s[1])*2
    914                 quickcheck <<= quickcheck_shift
    915                 if '..' not in s[0]:
    916                     first = last = int(s[0], 16)
    917                 else:
    918                     first, last = [int(c, 16) for c in s[0].split('..')]
    919                 for char in range(first, last+1):
    920                     assert not (quickchecks[char]>>quickcheck_shift)&3
    921                     quickchecks[char] |= quickcheck
    922             for i in range(0, 0x110000):
    923                 if table[i] is not None:
    924                     table[i].append(quickchecks[i])
    925 
    926         for line in open(unihan):
    927             if not line.startswith('U+'):
    928                 continue
    929             code, tag, value = line.split(None, 3)[:3]
    930             if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
    931                            'kOtherNumeric'):
    932                 continue
    933             value = value.strip().replace(',', '')
    934             i = int(code[2:], 16)
    935             # Patch the numeric field
    936             if table[i] is not None:
    937                 table[i][8] = value
    938 
    939     def uselatin1(self):
    940         # restrict character range to ISO Latin 1
    941         self.chars = range(256)
    942 
    943 # hash table tools
    944 
    945 # this is a straight-forward reimplementation of Python's built-in
    946 # dictionary type, using a static data structure, and a custom string
    947 # hash algorithm.
    948 
    949 def myhash(s, magic):
    950     h = 0
    951     for c in map(ord, s.upper()):
    952         h = (h * magic) + c
    953         ix = h & 0xff000000L
    954         if ix:
    955             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
    956     return h
    957 
    958 SIZES = [
    959     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
    960     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
    961     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
    962     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
    963 ]
    964 
    965 class Hash:
    966     def __init__(self, name, data, magic):
    967         # turn a (key, value) list into a static hash table structure
    968 
    969         # determine table size
    970         for size, poly in SIZES:
    971             if size > len(data):
    972                 poly = size + poly
    973                 break
    974         else:
    975             raise AssertionError, "ran out of polynominals"
    976 
    977         print size, "slots in hash table"
    978 
    979         table = [None] * size
    980 
    981         mask = size-1
    982 
    983         n = 0
    984 
    985         hash = myhash
    986 
    987         # initialize hash table
    988         for key, value in data:
    989             h = hash(key, magic)
    990             i = (~h) & mask
    991             v = table[i]
    992             if v is None:
    993                 table[i] = value
    994                 continue
    995             incr = (h ^ (h >> 3)) & mask;
    996             if not incr:
    997                 incr = mask
    998             while 1:
    999                 n = n + 1
   1000                 i = (i + incr) & mask
   1001                 v = table[i]
   1002                 if v is None:
   1003                     table[i] = value
   1004                     break
   1005                 incr = incr << 1
   1006                 if incr > mask:
   1007                     incr = incr ^ poly
   1008 
   1009         print n, "collisions"
   1010         self.collisions = n
   1011 
   1012         for i in range(len(table)):
   1013             if table[i] is None:
   1014                 table[i] = 0
   1015 
   1016         self.data = Array(name + "_hash", table)
   1017         self.magic = magic
   1018         self.name = name
   1019         self.size = size
   1020         self.poly = poly
   1021 
   1022     def dump(self, file, trace):
   1023         # write data to file, as a C array
   1024         self.data.dump(file, trace)
   1025         file.write("#define %s_magic %d\n" % (self.name, self.magic))
   1026         file.write("#define %s_size %d\n" % (self.name, self.size))
   1027         file.write("#define %s_poly %d\n" % (self.name, self.poly))
   1028 
   1029 # stuff to deal with arrays of unsigned integers
   1030 
   1031 class Array:
   1032 
   1033     def __init__(self, name, data):
   1034         self.name = name
   1035         self.data = data
   1036 
   1037     def dump(self, file, trace=0):
   1038         # write data to file, as a C array
   1039         size = getsize(self.data)
   1040         if trace:
   1041             print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
   1042         file.write("static ")
   1043         if size == 1:
   1044             file.write("unsigned char")
   1045         elif size == 2:
   1046             file.write("unsigned short")
   1047         else:
   1048             file.write("unsigned int")
   1049         file.write(" " + self.name + "[] = {\n")
   1050         if self.data:
   1051             s = "    "
   1052             for item in self.data:
   1053                 i = str(item) + ", "
   1054                 if len(s) + len(i) > 78:
   1055                     file.write(s + "\n")
   1056                     s = "    " + i
   1057                 else:
   1058                     s = s + i
   1059             if s.strip():
   1060                 file.write(s + "\n")
   1061         file.write("};\n\n")
   1062 
   1063 def getsize(data):
   1064     # return smallest possible integer size for the given array
   1065     maxdata = max(data)
   1066     if maxdata < 256:
   1067         return 1
   1068     elif maxdata < 65536:
   1069         return 2
   1070     else:
   1071         return 4
   1072 
   1073 def splitbins(t, trace=0):
   1074     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
   1075 
   1076     t is a sequence of ints.  This function can be useful to save space if
   1077     many of the ints are the same.  t1 and t2 are lists of ints, and shift
   1078     is an int, chosen to minimize the combined size of t1 and t2 (in C
   1079     code), and where for each i in range(len(t)),
   1080         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1081     where mask is a bitmask isolating the last "shift" bits.
   1082 
   1083     If optional arg trace is non-zero (default zero), progress info
   1084     is printed to sys.stderr.  The higher the value, the more info
   1085     you'll get.
   1086     """
   1087 
   1088     if trace:
   1089         def dump(t1, t2, shift, bytes):
   1090             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
   1091                 len(t1), len(t2), shift, bytes)
   1092         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
   1093                             "bytes"
   1094     n = len(t)-1    # last valid index
   1095     maxshift = 0    # the most we can shift n and still have something left
   1096     if n > 0:
   1097         while n >> 1:
   1098             n >>= 1
   1099             maxshift += 1
   1100     del n
   1101     bytes = sys.maxint  # smallest total size so far
   1102     t = tuple(t)    # so slices can be dict keys
   1103     for shift in range(maxshift + 1):
   1104         t1 = []
   1105         t2 = []
   1106         size = 2**shift
   1107         bincache = {}
   1108         for i in range(0, len(t), size):
   1109             bin = t[i:i+size]
   1110             index = bincache.get(bin)
   1111             if index is None:
   1112                 index = len(t2)
   1113                 bincache[bin] = index
   1114                 t2.extend(bin)
   1115             t1.append(index >> shift)
   1116         # determine memory size
   1117         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
   1118         if trace > 1:
   1119             dump(t1, t2, shift, b)
   1120         if b < bytes:
   1121             best = t1, t2, shift
   1122             bytes = b
   1123     t1, t2, shift = best
   1124     if trace:
   1125         print >>sys.stderr, "Best:",
   1126         dump(t1, t2, shift, bytes)
   1127     if __debug__:
   1128         # exhaustively verify that the decomposition is correct
   1129         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
   1130         for i in xrange(len(t)):
   1131             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1132     return best
   1133 
   1134 if __name__ == "__main__":
   1135     maketables(1)
   1136