Home | History | Annotate | Download | only in re2
      1 #!/usr/bin/python
      2 # coding=utf-8
      3 #
      4 # Copyright 2008 The RE2 Authors.  All Rights Reserved.
      5 # Use of this source code is governed by a BSD-style
      6 # license that can be found in the LICENSE file.
      7 
      8 # See unicode_casefold.h for description of case folding tables.
      9 
     10 """Generate C++ table for Unicode case folding."""
     11 
     12 import unicode, sys
     13 
     14 _header = """
     15 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
     16 // make_unicode_casefold.py >unicode_casefold.cc
     17 
     18 #include "re2/unicode_casefold.h"
     19 
     20 namespace re2 {
     21 
     22 """
     23 
     24 _trailer = """
     25 
     26 } // namespace re2
     27 
     28 """
     29 
     30 def _Delta(a, b):
     31   """Compute the delta for b - a.  Even/odd and odd/even
     32      are handled specially, as described above."""
     33   if a+1 == b:
     34     if a%2 == 0:
     35       return 'EvenOdd'
     36     else:
     37       return 'OddEven'
     38   if a == b+1:
     39     if a%2 == 0:
     40       return 'OddEven'
     41     else:
     42       return 'EvenOdd'
     43   return b - a
     44 
     45 def _AddDelta(a, delta):
     46   """Return a + delta, handling EvenOdd and OddEven specially."""
     47   if type(delta) == int:
     48     return a+delta
     49   if delta == 'EvenOdd':
     50     if a%2 == 0:
     51       return a+1
     52     else:
     53       return a-1
     54   if delta == 'OddEven':
     55     if a%2 == 1:
     56       return a+1
     57     else:
     58       return a-1
     59   print >>sys.stderr, "Bad Delta: ", delta
     60   raise "Bad Delta"
     61 
     62 def _MakeRanges(pairs):
     63   """Turn a list like [(65,97), (66, 98), ..., (90,122)]
     64      into [(65, 90, +32)]."""
     65   ranges = []
     66   last = -100
     67 
     68   def evenodd(last, a, b, r):
     69     if a != last+1 or b != _AddDelta(a, r[2]):
     70       return False
     71     r[1] = a
     72     return True
     73 
     74   def evenoddpair(last, a, b, r):
     75     if a != last+2:
     76       return False
     77     delta = r[2]
     78     d = delta
     79     if type(delta) is not str:
     80       return False
     81     if delta.endswith('Skip'):
     82       d = delta[:-4]
     83     else:
     84       delta = d + 'Skip'
     85     if b != _AddDelta(a, d):
     86       return False
     87     r[1] = a
     88     r[2] = delta
     89     return True
     90 
     91   for a, b in pairs:
     92     if ranges and evenodd(last, a, b, ranges[-1]):
     93       pass
     94     elif ranges and evenoddpair(last, a, b, ranges[-1]):
     95       pass
     96     else:
     97       ranges.append([a, a, _Delta(a, b)])
     98     last = a
     99   return ranges
    100 
    101 # The maximum size of a case-folding group.
    102 # Case folding is implemented in parse.cc by a recursive process
    103 # with a recursion depth equal to the size of the largest
    104 # case-folding group, so it is important that this bound be small.
    105 # The current tables have no group bigger than 4.
    106 # If there are ever groups bigger than 10 or so, it will be
    107 # time to rework the code in parse.cc.
    108 MaxCasefoldGroup = 4
    109 
    110 def main():
    111   lowergroups, casegroups = unicode.CaseGroups()
    112   foldpairs = []
    113   seen = {}
    114   for c in casegroups:
    115     if len(c) > MaxCasefoldGroup:
    116       raise unicode.Error("casefold group too long: %s" % (c,))
    117     for i in range(len(c)):
    118       if c[i-1] in seen:
    119         raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
    120       seen[c[i-1]] = True
    121       foldpairs.append([c[i-1], c[i]])
    122 
    123   lowerpairs = []
    124   for lower, group in lowergroups.iteritems():
    125     for g in group:
    126       if g != lower:
    127         lowerpairs.append([g, lower])
    128 
    129   def printpairs(name, foldpairs):
    130     foldpairs.sort()
    131     foldranges = _MakeRanges(foldpairs)
    132     print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
    133     print "CaseFold unicode_%s[] = {" % (name,)
    134     for lo, hi, delta in foldranges:
    135       print "\t{ %d, %d, %s }," % (lo, hi, delta)
    136     print "};"
    137     print "int num_unicode_%s = %d;" % (name, len(foldranges),)
    138     print ""
    139 
    140   print _header
    141   printpairs("casefold", foldpairs)
    142   printpairs("tolower", lowerpairs)
    143   print _trailer
    144 
    145 if __name__ == '__main__':
    146   main()
    147