1 #!/usr/bin/python 2 # coding=utf-8 3 # 4 # Copyright 2008 The RE2 Authors. All Rights Reserved. 5 # Use of this source code is governed by a BSD-style 6 # license that can be found in the LICENSE file. 7 8 # See unicode_casefold.h for description of case folding tables. 9 10 """Generate C++ table for Unicode case folding.""" 11 12 import unicode, sys 13 14 _header = """ 15 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. 16 // make_unicode_casefold.py >unicode_casefold.cc 17 18 #include "re2/unicode_casefold.h" 19 20 namespace re2 { 21 22 """ 23 24 _trailer = """ 25 26 } // namespace re2 27 28 """ 29 30 def _Delta(a, b): 31 """Compute the delta for b - a. Even/odd and odd/even 32 are handled specially, as described above.""" 33 if a+1 == b: 34 if a%2 == 0: 35 return 'EvenOdd' 36 else: 37 return 'OddEven' 38 if a == b+1: 39 if a%2 == 0: 40 return 'OddEven' 41 else: 42 return 'EvenOdd' 43 return b - a 44 45 def _AddDelta(a, delta): 46 """Return a + delta, handling EvenOdd and OddEven specially.""" 47 if type(delta) == int: 48 return a+delta 49 if delta == 'EvenOdd': 50 if a%2 == 0: 51 return a+1 52 else: 53 return a-1 54 if delta == 'OddEven': 55 if a%2 == 1: 56 return a+1 57 else: 58 return a-1 59 print >>sys.stderr, "Bad Delta: ", delta 60 raise "Bad Delta" 61 62 def _MakeRanges(pairs): 63 """Turn a list like [(65,97), (66, 98), ..., (90,122)] 64 into [(65, 90, +32)].""" 65 ranges = [] 66 last = -100 67 68 def evenodd(last, a, b, r): 69 if a != last+1 or b != _AddDelta(a, r[2]): 70 return False 71 r[1] = a 72 return True 73 74 def evenoddpair(last, a, b, r): 75 if a != last+2: 76 return False 77 delta = r[2] 78 d = delta 79 if type(delta) is not str: 80 return False 81 if delta.endswith('Skip'): 82 d = delta[:-4] 83 else: 84 delta = d + 'Skip' 85 if b != _AddDelta(a, d): 86 return False 87 r[1] = a 88 r[2] = delta 89 return True 90 91 for a, b in pairs: 92 if ranges and evenodd(last, a, b, ranges[-1]): 93 pass 94 elif ranges and evenoddpair(last, a, b, ranges[-1]): 95 pass 96 else: 97 ranges.append([a, a, _Delta(a, b)]) 98 last = a 99 return ranges 100 101 # The maximum size of a case-folding group. 102 # Case folding is implemented in parse.cc by a recursive process 103 # with a recursion depth equal to the size of the largest 104 # case-folding group, so it is important that this bound be small. 105 # The current tables have no group bigger than 4. 106 # If there are ever groups bigger than 10 or so, it will be 107 # time to rework the code in parse.cc. 108 MaxCasefoldGroup = 4 109 110 def main(): 111 lowergroups, casegroups = unicode.CaseGroups() 112 foldpairs = [] 113 seen = {} 114 for c in casegroups: 115 if len(c) > MaxCasefoldGroup: 116 raise unicode.Error("casefold group too long: %s" % (c,)) 117 for i in range(len(c)): 118 if c[i-1] in seen: 119 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) 120 seen[c[i-1]] = True 121 foldpairs.append([c[i-1], c[i]]) 122 123 lowerpairs = [] 124 for lower, group in lowergroups.iteritems(): 125 for g in group: 126 if g != lower: 127 lowerpairs.append([g, lower]) 128 129 def printpairs(name, foldpairs): 130 foldpairs.sort() 131 foldranges = _MakeRanges(foldpairs) 132 print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) 133 print "CaseFold unicode_%s[] = {" % (name,) 134 for lo, hi, delta in foldranges: 135 print "\t{ %d, %d, %s }," % (lo, hi, delta) 136 print "};" 137 print "int num_unicode_%s = %d;" % (name, len(foldranges),) 138 print "" 139 140 print _header 141 printpairs("casefold", foldpairs) 142 printpairs("tolower", lowerpairs) 143 print _trailer 144 145 if __name__ == '__main__': 146 main() 147