1 #!/usr/bin/python 2 # Copyright 2008 The RE2 Authors. All Rights Reserved. 3 # Use of this source code is governed by a BSD-style 4 # license that can be found in the LICENSE file. 5 6 """Generate C++ tables for Unicode Script and Category groups.""" 7 8 import sys 9 import unicode 10 11 _header = """ 12 // GENERATED BY make_unicode_groups.py; DO NOT EDIT. 13 // make_unicode_groups.py >unicode_groups.cc 14 15 #include "re2/unicode_groups.h" 16 17 namespace re2 { 18 19 """ 20 21 _trailer = """ 22 23 } // namespace re2 24 25 """ 26 27 n16 = 0 28 n32 = 0 29 30 def MakeRanges(codes): 31 """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" 32 ranges = [] 33 last = -100 34 for c in codes: 35 if c == last+1: 36 ranges[-1][1] = c 37 else: 38 ranges.append([c, c]) 39 last = c 40 return ranges 41 42 def PrintRanges(type, name, ranges): 43 """Print the ranges as an array of type named name.""" 44 print "static %s %s[] = {" % (type, name,) 45 for lo, hi in ranges: 46 print "\t{ %d, %d }," % (lo, hi) 47 print "};" 48 49 # def PrintCodes(type, name, codes): 50 # """Print the codes as an array of type named name.""" 51 # print "static %s %s[] = {" % (type, name,) 52 # for c in codes: 53 # print "\t%d," % (c,) 54 # print "};" 55 56 def PrintGroup(name, codes): 57 """Print the data structures for the group of codes. 58 Return a UGroup literal for the group.""" 59 60 # See unicode_groups.h for a description of the data structure. 61 62 # Split codes into 16-bit ranges and 32-bit ranges. 63 range16 = MakeRanges([c for c in codes if c < 65536]) 64 range32 = MakeRanges([c for c in codes if c >= 65536]) 65 66 # Pull singleton ranges out of range16. 67 # code16 = [lo for lo, hi in range16 if lo == hi] 68 # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] 69 70 global n16 71 global n32 72 n16 += len(range16) 73 n32 += len(range32) 74 75 ugroup = "{ \"%s\", +1" % (name,) 76 # if len(code16) > 0: 77 # PrintCodes("uint16", name+"_code16", code16) 78 # ugroup += ", %s_code16, %d" % (name, len(code16)) 79 # else: 80 # ugroup += ", 0, 0" 81 if len(range16) > 0: 82 PrintRanges("URange16", name+"_range16", range16) 83 ugroup += ", %s_range16, %d" % (name, len(range16)) 84 else: 85 ugroup += ", 0, 0" 86 if len(range32) > 0: 87 PrintRanges("URange32", name+"_range32", range32) 88 ugroup += ", %s_range32, %d" % (name, len(range32)) 89 else: 90 ugroup += ", 0, 0" 91 ugroup += " }" 92 return ugroup 93 94 def main(): 95 print _header 96 ugroups = [] 97 for name, codes in unicode.Categories().iteritems(): 98 ugroups.append(PrintGroup(name, codes)) 99 for name, codes in unicode.Scripts().iteritems(): 100 ugroups.append(PrintGroup(name, codes)) 101 print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) 102 print "UGroup unicode_groups[] = {"; 103 ugroups.sort() 104 for ug in ugroups: 105 print "\t%s," % (ug,) 106 print "};" 107 print "int num_unicode_groups = %d;" % (len(ugroups),) 108 print _trailer 109 110 if __name__ == '__main__': 111 main() 112