Home | History | Annotate | Download | only in re2
      1 #!/usr/bin/python
      2 # Copyright 2008 The RE2 Authors.  All Rights Reserved.
      3 # Use of this source code is governed by a BSD-style
      4 # license that can be found in the LICENSE file.
      5 
      6 """Generate C++ tables for Unicode Script and Category groups."""
      7 
      8 import sys
      9 import unicode
     10 
     11 _header = """
     12 // GENERATED BY make_unicode_groups.py; DO NOT EDIT.
     13 // make_unicode_groups.py >unicode_groups.cc
     14 
     15 #include "re2/unicode_groups.h"
     16 
     17 namespace re2 {
     18 
     19 """
     20 
     21 _trailer = """
     22 
     23 }  // namespace re2
     24 
     25 """
     26 
     27 n16 = 0
     28 n32 = 0
     29 
     30 def MakeRanges(codes):
     31   """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
     32   ranges = []
     33   last = -100
     34   for c in codes:
     35     if c == last+1:
     36       ranges[-1][1] = c
     37     else:
     38       ranges.append([c, c])
     39     last = c
     40   return ranges
     41 
     42 def PrintRanges(type, name, ranges):
     43   """Print the ranges as an array of type named name."""
     44   print "static %s %s[] = {" % (type, name,)
     45   for lo, hi in ranges:
     46     print "\t{ %d, %d }," % (lo, hi)
     47   print "};"
     48 
     49 # def PrintCodes(type, name, codes):
     50 #   """Print the codes as an array of type named name."""
     51 #   print "static %s %s[] = {" % (type, name,)
     52 #   for c in codes:
     53 #     print "\t%d," % (c,)
     54 #   print "};"
     55 
     56 def PrintGroup(name, codes):
     57   """Print the data structures for the group of codes.
     58   Return a UGroup literal for the group."""
     59 
     60   # See unicode_groups.h for a description of the data structure.
     61 
     62   # Split codes into 16-bit ranges and 32-bit ranges.
     63   range16 = MakeRanges([c for c in codes if c < 65536])
     64   range32 = MakeRanges([c for c in codes if c >= 65536])
     65 
     66   # Pull singleton ranges out of range16.
     67   # code16 = [lo for lo, hi in range16 if lo == hi]
     68   # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
     69 
     70   global n16
     71   global n32
     72   n16 += len(range16)
     73   n32 += len(range32)
     74 
     75   ugroup = "{ \"%s\", +1" % (name,)
     76   # if len(code16) > 0:
     77   #   PrintCodes("uint16", name+"_code16", code16)
     78   #   ugroup += ", %s_code16, %d" % (name, len(code16))
     79   # else:
     80   #   ugroup += ", 0, 0"
     81   if len(range16) > 0:
     82     PrintRanges("URange16", name+"_range16", range16)
     83     ugroup += ", %s_range16, %d" % (name, len(range16))
     84   else:
     85     ugroup += ", 0, 0"
     86   if len(range32) > 0:
     87     PrintRanges("URange32", name+"_range32", range32)
     88     ugroup += ", %s_range32, %d" % (name, len(range32))
     89   else:
     90     ugroup += ", 0, 0"
     91   ugroup += " }"
     92   return ugroup
     93 
     94 def main():
     95   print _header
     96   ugroups = []
     97   for name, codes in unicode.Categories().iteritems():
     98     ugroups.append(PrintGroup(name, codes))
     99   for name, codes in unicode.Scripts().iteritems():
    100     ugroups.append(PrintGroup(name, codes))
    101   print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
    102   print "UGroup unicode_groups[] = {";
    103   ugroups.sort()
    104   for ug in ugroups:
    105     print "\t%s," % (ug,)
    106   print "};"
    107   print "int num_unicode_groups = %d;" % (len(ugroups),)
    108   print _trailer
    109 
    110 if __name__ == '__main__':
    111   main()
    112