Home | History | Annotate | Download | only in re2
      1 # Copyright 2008 The RE2 Authors.  All Rights Reserved.
      2 # Use of this source code is governed by a BSD-style
      3 # license that can be found in the LICENSE file.
      4 
      5 """Parser for Unicode data files (as distributed by unicode.org)."""
      6 
      7 import os
      8 import re
      9 import urllib2
     10 
     11 # Directory or URL where Unicode tables reside.
     12 _UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
     13 
     14 # Largest valid Unicode code value.
     15 _RUNE_MAX = 0x10FFFF
     16 
     17 
     18 class Error(Exception):
     19   """Unicode error base class."""
     20 
     21 
     22 class InputError(Error):
     23   """Unicode input error class.  Raised on invalid input."""
     24 
     25 
     26 def _UInt(s):
     27   """Converts string to Unicode code point ('263A' => 0x263a).
     28 
     29   Args:
     30     s: string to convert
     31 
     32   Returns:
     33     Unicode code point
     34 
     35   Raises:
     36     InputError: the string is not a valid Unicode value.
     37   """
     38 
     39   try:
     40     v = int(s, 16)
     41   except ValueError:
     42     v = -1
     43   if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
     44     raise InputError("invalid Unicode value %s" % (s,))
     45   return v
     46 
     47 
     48 def _URange(s):
     49   """Converts string to Unicode range.
     50 
     51     '0001..0003' => [1, 2, 3].
     52     '0001' => [1].
     53 
     54   Args:
     55     s: string to convert
     56 
     57   Returns:
     58     Unicode range
     59 
     60   Raises:
     61     InputError: the string is not a valid Unicode range.
     62   """
     63   a = s.split("..")
     64   if len(a) == 1:
     65     return [_UInt(a[0])]
     66   if len(a) == 2:
     67     lo = _UInt(a[0])
     68     hi = _UInt(a[1])
     69     if lo < hi:
     70       return range(lo, hi + 1)
     71   raise InputError("invalid Unicode range %s" % (s,))
     72 
     73 
     74 def _UStr(v):
     75   """Converts Unicode code point to hex string.
     76 
     77     0x263a => '0x263A'.
     78 
     79   Args:
     80     v: code point to convert
     81 
     82   Returns:
     83     Unicode string
     84 
     85   Raises:
     86     InputError: the argument is not a valid Unicode value.
     87   """
     88   if v < 0 or v > _RUNE_MAX:
     89     raise InputError("invalid Unicode value %s" % (v,))
     90   return "0x%04X" % (v,)
     91 
     92 
     93 def _ParseContinue(s):
     94   """Parses a Unicode continuation field.
     95 
     96   These are of the form '<Name, First>' or '<Name, Last>'.
     97   Instead of giving an explicit range in a single table entry,
     98   some Unicode tables use two entries, one for the first
     99   code value in the range and one for the last.
    100   The first entry's description is '<Name, First>' instead of 'Name'
    101   and the second is '<Name, Last>'.
    102 
    103     '<Name, First>' => ('Name', 'First')
    104     '<Name, Last>' => ('Name', 'Last')
    105     'Anything else' => ('Anything else', None)
    106 
    107   Args:
    108     s: continuation field string
    109 
    110   Returns:
    111     pair: name and ('First', 'Last', or None)
    112   """
    113 
    114   match = re.match("<(.*), (First|Last)>", s)
    115   if match is not None:
    116     return match.groups()
    117   return (s, None)
    118 
    119 
    120 def ReadUnicodeTable(filename, nfields, doline):
    121   """Generic Unicode table text file reader.
    122 
    123   The reader takes care of stripping out comments and also
    124   parsing the two different ways that the Unicode tables specify
    125   code ranges (using the .. notation and splitting the range across
    126   multiple lines).
    127 
    128   Each non-comment line in the table is expected to have the given
    129   number of fields.  The first field is known to be the Unicode value
    130   and the second field its description.
    131 
    132   The reader calls doline(codes, fields) for each entry in the table.
    133   If fn raises an exception, the reader prints that exception,
    134   prefixed with the file name and line number, and continues
    135   processing the file.  When done with the file, the reader re-raises
    136   the first exception encountered during the file.
    137 
    138   Arguments:
    139     filename: the Unicode data file to read, or a file-like object.
    140     nfields: the number of expected fields per line in that file.
    141     doline: the function to call for each table entry.
    142 
    143   Raises:
    144     InputError: nfields is invalid (must be >= 2).
    145   """
    146 
    147   if nfields < 2:
    148     raise InputError("invalid number of fields %d" % (nfields,))
    149 
    150   if type(filename) == str:
    151     if filename.startswith("http://"):
    152       fil = urllib2.urlopen(filename)
    153     else:
    154       fil = open(filename, "r")
    155   else:
    156     fil = filename
    157 
    158   first = None        # first code in multiline range
    159   expect_last = None  # tag expected for "Last" line in multiline range
    160   lineno = 0          # current line number
    161   for line in fil:
    162     lineno += 1
    163     try:
    164       # Chop # comments and white space; ignore empty lines.
    165       sharp = line.find("#")
    166       if sharp >= 0:
    167         line = line[:sharp]
    168       line = line.strip()
    169       if not line:
    170         continue
    171 
    172       # Split fields on ";", chop more white space.
    173       # Must have the expected number of fields.
    174       fields = [s.strip() for s in line.split(";")]
    175       if len(fields) != nfields:
    176         raise InputError("wrong number of fields %d %d - %s" %
    177                          (len(fields), nfields, line))
    178 
    179       # The Unicode text files have two different ways
    180       # to list a Unicode range.  Either the first field is
    181       # itself a range (0000..FFFF), or the range is split
    182       # across two lines, with the second field noting
    183       # the continuation.
    184       codes = _URange(fields[0])
    185       (name, cont) = _ParseContinue(fields[1])
    186 
    187       if expect_last is not None:
    188         # If the last line gave the First code in a range,
    189         # this one had better give the Last one.
    190         if (len(codes) != 1 or codes[0] <= first or
    191             cont != "Last" or name != expect_last):
    192           raise InputError("expected Last line for %s" %
    193                            (expect_last,))
    194         codes = range(first, codes[0] + 1)
    195         first = None
    196         expect_last = None
    197         fields[0] = "%04X..%04X" % (codes[0], codes[-1])
    198         fields[1] = name
    199       elif cont == "First":
    200         # Otherwise, if this is the First code in a range,
    201         # remember it and go to the next line.
    202         if len(codes) != 1:
    203           raise InputError("bad First line: range given")
    204         expect_last = name
    205         first = codes[0]
    206         continue
    207 
    208       doline(codes, fields)
    209 
    210     except Exception, e:
    211       print "%s:%d: %s" % (filename, lineno, e)
    212       raise
    213 
    214   if expect_last is not None:
    215     raise InputError("expected Last line for %s; got EOF" %
    216                      (expect_last,))
    217 
    218 
    219 def CaseGroups(unicode_dir=_UNICODE_DIR):
    220   """Returns list of Unicode code groups equivalent under case folding.
    221 
    222   Each group is a sorted list of code points,
    223   and the list of groups is sorted by first code point
    224   in the group.
    225 
    226   Args:
    227     unicode_dir: Unicode data directory
    228 
    229   Returns:
    230     list of Unicode code groups
    231   """
    232 
    233   # Dict mapping lowercase code point to fold-equivalent group.
    234   togroup = {}
    235 
    236   def DoLine(codes, fields):
    237     """Process single CaseFolding.txt line, updating togroup."""
    238     (_, foldtype, lower, _) = fields
    239     if foldtype not in ("C", "S"):
    240       return
    241     lower = _UInt(lower)
    242     togroup.setdefault(lower, [lower]).extend(codes)
    243 
    244   ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
    245 
    246   groups = togroup.values()
    247   for g in groups:
    248     g.sort()
    249   groups.sort()
    250   return togroup, groups
    251 
    252 
    253 def Scripts(unicode_dir=_UNICODE_DIR):
    254   """Returns dict mapping script names to code lists.
    255 
    256   Args:
    257     unicode_dir: Unicode data directory
    258 
    259   Returns:
    260     dict mapping script names to code lists
    261   """
    262 
    263   scripts = {}
    264 
    265   def DoLine(codes, fields):
    266     """Process single Scripts.txt line, updating scripts."""
    267     (_, name) = fields
    268     scripts.setdefault(name, []).extend(codes)
    269 
    270   ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
    271   return scripts
    272 
    273 
    274 def Categories(unicode_dir=_UNICODE_DIR):
    275   """Returns dict mapping category names to code lists.
    276 
    277   Args:
    278     unicode_dir: Unicode data directory
    279 
    280   Returns:
    281     dict mapping category names to code lists
    282   """
    283 
    284   categories = {}
    285 
    286   def DoLine(codes, fields):
    287     """Process single UnicodeData.txt line, updating categories."""
    288     category = fields[2]
    289     categories.setdefault(category, []).extend(codes)
    290     # Add codes from Lu into L, etc.
    291     if len(category) > 1:
    292       short = category[0]
    293       categories.setdefault(short, []).extend(codes)
    294 
    295   ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
    296   return categories
    297 
    298