1 # Copyright 2008 The RE2 Authors. All Rights Reserved. 2 # Use of this source code is governed by a BSD-style 3 # license that can be found in the LICENSE file. 4 5 """Parser for Unicode data files (as distributed by unicode.org).""" 6 7 import os 8 import re 9 import urllib2 10 11 # Directory or URL where Unicode tables reside. 12 _UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd" 13 14 # Largest valid Unicode code value. 15 _RUNE_MAX = 0x10FFFF 16 17 18 class Error(Exception): 19 """Unicode error base class.""" 20 21 22 class InputError(Error): 23 """Unicode input error class. Raised on invalid input.""" 24 25 26 def _UInt(s): 27 """Converts string to Unicode code point ('263A' => 0x263a). 28 29 Args: 30 s: string to convert 31 32 Returns: 33 Unicode code point 34 35 Raises: 36 InputError: the string is not a valid Unicode value. 37 """ 38 39 try: 40 v = int(s, 16) 41 except ValueError: 42 v = -1 43 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: 44 raise InputError("invalid Unicode value %s" % (s,)) 45 return v 46 47 48 def _URange(s): 49 """Converts string to Unicode range. 50 51 '0001..0003' => [1, 2, 3]. 52 '0001' => [1]. 53 54 Args: 55 s: string to convert 56 57 Returns: 58 Unicode range 59 60 Raises: 61 InputError: the string is not a valid Unicode range. 62 """ 63 a = s.split("..") 64 if len(a) == 1: 65 return [_UInt(a[0])] 66 if len(a) == 2: 67 lo = _UInt(a[0]) 68 hi = _UInt(a[1]) 69 if lo < hi: 70 return range(lo, hi + 1) 71 raise InputError("invalid Unicode range %s" % (s,)) 72 73 74 def _UStr(v): 75 """Converts Unicode code point to hex string. 76 77 0x263a => '0x263A'. 78 79 Args: 80 v: code point to convert 81 82 Returns: 83 Unicode string 84 85 Raises: 86 InputError: the argument is not a valid Unicode value. 87 """ 88 if v < 0 or v > _RUNE_MAX: 89 raise InputError("invalid Unicode value %s" % (v,)) 90 return "0x%04X" % (v,) 91 92 93 def _ParseContinue(s): 94 """Parses a Unicode continuation field. 95 96 These are of the form '<Name, First>' or '<Name, Last>'. 97 Instead of giving an explicit range in a single table entry, 98 some Unicode tables use two entries, one for the first 99 code value in the range and one for the last. 100 The first entry's description is '<Name, First>' instead of 'Name' 101 and the second is '<Name, Last>'. 102 103 '<Name, First>' => ('Name', 'First') 104 '<Name, Last>' => ('Name', 'Last') 105 'Anything else' => ('Anything else', None) 106 107 Args: 108 s: continuation field string 109 110 Returns: 111 pair: name and ('First', 'Last', or None) 112 """ 113 114 match = re.match("<(.*), (First|Last)>", s) 115 if match is not None: 116 return match.groups() 117 return (s, None) 118 119 120 def ReadUnicodeTable(filename, nfields, doline): 121 """Generic Unicode table text file reader. 122 123 The reader takes care of stripping out comments and also 124 parsing the two different ways that the Unicode tables specify 125 code ranges (using the .. notation and splitting the range across 126 multiple lines). 127 128 Each non-comment line in the table is expected to have the given 129 number of fields. The first field is known to be the Unicode value 130 and the second field its description. 131 132 The reader calls doline(codes, fields) for each entry in the table. 133 If fn raises an exception, the reader prints that exception, 134 prefixed with the file name and line number, and continues 135 processing the file. When done with the file, the reader re-raises 136 the first exception encountered during the file. 137 138 Arguments: 139 filename: the Unicode data file to read, or a file-like object. 140 nfields: the number of expected fields per line in that file. 141 doline: the function to call for each table entry. 142 143 Raises: 144 InputError: nfields is invalid (must be >= 2). 145 """ 146 147 if nfields < 2: 148 raise InputError("invalid number of fields %d" % (nfields,)) 149 150 if type(filename) == str: 151 if filename.startswith("http://"): 152 fil = urllib2.urlopen(filename) 153 else: 154 fil = open(filename, "r") 155 else: 156 fil = filename 157 158 first = None # first code in multiline range 159 expect_last = None # tag expected for "Last" line in multiline range 160 lineno = 0 # current line number 161 for line in fil: 162 lineno += 1 163 try: 164 # Chop # comments and white space; ignore empty lines. 165 sharp = line.find("#") 166 if sharp >= 0: 167 line = line[:sharp] 168 line = line.strip() 169 if not line: 170 continue 171 172 # Split fields on ";", chop more white space. 173 # Must have the expected number of fields. 174 fields = [s.strip() for s in line.split(";")] 175 if len(fields) != nfields: 176 raise InputError("wrong number of fields %d %d - %s" % 177 (len(fields), nfields, line)) 178 179 # The Unicode text files have two different ways 180 # to list a Unicode range. Either the first field is 181 # itself a range (0000..FFFF), or the range is split 182 # across two lines, with the second field noting 183 # the continuation. 184 codes = _URange(fields[0]) 185 (name, cont) = _ParseContinue(fields[1]) 186 187 if expect_last is not None: 188 # If the last line gave the First code in a range, 189 # this one had better give the Last one. 190 if (len(codes) != 1 or codes[0] <= first or 191 cont != "Last" or name != expect_last): 192 raise InputError("expected Last line for %s" % 193 (expect_last,)) 194 codes = range(first, codes[0] + 1) 195 first = None 196 expect_last = None 197 fields[0] = "%04X..%04X" % (codes[0], codes[-1]) 198 fields[1] = name 199 elif cont == "First": 200 # Otherwise, if this is the First code in a range, 201 # remember it and go to the next line. 202 if len(codes) != 1: 203 raise InputError("bad First line: range given") 204 expect_last = name 205 first = codes[0] 206 continue 207 208 doline(codes, fields) 209 210 except Exception, e: 211 print "%s:%d: %s" % (filename, lineno, e) 212 raise 213 214 if expect_last is not None: 215 raise InputError("expected Last line for %s; got EOF" % 216 (expect_last,)) 217 218 219 def CaseGroups(unicode_dir=_UNICODE_DIR): 220 """Returns list of Unicode code groups equivalent under case folding. 221 222 Each group is a sorted list of code points, 223 and the list of groups is sorted by first code point 224 in the group. 225 226 Args: 227 unicode_dir: Unicode data directory 228 229 Returns: 230 list of Unicode code groups 231 """ 232 233 # Dict mapping lowercase code point to fold-equivalent group. 234 togroup = {} 235 236 def DoLine(codes, fields): 237 """Process single CaseFolding.txt line, updating togroup.""" 238 (_, foldtype, lower, _) = fields 239 if foldtype not in ("C", "S"): 240 return 241 lower = _UInt(lower) 242 togroup.setdefault(lower, [lower]).extend(codes) 243 244 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) 245 246 groups = togroup.values() 247 for g in groups: 248 g.sort() 249 groups.sort() 250 return togroup, groups 251 252 253 def Scripts(unicode_dir=_UNICODE_DIR): 254 """Returns dict mapping script names to code lists. 255 256 Args: 257 unicode_dir: Unicode data directory 258 259 Returns: 260 dict mapping script names to code lists 261 """ 262 263 scripts = {} 264 265 def DoLine(codes, fields): 266 """Process single Scripts.txt line, updating scripts.""" 267 (_, name) = fields 268 scripts.setdefault(name, []).extend(codes) 269 270 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) 271 return scripts 272 273 274 def Categories(unicode_dir=_UNICODE_DIR): 275 """Returns dict mapping category names to code lists. 276 277 Args: 278 unicode_dir: Unicode data directory 279 280 Returns: 281 dict mapping category names to code lists 282 """ 283 284 categories = {} 285 286 def DoLine(codes, fields): 287 """Process single UnicodeData.txt line, updating categories.""" 288 category = fields[2] 289 categories.setdefault(category, []).extend(codes) 290 # Add codes from Lu into L, etc. 291 if len(category) > 1: 292 short = category[0] 293 categories.setdefault(short, []).extend(codes) 294 295 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) 296 return categories 297 298