1 def lines_get(f): 2 '''Parse a file like object, removing comments and returning a list of 3 lines.''' 4 def cut_comment(line): 5 first_hash = line.find('#') 6 if first_hash == -1: 7 return line 8 return line[:first_hash] 9 10 return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)] 11 12 def line_split(line): 13 '''Split a line based on a semicolon separator.''' 14 def normalise(word): 15 return word.lstrip().rstrip() 16 return [normalise(x) for x in line.split(';')] 17 18 def codepoints_parse(token): 19 '''Parse a Unicode style code-point range. Return either a single value or a 20 tuple of (start, end) for a range of code-points.''' 21 def fromHex(token): 22 return int(token, 16) 23 parts = token.split('..') 24 if len(parts) == 2: 25 return (fromHex(parts[0]), fromHex(parts[1])) 26 elif len(parts) == 1: 27 return fromHex(parts[0]) 28 else: 29 raise ValueError(token) 30 31 def unicode_file_parse(input, map, default_value = None): 32 '''Parse a file like object, @input where the first column is a code-point 33 range and the second column is mapped via the given dict, @map.''' 34 ranges = [] 35 tokens = [line_split(x) for x in lines_get(input)] 36 for line in tokens: 37 if len(line) == 2: 38 codepoints = codepoints_parse(line[0]) 39 value = map[line[1]] 40 if value == default_value: 41 continue 42 43 if type(codepoints) == int: 44 codepoints = (codepoints, codepoints) 45 46 ranges.append((codepoints[0], codepoints[1], value)) 47 else: 48 raise ValueError(line) 49 50 return ranges 51 52 def sort_and_merge(ranges): 53 '''Given a list of (start, end, value), merge elements where the ranges are 54 continuous and the values are the same.''' 55 output = [] 56 ranges.sort() 57 current = None 58 for v in ranges: 59 if current is None: 60 current = v 61 continue 62 if current[1] + 1 == v[0] and current[2] == v[2]: 63 current = (current[0], v[1], v[2]) 64 else: 65 output.append(current) 66 current = v 67 if current is not None: 68 output.append(current) 69 70 return output 71