contrib/tables/unicode_parse_common.py

def lines_get(f):
  '''Parse a file like object, removing comments and returning a list of
     lines.'''
  def cut_comment(line):
    first_hash = line.find('#')
    if first_hash == -1:
      return line
    return line[:first_hash]

  return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)]

def line_split(line):
  '''Split a line based on a semicolon separator.'''
  def normalise(word):
    return word.lstrip().rstrip()
  return [normalise(x) for x in line.split(';')]

def codepoints_parse(token):
  '''Parse a Unicode style code-point range. Return either a single value or a
     tuple of (start, end) for a range of code-points.'''
  def fromHex(token):
    return int(token, 16)
  parts = token.split('..')
  if len(parts) == 2:
    return (fromHex(parts[0]), fromHex(parts[1]))
  elif len(parts) == 1:
    return fromHex(parts[0])
  else:
    raise ValueError(token)

def unicode_file_parse(input, map, default_value = None):
  '''Parse a file like object, @input where the first column is a code-point
     range and the second column is mapped via the given dict, @map.'''
  ranges = []
  tokens = [line_split(x) for x in lines_get(input)]
  for line in tokens:
    if len(line) == 2:
      codepoints = codepoints_parse(line[0])
      value = map[line[1]]
      if value == default_value:
        continue

      if type(codepoints) == int:
        codepoints = (codepoints, codepoints)

      ranges.append((codepoints[0], codepoints[1], value))
    else:
      raise ValueError(line)

  return ranges

def sort_and_merge(ranges):
  '''Given a list of (start, end, value), merge elements where the ranges are
     continuous and the values are the same.'''
  output = []
  ranges.sort()
  current = None
  for v in ranges:
    if current is None:
      current = v
      continue
    if current[1] + 1 == v[0] and current[2] == v[2]:
      current = (current[0], v[1], v[2])
    else:
      output.append(current)
      current = v
  if current is not None:
    output.append(current)

  return output