py_utils/refactor/offset_token.py

# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import collections
import itertools
import token
import tokenize


def _Pairwise(iterable):
  """s -> (None, s0), (s0, s1), (s1, s2), (s2, s3), ..."""
  a, b = itertools.tee(iterable)
  a = itertools.chain((None,), a)
  return itertools.izip(a, b)


class OffsetToken(object):
  """A Python token with a relative position.

  A token is represented by a type defined in Python's token module, a string
  representing the content, and an offset. Using relative positions makes it
  easy to insert and remove tokens.
  """

  def __init__(self, token_type, string, offset):
    self._type = token_type
    self._string = string
    self._offset = offset

  @property
  def type(self):
    return self._type

  @property
  def type_name(self):
    return token.tok_name[self._type]

  @property
  def string(self):
    return self._string

  @string.setter
  def string(self, value):
    self._string = value

  @property
  def offset(self):
    return self._offset

  def __str__(self):
    return str((self.type_name, self.string, self.offset))


def Tokenize(f):
  """Read tokens from a file-like object.

  Args:
    f: Any object that has a readline method.

  Returns:
    A collections.deque containing OffsetTokens. Deques are cheaper and easier
    to manipulate sequentially than lists.
  """
  f.seek(0)
  tokenize_tokens = tokenize.generate_tokens(f.readline)

  offset_tokens = collections.deque()
  for prev_token, next_token in _Pairwise(tokenize_tokens):
    token_type, string, (srow, scol), _, _ = next_token
    if not prev_token:
      offset_tokens.append(OffsetToken(token_type, string, (0, 0)))
    else:
      erow, ecol = prev_token[3]
      if erow == srow:
        offset_tokens.append(OffsetToken(token_type, string, (0, scol - ecol)))
      else:
        offset_tokens.append(OffsetToken(
            token_type, string, (srow - erow, scol)))

  return offset_tokens


def Untokenize(offset_tokens):
  """Return the string representation of an iterable of OffsetTokens."""
  # Make a copy. Don't modify the original.
  offset_tokens = collections.deque(offset_tokens)

  # Strip leading NL tokens.
  while offset_tokens[0].type == tokenize.NL:
    offset_tokens.popleft()

  # Strip leading vertical whitespace.
  first_token = offset_tokens.popleft()
  # Take care not to modify the existing token. Create a new one in its place.
  first_token = OffsetToken(first_token.type, first_token.string,
                            (0, first_token.offset[1]))
  offset_tokens.appendleft(first_token)

  # Convert OffsetTokens to tokenize tokens.
  tokenize_tokens = []
  row = 1
  col = 0
  for t in offset_tokens:
    offset_row, offset_col = t.offset
    if offset_row == 0:
      col += offset_col
    else:
      row += offset_row
      col = offset_col
    tokenize_tokens.append((t.type, t.string, (row, col), (row, col), None))

  # tokenize can't handle whitespace before line continuations.
  # So add a space.
  return tokenize.untokenize(tokenize_tokens).replace('\\\n', ' \\\n')