Home | History | Annotate | Download | only in refactor
      1 # Copyright 2015 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import parser
      6 import symbol
      7 import sys
      8 import token
      9 import tokenize
     10 
     11 from py_utils.refactor import offset_token
     12 
     13 
     14 class Snippet(object):
     15   """A node in the Python parse tree.
     16 
     17   The Python grammar is defined at:
     18   https://docs.python.org/2/reference/grammar.html
     19 
     20   There are two types of Snippets:
     21     TokenSnippets are leaf nodes containing actual text.
     22     Symbols are internal nodes representing higher-level groupings, and are
     23         defined by the left-hand sides of the BNFs in the above link.
     24   """
     25   @property
     26   def type(self):
     27     raise NotImplementedError()
     28 
     29   @property
     30   def type_name(self):
     31     raise NotImplementedError()
     32 
     33   @property
     34   def children(self):
     35     """Return a list of this node's children."""
     36     raise NotImplementedError()
     37 
     38   @property
     39   def tokens(self):
     40     """Return a tuple of the tokens this Snippet contains."""
     41     raise NotImplementedError()
     42 
     43   def PrintTree(self, indent=0, stream=sys.stdout):
     44     """Spew a pretty-printed parse tree. Mostly useful for debugging."""
     45     raise NotImplementedError()
     46 
     47   def __str__(self):
     48     return offset_token.Untokenize(self.tokens)
     49 
     50   def FindAll(self, snippet_type):
     51     if isinstance(snippet_type, int):
     52       if self.type == snippet_type:
     53         yield self
     54     else:
     55       if isinstance(self, snippet_type):
     56         yield self
     57 
     58     for child in self.children:
     59       for snippet in child.FindAll(snippet_type):
     60         yield snippet
     61 
     62   def FindChild(self, snippet_type, **kwargs):
     63     for child in self.children:
     64       if isinstance(snippet_type, int):
     65         if child.type != snippet_type:
     66           continue
     67       else:
     68         if not isinstance(child, snippet_type):
     69           continue
     70 
     71       for attribute, value in kwargs:
     72         if getattr(child, attribute) != value:
     73           break
     74       else:
     75         return child
     76     raise ValueError('%s is not in %s. Children are: %s' %
     77                      (snippet_type, self, self.children))
     78 
     79   def FindChildren(self, snippet_type):
     80     if isinstance(snippet_type, int):
     81       for child in self.children:
     82         if child.type == snippet_type:
     83           yield child
     84     else:
     85       for child in self.children:
     86         if isinstance(child, snippet_type):
     87           yield child
     88 
     89 
     90 class TokenSnippet(Snippet):
     91   """A Snippet containing a list of tokens.
     92 
     93   A list of tokens may start with any number of comments and non-terminating
     94   newlines, but must end with a syntactically meaningful token.
     95   """
     96 
     97   def __init__(self, token_type, tokens):
     98     # For operators and delimiters, the TokenSnippet's type may be more specific
     99     # than the type of the constituent token. E.g. the TokenSnippet type is
    100     # token.DOT, but the token type is token.OP. This is because the parser
    101     # has more context than the tokenizer.
    102     self._type = token_type
    103     self._tokens = tokens
    104     self._modified = False
    105 
    106   @classmethod
    107   def Create(cls, token_type, string, offset=(0, 0)):
    108     return cls(token_type,
    109                [offset_token.OffsetToken(token_type, string, offset)])
    110 
    111   @property
    112   def type(self):
    113     return self._type
    114 
    115   @property
    116   def type_name(self):
    117     return token.tok_name[self.type]
    118 
    119   @property
    120   def value(self):
    121     return self._tokens[-1].string
    122 
    123   @value.setter
    124   def value(self, value):
    125     self._tokens[-1].string = value
    126     self._modified = True
    127 
    128   @property
    129   def children(self):
    130     return []
    131 
    132   @property
    133   def tokens(self):
    134     return tuple(self._tokens)
    135 
    136   @property
    137   def modified(self):
    138     return self._modified
    139 
    140   def PrintTree(self, indent=0, stream=sys.stdout):
    141     stream.write(' ' * indent)
    142     if not self.tokens:
    143       print >> stream, self.type_name
    144       return
    145 
    146     print >> stream, '%-4s' % self.type_name, repr(self.tokens[0].string)
    147     for tok in self.tokens[1:]:
    148       stream.write(' ' * indent)
    149       print >> stream, ' ' * max(len(self.type_name), 4), repr(tok.string)
    150 
    151 
    152 class Symbol(Snippet):
    153   """A Snippet containing sub-Snippets.
    154 
    155   The possible types and type_names are defined in Python's symbol module."""
    156 
    157   def __init__(self, symbol_type, children):
    158     self._type = symbol_type
    159     self._children = children
    160 
    161   @property
    162   def type(self):
    163     return self._type
    164 
    165   @property
    166   def type_name(self):
    167     return symbol.sym_name[self.type]
    168 
    169   @property
    170   def children(self):
    171     return self._children
    172 
    173   @children.setter
    174   def children(self, value):  # pylint: disable=arguments-differ
    175     self._children = value
    176 
    177   @property
    178   def tokens(self):
    179     tokens = []
    180     for child in self.children:
    181       tokens += child.tokens
    182     return tuple(tokens)
    183 
    184   @property
    185   def modified(self):
    186     return any(child.modified for child in self.children)
    187 
    188   def PrintTree(self, indent=0, stream=sys.stdout):
    189     stream.write(' ' * indent)
    190 
    191     # If there's only one child, collapse it onto the same line.
    192     node = self
    193     while len(node.children) == 1 and len(node.children[0].children) == 1:
    194       print >> stream, node.type_name,
    195       node = node.children[0]
    196 
    197     print >> stream, node.type_name
    198     for child in node.children:
    199       child.PrintTree(indent + 2, stream)
    200 
    201 
    202 def Snippetize(f):
    203   """Return the syntax tree of the given file."""
    204   f.seek(0)
    205   syntax_tree = parser.st2list(parser.suite(f.read()))
    206   tokens = offset_token.Tokenize(f)
    207 
    208   snippet = _SnippetizeNode(syntax_tree, tokens)
    209   assert not tokens
    210   return snippet
    211 
    212 
    213 def _SnippetizeNode(node, tokens):
    214   # The parser module gives a syntax tree that discards comments,
    215   # non-terminating newlines, and whitespace information. Use the tokens given
    216   # by the tokenize module to annotate the syntax tree with the information
    217   # needed to exactly reproduce the original source code.
    218   node_type = node[0]
    219 
    220   if node_type >= token.NT_OFFSET:
    221     # Symbol.
    222     children = tuple(_SnippetizeNode(child, tokens) for child in node[1:])
    223     return Symbol(node_type, children)
    224   else:
    225     # Token.
    226     grabbed_tokens = []
    227     while tokens and (
    228         tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL):
    229       grabbed_tokens.append(tokens.popleft())
    230 
    231     # parser has 2 NEWLINEs right before the end.
    232     # tokenize has 0 or 1 depending on if the file has one.
    233     # Create extra nodes without consuming tokens to account for this.
    234     if node_type == token.NEWLINE:
    235       for tok in tokens:
    236         if tok.type == token.ENDMARKER:
    237           return TokenSnippet(node_type, grabbed_tokens)
    238         if tok.type != token.DEDENT:
    239           break
    240 
    241     assert tokens[0].type == token.OP or node_type == tokens[0].type
    242 
    243     grabbed_tokens.append(tokens.popleft())
    244     return TokenSnippet(node_type, grabbed_tokens)
    245