Home | History | Annotate | Download | only in coverage
      1 """Better tokenizing for coverage.py."""
      2 
      3 import keyword, re, token, tokenize
      4 from coverage.backward import StringIO              # pylint: disable=W0622
      5 
      6 def phys_tokens(toks):
      7     """Return all physical tokens, even line continuations.
      8 
      9     tokenize.generate_tokens() doesn't return a token for the backslash that
     10     continues lines.  This wrapper provides those tokens so that we can
     11     re-create a faithful representation of the original source.
     12 
     13     Returns the same values as generate_tokens()
     14 
     15     """
     16     last_line = None
     17     last_lineno = -1
     18     last_ttype = None
     19     for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
     20         if last_lineno != elineno:
     21             if last_line and last_line[-2:] == "\\\n":
     22                 # We are at the beginning of a new line, and the last line
     23                 # ended with a backslash.  We probably have to inject a
     24                 # backslash token into the stream. Unfortunately, there's more
     25                 # to figure out.  This code::
     26                 #
     27                 #   usage = """\
     28                 #   HEY THERE
     29                 #   """
     30                 #
     31                 # triggers this condition, but the token text is::
     32                 #
     33                 #   '"""\\\nHEY THERE\n"""'
     34                 #
     35                 # so we need to figure out if the backslash is already in the
     36                 # string token or not.
     37                 inject_backslash = True
     38                 if last_ttype == tokenize.COMMENT:
     39                     # Comments like this \
     40                     # should never result in a new token.
     41                     inject_backslash = False
     42                 elif ttype == token.STRING:
     43                     if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
     44                         # It's a multiline string and the first line ends with
     45                         # a backslash, so we don't need to inject another.
     46                         inject_backslash = False
     47                 if inject_backslash:
     48                     # Figure out what column the backslash is in.
     49                     ccol = len(last_line.split("\n")[-2]) - 1
     50                     # Yield the token, with a fake token type.
     51                     yield (
     52                         99999, "\\\n",
     53                         (slineno, ccol), (slineno, ccol+2),
     54                         last_line
     55                         )
     56             last_line = ltext
     57             last_ttype = ttype
     58         yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
     59         last_lineno = elineno
     60 
     61 
     62 def source_token_lines(source):
     63     """Generate a series of lines, one for each line in `source`.
     64 
     65     Each line is a list of pairs, each pair is a token::
     66 
     67         [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
     68 
     69     Each pair has a token class, and the token text.
     70 
     71     If you concatenate all the token texts, and then join them with newlines,
     72     you should have your original `source` back, with two differences:
     73     trailing whitespace is not preserved, and a final line with no newline
     74     is indistinguishable from a final line with a newline.
     75 
     76     """
     77     ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
     78     line = []
     79     col = 0
     80     source = source.expandtabs(8).replace('\r\n', '\n')
     81     tokgen = tokenize.generate_tokens(StringIO(source).readline)
     82     for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
     83         mark_start = True
     84         for part in re.split('(\n)', ttext):
     85             if part == '\n':
     86                 yield line
     87                 line = []
     88                 col = 0
     89                 mark_end = False
     90             elif part == '':
     91                 mark_end = False
     92             elif ttype in ws_tokens:
     93                 mark_end = False
     94             else:
     95                 if mark_start and scol > col:
     96                     line.append(("ws", " " * (scol - col)))
     97                     mark_start = False
     98                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
     99                 if ttype == token.NAME and keyword.iskeyword(ttext):
    100                     tok_class = "key"
    101                 line.append((tok_class, part))
    102                 mark_end = True
    103             scol = 0
    104         if mark_end:
    105             col = ecol
    106 
    107     if line:
    108         yield line
    109