Home | History | Annotate | Download | only in Lib
      1 """Tokenization help for Python programs.
      2 
      3 generate_tokens(readline) is a generator that breaks a stream of
      4 text into Python tokens.  It accepts a readline-like method which is called
      5 repeatedly to get the next line of input (or "" for EOF).  It generates
      6 5-tuples with these members:
      7 
      8     the token type (see token.py)
      9     the token (a string)
     10     the starting (row, column) indices of the token (a 2-tuple of ints)
     11     the ending (row, column) indices of the token (a 2-tuple of ints)
     12     the original line (string)
     13 
     14 It is designed to match the working of the Python tokenizer exactly, except
     15 that it produces COMMENT tokens for comments and gives type OP for all
     16 operators
     17 
     18 Older entry points
     19     tokenize_loop(readline, tokeneater)
     20     tokenize(readline, tokeneater=printtoken)
     21 are the same, except instead of generating tokens, tokeneater is a callback
     22 function to which the 5 fields described above are passed as 5 arguments,
     23 each time a new token is found."""
     24 
     25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     27                'Skip Montanaro, Raymond Hettinger')
     28 
     29 from itertools import chain
     30 import string, re
     31 from token import *
     32 
     33 import token
     34 __all__ = [x for x in dir(token) if not x.startswith("_")]
     35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
     36 del x
     37 del token
     38 
     39 COMMENT = N_TOKENS
     40 tok_name[COMMENT] = 'COMMENT'
     41 NL = N_TOKENS + 1
     42 tok_name[NL] = 'NL'
     43 N_TOKENS += 2
     44 
     45 def group(*choices): return '(' + '|'.join(choices) + ')'
     46 def any(*choices): return group(*choices) + '*'
     47 def maybe(*choices): return group(*choices) + '?'
     48 
     49 Whitespace = r'[ \f\t]*'
     50 Comment = r'#[^\r\n]*'
     51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     52 Name = r'[a-zA-Z_]\w*'
     53 
     54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
     55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
     56 Binnumber = r'0[bB][01]+[lL]?'
     57 Decnumber = r'[1-9]\d*[lL]?'
     58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
     59 Exponent = r'[eE][-+]?\d+'
     60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     61 Expfloat = r'\d+' + Exponent
     62 Floatnumber = group(Pointfloat, Expfloat)
     63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     64 Number = group(Imagnumber, Floatnumber, Intnumber)
     65 
     66 # Tail end of ' string.
     67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     68 # Tail end of " string.
     69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     70 # Tail end of ''' string.
     71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     72 # Tail end of """ string.
     73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
     75 # Single-line ' or " string.
     76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     77                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     78 
     79 # Because of leftmost-then-longest match semantics, be sure to put the
     80 # longest operators first (e.g., if = came before ==, == would get
     81 # recognized as two instances of =).
     82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     83                  r"//=?",
     84                  r"[+\-*/%&|^=<>]=?",
     85                  r"~")
     86 
     87 Bracket = '[][(){}]'
     88 Special = group(r'\r?\n', r'[:;.,`@]')
     89 Funny = group(Operator, Bracket, Special)
     90 
     91 PlainToken = group(Number, Funny, String, Name)
     92 Token = Ignore + PlainToken
     93 
     94 # First (or only) line of ' or " string.
     95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     96                 group("'", r'\\\r?\n'),
     97                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     98                 group('"', r'\\\r?\n'))
     99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
    100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    101 
    102 tokenprog, pseudoprog, single3prog, double3prog = map(
    103     re.compile, (Token, PseudoToken, Single3, Double3))
    104 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    105             "'''": single3prog, '"""': double3prog,
    106             "r'''": single3prog, 'r"""': double3prog,
    107             "u'''": single3prog, 'u"""': double3prog,
    108             "ur'''": single3prog, 'ur"""': double3prog,
    109             "R'''": single3prog, 'R"""': double3prog,
    110             "U'''": single3prog, 'U"""': double3prog,
    111             "uR'''": single3prog, 'uR"""': double3prog,
    112             "Ur'''": single3prog, 'Ur"""': double3prog,
    113             "UR'''": single3prog, 'UR"""': double3prog,
    114             "b'''": single3prog, 'b"""': double3prog,
    115             "br'''": single3prog, 'br"""': double3prog,
    116             "B'''": single3prog, 'B"""': double3prog,
    117             "bR'''": single3prog, 'bR"""': double3prog,
    118             "Br'''": single3prog, 'Br"""': double3prog,
    119             "BR'''": single3prog, 'BR"""': double3prog,
    120             'r': None, 'R': None, 'u': None, 'U': None,
    121             'b': None, 'B': None}
    122 
    123 triple_quoted = {}
    124 for t in ("'''", '"""',
    125           "r'''", 'r"""', "R'''", 'R"""',
    126           "u'''", 'u"""', "U'''", 'U"""',
    127           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    128           "uR'''", 'uR"""', "UR'''", 'UR"""',
    129           "b'''", 'b"""', "B'''", 'B"""',
    130           "br'''", 'br"""', "Br'''", 'Br"""',
    131           "bR'''", 'bR"""', "BR'''", 'BR"""'):
    132     triple_quoted[t] = t
    133 single_quoted = {}
    134 for t in ("'", '"',
    135           "r'", 'r"', "R'", 'R"',
    136           "u'", 'u"', "U'", 'U"',
    137           "ur'", 'ur"', "Ur'", 'Ur"',
    138           "uR'", 'uR"', "UR'", 'UR"',
    139           "b'", 'b"', "B'", 'B"',
    140           "br'", 'br"', "Br'", 'Br"',
    141           "bR'", 'bR"', "BR'", 'BR"' ):
    142     single_quoted[t] = t
    143 
    144 tabsize = 8
    145 
    146 class TokenError(Exception): pass
    147 
    148 class StopTokenizing(Exception): pass
    149 
    150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
    151     srow, scol = srow_scol
    152     erow, ecol = erow_ecol
    153     print "%d,%d-%d,%d:\t%s\t%s" % \
    154         (srow, scol, erow, ecol, tok_name[type], repr(token))
    155 
    156 def tokenize(readline, tokeneater=printtoken):
    157     """
    158     The tokenize() function accepts two parameters: one representing the
    159     input stream, and one providing an output mechanism for tokenize().
    160 
    161     The first parameter, readline, must be a callable object which provides
    162     the same interface as the readline() method of built-in file objects.
    163     Each call to the function should return one line of input as a string.
    164 
    165     The second parameter, tokeneater, must also be a callable object. It is
    166     called once for each token, with five arguments, corresponding to the
    167     tuples generated by generate_tokens().
    168     """
    169     try:
    170         tokenize_loop(readline, tokeneater)
    171     except StopTokenizing:
    172         pass
    173 
    174 # backwards compatible interface
    175 def tokenize_loop(readline, tokeneater):
    176     for token_info in generate_tokens(readline):
    177         tokeneater(*token_info)
    178 
    179 class Untokenizer:
    180 
    181     def __init__(self):
    182         self.tokens = []
    183         self.prev_row = 1
    184         self.prev_col = 0
    185 
    186     def add_whitespace(self, start):
    187         row, col = start
    188         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
    189             raise ValueError("start ({},{}) precedes previous end ({},{})"
    190                              .format(row, col, self.prev_row, self.prev_col))
    191         row_offset = row - self.prev_row
    192         if row_offset:
    193             self.tokens.append("\\\n" * row_offset)
    194             self.prev_col = 0
    195         col_offset = col - self.prev_col
    196         if col_offset:
    197             self.tokens.append(" " * col_offset)
    198 
    199     def untokenize(self, iterable):
    200         it = iter(iterable)
    201         indents = []
    202         startline = False
    203         for t in it:
    204             if len(t) == 2:
    205                 self.compat(t, it)
    206                 break
    207             tok_type, token, start, end, line = t
    208             if tok_type == ENDMARKER:
    209                 break
    210             if tok_type == INDENT:
    211                 indents.append(token)
    212                 continue
    213             elif tok_type == DEDENT:
    214                 indents.pop()
    215                 self.prev_row, self.prev_col = end
    216                 continue
    217             elif tok_type in (NEWLINE, NL):
    218                 startline = True
    219             elif startline and indents:
    220                 indent = indents[-1]
    221                 if start[1] >= len(indent):
    222                     self.tokens.append(indent)
    223                     self.prev_col = len(indent)
    224                 startline = False
    225             self.add_whitespace(start)
    226             self.tokens.append(token)
    227             self.prev_row, self.prev_col = end
    228             if tok_type in (NEWLINE, NL):
    229                 self.prev_row += 1
    230                 self.prev_col = 0
    231         return "".join(self.tokens)
    232 
    233     def compat(self, token, iterable):
    234         indents = []
    235         toks_append = self.tokens.append
    236         startline = token[0] in (NEWLINE, NL)
    237         prevstring = False
    238 
    239         for tok in chain([token], iterable):
    240             toknum, tokval = tok[:2]
    241 
    242             if toknum in (NAME, NUMBER):
    243                 tokval += ' '
    244 
    245             # Insert a space between two consecutive strings
    246             if toknum == STRING:
    247                 if prevstring:
    248                     tokval = ' ' + tokval
    249                 prevstring = True
    250             else:
    251                 prevstring = False
    252 
    253             if toknum == INDENT:
    254                 indents.append(tokval)
    255                 continue
    256             elif toknum == DEDENT:
    257                 indents.pop()
    258                 continue
    259             elif toknum in (NEWLINE, NL):
    260                 startline = True
    261             elif startline and indents:
    262                 toks_append(indents[-1])
    263                 startline = False
    264             toks_append(tokval)
    265 
    266 def untokenize(iterable):
    267     """Transform tokens back into Python source code.
    268 
    269     Each element returned by the iterable must be a token sequence
    270     with at least two elements, a token number and token value.  If
    271     only two tokens are passed, the resulting output is poor.
    272 
    273     Round-trip invariant for full input:
    274         Untokenized source will match input source exactly
    275 
    276     Round-trip invariant for limited intput:
    277         # Output text will tokenize the back to the input
    278         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    279         newcode = untokenize(t1)
    280         readline = iter(newcode.splitlines(1)).next
    281         t2 = [tok[:2] for tok in generate_tokens(readline)]
    282         assert t1 == t2
    283     """
    284     ut = Untokenizer()
    285     return ut.untokenize(iterable)
    286 
    287 def generate_tokens(readline):
    288     """
    289     The generate_tokens() generator requires one argument, readline, which
    290     must be a callable object which provides the same interface as the
    291     readline() method of built-in file objects. Each call to the function
    292     should return one line of input as a string.  Alternately, readline
    293     can be a callable function terminating with StopIteration:
    294         readline = open(myfile).next    # Example of alternate readline
    295 
    296     The generator produces 5-tuples with these members: the token type; the
    297     token string; a 2-tuple (srow, scol) of ints specifying the row and
    298     column where the token begins in the source; a 2-tuple (erow, ecol) of
    299     ints specifying the row and column where the token ends in the source;
    300     and the line on which the token was found. The line passed is the
    301     logical line; continuation lines are included.
    302     """
    303     lnum = parenlev = continued = 0
    304     namechars, numchars = string.ascii_letters + '_', '0123456789'
    305     contstr, needcont = '', 0
    306     contline = None
    307     indents = [0]
    308 
    309     last_line = b''
    310     line = b''
    311     while 1:                                   # loop over lines in stream
    312         try:
    313             # We capture the value of the line variable here because
    314             # readline uses the empty string '' to signal end of input,
    315             # hence `line` itself will always be overwritten at the end
    316             # of this loop.
    317             last_line = line
    318             line = readline()
    319         except StopIteration:
    320             line = ''
    321         lnum += 1
    322         pos, max = 0, len(line)
    323 
    324         if contstr:                            # continued string
    325             if not line:
    326                 raise TokenError, ("EOF in multi-line string", strstart)
    327             endmatch = endprog.match(line)
    328             if endmatch:
    329                 pos = end = endmatch.end(0)
    330                 yield (STRING, contstr + line[:end],
    331                        strstart, (lnum, end), contline + line)
    332                 contstr, needcont = '', 0
    333                 contline = None
    334             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    335                 yield (ERRORTOKEN, contstr + line,
    336                            strstart, (lnum, len(line)), contline)
    337                 contstr = ''
    338                 contline = None
    339                 continue
    340             else:
    341                 contstr = contstr + line
    342                 contline = contline + line
    343                 continue
    344 
    345         elif parenlev == 0 and not continued:  # new statement
    346             if not line: break
    347             column = 0
    348             while pos < max:                   # measure leading whitespace
    349                 if line[pos] == ' ':
    350                     column += 1
    351                 elif line[pos] == '\t':
    352                     column = (column//tabsize + 1)*tabsize
    353                 elif line[pos] == '\f':
    354                     column = 0
    355                 else:
    356                     break
    357                 pos += 1
    358             if pos == max:
    359                 break
    360 
    361             if line[pos] in '#\r\n':           # skip comments or blank lines
    362                 if line[pos] == '#':
    363                     comment_token = line[pos:].rstrip('\r\n')
    364                     nl_pos = pos + len(comment_token)
    365                     yield (COMMENT, comment_token,
    366                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    367                     yield (NL, line[nl_pos:],
    368                            (lnum, nl_pos), (lnum, len(line)), line)
    369                 else:
    370                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    371                            (lnum, pos), (lnum, len(line)), line)
    372                 continue
    373 
    374             if column > indents[-1]:           # count indents or dedents
    375                 indents.append(column)
    376                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    377             while column < indents[-1]:
    378                 if column not in indents:
    379                     raise IndentationError(
    380                         "unindent does not match any outer indentation level",
    381                         ("<tokenize>", lnum, pos, line))
    382                 indents = indents[:-1]
    383                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    384 
    385         else:                                  # continued statement
    386             if not line:
    387                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    388             continued = 0
    389 
    390         while pos < max:
    391             pseudomatch = pseudoprog.match(line, pos)
    392             if pseudomatch:                                # scan for tokens
    393                 start, end = pseudomatch.span(1)
    394                 spos, epos, pos = (lnum, start), (lnum, end), end
    395                 if start == end:
    396                     continue
    397                 token, initial = line[start:end], line[start]
    398 
    399                 if initial in numchars or \
    400                    (initial == '.' and token != '.'):      # ordinary number
    401                     yield (NUMBER, token, spos, epos, line)
    402                 elif initial in '\r\n':
    403                     yield (NL if parenlev > 0 else NEWLINE,
    404                            token, spos, epos, line)
    405                 elif initial == '#':
    406                     assert not token.endswith("\n")
    407                     yield (COMMENT, token, spos, epos, line)
    408                 elif token in triple_quoted:
    409                     endprog = endprogs[token]
    410                     endmatch = endprog.match(line, pos)
    411                     if endmatch:                           # all on one line
    412                         pos = endmatch.end(0)
    413                         token = line[start:pos]
    414                         yield (STRING, token, spos, (lnum, pos), line)
    415                     else:
    416                         strstart = (lnum, start)           # multiple lines
    417                         contstr = line[start:]
    418                         contline = line
    419                         break
    420                 elif initial in single_quoted or \
    421                     token[:2] in single_quoted or \
    422                     token[:3] in single_quoted:
    423                     if token[-1] == '\n':                  # continued string
    424                         strstart = (lnum, start)
    425                         endprog = (endprogs[initial] or endprogs[token[1]] or
    426                                    endprogs[token[2]])
    427                         contstr, needcont = line[start:], 1
    428                         contline = line
    429                         break
    430                     else:                                  # ordinary string
    431                         yield (STRING, token, spos, epos, line)
    432                 elif initial in namechars:                 # ordinary name
    433                     yield (NAME, token, spos, epos, line)
    434                 elif initial == '\\':                      # continued stmt
    435                     continued = 1
    436                 else:
    437                     if initial in '([{':
    438                         parenlev += 1
    439                     elif initial in ')]}':
    440                         parenlev -= 1
    441                     yield (OP, token, spos, epos, line)
    442             else:
    443                 yield (ERRORTOKEN, line[pos],
    444                            (lnum, pos), (lnum, pos+1), line)
    445                 pos += 1
    446 
    447     # Add an implicit NEWLINE if the input doesn't end in one
    448     if last_line and last_line[-1] not in '\r\n':
    449         yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
    450     for indent in indents[1:]:                 # pop remaining indent levels
    451         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    452     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    453 
    454 if __name__ == '__main__':                     # testing
    455     import sys
    456     if len(sys.argv) > 1:
    457         tokenize(open(sys.argv[1]).readline)
    458     else:
    459         tokenize(sys.stdin.readline)
    460