Home | History | Annotate | Download | only in Lib
      1 """Tokenization help for Python programs.
      2 
      3 generate_tokens(readline) is a generator that breaks a stream of
      4 text into Python tokens.  It accepts a readline-like method which is called
      5 repeatedly to get the next line of input (or "" for EOF).  It generates
      6 5-tuples with these members:
      7 
      8     the token type (see token.py)
      9     the token (a string)
     10     the starting (row, column) indices of the token (a 2-tuple of ints)
     11     the ending (row, column) indices of the token (a 2-tuple of ints)
     12     the original line (string)
     13 
     14 It is designed to match the working of the Python tokenizer exactly, except
     15 that it produces COMMENT tokens for comments and gives type OP for all
     16 operators
     17 
     18 Older entry points
     19     tokenize_loop(readline, tokeneater)
     20     tokenize(readline, tokeneater=printtoken)
     21 are the same, except instead of generating tokens, tokeneater is a callback
     22 function to which the 5 fields described above are passed as 5 arguments,
     23 each time a new token is found."""
     24 
     25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     27                'Skip Montanaro, Raymond Hettinger')
     28 
     29 from itertools import chain
     30 import string, re
     31 from token import *
     32 
     33 import token
     34 __all__ = [x for x in dir(token) if not x.startswith("_")]
     35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
     36 del x
     37 del token
     38 
     39 COMMENT = N_TOKENS
     40 tok_name[COMMENT] = 'COMMENT'
     41 NL = N_TOKENS + 1
     42 tok_name[NL] = 'NL'
     43 N_TOKENS += 2
     44 
     45 def group(*choices): return '(' + '|'.join(choices) + ')'
     46 def any(*choices): return group(*choices) + '*'
     47 def maybe(*choices): return group(*choices) + '?'
     48 
     49 Whitespace = r'[ \f\t]*'
     50 Comment = r'#[^\r\n]*'
     51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     52 Name = r'[a-zA-Z_]\w*'
     53 
     54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
     55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
     56 Binnumber = r'0[bB][01]+[lL]?'
     57 Decnumber = r'[1-9]\d*[lL]?'
     58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
     59 Exponent = r'[eE][-+]?\d+'
     60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     61 Expfloat = r'\d+' + Exponent
     62 Floatnumber = group(Pointfloat, Expfloat)
     63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     64 Number = group(Imagnumber, Floatnumber, Intnumber)
     65 
     66 # Tail end of ' string.

     67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     68 # Tail end of " string.

     69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     70 # Tail end of ''' string.

     71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     72 # Tail end of """ string.

     73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
     75 # Single-line ' or " string.

     76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     77                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     78 
     79 # Because of leftmost-then-longest match semantics, be sure to put the

     80 # longest operators first (e.g., if = came before ==, == would get

     81 # recognized as two instances of =).

     82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     83                  r"//=?",
     84                  r"[+\-*/%&|^=<>]=?",
     85                  r"~")
     86 
     87 Bracket = '[][(){}]'
     88 Special = group(r'\r?\n', r'[:;.,`@]')
     89 Funny = group(Operator, Bracket, Special)
     90 
     91 PlainToken = group(Number, Funny, String, Name)
     92 Token = Ignore + PlainToken
     93 
     94 # First (or only) line of ' or " string.

     95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     96                 group("'", r'\\\r?\n'),
     97                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     98                 group('"', r'\\\r?\n'))
     99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
    100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    101 
    102 tokenprog, pseudoprog, single3prog, double3prog = map(
    103     re.compile, (Token, PseudoToken, Single3, Double3))
    104 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    105             "'''": single3prog, '"""': double3prog,
    106             "r'''": single3prog, 'r"""': double3prog,
    107             "u'''": single3prog, 'u"""': double3prog,
    108             "ur'''": single3prog, 'ur"""': double3prog,
    109             "R'''": single3prog, 'R"""': double3prog,
    110             "U'''": single3prog, 'U"""': double3prog,
    111             "uR'''": single3prog, 'uR"""': double3prog,
    112             "Ur'''": single3prog, 'Ur"""': double3prog,
    113             "UR'''": single3prog, 'UR"""': double3prog,
    114             "b'''": single3prog, 'b"""': double3prog,
    115             "br'''": single3prog, 'br"""': double3prog,
    116             "B'''": single3prog, 'B"""': double3prog,
    117             "bR'''": single3prog, 'bR"""': double3prog,
    118             "Br'''": single3prog, 'Br"""': double3prog,
    119             "BR'''": single3prog, 'BR"""': double3prog,
    120             'r': None, 'R': None, 'u': None, 'U': None,
    121             'b': None, 'B': None}
    122 
    123 triple_quoted = {}
    124 for t in ("'''", '"""',
    125           "r'''", 'r"""', "R'''", 'R"""',
    126           "u'''", 'u"""', "U'''", 'U"""',
    127           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    128           "uR'''", 'uR"""', "UR'''", 'UR"""',
    129           "b'''", 'b"""', "B'''", 'B"""',
    130           "br'''", 'br"""', "Br'''", 'Br"""',
    131           "bR'''", 'bR"""', "BR'''", 'BR"""'):
    132     triple_quoted[t] = t
    133 single_quoted = {}
    134 for t in ("'", '"',
    135           "r'", 'r"', "R'", 'R"',
    136           "u'", 'u"', "U'", 'U"',
    137           "ur'", 'ur"', "Ur'", 'Ur"',
    138           "uR'", 'uR"', "UR'", 'UR"',
    139           "b'", 'b"', "B'", 'B"',
    140           "br'", 'br"', "Br'", 'Br"',
    141           "bR'", 'bR"', "BR'", 'BR"' ):
    142     single_quoted[t] = t
    143 
    144 tabsize = 8
    145 
    146 class TokenError(Exception): pass
    147 
    148 class StopTokenizing(Exception): pass
    149 
    150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing

    151     srow, scol = srow_scol
    152     erow, ecol = erow_ecol
    153     print "%d,%d-%d,%d:\t%s\t%s" % \
    154         (srow, scol, erow, ecol, tok_name[type], repr(token))
    155 
    156 def tokenize(readline, tokeneater=printtoken):
    157     """
    158     The tokenize() function accepts two parameters: one representing the
    159     input stream, and one providing an output mechanism for tokenize().
    160 
    161     The first parameter, readline, must be a callable object which provides
    162     the same interface as the readline() method of built-in file objects.
    163     Each call to the function should return one line of input as a string.
    164 
    165     The second parameter, tokeneater, must also be a callable object. It is
    166     called once for each token, with five arguments, corresponding to the
    167     tuples generated by generate_tokens().
    168     """
    169     try:
    170         tokenize_loop(readline, tokeneater)
    171     except StopTokenizing:
    172         pass
    173 
    174 # backwards compatible interface

    175 def tokenize_loop(readline, tokeneater):
    176     for token_info in generate_tokens(readline):
    177         tokeneater(*token_info)
    178 
    179 class Untokenizer:
    180 
    181     def __init__(self):
    182         self.tokens = []
    183         self.prev_row = 1
    184         self.prev_col = 0
    185 
    186     def add_whitespace(self, start):
    187         row, col = start
    188         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
    189             raise ValueError("start ({},{}) precedes previous end ({},{})"
    190                              .format(row, col, self.prev_row, self.prev_col))
    191         row_offset = row - self.prev_row
    192         if row_offset:
    193             self.tokens.append("\\\n" * row_offset)
    194             self.prev_col = 0
    195         col_offset = col - self.prev_col
    196         if col_offset:
    197             self.tokens.append(" " * col_offset)
    198 
    199     def untokenize(self, iterable):
    200         it = iter(iterable)
    201         for t in it:
    202             if len(t) == 2:
    203                 self.compat(t, it)
    204                 break
    205             tok_type, token, start, end, line = t
    206             if tok_type == ENDMARKER:
    207                 break
    208             self.add_whitespace(start)
    209             self.tokens.append(token)
    210             self.prev_row, self.prev_col = end
    211             if tok_type in (NEWLINE, NL):
    212                 self.prev_row += 1
    213                 self.prev_col = 0
    214         return "".join(self.tokens)
    215 
    216     def compat(self, token, iterable):
    217         indents = []
    218         toks_append = self.tokens.append
    219         startline = token[0] in (NEWLINE, NL)
    220         prevstring = False
    221 
    222         for tok in chain([token], iterable):
    223             toknum, tokval = tok[:2]
    224 
    225             if toknum in (NAME, NUMBER):
    226                 tokval += ' '
    227 
    228             # Insert a space between two consecutive strings

    229             if toknum == STRING:
    230                 if prevstring:
    231                     tokval = ' ' + tokval
    232                 prevstring = True
    233             else:
    234                 prevstring = False
    235 
    236             if toknum == INDENT:
    237                 indents.append(tokval)
    238                 continue
    239             elif toknum == DEDENT:
    240                 indents.pop()
    241                 continue
    242             elif toknum in (NEWLINE, NL):
    243                 startline = True
    244             elif startline and indents:
    245                 toks_append(indents[-1])
    246                 startline = False
    247             toks_append(tokval)
    248 
    249 def untokenize(iterable):
    250     """Transform tokens back into Python source code.
    251 
    252     Each element returned by the iterable must be a token sequence
    253     with at least two elements, a token number and token value.  If
    254     only two tokens are passed, the resulting output is poor.
    255 
    256     Round-trip invariant for full input:
    257         Untokenized source will match input source exactly
    258 
    259     Round-trip invariant for limited intput:
    260         # Output text will tokenize the back to the input
    261         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    262         newcode = untokenize(t1)
    263         readline = iter(newcode.splitlines(1)).next
    264         t2 = [tok[:2] for tok in generate_tokens(readline)]
    265         assert t1 == t2
    266     """
    267     ut = Untokenizer()
    268     return ut.untokenize(iterable)
    269 
    270 def generate_tokens(readline):
    271     """
    272     The generate_tokens() generator requires one argument, readline, which
    273     must be a callable object which provides the same interface as the
    274     readline() method of built-in file objects. Each call to the function
    275     should return one line of input as a string.  Alternately, readline
    276     can be a callable function terminating with StopIteration:
    277         readline = open(myfile).next    # Example of alternate readline
    278 
    279     The generator produces 5-tuples with these members: the token type; the
    280     token string; a 2-tuple (srow, scol) of ints specifying the row and
    281     column where the token begins in the source; a 2-tuple (erow, ecol) of
    282     ints specifying the row and column where the token ends in the source;
    283     and the line on which the token was found. The line passed is the
    284     logical line; continuation lines are included.
    285     """
    286     lnum = parenlev = continued = 0
    287     namechars, numchars = string.ascii_letters + '_', '0123456789'
    288     contstr, needcont = '', 0
    289     contline = None
    290     indents = [0]
    291 
    292     while 1:                                   # loop over lines in stream

    293         try:
    294             line = readline()
    295         except StopIteration:
    296             line = ''
    297         lnum += 1
    298         pos, max = 0, len(line)
    299 
    300         if contstr:                            # continued string

    301             if not line:
    302                 raise TokenError, ("EOF in multi-line string", strstart)
    303             endmatch = endprog.match(line)
    304             if endmatch:
    305                 pos = end = endmatch.end(0)
    306                 yield (STRING, contstr + line[:end],
    307                        strstart, (lnum, end), contline + line)
    308                 contstr, needcont = '', 0
    309                 contline = None
    310             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    311                 yield (ERRORTOKEN, contstr + line,
    312                            strstart, (lnum, len(line)), contline)
    313                 contstr = ''
    314                 contline = None
    315                 continue
    316             else:
    317                 contstr = contstr + line
    318                 contline = contline + line
    319                 continue
    320 
    321         elif parenlev == 0 and not continued:  # new statement

    322             if not line: break
    323             column = 0
    324             while pos < max:                   # measure leading whitespace

    325                 if line[pos] == ' ':
    326                     column += 1
    327                 elif line[pos] == '\t':
    328                     column = (column//tabsize + 1)*tabsize
    329                 elif line[pos] == '\f':
    330                     column = 0
    331                 else:
    332                     break
    333                 pos += 1
    334             if pos == max:
    335                 break
    336 
    337             if line[pos] in '#\r\n':           # skip comments or blank lines

    338                 if line[pos] == '#':
    339                     comment_token = line[pos:].rstrip('\r\n')
    340                     nl_pos = pos + len(comment_token)
    341                     yield (COMMENT, comment_token,
    342                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    343                     yield (NL, line[nl_pos:],
    344                            (lnum, nl_pos), (lnum, len(line)), line)
    345                 else:
    346                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    347                            (lnum, pos), (lnum, len(line)), line)
    348                 continue
    349 
    350             if column > indents[-1]:           # count indents or dedents

    351                 indents.append(column)
    352                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    353             while column < indents[-1]:
    354                 if column not in indents:
    355                     raise IndentationError(
    356                         "unindent does not match any outer indentation level",
    357                         ("<tokenize>", lnum, pos, line))
    358                 indents = indents[:-1]
    359                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    360 
    361         else:                                  # continued statement

    362             if not line:
    363                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    364             continued = 0
    365 
    366         while pos < max:
    367             pseudomatch = pseudoprog.match(line, pos)
    368             if pseudomatch:                                # scan for tokens

    369                 start, end = pseudomatch.span(1)
    370                 spos, epos, pos = (lnum, start), (lnum, end), end
    371                 if start == end:
    372                     continue
    373                 token, initial = line[start:end], line[start]
    374 
    375                 if initial in numchars or \
    376                    (initial == '.' and token != '.'):      # ordinary number

    377                     yield (NUMBER, token, spos, epos, line)
    378                 elif initial in '\r\n':
    379                     yield (NL if parenlev > 0 else NEWLINE,
    380                            token, spos, epos, line)
    381                 elif initial == '#':
    382                     assert not token.endswith("\n")
    383                     yield (COMMENT, token, spos, epos, line)
    384                 elif token in triple_quoted:
    385                     endprog = endprogs[token]
    386                     endmatch = endprog.match(line, pos)
    387                     if endmatch:                           # all on one line

    388                         pos = endmatch.end(0)
    389                         token = line[start:pos]
    390                         yield (STRING, token, spos, (lnum, pos), line)
    391                     else:
    392                         strstart = (lnum, start)           # multiple lines

    393                         contstr = line[start:]
    394                         contline = line
    395                         break
    396                 elif initial in single_quoted or \
    397                     token[:2] in single_quoted or \
    398                     token[:3] in single_quoted:
    399                     if token[-1] == '\n':                  # continued string

    400                         strstart = (lnum, start)
    401                         endprog = (endprogs[initial] or endprogs[token[1]] or
    402                                    endprogs[token[2]])
    403                         contstr, needcont = line[start:], 1
    404                         contline = line
    405                         break
    406                     else:                                  # ordinary string

    407                         yield (STRING, token, spos, epos, line)
    408                 elif initial in namechars:                 # ordinary name

    409                     yield (NAME, token, spos, epos, line)
    410                 elif initial == '\\':                      # continued stmt
    411                     continued = 1
    412                 else:
    413                     if initial in '([{':
    414                         parenlev += 1
    415                     elif initial in ')]}':
    416                         parenlev -= 1
    417                     yield (OP, token, spos, epos, line)
    418             else:
    419                 yield (ERRORTOKEN, line[pos],
    420                            (lnum, pos), (lnum, pos+1), line)
    421                 pos += 1
    422 
    423     for indent in indents[1:]:                 # pop remaining indent levels
    424         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    425     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    426 
    427 if __name__ == '__main__':                     # testing
    428     import sys
    429     if len(sys.argv) > 1:
    430         tokenize(open(sys.argv[1]).readline)
    431     else:
    432         tokenize(sys.stdin.readline)
    433