Home | History | Annotate | Download | only in Lib
      1 """Tokenization help for Python programs.
      2 
      3 generate_tokens(readline) is a generator that breaks a stream of
      4 text into Python tokens.  It accepts a readline-like method which is called
      5 repeatedly to get the next line of input (or "" for EOF).  It generates
      6 5-tuples with these members:
      7 
      8     the token type (see token.py)
      9     the token (a string)
     10     the starting (row, column) indices of the token (a 2-tuple of ints)
     11     the ending (row, column) indices of the token (a 2-tuple of ints)
     12     the original line (string)
     13 
     14 It is designed to match the working of the Python tokenizer exactly, except
     15 that it produces COMMENT tokens for comments and gives type OP for all
     16 operators
     17 
     18 Older entry points
     19     tokenize_loop(readline, tokeneater)
     20     tokenize(readline, tokeneater=printtoken)
     21 are the same, except instead of generating tokens, tokeneater is a callback
     22 function to which the 5 fields described above are passed as 5 arguments,
     23 each time a new token is found."""
     24 
     25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     27                'Skip Montanaro, Raymond Hettinger')
     28 
     29 from itertools import chain
     30 import string, re
     31 from token import *
     32 
     33 import token
     34 __all__ = [x for x in dir(token) if not x.startswith("_")]
     35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
     36 del x
     37 del token
     38 
     39 COMMENT = N_TOKENS
     40 tok_name[COMMENT] = 'COMMENT'
     41 NL = N_TOKENS + 1
     42 tok_name[NL] = 'NL'
     43 N_TOKENS += 2
     44 
     45 def group(*choices): return '(' + '|'.join(choices) + ')'
     46 def any(*choices): return group(*choices) + '*'
     47 def maybe(*choices): return group(*choices) + '?'
     48 
     49 Whitespace = r'[ \f\t]*'
     50 Comment = r'#[^\r\n]*'
     51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     52 Name = r'[a-zA-Z_]\w*'
     53 
     54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
     55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
     56 Binnumber = r'0[bB][01]+[lL]?'
     57 Decnumber = r'[1-9]\d*[lL]?'
     58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
     59 Exponent = r'[eE][-+]?\d+'
     60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     61 Expfloat = r'\d+' + Exponent
     62 Floatnumber = group(Pointfloat, Expfloat)
     63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     64 Number = group(Imagnumber, Floatnumber, Intnumber)
     65 
     66 # Tail end of ' string.
     67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     68 # Tail end of " string.
     69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     70 # Tail end of ''' string.
     71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     72 # Tail end of """ string.
     73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
     75 # Single-line ' or " string.
     76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     77                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     78 
     79 # Because of leftmost-then-longest match semantics, be sure to put the
     80 # longest operators first (e.g., if = came before ==, == would get
     81 # recognized as two instances of =).
     82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     83                  r"//=?",
     84                  r"[+\-*/%&|^=<>]=?",
     85                  r"~")
     86 
     87 Bracket = '[][(){}]'
     88 Special = group(r'\r?\n', r'[:;.,`@]')
     89 Funny = group(Operator, Bracket, Special)
     90 
     91 PlainToken = group(Number, Funny, String, Name)
     92 Token = Ignore + PlainToken
     93 
     94 # First (or only) line of ' or " string.
     95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     96                 group("'", r'\\\r?\n'),
     97                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     98                 group('"', r'\\\r?\n'))
     99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
    100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    101 
    102 tokenprog, pseudoprog, single3prog, double3prog = map(
    103     re.compile, (Token, PseudoToken, Single3, Double3))
    104 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    105             "'''": single3prog, '"""': double3prog,
    106             "r'''": single3prog, 'r"""': double3prog,
    107             "u'''": single3prog, 'u"""': double3prog,
    108             "ur'''": single3prog, 'ur"""': double3prog,
    109             "R'''": single3prog, 'R"""': double3prog,
    110             "U'''": single3prog, 'U"""': double3prog,
    111             "uR'''": single3prog, 'uR"""': double3prog,
    112             "Ur'''": single3prog, 'Ur"""': double3prog,
    113             "UR'''": single3prog, 'UR"""': double3prog,
    114             "b'''": single3prog, 'b"""': double3prog,
    115             "br'''": single3prog, 'br"""': double3prog,
    116             "B'''": single3prog, 'B"""': double3prog,
    117             "bR'''": single3prog, 'bR"""': double3prog,
    118             "Br'''": single3prog, 'Br"""': double3prog,
    119             "BR'''": single3prog, 'BR"""': double3prog,
    120             'r': None, 'R': None, 'u': None, 'U': None,
    121             'b': None, 'B': None}
    122 
    123 triple_quoted = {}
    124 for t in ("'''", '"""',
    125           "r'''", 'r"""', "R'''", 'R"""',
    126           "u'''", 'u"""', "U'''", 'U"""',
    127           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    128           "uR'''", 'uR"""', "UR'''", 'UR"""',
    129           "b'''", 'b"""', "B'''", 'B"""',
    130           "br'''", 'br"""', "Br'''", 'Br"""',
    131           "bR'''", 'bR"""', "BR'''", 'BR"""'):
    132     triple_quoted[t] = t
    133 single_quoted = {}
    134 for t in ("'", '"',
    135           "r'", 'r"', "R'", 'R"',
    136           "u'", 'u"', "U'", 'U"',
    137           "ur'", 'ur"', "Ur'", 'Ur"',
    138           "uR'", 'uR"', "UR'", 'UR"',
    139           "b'", 'b"', "B'", 'B"',
    140           "br'", 'br"', "Br'", 'Br"',
    141           "bR'", 'bR"', "BR'", 'BR"' ):
    142     single_quoted[t] = t
    143 
    144 tabsize = 8
    145 
    146 class TokenError(Exception): pass
    147 
    148 class StopTokenizing(Exception): pass
    149 
    150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
    151     srow, scol = srow_scol
    152     erow, ecol = erow_ecol
    153     print "%d,%d-%d,%d:\t%s\t%s" % \
    154         (srow, scol, erow, ecol, tok_name[type], repr(token))
    155 
    156 def tokenize(readline, tokeneater=printtoken):
    157     """
    158     The tokenize() function accepts two parameters: one representing the
    159     input stream, and one providing an output mechanism for tokenize().
    160 
    161     The first parameter, readline, must be a callable object which provides
    162     the same interface as the readline() method of built-in file objects.
    163     Each call to the function should return one line of input as a string.
    164 
    165     The second parameter, tokeneater, must also be a callable object. It is
    166     called once for each token, with five arguments, corresponding to the
    167     tuples generated by generate_tokens().
    168     """
    169     try:
    170         tokenize_loop(readline, tokeneater)
    171     except StopTokenizing:
    172         pass
    173 
    174 # backwards compatible interface
    175 def tokenize_loop(readline, tokeneater):
    176     for token_info in generate_tokens(readline):
    177         tokeneater(*token_info)
    178 
    179 class Untokenizer:
    180 
    181     def __init__(self):
    182         self.tokens = []
    183         self.prev_row = 1
    184         self.prev_col = 0
    185 
    186     def add_whitespace(self, start):
    187         row, col = start
    188         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
    189             raise ValueError("start ({},{}) precedes previous end ({},{})"
    190                              .format(row, col, self.prev_row, self.prev_col))
    191         row_offset = row - self.prev_row
    192         if row_offset:
    193             self.tokens.append("\\\n" * row_offset)
    194             self.prev_col = 0
    195         col_offset = col - self.prev_col
    196         if col_offset:
    197             self.tokens.append(" " * col_offset)
    198 
    199     def untokenize(self, iterable):
    200         it = iter(iterable)
    201         indents = []
    202         startline = False
    203         for t in it:
    204             if len(t) == 2:
    205                 self.compat(t, it)
    206                 break
    207             tok_type, token, start, end, line = t
    208             if tok_type == ENDMARKER:
    209                 break
    210             if tok_type == INDENT:
    211                 indents.append(token)
    212                 continue
    213             elif tok_type == DEDENT:
    214                 indents.pop()
    215                 self.prev_row, self.prev_col = end
    216                 continue
    217             elif tok_type in (NEWLINE, NL):
    218                 startline = True
    219             elif startline and indents:
    220                 indent = indents[-1]
    221                 if start[1] >= len(indent):
    222                     self.tokens.append(indent)
    223                     self.prev_col = len(indent)
    224                 startline = False
    225             self.add_whitespace(start)
    226             self.tokens.append(token)
    227             self.prev_row, self.prev_col = end
    228             if tok_type in (NEWLINE, NL):
    229                 self.prev_row += 1
    230                 self.prev_col = 0
    231         return "".join(self.tokens)
    232 
    233     def compat(self, token, iterable):
    234         indents = []
    235         toks_append = self.tokens.append
    236         startline = token[0] in (NEWLINE, NL)
    237         prevstring = False
    238 
    239         for tok in chain([token], iterable):
    240             toknum, tokval = tok[:2]
    241 
    242             if toknum in (NAME, NUMBER):
    243                 tokval += ' '
    244 
    245             # Insert a space between two consecutive strings
    246             if toknum == STRING:
    247                 if prevstring:
    248                     tokval = ' ' + tokval
    249                 prevstring = True
    250             else:
    251                 prevstring = False
    252 
    253             if toknum == INDENT:
    254                 indents.append(tokval)
    255                 continue
    256             elif toknum == DEDENT:
    257                 indents.pop()
    258                 continue
    259             elif toknum in (NEWLINE, NL):
    260                 startline = True
    261             elif startline and indents:
    262                 toks_append(indents[-1])
    263                 startline = False
    264             toks_append(tokval)
    265 
    266 def untokenize(iterable):
    267     """Transform tokens back into Python source code.
    268 
    269     Each element returned by the iterable must be a token sequence
    270     with at least two elements, a token number and token value.  If
    271     only two tokens are passed, the resulting output is poor.
    272 
    273     Round-trip invariant for full input:
    274         Untokenized source will match input source exactly
    275 
    276     Round-trip invariant for limited intput:
    277         # Output text will tokenize the back to the input
    278         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    279         newcode = untokenize(t1)
    280         readline = iter(newcode.splitlines(1)).next
    281         t2 = [tok[:2] for tok in generate_tokens(readline)]
    282         assert t1 == t2
    283     """
    284     ut = Untokenizer()
    285     return ut.untokenize(iterable)
    286 
    287 def generate_tokens(readline):
    288     """
    289     The generate_tokens() generator requires one argument, readline, which
    290     must be a callable object which provides the same interface as the
    291     readline() method of built-in file objects. Each call to the function
    292     should return one line of input as a string.  Alternately, readline
    293     can be a callable function terminating with StopIteration:
    294         readline = open(myfile).next    # Example of alternate readline
    295 
    296     The generator produces 5-tuples with these members: the token type; the
    297     token string; a 2-tuple (srow, scol) of ints specifying the row and
    298     column where the token begins in the source; a 2-tuple (erow, ecol) of
    299     ints specifying the row and column where the token ends in the source;
    300     and the line on which the token was found. The line passed is the
    301     logical line; continuation lines are included.
    302     """
    303     lnum = parenlev = continued = 0
    304     namechars, numchars = string.ascii_letters + '_', '0123456789'
    305     contstr, needcont = '', 0
    306     contline = None
    307     indents = [0]
    308 
    309     while 1:                                   # loop over lines in stream
    310         try:
    311             line = readline()
    312         except StopIteration:
    313             line = ''
    314         lnum += 1
    315         pos, max = 0, len(line)
    316 
    317         if contstr:                            # continued string
    318             if not line:
    319                 raise TokenError, ("EOF in multi-line string", strstart)
    320             endmatch = endprog.match(line)
    321             if endmatch:
    322                 pos = end = endmatch.end(0)
    323                 yield (STRING, contstr + line[:end],
    324                        strstart, (lnum, end), contline + line)
    325                 contstr, needcont = '', 0
    326                 contline = None
    327             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    328                 yield (ERRORTOKEN, contstr + line,
    329                            strstart, (lnum, len(line)), contline)
    330                 contstr = ''
    331                 contline = None
    332                 continue
    333             else:
    334                 contstr = contstr + line
    335                 contline = contline + line
    336                 continue
    337 
    338         elif parenlev == 0 and not continued:  # new statement
    339             if not line: break
    340             column = 0
    341             while pos < max:                   # measure leading whitespace
    342                 if line[pos] == ' ':
    343                     column += 1
    344                 elif line[pos] == '\t':
    345                     column = (column//tabsize + 1)*tabsize
    346                 elif line[pos] == '\f':
    347                     column = 0
    348                 else:
    349                     break
    350                 pos += 1
    351             if pos == max:
    352                 break
    353 
    354             if line[pos] in '#\r\n':           # skip comments or blank lines
    355                 if line[pos] == '#':
    356                     comment_token = line[pos:].rstrip('\r\n')
    357                     nl_pos = pos + len(comment_token)
    358                     yield (COMMENT, comment_token,
    359                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    360                     yield (NL, line[nl_pos:],
    361                            (lnum, nl_pos), (lnum, len(line)), line)
    362                 else:
    363                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    364                            (lnum, pos), (lnum, len(line)), line)
    365                 continue
    366 
    367             if column > indents[-1]:           # count indents or dedents
    368                 indents.append(column)
    369                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    370             while column < indents[-1]:
    371                 if column not in indents:
    372                     raise IndentationError(
    373                         "unindent does not match any outer indentation level",
    374                         ("<tokenize>", lnum, pos, line))
    375                 indents = indents[:-1]
    376                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    377 
    378         else:                                  # continued statement
    379             if not line:
    380                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    381             continued = 0
    382 
    383         while pos < max:
    384             pseudomatch = pseudoprog.match(line, pos)
    385             if pseudomatch:                                # scan for tokens
    386                 start, end = pseudomatch.span(1)
    387                 spos, epos, pos = (lnum, start), (lnum, end), end
    388                 if start == end:
    389                     continue
    390                 token, initial = line[start:end], line[start]
    391 
    392                 if initial in numchars or \
    393                    (initial == '.' and token != '.'):      # ordinary number
    394                     yield (NUMBER, token, spos, epos, line)
    395                 elif initial in '\r\n':
    396                     yield (NL if parenlev > 0 else NEWLINE,
    397                            token, spos, epos, line)
    398                 elif initial == '#':
    399                     assert not token.endswith("\n")
    400                     yield (COMMENT, token, spos, epos, line)
    401                 elif token in triple_quoted:
    402                     endprog = endprogs[token]
    403                     endmatch = endprog.match(line, pos)
    404                     if endmatch:                           # all on one line
    405                         pos = endmatch.end(0)
    406                         token = line[start:pos]
    407                         yield (STRING, token, spos, (lnum, pos), line)
    408                     else:
    409                         strstart = (lnum, start)           # multiple lines
    410                         contstr = line[start:]
    411                         contline = line
    412                         break
    413                 elif initial in single_quoted or \
    414                     token[:2] in single_quoted or \
    415                     token[:3] in single_quoted:
    416                     if token[-1] == '\n':                  # continued string
    417                         strstart = (lnum, start)
    418                         endprog = (endprogs[initial] or endprogs[token[1]] or
    419                                    endprogs[token[2]])
    420                         contstr, needcont = line[start:], 1
    421                         contline = line
    422                         break
    423                     else:                                  # ordinary string
    424                         yield (STRING, token, spos, epos, line)
    425                 elif initial in namechars:                 # ordinary name
    426                     yield (NAME, token, spos, epos, line)
    427                 elif initial == '\\':                      # continued stmt
    428                     continued = 1
    429                 else:
    430                     if initial in '([{':
    431                         parenlev += 1
    432                     elif initial in ')]}':
    433                         parenlev -= 1
    434                     yield (OP, token, spos, epos, line)
    435             else:
    436                 yield (ERRORTOKEN, line[pos],
    437                            (lnum, pos), (lnum, pos+1), line)
    438                 pos += 1
    439 
    440     for indent in indents[1:]:                 # pop remaining indent levels
    441         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    442     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    443 
    444 if __name__ == '__main__':                     # testing
    445     import sys
    446     if len(sys.argv) > 1:
    447         tokenize(open(sys.argv[1]).readline)
    448     else:
    449         tokenize(sys.stdin.readline)
    450