Home | History | Annotate | Download | only in Lib
      1 """Tokenization help for Python programs.
      2 
      3 generate_tokens(readline) is a generator that breaks a stream of
      4 text into Python tokens.  It accepts a readline-like method which is called
      5 repeatedly to get the next line of input (or "" for EOF).  It generates
      6 5-tuples with these members:
      7 
      8     the token type (see token.py)
      9     the token (a string)
     10     the starting (row, column) indices of the token (a 2-tuple of ints)
     11     the ending (row, column) indices of the token (a 2-tuple of ints)
     12     the original line (string)
     13 
     14 It is designed to match the working of the Python tokenizer exactly, except
     15 that it produces COMMENT tokens for comments and gives type OP for all
     16 operators
     17 
     18 Older entry points
     19     tokenize_loop(readline, tokeneater)
     20     tokenize(readline, tokeneater=printtoken)
     21 are the same, except instead of generating tokens, tokeneater is a callback
     22 function to which the 5 fields described above are passed as 5 arguments,
     23 each time a new token is found."""
     24 
     25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     27                'Skip Montanaro, Raymond Hettinger')
     28 
     29 import string, re
     30 from token import *
     31 
     32 import token
     33 __all__ = [x for x in dir(token) if not x.startswith("_")]
     34 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
     35 del x
     36 del token
     37 
     38 COMMENT = N_TOKENS
     39 tok_name[COMMENT] = 'COMMENT'
     40 NL = N_TOKENS + 1
     41 tok_name[NL] = 'NL'
     42 N_TOKENS += 2
     43 
     44 def group(*choices): return '(' + '|'.join(choices) + ')'
     45 def any(*choices): return group(*choices) + '*'
     46 def maybe(*choices): return group(*choices) + '?'
     47 
     48 Whitespace = r'[ \f\t]*'
     49 Comment = r'#[^\r\n]*'
     50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     51 Name = r'[a-zA-Z_]\w*'
     52 
     53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
     54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
     55 Binnumber = r'0[bB][01]+[lL]?'
     56 Decnumber = r'[1-9]\d*[lL]?'
     57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
     58 Exponent = r'[eE][-+]?\d+'
     59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     60 Expfloat = r'\d+' + Exponent
     61 Floatnumber = group(Pointfloat, Expfloat)
     62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     63 Number = group(Imagnumber, Floatnumber, Intnumber)
     64 
     65 # Tail end of ' string.

     66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     67 # Tail end of " string.

     68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     69 # Tail end of ''' string.

     70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     71 # Tail end of """ string.

     72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
     74 # Single-line ' or " string.

     75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     76                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     77 
     78 # Because of leftmost-then-longest match semantics, be sure to put the

     79 # longest operators first (e.g., if = came before ==, == would get

     80 # recognized as two instances of =).

     81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     82                  r"//=?",
     83                  r"[+\-*/%&|^=<>]=?",
     84                  r"~")
     85 
     86 Bracket = '[][(){}]'
     87 Special = group(r'\r?\n', r'[:;.,`@]')
     88 Funny = group(Operator, Bracket, Special)
     89 
     90 PlainToken = group(Number, Funny, String, Name)
     91 Token = Ignore + PlainToken
     92 
     93 # First (or only) line of ' or " string.

     94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     95                 group("'", r'\\\r?\n'),
     96                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     97                 group('"', r'\\\r?\n'))
     98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
     99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    100 
    101 tokenprog, pseudoprog, single3prog, double3prog = map(
    102     re.compile, (Token, PseudoToken, Single3, Double3))
    103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    104             "'''": single3prog, '"""': double3prog,
    105             "r'''": single3prog, 'r"""': double3prog,
    106             "u'''": single3prog, 'u"""': double3prog,
    107             "ur'''": single3prog, 'ur"""': double3prog,
    108             "R'''": single3prog, 'R"""': double3prog,
    109             "U'''": single3prog, 'U"""': double3prog,
    110             "uR'''": single3prog, 'uR"""': double3prog,
    111             "Ur'''": single3prog, 'Ur"""': double3prog,
    112             "UR'''": single3prog, 'UR"""': double3prog,
    113             "b'''": single3prog, 'b"""': double3prog,
    114             "br'''": single3prog, 'br"""': double3prog,
    115             "B'''": single3prog, 'B"""': double3prog,
    116             "bR'''": single3prog, 'bR"""': double3prog,
    117             "Br'''": single3prog, 'Br"""': double3prog,
    118             "BR'''": single3prog, 'BR"""': double3prog,
    119             'r': None, 'R': None, 'u': None, 'U': None,
    120             'b': None, 'B': None}
    121 
    122 triple_quoted = {}
    123 for t in ("'''", '"""',
    124           "r'''", 'r"""', "R'''", 'R"""',
    125           "u'''", 'u"""', "U'''", 'U"""',
    126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    127           "uR'''", 'uR"""', "UR'''", 'UR"""',
    128           "b'''", 'b"""', "B'''", 'B"""',
    129           "br'''", 'br"""', "Br'''", 'Br"""',
    130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
    131     triple_quoted[t] = t
    132 single_quoted = {}
    133 for t in ("'", '"',
    134           "r'", 'r"', "R'", 'R"',
    135           "u'", 'u"', "U'", 'U"',
    136           "ur'", 'ur"', "Ur'", 'Ur"',
    137           "uR'", 'uR"', "UR'", 'UR"',
    138           "b'", 'b"', "B'", 'B"',
    139           "br'", 'br"', "Br'", 'Br"',
    140           "bR'", 'bR"', "BR'", 'BR"' ):
    141     single_quoted[t] = t
    142 
    143 tabsize = 8
    144 
    145 class TokenError(Exception): pass
    146 
    147 class StopTokenizing(Exception): pass
    148 
    149 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing

    150     srow, scol = srow_scol
    151     erow, ecol = erow_ecol
    152     print "%d,%d-%d,%d:\t%s\t%s" % \
    153         (srow, scol, erow, ecol, tok_name[type], repr(token))
    154 
    155 def tokenize(readline, tokeneater=printtoken):
    156     """
    157     The tokenize() function accepts two parameters: one representing the
    158     input stream, and one providing an output mechanism for tokenize().
    159 
    160     The first parameter, readline, must be a callable object which provides
    161     the same interface as the readline() method of built-in file objects.
    162     Each call to the function should return one line of input as a string.
    163 
    164     The second parameter, tokeneater, must also be a callable object. It is
    165     called once for each token, with five arguments, corresponding to the
    166     tuples generated by generate_tokens().
    167     """
    168     try:
    169         tokenize_loop(readline, tokeneater)
    170     except StopTokenizing:
    171         pass
    172 
    173 # backwards compatible interface

    174 def tokenize_loop(readline, tokeneater):
    175     for token_info in generate_tokens(readline):
    176         tokeneater(*token_info)
    177 
    178 class Untokenizer:
    179 
    180     def __init__(self):
    181         self.tokens = []
    182         self.prev_row = 1
    183         self.prev_col = 0
    184 
    185     def add_whitespace(self, start):
    186         row, col = start
    187         assert row <= self.prev_row
    188         col_offset = col - self.prev_col
    189         if col_offset:
    190             self.tokens.append(" " * col_offset)
    191 
    192     def untokenize(self, iterable):
    193         for t in iterable:
    194             if len(t) == 2:
    195                 self.compat(t, iterable)
    196                 break
    197             tok_type, token, start, end, line = t
    198             self.add_whitespace(start)
    199             self.tokens.append(token)
    200             self.prev_row, self.prev_col = end
    201             if tok_type in (NEWLINE, NL):
    202                 self.prev_row += 1
    203                 self.prev_col = 0
    204         return "".join(self.tokens)
    205 
    206     def compat(self, token, iterable):
    207         startline = False
    208         indents = []
    209         toks_append = self.tokens.append
    210         toknum, tokval = token
    211         if toknum in (NAME, NUMBER):
    212             tokval += ' '
    213         if toknum in (NEWLINE, NL):
    214             startline = True
    215         prevstring = False
    216         for tok in iterable:
    217             toknum, tokval = tok[:2]
    218 
    219             if toknum in (NAME, NUMBER):
    220                 tokval += ' '
    221 
    222             # Insert a space between two consecutive strings

    223             if toknum == STRING:
    224                 if prevstring:
    225                     tokval = ' ' + tokval
    226                 prevstring = True
    227             else:
    228                 prevstring = False
    229 
    230             if toknum == INDENT:
    231                 indents.append(tokval)
    232                 continue
    233             elif toknum == DEDENT:
    234                 indents.pop()
    235                 continue
    236             elif toknum in (NEWLINE, NL):
    237                 startline = True
    238             elif startline and indents:
    239                 toks_append(indents[-1])
    240                 startline = False
    241             toks_append(tokval)
    242 
    243 def untokenize(iterable):
    244     """Transform tokens back into Python source code.
    245 
    246     Each element returned by the iterable must be a token sequence
    247     with at least two elements, a token number and token value.  If
    248     only two tokens are passed, the resulting output is poor.
    249 
    250     Round-trip invariant for full input:
    251         Untokenized source will match input source exactly
    252 
    253     Round-trip invariant for limited intput:
    254         # Output text will tokenize the back to the input
    255         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    256         newcode = untokenize(t1)
    257         readline = iter(newcode.splitlines(1)).next
    258         t2 = [tok[:2] for tok in generate_tokens(readline)]
    259         assert t1 == t2
    260     """
    261     ut = Untokenizer()
    262     return ut.untokenize(iterable)
    263 
    264 def generate_tokens(readline):
    265     """
    266     The generate_tokens() generator requires one argment, readline, which
    267     must be a callable object which provides the same interface as the
    268     readline() method of built-in file objects. Each call to the function
    269     should return one line of input as a string.  Alternately, readline
    270     can be a callable function terminating with StopIteration:
    271         readline = open(myfile).next    # Example of alternate readline
    272 
    273     The generator produces 5-tuples with these members: the token type; the
    274     token string; a 2-tuple (srow, scol) of ints specifying the row and
    275     column where the token begins in the source; a 2-tuple (erow, ecol) of
    276     ints specifying the row and column where the token ends in the source;
    277     and the line on which the token was found. The line passed is the
    278     logical line; continuation lines are included.
    279     """
    280     lnum = parenlev = continued = 0
    281     namechars, numchars = string.ascii_letters + '_', '0123456789'
    282     contstr, needcont = '', 0
    283     contline = None
    284     indents = [0]
    285 
    286     while 1:                                   # loop over lines in stream

    287         try:
    288             line = readline()
    289         except StopIteration:
    290             line = ''
    291         lnum += 1
    292         pos, max = 0, len(line)
    293 
    294         if contstr:                            # continued string

    295             if not line:
    296                 raise TokenError, ("EOF in multi-line string", strstart)
    297             endmatch = endprog.match(line)
    298             if endmatch:
    299                 pos = end = endmatch.end(0)
    300                 yield (STRING, contstr + line[:end],
    301                        strstart, (lnum, end), contline + line)
    302                 contstr, needcont = '', 0
    303                 contline = None
    304             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    305                 yield (ERRORTOKEN, contstr + line,
    306                            strstart, (lnum, len(line)), contline)
    307                 contstr = ''
    308                 contline = None
    309                 continue
    310             else:
    311                 contstr = contstr + line
    312                 contline = contline + line
    313                 continue
    314 
    315         elif parenlev == 0 and not continued:  # new statement

    316             if not line: break
    317             column = 0
    318             while pos < max:                   # measure leading whitespace

    319                 if line[pos] == ' ':
    320                     column += 1
    321                 elif line[pos] == '\t':
    322                     column = (column//tabsize + 1)*tabsize
    323                 elif line[pos] == '\f':
    324                     column = 0
    325                 else:
    326                     break
    327                 pos += 1
    328             if pos == max:
    329                 break
    330 
    331             if line[pos] in '#\r\n':           # skip comments or blank lines

    332                 if line[pos] == '#':
    333                     comment_token = line[pos:].rstrip('\r\n')
    334                     nl_pos = pos + len(comment_token)
    335                     yield (COMMENT, comment_token,
    336                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    337                     yield (NL, line[nl_pos:],
    338                            (lnum, nl_pos), (lnum, len(line)), line)
    339                 else:
    340                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    341                            (lnum, pos), (lnum, len(line)), line)
    342                 continue
    343 
    344             if column > indents[-1]:           # count indents or dedents

    345                 indents.append(column)
    346                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    347             while column < indents[-1]:
    348                 if column not in indents:
    349                     raise IndentationError(
    350                         "unindent does not match any outer indentation level",
    351                         ("<tokenize>", lnum, pos, line))
    352                 indents = indents[:-1]
    353                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    354 
    355         else:                                  # continued statement

    356             if not line:
    357                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    358             continued = 0
    359 
    360         while pos < max:
    361             pseudomatch = pseudoprog.match(line, pos)
    362             if pseudomatch:                                # scan for tokens

    363                 start, end = pseudomatch.span(1)
    364                 spos, epos, pos = (lnum, start), (lnum, end), end
    365                 token, initial = line[start:end], line[start]
    366 
    367                 if initial in numchars or \
    368                    (initial == '.' and token != '.'):      # ordinary number

    369                     yield (NUMBER, token, spos, epos, line)
    370                 elif initial in '\r\n':
    371                     yield (NL if parenlev > 0 else NEWLINE,
    372                            token, spos, epos, line)
    373                 elif initial == '#':
    374                     assert not token.endswith("\n")
    375                     yield (COMMENT, token, spos, epos, line)
    376                 elif token in triple_quoted:
    377                     endprog = endprogs[token]
    378                     endmatch = endprog.match(line, pos)
    379                     if endmatch:                           # all on one line

    380                         pos = endmatch.end(0)
    381                         token = line[start:pos]
    382                         yield (STRING, token, spos, (lnum, pos), line)
    383                     else:
    384                         strstart = (lnum, start)           # multiple lines

    385                         contstr = line[start:]
    386                         contline = line
    387                         break
    388                 elif initial in single_quoted or \
    389                     token[:2] in single_quoted or \
    390                     token[:3] in single_quoted:
    391                     if token[-1] == '\n':                  # continued string

    392                         strstart = (lnum, start)
    393                         endprog = (endprogs[initial] or endprogs[token[1]] or
    394                                    endprogs[token[2]])
    395                         contstr, needcont = line[start:], 1
    396                         contline = line
    397                         break
    398                     else:                                  # ordinary string

    399                         yield (STRING, token, spos, epos, line)
    400                 elif initial in namechars:                 # ordinary name

    401                     yield (NAME, token, spos, epos, line)
    402                 elif initial == '\\':                      # continued stmt
    403                     continued = 1
    404                 else:
    405                     if initial in '([{':
    406                         parenlev += 1
    407                     elif initial in ')]}':
    408                         parenlev -= 1
    409                     yield (OP, token, spos, epos, line)
    410             else:
    411                 yield (ERRORTOKEN, line[pos],
    412                            (lnum, pos), (lnum, pos+1), line)
    413                 pos += 1
    414 
    415     for indent in indents[1:]:                 # pop remaining indent levels
    416         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    417     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    418 
    419 if __name__ == '__main__':                     # testing
    420     import sys
    421     if len(sys.argv) > 1:
    422         tokenize(open(sys.argv[1]).readline)
    423     else:
    424         tokenize(sys.stdin.readline)
    425