Home | History | Annotate | Download | only in python2.7
      1 """Tokenization help for Python programs.
      2 
      3 generate_tokens(readline) is a generator that breaks a stream of
      4 text into Python tokens.  It accepts a readline-like method which is called
      5 repeatedly to get the next line of input (or "" for EOF).  It generates
      6 5-tuples with these members:
      7 
      8     the token type (see token.py)
      9     the token (a string)
     10     the starting (row, column) indices of the token (a 2-tuple of ints)
     11     the ending (row, column) indices of the token (a 2-tuple of ints)
     12     the original line (string)
     13 
     14 It is designed to match the working of the Python tokenizer exactly, except
     15 that it produces COMMENT tokens for comments and gives type OP for all
     16 operators
     17 
     18 Older entry points
     19     tokenize_loop(readline, tokeneater)
     20     tokenize(readline, tokeneater=printtoken)
     21 are the same, except instead of generating tokens, tokeneater is a callback
     22 function to which the 5 fields described above are passed as 5 arguments,
     23 each time a new token is found."""
     24 
     25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     27                'Skip Montanaro, Raymond Hettinger')
     28 
     29 import string, re
     30 from token import *
     31 
     32 import token
     33 __all__ = [x for x in dir(token) if not x.startswith("_")]
     34 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
     35 del x
     36 del token
     37 
     38 COMMENT = N_TOKENS
     39 tok_name[COMMENT] = 'COMMENT'
     40 NL = N_TOKENS + 1
     41 tok_name[NL] = 'NL'
     42 N_TOKENS += 2
     43 
     44 def group(*choices): return '(' + '|'.join(choices) + ')'
     45 def any(*choices): return group(*choices) + '*'
     46 def maybe(*choices): return group(*choices) + '?'
     47 
     48 Whitespace = r'[ \f\t]*'
     49 Comment = r'#[^\r\n]*'
     50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     51 Name = r'[a-zA-Z_]\w*'
     52 
     53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
     54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
     55 Binnumber = r'0[bB][01]+[lL]?'
     56 Decnumber = r'[1-9]\d*[lL]?'
     57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
     58 Exponent = r'[eE][-+]?\d+'
     59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     60 Expfloat = r'\d+' + Exponent
     61 Floatnumber = group(Pointfloat, Expfloat)
     62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     63 Number = group(Imagnumber, Floatnumber, Intnumber)
     64 
     65 # Tail end of ' string.
     66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     67 # Tail end of " string.
     68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     69 # Tail end of ''' string.
     70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     71 # Tail end of """ string.
     72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     73 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
     74 # Single-line ' or " string.
     75 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     76                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     77 
     78 # Because of leftmost-then-longest match semantics, be sure to put the
     79 # longest operators first (e.g., if = came before ==, == would get
     80 # recognized as two instances of =).
     81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     82                  r"//=?",
     83                  r"[+\-*/%&|^=<>]=?",
     84                  r"~")
     85 
     86 Bracket = '[][(){}]'
     87 Special = group(r'\r?\n', r'[:;.,`@]')
     88 Funny = group(Operator, Bracket, Special)
     89 
     90 PlainToken = group(Number, Funny, String, Name)
     91 Token = Ignore + PlainToken
     92 
     93 # First (or only) line of ' or " string.
     94 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     95                 group("'", r'\\\r?\n'),
     96                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     97                 group('"', r'\\\r?\n'))
     98 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
     99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    100 
    101 tokenprog, pseudoprog, single3prog, double3prog = map(
    102     re.compile, (Token, PseudoToken, Single3, Double3))
    103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    104             "'''": single3prog, '"""': double3prog,
    105             "r'''": single3prog, 'r"""': double3prog,
    106             "u'''": single3prog, 'u"""': double3prog,
    107             "ur'''": single3prog, 'ur"""': double3prog,
    108             "R'''": single3prog, 'R"""': double3prog,
    109             "U'''": single3prog, 'U"""': double3prog,
    110             "uR'''": single3prog, 'uR"""': double3prog,
    111             "Ur'''": single3prog, 'Ur"""': double3prog,
    112             "UR'''": single3prog, 'UR"""': double3prog,
    113             "b'''": single3prog, 'b"""': double3prog,
    114             "br'''": single3prog, 'br"""': double3prog,
    115             "B'''": single3prog, 'B"""': double3prog,
    116             "bR'''": single3prog, 'bR"""': double3prog,
    117             "Br'''": single3prog, 'Br"""': double3prog,
    118             "BR'''": single3prog, 'BR"""': double3prog,
    119             'r': None, 'R': None, 'u': None, 'U': None,
    120             'b': None, 'B': None}
    121 
    122 triple_quoted = {}
    123 for t in ("'''", '"""',
    124           "r'''", 'r"""', "R'''", 'R"""',
    125           "u'''", 'u"""', "U'''", 'U"""',
    126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    127           "uR'''", 'uR"""', "UR'''", 'UR"""',
    128           "b'''", 'b"""', "B'''", 'B"""',
    129           "br'''", 'br"""', "Br'''", 'Br"""',
    130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
    131     triple_quoted[t] = t
    132 single_quoted = {}
    133 for t in ("'", '"',
    134           "r'", 'r"', "R'", 'R"',
    135           "u'", 'u"', "U'", 'U"',
    136           "ur'", 'ur"', "Ur'", 'Ur"',
    137           "uR'", 'uR"', "UR'", 'UR"',
    138           "b'", 'b"', "B'", 'B"',
    139           "br'", 'br"', "Br'", 'Br"',
    140           "bR'", 'bR"', "BR'", 'BR"' ):
    141     single_quoted[t] = t
    142 
    143 tabsize = 8
    144 
    145 class TokenError(Exception): pass
    146 
    147 class StopTokenizing(Exception): pass
    148 
    149 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
    150     srow, scol = srow_scol
    151     erow, ecol = erow_ecol
    152     print "%d,%d-%d,%d:\t%s\t%s" % \
    153         (srow, scol, erow, ecol, tok_name[type], repr(token))
    154 
    155 def tokenize(readline, tokeneater=printtoken):
    156     """
    157     The tokenize() function accepts two parameters: one representing the
    158     input stream, and one providing an output mechanism for tokenize().
    159 
    160     The first parameter, readline, must be a callable object which provides
    161     the same interface as the readline() method of built-in file objects.
    162     Each call to the function should return one line of input as a string.
    163 
    164     The second parameter, tokeneater, must also be a callable object. It is
    165     called once for each token, with five arguments, corresponding to the
    166     tuples generated by generate_tokens().
    167     """
    168     try:
    169         tokenize_loop(readline, tokeneater)
    170     except StopTokenizing:
    171         pass
    172 
    173 # backwards compatible interface
    174 def tokenize_loop(readline, tokeneater):
    175     for token_info in generate_tokens(readline):
    176         tokeneater(*token_info)
    177 
    178 class Untokenizer:
    179 
    180     def __init__(self):
    181         self.tokens = []
    182         self.prev_row = 1
    183         self.prev_col = 0
    184 
    185     def add_whitespace(self, start):
    186         row, col = start
    187         assert row <= self.prev_row
    188         col_offset = col - self.prev_col
    189         if col_offset:
    190             self.tokens.append(" " * col_offset)
    191 
    192     def untokenize(self, iterable):
    193         for t in iterable:
    194             if len(t) == 2:
    195                 self.compat(t, iterable)
    196                 break
    197             tok_type, token, start, end, line = t
    198             self.add_whitespace(start)
    199             self.tokens.append(token)
    200             self.prev_row, self.prev_col = end
    201             if tok_type in (NEWLINE, NL):
    202                 self.prev_row += 1
    203                 self.prev_col = 0
    204         return "".join(self.tokens)
    205 
    206     def compat(self, token, iterable):
    207         startline = False
    208         indents = []
    209         toks_append = self.tokens.append
    210         toknum, tokval = token
    211         if toknum in (NAME, NUMBER):
    212             tokval += ' '
    213         if toknum in (NEWLINE, NL):
    214             startline = True
    215         prevstring = False
    216         for tok in iterable:
    217             toknum, tokval = tok[:2]
    218 
    219             if toknum in (NAME, NUMBER):
    220                 tokval += ' '
    221 
    222             # Insert a space between two consecutive strings
    223             if toknum == STRING:
    224                 if prevstring:
    225                     tokval = ' ' + tokval
    226                 prevstring = True
    227             else:
    228                 prevstring = False
    229 
    230             if toknum == INDENT:
    231                 indents.append(tokval)
    232                 continue
    233             elif toknum == DEDENT:
    234                 indents.pop()
    235                 continue
    236             elif toknum in (NEWLINE, NL):
    237                 startline = True
    238             elif startline and indents:
    239                 toks_append(indents[-1])
    240                 startline = False
    241             toks_append(tokval)
    242 
    243 def untokenize(iterable):
    244     """Transform tokens back into Python source code.
    245 
    246     Each element returned by the iterable must be a token sequence
    247     with at least two elements, a token number and token value.  If
    248     only two tokens are passed, the resulting output is poor.
    249 
    250     Round-trip invariant for full input:
    251         Untokenized source will match input source exactly
    252 
    253     Round-trip invariant for limited intput:
    254         # Output text will tokenize the back to the input
    255         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    256         newcode = untokenize(t1)
    257         readline = iter(newcode.splitlines(1)).next
    258         t2 = [tok[:2] for tok in generate_tokens(readline)]
    259         assert t1 == t2
    260     """
    261     ut = Untokenizer()
    262     return ut.untokenize(iterable)
    263 
    264 def generate_tokens(readline):
    265     """
    266     The generate_tokens() generator requires one argment, readline, which
    267     must be a callable object which provides the same interface as the
    268     readline() method of built-in file objects. Each call to the function
    269     should return one line of input as a string.  Alternately, readline
    270     can be a callable function terminating with StopIteration:
    271         readline = open(myfile).next    # Example of alternate readline
    272 
    273     The generator produces 5-tuples with these members: the token type; the
    274     token string; a 2-tuple (srow, scol) of ints specifying the row and
    275     column where the token begins in the source; a 2-tuple (erow, ecol) of
    276     ints specifying the row and column where the token ends in the source;
    277     and the line on which the token was found. The line passed is the
    278     logical line; continuation lines are included.
    279     """
    280     lnum = parenlev = continued = 0
    281     namechars, numchars = string.ascii_letters + '_', '0123456789'
    282     contstr, needcont = '', 0
    283     contline = None
    284     indents = [0]
    285 
    286     while 1:                                   # loop over lines in stream
    287         try:
    288             line = readline()
    289         except StopIteration:
    290             line = ''
    291         lnum += 1
    292         pos, max = 0, len(line)
    293 
    294         if contstr:                            # continued string
    295             if not line:
    296                 raise TokenError, ("EOF in multi-line string", strstart)
    297             endmatch = endprog.match(line)
    298             if endmatch:
    299                 pos = end = endmatch.end(0)
    300                 yield (STRING, contstr + line[:end],
    301                        strstart, (lnum, end), contline + line)
    302                 contstr, needcont = '', 0
    303                 contline = None
    304             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    305                 yield (ERRORTOKEN, contstr + line,
    306                            strstart, (lnum, len(line)), contline)
    307                 contstr = ''
    308                 contline = None
    309                 continue
    310             else:
    311                 contstr = contstr + line
    312                 contline = contline + line
    313                 continue
    314 
    315         elif parenlev == 0 and not continued:  # new statement
    316             if not line: break
    317             column = 0
    318             while pos < max:                   # measure leading whitespace
    319                 if line[pos] == ' ':
    320                     column += 1
    321                 elif line[pos] == '\t':
    322                     column = (column//tabsize + 1)*tabsize
    323                 elif line[pos] == '\f':
    324                     column = 0
    325                 else:
    326                     break
    327                 pos += 1
    328             if pos == max:
    329                 break
    330 
    331             if line[pos] in '#\r\n':           # skip comments or blank lines
    332                 if line[pos] == '#':
    333                     comment_token = line[pos:].rstrip('\r\n')
    334                     nl_pos = pos + len(comment_token)
    335                     yield (COMMENT, comment_token,
    336                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    337                     yield (NL, line[nl_pos:],
    338                            (lnum, nl_pos), (lnum, len(line)), line)
    339                 else:
    340                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    341                            (lnum, pos), (lnum, len(line)), line)
    342                 continue
    343 
    344             if column > indents[-1]:           # count indents or dedents
    345                 indents.append(column)
    346                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    347             while column < indents[-1]:
    348                 if column not in indents:
    349                     raise IndentationError(
    350                         "unindent does not match any outer indentation level",
    351                         ("<tokenize>", lnum, pos, line))
    352                 indents = indents[:-1]
    353                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    354 
    355         else:                                  # continued statement
    356             if not line:
    357                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    358             continued = 0
    359 
    360         while pos < max:
    361             pseudomatch = pseudoprog.match(line, pos)
    362             if pseudomatch:                                # scan for tokens
    363                 start, end = pseudomatch.span(1)
    364                 spos, epos, pos = (lnum, start), (lnum, end), end
    365                 if start == end:
    366                     continue
    367                 token, initial = line[start:end], line[start]
    368 
    369                 if initial in numchars or \
    370                    (initial == '.' and token != '.'):      # ordinary number
    371                     yield (NUMBER, token, spos, epos, line)
    372                 elif initial in '\r\n':
    373                     yield (NL if parenlev > 0 else NEWLINE,
    374                            token, spos, epos, line)
    375                 elif initial == '#':
    376                     assert not token.endswith("\n")
    377                     yield (COMMENT, token, spos, epos, line)
    378                 elif token in triple_quoted:
    379                     endprog = endprogs[token]
    380                     endmatch = endprog.match(line, pos)
    381                     if endmatch:                           # all on one line
    382                         pos = endmatch.end(0)
    383                         token = line[start:pos]
    384                         yield (STRING, token, spos, (lnum, pos), line)
    385                     else:
    386                         strstart = (lnum, start)           # multiple lines
    387                         contstr = line[start:]
    388                         contline = line
    389                         break
    390                 elif initial in single_quoted or \
    391                     token[:2] in single_quoted or \
    392                     token[:3] in single_quoted:
    393                     if token[-1] == '\n':                  # continued string
    394                         strstart = (lnum, start)
    395                         endprog = (endprogs[initial] or endprogs[token[1]] or
    396                                    endprogs[token[2]])
    397                         contstr, needcont = line[start:], 1
    398                         contline = line
    399                         break
    400                     else:                                  # ordinary string
    401                         yield (STRING, token, spos, epos, line)
    402                 elif initial in namechars:                 # ordinary name
    403                     yield (NAME, token, spos, epos, line)
    404                 elif initial == '\\':                      # continued stmt
    405                     continued = 1
    406                 else:
    407                     if initial in '([{':
    408                         parenlev += 1
    409                     elif initial in ')]}':
    410                         parenlev -= 1
    411                     yield (OP, token, spos, epos, line)
    412             else:
    413                 yield (ERRORTOKEN, line[pos],
    414                            (lnum, pos), (lnum, pos+1), line)
    415                 pos += 1
    416 
    417     for indent in indents[1:]:                 # pop remaining indent levels
    418         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    419     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    420 
    421 if __name__ == '__main__':                     # testing
    422     import sys
    423     if len(sys.argv) > 1:
    424         tokenize(open(sys.argv[1]).readline)
    425     else:
    426         tokenize(sys.stdin.readline)
    427