Home | History | Annotate | Download | only in pgen2
      1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
      2 # All rights reserved.
      3 
      4 """Tokenization help for Python programs.
      5 
      6 generate_tokens(readline) is a generator that breaks a stream of
      7 text into Python tokens.  It accepts a readline-like method which is called
      8 repeatedly to get the next line of input (or "" for EOF).  It generates
      9 5-tuples with these members:
     10 
     11     the token type (see token.py)
     12     the token (a string)
     13     the starting (row, column) indices of the token (a 2-tuple of ints)
     14     the ending (row, column) indices of the token (a 2-tuple of ints)
     15     the original line (string)
     16 
     17 It is designed to match the working of the Python tokenizer exactly, except
     18 that it produces COMMENT tokens for comments and gives type OP for all
     19 operators
     20 
     21 Older entry points
     22     tokenize_loop(readline, tokeneater)
     23     tokenize(readline, tokeneater=printtoken)
     24 are the same, except instead of generating tokens, tokeneater is a callback
     25 function to which the 5 fields described above are passed as 5 arguments,
     26 each time a new token is found."""
     27 
     28 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     29 __credits__ = \
     30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
     31 
     32 import string, re
     33 from codecs import BOM_UTF8, lookup
     34 from lib2to3.pgen2.token import *
     35 
     36 from . import token
     37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
     38            "generate_tokens", "untokenize"]
     39 del token
     40 
     41 try:
     42     bytes
     43 except NameError:
     44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
     45     # valid Python 3 code.
     46     bytes = str
     47 
     48 def group(*choices): return '(' + '|'.join(choices) + ')'
     49 def any(*choices): return group(*choices) + '*'
     50 def maybe(*choices): return group(*choices) + '?'
     51 
     52 Whitespace = r'[ \f\t]*'
     53 Comment = r'#[^\r\n]*'
     54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     55 Name = r'[a-zA-Z_]\w*'
     56 
     57 Binnumber = r'0[bB][01]*'
     58 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
     59 Octnumber = r'0[oO]?[0-7]*[lL]?'
     60 Decnumber = r'[1-9]\d*[lL]?'
     61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
     62 Exponent = r'[eE][-+]?\d+'
     63 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     64 Expfloat = r'\d+' + Exponent
     65 Floatnumber = group(Pointfloat, Expfloat)
     66 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     67 Number = group(Imagnumber, Floatnumber, Intnumber)
     68 
     69 # Tail end of ' string.
     70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     71 # Tail end of " string.
     72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     73 # Tail end of ''' string.
     74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     75 # Tail end of """ string.
     76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     77 Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
     78 # Single-line ' or " string.
     79 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     80                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     81 
     82 # Because of leftmost-then-longest match semantics, be sure to put the
     83 # longest operators first (e.g., if = came before ==, == would get
     84 # recognized as two instances of =).
     85 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     86                  r"//=?", r"->",
     87                  r"[+\-*/%&@|^=<>]=?",
     88                  r"~")
     89 
     90 Bracket = '[][(){}]'
     91 Special = group(r'\r?\n', r'[:;.,`@]')
     92 Funny = group(Operator, Bracket, Special)
     93 
     94 PlainToken = group(Number, Funny, String, Name)
     95 Token = Ignore + PlainToken
     96 
     97 # First (or only) line of ' or " string.
     98 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     99                 group("'", r'\\\r?\n'),
    100                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
    101                 group('"', r'\\\r?\n'))
    102 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
    103 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    104 
    105 tokenprog, pseudoprog, single3prog, double3prog = list(map(
    106     re.compile, (Token, PseudoToken, Single3, Double3)))
    107 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    108             "'''": single3prog, '"""': double3prog,
    109             "r'''": single3prog, 'r"""': double3prog,
    110             "u'''": single3prog, 'u"""': double3prog,
    111             "b'''": single3prog, 'b"""': double3prog,
    112             "ur'''": single3prog, 'ur"""': double3prog,
    113             "br'''": single3prog, 'br"""': double3prog,
    114             "R'''": single3prog, 'R"""': double3prog,
    115             "U'''": single3prog, 'U"""': double3prog,
    116             "B'''": single3prog, 'B"""': double3prog,
    117             "uR'''": single3prog, 'uR"""': double3prog,
    118             "Ur'''": single3prog, 'Ur"""': double3prog,
    119             "UR'''": single3prog, 'UR"""': double3prog,
    120             "bR'''": single3prog, 'bR"""': double3prog,
    121             "Br'''": single3prog, 'Br"""': double3prog,
    122             "BR'''": single3prog, 'BR"""': double3prog,
    123             'r': None, 'R': None,
    124             'u': None, 'U': None,
    125             'b': None, 'B': None}
    126 
    127 triple_quoted = {}
    128 for t in ("'''", '"""',
    129           "r'''", 'r"""', "R'''", 'R"""',
    130           "u'''", 'u"""', "U'''", 'U"""',
    131           "b'''", 'b"""', "B'''", 'B"""',
    132           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    133           "uR'''", 'uR"""', "UR'''", 'UR"""',
    134           "br'''", 'br"""', "Br'''", 'Br"""',
    135           "bR'''", 'bR"""', "BR'''", 'BR"""',):
    136     triple_quoted[t] = t
    137 single_quoted = {}
    138 for t in ("'", '"',
    139           "r'", 'r"', "R'", 'R"',
    140           "u'", 'u"', "U'", 'U"',
    141           "b'", 'b"', "B'", 'B"',
    142           "ur'", 'ur"', "Ur'", 'Ur"',
    143           "uR'", 'uR"', "UR'", 'UR"',
    144           "br'", 'br"', "Br'", 'Br"',
    145           "bR'", 'bR"', "BR'", 'BR"', ):
    146     single_quoted[t] = t
    147 
    148 tabsize = 8
    149 
    150 class TokenError(Exception): pass
    151 
    152 class StopTokenizing(Exception): pass
    153 
    154 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
    155     (srow, scol) = xxx_todo_changeme
    156     (erow, ecol) = xxx_todo_changeme1
    157     print("%d,%d-%d,%d:\t%s\t%s" % \
    158         (srow, scol, erow, ecol, tok_name[type], repr(token)))
    159 
    160 def tokenize(readline, tokeneater=printtoken):
    161     """
    162     The tokenize() function accepts two parameters: one representing the
    163     input stream, and one providing an output mechanism for tokenize().
    164 
    165     The first parameter, readline, must be a callable object which provides
    166     the same interface as the readline() method of built-in file objects.
    167     Each call to the function should return one line of input as a string.
    168 
    169     The second parameter, tokeneater, must also be a callable object. It is
    170     called once for each token, with five arguments, corresponding to the
    171     tuples generated by generate_tokens().
    172     """
    173     try:
    174         tokenize_loop(readline, tokeneater)
    175     except StopTokenizing:
    176         pass
    177 
    178 # backwards compatible interface
    179 def tokenize_loop(readline, tokeneater):
    180     for token_info in generate_tokens(readline):
    181         tokeneater(*token_info)
    182 
    183 class Untokenizer:
    184 
    185     def __init__(self):
    186         self.tokens = []
    187         self.prev_row = 1
    188         self.prev_col = 0
    189 
    190     def add_whitespace(self, start):
    191         row, col = start
    192         assert row <= self.prev_row
    193         col_offset = col - self.prev_col
    194         if col_offset:
    195             self.tokens.append(" " * col_offset)
    196 
    197     def untokenize(self, iterable):
    198         for t in iterable:
    199             if len(t) == 2:
    200                 self.compat(t, iterable)
    201                 break
    202             tok_type, token, start, end, line = t
    203             self.add_whitespace(start)
    204             self.tokens.append(token)
    205             self.prev_row, self.prev_col = end
    206             if tok_type in (NEWLINE, NL):
    207                 self.prev_row += 1
    208                 self.prev_col = 0
    209         return "".join(self.tokens)
    210 
    211     def compat(self, token, iterable):
    212         startline = False
    213         indents = []
    214         toks_append = self.tokens.append
    215         toknum, tokval = token
    216         if toknum in (NAME, NUMBER):
    217             tokval += ' '
    218         if toknum in (NEWLINE, NL):
    219             startline = True
    220         for tok in iterable:
    221             toknum, tokval = tok[:2]
    222 
    223             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
    224                 tokval += ' '
    225 
    226             if toknum == INDENT:
    227                 indents.append(tokval)
    228                 continue
    229             elif toknum == DEDENT:
    230                 indents.pop()
    231                 continue
    232             elif toknum in (NEWLINE, NL):
    233                 startline = True
    234             elif startline and indents:
    235                 toks_append(indents[-1])
    236                 startline = False
    237             toks_append(tokval)
    238 
    239 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
    240 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
    241 
    242 def _get_normal_name(orig_enc):
    243     """Imitates get_normal_name in tokenizer.c."""
    244     # Only care about the first 12 characters.
    245     enc = orig_enc[:12].lower().replace("_", "-")
    246     if enc == "utf-8" or enc.startswith("utf-8-"):
    247         return "utf-8"
    248     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
    249        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
    250         return "iso-8859-1"
    251     return orig_enc
    252 
    253 def detect_encoding(readline):
    254     """
    255     The detect_encoding() function is used to detect the encoding that should
    256     be used to decode a Python source file. It requires one argument, readline,
    257     in the same way as the tokenize() generator.
    258 
    259     It will call readline a maximum of twice, and return the encoding used
    260     (as a string) and a list of any lines (left as bytes) it has read
    261     in.
    262 
    263     It detects the encoding from the presence of a utf-8 bom or an encoding
    264     cookie as specified in pep-0263. If both a bom and a cookie are present, but
    265     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
    266     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    267     'utf-8-sig' is returned.
    268 
    269     If no encoding is specified, then the default of 'utf-8' will be returned.
    270     """
    271     bom_found = False
    272     encoding = None
    273     default = 'utf-8'
    274     def read_or_stop():
    275         try:
    276             return readline()
    277         except StopIteration:
    278             return bytes()
    279 
    280     def find_cookie(line):
    281         try:
    282             line_string = line.decode('ascii')
    283         except UnicodeDecodeError:
    284             return None
    285         match = cookie_re.match(line_string)
    286         if not match:
    287             return None
    288         encoding = _get_normal_name(match.group(1))
    289         try:
    290             codec = lookup(encoding)
    291         except LookupError:
    292             # This behaviour mimics the Python interpreter
    293             raise SyntaxError("unknown encoding: " + encoding)
    294 
    295         if bom_found:
    296             if codec.name != 'utf-8':
    297                 # This behaviour mimics the Python interpreter
    298                 raise SyntaxError('encoding problem: utf-8')
    299             encoding += '-sig'
    300         return encoding
    301 
    302     first = read_or_stop()
    303     if first.startswith(BOM_UTF8):
    304         bom_found = True
    305         first = first[3:]
    306         default = 'utf-8-sig'
    307     if not first:
    308         return default, []
    309 
    310     encoding = find_cookie(first)
    311     if encoding:
    312         return encoding, [first]
    313     if not blank_re.match(first):
    314         return default, [first]
    315 
    316     second = read_or_stop()
    317     if not second:
    318         return default, [first]
    319 
    320     encoding = find_cookie(second)
    321     if encoding:
    322         return encoding, [first, second]
    323 
    324     return default, [first, second]
    325 
    326 def untokenize(iterable):
    327     """Transform tokens back into Python source code.
    328 
    329     Each element returned by the iterable must be a token sequence
    330     with at least two elements, a token number and token value.  If
    331     only two tokens are passed, the resulting output is poor.
    332 
    333     Round-trip invariant for full input:
    334         Untokenized source will match input source exactly
    335 
    336     Round-trip invariant for limited intput:
    337         # Output text will tokenize the back to the input
    338         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    339         newcode = untokenize(t1)
    340         readline = iter(newcode.splitlines(1)).next
    341         t2 = [tok[:2] for tokin generate_tokens(readline)]
    342         assert t1 == t2
    343     """
    344     ut = Untokenizer()
    345     return ut.untokenize(iterable)
    346 
    347 def generate_tokens(readline):
    348     """
    349     The generate_tokens() generator requires one argument, readline, which
    350     must be a callable object which provides the same interface as the
    351     readline() method of built-in file objects. Each call to the function
    352     should return one line of input as a string.  Alternately, readline
    353     can be a callable function terminating with StopIteration:
    354         readline = open(myfile).next    # Example of alternate readline
    355 
    356     The generator produces 5-tuples with these members: the token type; the
    357     token string; a 2-tuple (srow, scol) of ints specifying the row and
    358     column where the token begins in the source; a 2-tuple (erow, ecol) of
    359     ints specifying the row and column where the token ends in the source;
    360     and the line on which the token was found. The line passed is the
    361     logical line; continuation lines are included.
    362     """
    363     lnum = parenlev = continued = 0
    364     namechars, numchars = string.ascii_letters + '_', '0123456789'
    365     contstr, needcont = '', 0
    366     contline = None
    367     indents = [0]
    368 
    369     # 'stashed' and 'async_*' are used for async/await parsing
    370     stashed = None
    371     async_def = False
    372     async_def_indent = 0
    373     async_def_nl = False
    374 
    375     while 1:                                   # loop over lines in stream
    376         try:
    377             line = readline()
    378         except StopIteration:
    379             line = ''
    380         lnum = lnum + 1
    381         pos, max = 0, len(line)
    382 
    383         if contstr:                            # continued string
    384             if not line:
    385                 raise TokenError("EOF in multi-line string", strstart)
    386             endmatch = endprog.match(line)
    387             if endmatch:
    388                 pos = end = endmatch.end(0)
    389                 yield (STRING, contstr + line[:end],
    390                        strstart, (lnum, end), contline + line)
    391                 contstr, needcont = '', 0
    392                 contline = None
    393             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    394                 yield (ERRORTOKEN, contstr + line,
    395                            strstart, (lnum, len(line)), contline)
    396                 contstr = ''
    397                 contline = None
    398                 continue
    399             else:
    400                 contstr = contstr + line
    401                 contline = contline + line
    402                 continue
    403 
    404         elif parenlev == 0 and not continued:  # new statement
    405             if not line: break
    406             column = 0
    407             while pos < max:                   # measure leading whitespace
    408                 if line[pos] == ' ': column = column + 1
    409                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
    410                 elif line[pos] == '\f': column = 0
    411                 else: break
    412                 pos = pos + 1
    413             if pos == max: break
    414 
    415             if stashed:
    416                 yield stashed
    417                 stashed = None
    418 
    419             if line[pos] in '#\r\n':           # skip comments or blank lines
    420                 if line[pos] == '#':
    421                     comment_token = line[pos:].rstrip('\r\n')
    422                     nl_pos = pos + len(comment_token)
    423                     yield (COMMENT, comment_token,
    424                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    425                     yield (NL, line[nl_pos:],
    426                            (lnum, nl_pos), (lnum, len(line)), line)
    427                 else:
    428                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    429                            (lnum, pos), (lnum, len(line)), line)
    430                 continue
    431 
    432             if column > indents[-1]:           # count indents or dedents
    433                 indents.append(column)
    434                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    435             while column < indents[-1]:
    436                 if column not in indents:
    437                     raise IndentationError(
    438                         "unindent does not match any outer indentation level",
    439                         ("<tokenize>", lnum, pos, line))
    440                 indents = indents[:-1]
    441 
    442                 if async_def and async_def_indent >= indents[-1]:
    443                     async_def = False
    444                     async_def_nl = False
    445                     async_def_indent = 0
    446 
    447                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    448 
    449             if async_def and async_def_nl and async_def_indent >= indents[-1]:
    450                 async_def = False
    451                 async_def_nl = False
    452                 async_def_indent = 0
    453 
    454         else:                                  # continued statement
    455             if not line:
    456                 raise TokenError("EOF in multi-line statement", (lnum, 0))
    457             continued = 0
    458 
    459         while pos < max:
    460             pseudomatch = pseudoprog.match(line, pos)
    461             if pseudomatch:                                # scan for tokens
    462                 start, end = pseudomatch.span(1)
    463                 spos, epos, pos = (lnum, start), (lnum, end), end
    464                 token, initial = line[start:end], line[start]
    465 
    466                 if initial in numchars or \
    467                    (initial == '.' and token != '.'):      # ordinary number
    468                     yield (NUMBER, token, spos, epos, line)
    469                 elif initial in '\r\n':
    470                     newline = NEWLINE
    471                     if parenlev > 0:
    472                         newline = NL
    473                     elif async_def:
    474                         async_def_nl = True
    475                     if stashed:
    476                         yield stashed
    477                         stashed = None
    478                     yield (newline, token, spos, epos, line)
    479 
    480                 elif initial == '#':
    481                     assert not token.endswith("\n")
    482                     if stashed:
    483                         yield stashed
    484                         stashed = None
    485                     yield (COMMENT, token, spos, epos, line)
    486                 elif token in triple_quoted:
    487                     endprog = endprogs[token]
    488                     endmatch = endprog.match(line, pos)
    489                     if endmatch:                           # all on one line
    490                         pos = endmatch.end(0)
    491                         token = line[start:pos]
    492                         if stashed:
    493                             yield stashed
    494                             stashed = None
    495                         yield (STRING, token, spos, (lnum, pos), line)
    496                     else:
    497                         strstart = (lnum, start)           # multiple lines
    498                         contstr = line[start:]
    499                         contline = line
    500                         break
    501                 elif initial in single_quoted or \
    502                     token[:2] in single_quoted or \
    503                     token[:3] in single_quoted:
    504                     if token[-1] == '\n':                  # continued string
    505                         strstart = (lnum, start)
    506                         endprog = (endprogs[initial] or endprogs[token[1]] or
    507                                    endprogs[token[2]])
    508                         contstr, needcont = line[start:], 1
    509                         contline = line
    510                         break
    511                     else:                                  # ordinary string
    512                         if stashed:
    513                             yield stashed
    514                             stashed = None
    515                         yield (STRING, token, spos, epos, line)
    516                 elif initial in namechars:                 # ordinary name
    517                     if token in ('async', 'await'):
    518                         if async_def:
    519                             yield (ASYNC if token == 'async' else AWAIT,
    520                                    token, spos, epos, line)
    521                             continue
    522 
    523                     tok = (NAME, token, spos, epos, line)
    524                     if token == 'async' and not stashed:
    525                         stashed = tok
    526                         continue
    527 
    528                     if token == 'def':
    529                         if (stashed
    530                                 and stashed[0] == NAME
    531                                 and stashed[1] == 'async'):
    532 
    533                             async_def = True
    534                             async_def_indent = indents[-1]
    535 
    536                             yield (ASYNC, stashed[1],
    537                                    stashed[2], stashed[3],
    538                                    stashed[4])
    539                             stashed = None
    540 
    541                     if stashed:
    542                         yield stashed
    543                         stashed = None
    544 
    545                     yield tok
    546                 elif initial == '\\':                      # continued stmt
    547                     # This yield is new; needed for better idempotency:
    548                     if stashed:
    549                         yield stashed
    550                         stashed = None
    551                     yield (NL, token, spos, (lnum, pos), line)
    552                     continued = 1
    553                 else:
    554                     if initial in '([{': parenlev = parenlev + 1
    555                     elif initial in ')]}': parenlev = parenlev - 1
    556                     if stashed:
    557                         yield stashed
    558                         stashed = None
    559                     yield (OP, token, spos, epos, line)
    560             else:
    561                 yield (ERRORTOKEN, line[pos],
    562                            (lnum, pos), (lnum, pos+1), line)
    563                 pos = pos + 1
    564 
    565     if stashed:
    566         yield stashed
    567         stashed = None
    568 
    569     for indent in indents[1:]:                 # pop remaining indent levels
    570         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    571     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    572 
    573 if __name__ == '__main__':                     # testing
    574     import sys
    575     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    576     else: tokenize(sys.stdin.readline)
    577