Home | History | Annotate | Download | only in pgen2
      1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
      2 # All rights reserved.
      3 
      4 """Tokenization help for Python programs.
      5 
      6 generate_tokens(readline) is a generator that breaks a stream of
      7 text into Python tokens.  It accepts a readline-like method which is called
      8 repeatedly to get the next line of input (or "" for EOF).  It generates
      9 5-tuples with these members:
     10 
     11     the token type (see token.py)
     12     the token (a string)
     13     the starting (row, column) indices of the token (a 2-tuple of ints)
     14     the ending (row, column) indices of the token (a 2-tuple of ints)
     15     the original line (string)
     16 
     17 It is designed to match the working of the Python tokenizer exactly, except
     18 that it produces COMMENT tokens for comments and gives type OP for all
     19 operators
     20 
     21 Older entry points
     22     tokenize_loop(readline, tokeneater)
     23     tokenize(readline, tokeneater=printtoken)
     24 are the same, except instead of generating tokens, tokeneater is a callback
     25 function to which the 5 fields described above are passed as 5 arguments,
     26 each time a new token is found."""
     27 
     28 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     29 __credits__ = \
     30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
     31 
     32 import string, re
     33 from codecs import BOM_UTF8, lookup
     34 from lib2to3.pgen2.token import *
     35 
     36 from . import token
     37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
     38            "generate_tokens", "untokenize"]
     39 del token
     40 
     41 try:
     42     bytes
     43 except NameError:
     44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
     45     # valid Python 3 code.
     46     bytes = str
     47 
     48 def group(*choices): return '(' + '|'.join(choices) + ')'
     49 def any(*choices): return group(*choices) + '*'
     50 def maybe(*choices): return group(*choices) + '?'
     51 def _combinations(*l):
     52     return set(
     53         x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
     54     )
     55 
     56 Whitespace = r'[ \f\t]*'
     57 Comment = r'#[^\r\n]*'
     58 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     59 Name = r'\w+'
     60 
     61 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
     62 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
     63 Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
     64 Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
     65 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
     66 Exponent = r'[eE][-+]?\d+(?:_\d+)*'
     67 Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
     68 Expfloat = r'\d+(?:_\d+)*' + Exponent
     69 Floatnumber = group(Pointfloat, Expfloat)
     70 Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
     71 Number = group(Imagnumber, Floatnumber, Intnumber)
     72 
     73 # Tail end of ' string.
     74 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     75 # Tail end of " string.
     76 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     77 # Tail end of ''' string.
     78 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     79 # Tail end of """ string.
     80 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     81 _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
     82 Triple = group(_litprefix + "'''", _litprefix + '"""')
     83 # Single-line ' or " string.
     84 String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     85                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     86 
     87 # Because of leftmost-then-longest match semantics, be sure to put the
     88 # longest operators first (e.g., if = came before ==, == would get
     89 # recognized as two instances of =).
     90 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     91                  r"//=?", r"->",
     92                  r"[+\-*/%&@|^=<>]=?",
     93                  r"~")
     94 
     95 Bracket = '[][(){}]'
     96 Special = group(r'\r?\n', r'[:;.,`@]')
     97 Funny = group(Operator, Bracket, Special)
     98 
     99 PlainToken = group(Number, Funny, String, Name)
    100 Token = Ignore + PlainToken
    101 
    102 # First (or only) line of ' or " string.
    103 ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
    104                 group("'", r'\\\r?\n'),
    105                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
    106                 group('"', r'\\\r?\n'))
    107 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
    108 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    109 
    110 tokenprog, pseudoprog, single3prog, double3prog = map(
    111     re.compile, (Token, PseudoToken, Single3, Double3))
    112 
    113 _strprefixes = (
    114     _combinations('r', 'R', 'f', 'F') |
    115     _combinations('r', 'R', 'b', 'B') |
    116     {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
    117 )
    118 
    119 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    120             "'''": single3prog, '"""': double3prog,
    121             **{f"{prefix}'''": single3prog for prefix in _strprefixes},
    122             **{f'{prefix}"""': double3prog for prefix in _strprefixes},
    123             **{prefix: None for prefix in _strprefixes}}
    124 
    125 triple_quoted = (
    126     {"'''", '"""'} |
    127     {f"{prefix}'''" for prefix in _strprefixes} |
    128     {f'{prefix}"""' for prefix in _strprefixes}
    129 )
    130 single_quoted = (
    131     {"'", '"'} |
    132     {f"{prefix}'" for prefix in _strprefixes} |
    133     {f'{prefix}"' for prefix in _strprefixes}
    134 )
    135 
    136 tabsize = 8
    137 
    138 class TokenError(Exception): pass
    139 
    140 class StopTokenizing(Exception): pass
    141 
    142 def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
    143     (srow, scol) = xxx_todo_changeme
    144     (erow, ecol) = xxx_todo_changeme1
    145     print("%d,%d-%d,%d:\t%s\t%s" % \
    146         (srow, scol, erow, ecol, tok_name[type], repr(token)))
    147 
    148 def tokenize(readline, tokeneater=printtoken):
    149     """
    150     The tokenize() function accepts two parameters: one representing the
    151     input stream, and one providing an output mechanism for tokenize().
    152 
    153     The first parameter, readline, must be a callable object which provides
    154     the same interface as the readline() method of built-in file objects.
    155     Each call to the function should return one line of input as a string.
    156 
    157     The second parameter, tokeneater, must also be a callable object. It is
    158     called once for each token, with five arguments, corresponding to the
    159     tuples generated by generate_tokens().
    160     """
    161     try:
    162         tokenize_loop(readline, tokeneater)
    163     except StopTokenizing:
    164         pass
    165 
    166 # backwards compatible interface
    167 def tokenize_loop(readline, tokeneater):
    168     for token_info in generate_tokens(readline):
    169         tokeneater(*token_info)
    170 
    171 class Untokenizer:
    172 
    173     def __init__(self):
    174         self.tokens = []
    175         self.prev_row = 1
    176         self.prev_col = 0
    177 
    178     def add_whitespace(self, start):
    179         row, col = start
    180         assert row <= self.prev_row
    181         col_offset = col - self.prev_col
    182         if col_offset:
    183             self.tokens.append(" " * col_offset)
    184 
    185     def untokenize(self, iterable):
    186         for t in iterable:
    187             if len(t) == 2:
    188                 self.compat(t, iterable)
    189                 break
    190             tok_type, token, start, end, line = t
    191             self.add_whitespace(start)
    192             self.tokens.append(token)
    193             self.prev_row, self.prev_col = end
    194             if tok_type in (NEWLINE, NL):
    195                 self.prev_row += 1
    196                 self.prev_col = 0
    197         return "".join(self.tokens)
    198 
    199     def compat(self, token, iterable):
    200         startline = False
    201         indents = []
    202         toks_append = self.tokens.append
    203         toknum, tokval = token
    204         if toknum in (NAME, NUMBER):
    205             tokval += ' '
    206         if toknum in (NEWLINE, NL):
    207             startline = True
    208         for tok in iterable:
    209             toknum, tokval = tok[:2]
    210 
    211             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
    212                 tokval += ' '
    213 
    214             if toknum == INDENT:
    215                 indents.append(tokval)
    216                 continue
    217             elif toknum == DEDENT:
    218                 indents.pop()
    219                 continue
    220             elif toknum in (NEWLINE, NL):
    221                 startline = True
    222             elif startline and indents:
    223                 toks_append(indents[-1])
    224                 startline = False
    225             toks_append(tokval)
    226 
    227 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
    228 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
    229 
    230 def _get_normal_name(orig_enc):
    231     """Imitates get_normal_name in tokenizer.c."""
    232     # Only care about the first 12 characters.
    233     enc = orig_enc[:12].lower().replace("_", "-")
    234     if enc == "utf-8" or enc.startswith("utf-8-"):
    235         return "utf-8"
    236     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
    237        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
    238         return "iso-8859-1"
    239     return orig_enc
    240 
    241 def detect_encoding(readline):
    242     """
    243     The detect_encoding() function is used to detect the encoding that should
    244     be used to decode a Python source file. It requires one argument, readline,
    245     in the same way as the tokenize() generator.
    246 
    247     It will call readline a maximum of twice, and return the encoding used
    248     (as a string) and a list of any lines (left as bytes) it has read
    249     in.
    250 
    251     It detects the encoding from the presence of a utf-8 bom or an encoding
    252     cookie as specified in pep-0263. If both a bom and a cookie are present, but
    253     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
    254     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    255     'utf-8-sig' is returned.
    256 
    257     If no encoding is specified, then the default of 'utf-8' will be returned.
    258     """
    259     bom_found = False
    260     encoding = None
    261     default = 'utf-8'
    262     def read_or_stop():
    263         try:
    264             return readline()
    265         except StopIteration:
    266             return bytes()
    267 
    268     def find_cookie(line):
    269         try:
    270             line_string = line.decode('ascii')
    271         except UnicodeDecodeError:
    272             return None
    273         match = cookie_re.match(line_string)
    274         if not match:
    275             return None
    276         encoding = _get_normal_name(match.group(1))
    277         try:
    278             codec = lookup(encoding)
    279         except LookupError:
    280             # This behaviour mimics the Python interpreter
    281             raise SyntaxError("unknown encoding: " + encoding)
    282 
    283         if bom_found:
    284             if codec.name != 'utf-8':
    285                 # This behaviour mimics the Python interpreter
    286                 raise SyntaxError('encoding problem: utf-8')
    287             encoding += '-sig'
    288         return encoding
    289 
    290     first = read_or_stop()
    291     if first.startswith(BOM_UTF8):
    292         bom_found = True
    293         first = first[3:]
    294         default = 'utf-8-sig'
    295     if not first:
    296         return default, []
    297 
    298     encoding = find_cookie(first)
    299     if encoding:
    300         return encoding, [first]
    301     if not blank_re.match(first):
    302         return default, [first]
    303 
    304     second = read_or_stop()
    305     if not second:
    306         return default, [first]
    307 
    308     encoding = find_cookie(second)
    309     if encoding:
    310         return encoding, [first, second]
    311 
    312     return default, [first, second]
    313 
    314 def untokenize(iterable):
    315     """Transform tokens back into Python source code.
    316 
    317     Each element returned by the iterable must be a token sequence
    318     with at least two elements, a token number and token value.  If
    319     only two tokens are passed, the resulting output is poor.
    320 
    321     Round-trip invariant for full input:
    322         Untokenized source will match input source exactly
    323 
    324     Round-trip invariant for limited intput:
    325         # Output text will tokenize the back to the input
    326         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    327         newcode = untokenize(t1)
    328         readline = iter(newcode.splitlines(1)).next
    329         t2 = [tok[:2] for tokin generate_tokens(readline)]
    330         assert t1 == t2
    331     """
    332     ut = Untokenizer()
    333     return ut.untokenize(iterable)
    334 
    335 def generate_tokens(readline):
    336     """
    337     The generate_tokens() generator requires one argument, readline, which
    338     must be a callable object which provides the same interface as the
    339     readline() method of built-in file objects. Each call to the function
    340     should return one line of input as a string.  Alternately, readline
    341     can be a callable function terminating with StopIteration:
    342         readline = open(myfile).next    # Example of alternate readline
    343 
    344     The generator produces 5-tuples with these members: the token type; the
    345     token string; a 2-tuple (srow, scol) of ints specifying the row and
    346     column where the token begins in the source; a 2-tuple (erow, ecol) of
    347     ints specifying the row and column where the token ends in the source;
    348     and the line on which the token was found. The line passed is the
    349     logical line; continuation lines are included.
    350     """
    351     lnum = parenlev = continued = 0
    352     contstr, needcont = '', 0
    353     contline = None
    354     indents = [0]
    355 
    356     # 'stashed' and 'async_*' are used for async/await parsing
    357     stashed = None
    358     async_def = False
    359     async_def_indent = 0
    360     async_def_nl = False
    361 
    362     while 1:                                   # loop over lines in stream
    363         try:
    364             line = readline()
    365         except StopIteration:
    366             line = ''
    367         lnum = lnum + 1
    368         pos, max = 0, len(line)
    369 
    370         if contstr:                            # continued string
    371             if not line:
    372                 raise TokenError("EOF in multi-line string", strstart)
    373             endmatch = endprog.match(line)
    374             if endmatch:
    375                 pos = end = endmatch.end(0)
    376                 yield (STRING, contstr + line[:end],
    377                        strstart, (lnum, end), contline + line)
    378                 contstr, needcont = '', 0
    379                 contline = None
    380             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    381                 yield (ERRORTOKEN, contstr + line,
    382                            strstart, (lnum, len(line)), contline)
    383                 contstr = ''
    384                 contline = None
    385                 continue
    386             else:
    387                 contstr = contstr + line
    388                 contline = contline + line
    389                 continue
    390 
    391         elif parenlev == 0 and not continued:  # new statement
    392             if not line: break
    393             column = 0
    394             while pos < max:                   # measure leading whitespace
    395                 if line[pos] == ' ': column = column + 1
    396                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
    397                 elif line[pos] == '\f': column = 0
    398                 else: break
    399                 pos = pos + 1
    400             if pos == max: break
    401 
    402             if stashed:
    403                 yield stashed
    404                 stashed = None
    405 
    406             if line[pos] in '#\r\n':           # skip comments or blank lines
    407                 if line[pos] == '#':
    408                     comment_token = line[pos:].rstrip('\r\n')
    409                     nl_pos = pos + len(comment_token)
    410                     yield (COMMENT, comment_token,
    411                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    412                     yield (NL, line[nl_pos:],
    413                            (lnum, nl_pos), (lnum, len(line)), line)
    414                 else:
    415                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    416                            (lnum, pos), (lnum, len(line)), line)
    417                 continue
    418 
    419             if column > indents[-1]:           # count indents or dedents
    420                 indents.append(column)
    421                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    422             while column < indents[-1]:
    423                 if column not in indents:
    424                     raise IndentationError(
    425                         "unindent does not match any outer indentation level",
    426                         ("<tokenize>", lnum, pos, line))
    427                 indents = indents[:-1]
    428 
    429                 if async_def and async_def_indent >= indents[-1]:
    430                     async_def = False
    431                     async_def_nl = False
    432                     async_def_indent = 0
    433 
    434                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    435 
    436             if async_def and async_def_nl and async_def_indent >= indents[-1]:
    437                 async_def = False
    438                 async_def_nl = False
    439                 async_def_indent = 0
    440 
    441         else:                                  # continued statement
    442             if not line:
    443                 raise TokenError("EOF in multi-line statement", (lnum, 0))
    444             continued = 0
    445 
    446         while pos < max:
    447             pseudomatch = pseudoprog.match(line, pos)
    448             if pseudomatch:                                # scan for tokens
    449                 start, end = pseudomatch.span(1)
    450                 spos, epos, pos = (lnum, start), (lnum, end), end
    451                 token, initial = line[start:end], line[start]
    452 
    453                 if initial in string.digits or \
    454                    (initial == '.' and token != '.'):      # ordinary number
    455                     yield (NUMBER, token, spos, epos, line)
    456                 elif initial in '\r\n':
    457                     newline = NEWLINE
    458                     if parenlev > 0:
    459                         newline = NL
    460                     elif async_def:
    461                         async_def_nl = True
    462                     if stashed:
    463                         yield stashed
    464                         stashed = None
    465                     yield (newline, token, spos, epos, line)
    466 
    467                 elif initial == '#':
    468                     assert not token.endswith("\n")
    469                     if stashed:
    470                         yield stashed
    471                         stashed = None
    472                     yield (COMMENT, token, spos, epos, line)
    473                 elif token in triple_quoted:
    474                     endprog = endprogs[token]
    475                     endmatch = endprog.match(line, pos)
    476                     if endmatch:                           # all on one line
    477                         pos = endmatch.end(0)
    478                         token = line[start:pos]
    479                         if stashed:
    480                             yield stashed
    481                             stashed = None
    482                         yield (STRING, token, spos, (lnum, pos), line)
    483                     else:
    484                         strstart = (lnum, start)           # multiple lines
    485                         contstr = line[start:]
    486                         contline = line
    487                         break
    488                 elif initial in single_quoted or \
    489                     token[:2] in single_quoted or \
    490                     token[:3] in single_quoted:
    491                     if token[-1] == '\n':                  # continued string
    492                         strstart = (lnum, start)
    493                         endprog = (endprogs[initial] or endprogs[token[1]] or
    494                                    endprogs[token[2]])
    495                         contstr, needcont = line[start:], 1
    496                         contline = line
    497                         break
    498                     else:                                  # ordinary string
    499                         if stashed:
    500                             yield stashed
    501                             stashed = None
    502                         yield (STRING, token, spos, epos, line)
    503                 elif initial.isidentifier():               # ordinary name
    504                     if token in ('async', 'await'):
    505                         if async_def:
    506                             yield (ASYNC if token == 'async' else AWAIT,
    507                                    token, spos, epos, line)
    508                             continue
    509 
    510                     tok = (NAME, token, spos, epos, line)
    511                     if token == 'async' and not stashed:
    512                         stashed = tok
    513                         continue
    514 
    515                     if token == 'def':
    516                         if (stashed
    517                                 and stashed[0] == NAME
    518                                 and stashed[1] == 'async'):
    519 
    520                             async_def = True
    521                             async_def_indent = indents[-1]
    522 
    523                             yield (ASYNC, stashed[1],
    524                                    stashed[2], stashed[3],
    525                                    stashed[4])
    526                             stashed = None
    527 
    528                     if stashed:
    529                         yield stashed
    530                         stashed = None
    531 
    532                     yield tok
    533                 elif initial == '\\':                      # continued stmt
    534                     # This yield is new; needed for better idempotency:
    535                     if stashed:
    536                         yield stashed
    537                         stashed = None
    538                     yield (NL, token, spos, (lnum, pos), line)
    539                     continued = 1
    540                 else:
    541                     if initial in '([{': parenlev = parenlev + 1
    542                     elif initial in ')]}': parenlev = parenlev - 1
    543                     if stashed:
    544                         yield stashed
    545                         stashed = None
    546                     yield (OP, token, spos, epos, line)
    547             else:
    548                 yield (ERRORTOKEN, line[pos],
    549                            (lnum, pos), (lnum, pos+1), line)
    550                 pos = pos + 1
    551 
    552     if stashed:
    553         yield stashed
    554         stashed = None
    555 
    556     for indent in indents[1:]:                 # pop remaining indent levels
    557         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    558     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    559 
    560 if __name__ == '__main__':                     # testing
    561     import sys
    562     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    563     else: tokenize(sys.stdin.readline)
    564