Home | History | Annotate | Download | only in pgen2
      1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
      2 # All rights reserved.
      3 
      4 """Tokenization help for Python programs.
      5 
      6 generate_tokens(readline) is a generator that breaks a stream of
      7 text into Python tokens.  It accepts a readline-like method which is called
      8 repeatedly to get the next line of input (or "" for EOF).  It generates
      9 5-tuples with these members:
     10 
     11     the token type (see token.py)
     12     the token (a string)
     13     the starting (row, column) indices of the token (a 2-tuple of ints)
     14     the ending (row, column) indices of the token (a 2-tuple of ints)
     15     the original line (string)
     16 
     17 It is designed to match the working of the Python tokenizer exactly, except
     18 that it produces COMMENT tokens for comments and gives type OP for all
     19 operators
     20 
     21 Older entry points
     22     tokenize_loop(readline, tokeneater)
     23     tokenize(readline, tokeneater=printtoken)
     24 are the same, except instead of generating tokens, tokeneater is a callback
     25 function to which the 5 fields described above are passed as 5 arguments,
     26 each time a new token is found."""
     27 
     28 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     29 __credits__ = \
     30     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
     31 
     32 import string, re
     33 from codecs import BOM_UTF8, lookup
     34 from lib2to3.pgen2.token import *
     35 
     36 from . import token
     37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
     38            "generate_tokens", "untokenize"]
     39 del token
     40 
     41 try:
     42     bytes
     43 except NameError:
     44     # Support bytes type in Python <= 2.5, so 2to3 turns itself into
     45     # valid Python 3 code.
     46     bytes = str
     47 
     48 def group(*choices): return '(' + '|'.join(choices) + ')'
     49 def any(*choices): return group(*choices) + '*'
     50 def maybe(*choices): return group(*choices) + '?'
     51 
     52 Whitespace = r'[ \f\t]*'
     53 Comment = r'#[^\r\n]*'
     54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
     55 Name = r'[a-zA-Z_]\w*'
     56 
     57 Binnumber = r'0[bB][01]*'
     58 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
     59 Octnumber = r'0[oO]?[0-7]*[lL]?'
     60 Decnumber = r'[1-9]\d*[lL]?'
     61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
     62 Exponent = r'[eE][-+]?\d+'
     63 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
     64 Expfloat = r'\d+' + Exponent
     65 Floatnumber = group(Pointfloat, Expfloat)
     66 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
     67 Number = group(Imagnumber, Floatnumber, Intnumber)
     68 
     69 # Tail end of ' string.
     70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     71 # Tail end of " string.
     72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     73 # Tail end of ''' string.
     74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     75 # Tail end of """ string.
     76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     77 Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
     78 # Single-line ' or " string.
     79 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     80                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     81 
     82 # Because of leftmost-then-longest match semantics, be sure to put the
     83 # longest operators first (e.g., if = came before ==, == would get
     84 # recognized as two instances of =).
     85 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
     86                  r"//=?", r"->",
     87                  r"[+\-*/%&|^=<>]=?",
     88                  r"~")
     89 
     90 Bracket = '[][(){}]'
     91 Special = group(r'\r?\n', r'[:;.,`@]')
     92 Funny = group(Operator, Bracket, Special)
     93 
     94 PlainToken = group(Number, Funny, String, Name)
     95 Token = Ignore + PlainToken
     96 
     97 # First (or only) line of ' or " string.
     98 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     99                 group("'", r'\\\r?\n'),
    100                 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
    101                 group('"', r'\\\r?\n'))
    102 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
    103 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    104 
    105 tokenprog, pseudoprog, single3prog, double3prog = map(
    106     re.compile, (Token, PseudoToken, Single3, Double3))
    107 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    108             "'''": single3prog, '"""': double3prog,
    109             "r'''": single3prog, 'r"""': double3prog,
    110             "u'''": single3prog, 'u"""': double3prog,
    111             "b'''": single3prog, 'b"""': double3prog,
    112             "ur'''": single3prog, 'ur"""': double3prog,
    113             "br'''": single3prog, 'br"""': double3prog,
    114             "R'''": single3prog, 'R"""': double3prog,
    115             "U'''": single3prog, 'U"""': double3prog,
    116             "B'''": single3prog, 'B"""': double3prog,
    117             "uR'''": single3prog, 'uR"""': double3prog,
    118             "Ur'''": single3prog, 'Ur"""': double3prog,
    119             "UR'''": single3prog, 'UR"""': double3prog,
    120             "bR'''": single3prog, 'bR"""': double3prog,
    121             "Br'''": single3prog, 'Br"""': double3prog,
    122             "BR'''": single3prog, 'BR"""': double3prog,
    123             'r': None, 'R': None,
    124             'u': None, 'U': None,
    125             'b': None, 'B': None}
    126 
    127 triple_quoted = {}
    128 for t in ("'''", '"""',
    129           "r'''", 'r"""', "R'''", 'R"""',
    130           "u'''", 'u"""', "U'''", 'U"""',
    131           "b'''", 'b"""', "B'''", 'B"""',
    132           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
    133           "uR'''", 'uR"""', "UR'''", 'UR"""',
    134           "br'''", 'br"""', "Br'''", 'Br"""',
    135           "bR'''", 'bR"""', "BR'''", 'BR"""',):
    136     triple_quoted[t] = t
    137 single_quoted = {}
    138 for t in ("'", '"',
    139           "r'", 'r"', "R'", 'R"',
    140           "u'", 'u"', "U'", 'U"',
    141           "b'", 'b"', "B'", 'B"',
    142           "ur'", 'ur"', "Ur'", 'Ur"',
    143           "uR'", 'uR"', "UR'", 'UR"',
    144           "br'", 'br"', "Br'", 'Br"',
    145           "bR'", 'bR"', "BR'", 'BR"', ):
    146     single_quoted[t] = t
    147 
    148 tabsize = 8
    149 
    150 class TokenError(Exception): pass
    151 
    152 class StopTokenizing(Exception): pass
    153 
    154 def printtoken(type, token, start, end, line): # for testing
    155     (srow, scol) = start
    156     (erow, ecol) = end
    157     print "%d,%d-%d,%d:\t%s\t%s" % \
    158         (srow, scol, erow, ecol, tok_name[type], repr(token))
    159 
    160 def tokenize(readline, tokeneater=printtoken):
    161     """
    162     The tokenize() function accepts two parameters: one representing the
    163     input stream, and one providing an output mechanism for tokenize().
    164 
    165     The first parameter, readline, must be a callable object which provides
    166     the same interface as the readline() method of built-in file objects.
    167     Each call to the function should return one line of input as a string.
    168 
    169     The second parameter, tokeneater, must also be a callable object. It is
    170     called once for each token, with five arguments, corresponding to the
    171     tuples generated by generate_tokens().
    172     """
    173     try:
    174         tokenize_loop(readline, tokeneater)
    175     except StopTokenizing:
    176         pass
    177 
    178 # backwards compatible interface
    179 def tokenize_loop(readline, tokeneater):
    180     for token_info in generate_tokens(readline):
    181         tokeneater(*token_info)
    182 
    183 class Untokenizer:
    184 
    185     def __init__(self):
    186         self.tokens = []
    187         self.prev_row = 1
    188         self.prev_col = 0
    189 
    190     def add_whitespace(self, start):
    191         row, col = start
    192         assert row <= self.prev_row
    193         col_offset = col - self.prev_col
    194         if col_offset:
    195             self.tokens.append(" " * col_offset)
    196 
    197     def untokenize(self, iterable):
    198         for t in iterable:
    199             if len(t) == 2:
    200                 self.compat(t, iterable)
    201                 break
    202             tok_type, token, start, end, line = t
    203             self.add_whitespace(start)
    204             self.tokens.append(token)
    205             self.prev_row, self.prev_col = end
    206             if tok_type in (NEWLINE, NL):
    207                 self.prev_row += 1
    208                 self.prev_col = 0
    209         return "".join(self.tokens)
    210 
    211     def compat(self, token, iterable):
    212         startline = False
    213         indents = []
    214         toks_append = self.tokens.append
    215         toknum, tokval = token
    216         if toknum in (NAME, NUMBER):
    217             tokval += ' '
    218         if toknum in (NEWLINE, NL):
    219             startline = True
    220         for tok in iterable:
    221             toknum, tokval = tok[:2]
    222 
    223             if toknum in (NAME, NUMBER):
    224                 tokval += ' '
    225 
    226             if toknum == INDENT:
    227                 indents.append(tokval)
    228                 continue
    229             elif toknum == DEDENT:
    230                 indents.pop()
    231                 continue
    232             elif toknum in (NEWLINE, NL):
    233                 startline = True
    234             elif startline and indents:
    235                 toks_append(indents[-1])
    236                 startline = False
    237             toks_append(tokval)
    238 
    239 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
    240 
    241 def _get_normal_name(orig_enc):
    242     """Imitates get_normal_name in tokenizer.c."""
    243     # Only care about the first 12 characters.
    244     enc = orig_enc[:12].lower().replace("_", "-")
    245     if enc == "utf-8" or enc.startswith("utf-8-"):
    246         return "utf-8"
    247     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
    248        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
    249         return "iso-8859-1"
    250     return orig_enc
    251 
    252 def detect_encoding(readline):
    253     """
    254     The detect_encoding() function is used to detect the encoding that should
    255     be used to decode a Python source file. It requires one argment, readline,
    256     in the same way as the tokenize() generator.
    257 
    258     It will call readline a maximum of twice, and return the encoding used
    259     (as a string) and a list of any lines (left as bytes) it has read
    260     in.
    261 
    262     It detects the encoding from the presence of a utf-8 bom or an encoding
    263     cookie as specified in pep-0263. If both a bom and a cookie are present, but
    264     disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
    265     charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    266     'utf-8-sig' is returned.
    267 
    268     If no encoding is specified, then the default of 'utf-8' will be returned.
    269     """
    270     bom_found = False
    271     encoding = None
    272     default = 'utf-8'
    273     def read_or_stop():
    274         try:
    275             return readline()
    276         except StopIteration:
    277             return bytes()
    278 
    279     def find_cookie(line):
    280         try:
    281             line_string = line.decode('ascii')
    282         except UnicodeDecodeError:
    283             return None
    284 
    285         matches = cookie_re.findall(line_string)
    286         if not matches:
    287             return None
    288         encoding = _get_normal_name(matches[0])
    289         try:
    290             codec = lookup(encoding)
    291         except LookupError:
    292             # This behaviour mimics the Python interpreter
    293             raise SyntaxError("unknown encoding: " + encoding)
    294 
    295         if bom_found:
    296             if codec.name != 'utf-8':
    297                 # This behaviour mimics the Python interpreter
    298                 raise SyntaxError('encoding problem: utf-8')
    299             encoding += '-sig'
    300         return encoding
    301 
    302     first = read_or_stop()
    303     if first.startswith(BOM_UTF8):
    304         bom_found = True
    305         first = first[3:]
    306         default = 'utf-8-sig'
    307     if not first:
    308         return default, []
    309 
    310     encoding = find_cookie(first)
    311     if encoding:
    312         return encoding, [first]
    313 
    314     second = read_or_stop()
    315     if not second:
    316         return default, [first]
    317 
    318     encoding = find_cookie(second)
    319     if encoding:
    320         return encoding, [first, second]
    321 
    322     return default, [first, second]
    323 
    324 def untokenize(iterable):
    325     """Transform tokens back into Python source code.
    326 
    327     Each element returned by the iterable must be a token sequence
    328     with at least two elements, a token number and token value.  If
    329     only two tokens are passed, the resulting output is poor.
    330 
    331     Round-trip invariant for full input:
    332         Untokenized source will match input source exactly
    333 
    334     Round-trip invariant for limited intput:
    335         # Output text will tokenize the back to the input
    336         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
    337         newcode = untokenize(t1)
    338         readline = iter(newcode.splitlines(1)).next
    339         t2 = [tok[:2] for tokin generate_tokens(readline)]
    340         assert t1 == t2
    341     """
    342     ut = Untokenizer()
    343     return ut.untokenize(iterable)
    344 
    345 def generate_tokens(readline):
    346     """
    347     The generate_tokens() generator requires one argment, readline, which
    348     must be a callable object which provides the same interface as the
    349     readline() method of built-in file objects. Each call to the function
    350     should return one line of input as a string.  Alternately, readline
    351     can be a callable function terminating with StopIteration:
    352         readline = open(myfile).next    # Example of alternate readline
    353 
    354     The generator produces 5-tuples with these members: the token type; the
    355     token string; a 2-tuple (srow, scol) of ints specifying the row and
    356     column where the token begins in the source; a 2-tuple (erow, ecol) of
    357     ints specifying the row and column where the token ends in the source;
    358     and the line on which the token was found. The line passed is the
    359     logical line; continuation lines are included.
    360     """
    361     lnum = parenlev = continued = 0
    362     namechars, numchars = string.ascii_letters + '_', '0123456789'
    363     contstr, needcont = '', 0
    364     contline = None
    365     indents = [0]
    366 
    367     while 1:                                   # loop over lines in stream
    368         try:
    369             line = readline()
    370         except StopIteration:
    371             line = ''
    372         lnum = lnum + 1
    373         pos, max = 0, len(line)
    374 
    375         if contstr:                            # continued string
    376             if not line:
    377                 raise TokenError, ("EOF in multi-line string", strstart)
    378             endmatch = endprog.match(line)
    379             if endmatch:
    380                 pos = end = endmatch.end(0)
    381                 yield (STRING, contstr + line[:end],
    382                        strstart, (lnum, end), contline + line)
    383                 contstr, needcont = '', 0
    384                 contline = None
    385             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    386                 yield (ERRORTOKEN, contstr + line,
    387                            strstart, (lnum, len(line)), contline)
    388                 contstr = ''
    389                 contline = None
    390                 continue
    391             else:
    392                 contstr = contstr + line
    393                 contline = contline + line
    394                 continue
    395 
    396         elif parenlev == 0 and not continued:  # new statement
    397             if not line: break
    398             column = 0
    399             while pos < max:                   # measure leading whitespace
    400                 if line[pos] == ' ': column = column + 1
    401                 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
    402                 elif line[pos] == '\f': column = 0
    403                 else: break
    404                 pos = pos + 1
    405             if pos == max: break
    406 
    407             if line[pos] in '#\r\n':           # skip comments or blank lines
    408                 if line[pos] == '#':
    409                     comment_token = line[pos:].rstrip('\r\n')
    410                     nl_pos = pos + len(comment_token)
    411                     yield (COMMENT, comment_token,
    412                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    413                     yield (NL, line[nl_pos:],
    414                            (lnum, nl_pos), (lnum, len(line)), line)
    415                 else:
    416                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
    417                            (lnum, pos), (lnum, len(line)), line)
    418                 continue
    419 
    420             if column > indents[-1]:           # count indents or dedents
    421                 indents.append(column)
    422                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    423             while column < indents[-1]:
    424                 if column not in indents:
    425                     raise IndentationError(
    426                         "unindent does not match any outer indentation level",
    427                         ("<tokenize>", lnum, pos, line))
    428                 indents = indents[:-1]
    429                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
    430 
    431         else:                                  # continued statement
    432             if not line:
    433                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
    434             continued = 0
    435 
    436         while pos < max:
    437             pseudomatch = pseudoprog.match(line, pos)
    438             if pseudomatch:                                # scan for tokens
    439                 start, end = pseudomatch.span(1)
    440                 spos, epos, pos = (lnum, start), (lnum, end), end
    441                 token, initial = line[start:end], line[start]
    442 
    443                 if initial in numchars or \
    444                    (initial == '.' and token != '.'):      # ordinary number
    445                     yield (NUMBER, token, spos, epos, line)
    446                 elif initial in '\r\n':
    447                     newline = NEWLINE
    448                     if parenlev > 0:
    449                         newline = NL
    450                     yield (newline, token, spos, epos, line)
    451                 elif initial == '#':
    452                     assert not token.endswith("\n")
    453                     yield (COMMENT, token, spos, epos, line)
    454                 elif token in triple_quoted:
    455                     endprog = endprogs[token]
    456                     endmatch = endprog.match(line, pos)
    457                     if endmatch:                           # all on one line
    458                         pos = endmatch.end(0)
    459                         token = line[start:pos]
    460                         yield (STRING, token, spos, (lnum, pos), line)
    461                     else:
    462                         strstart = (lnum, start)           # multiple lines
    463                         contstr = line[start:]
    464                         contline = line
    465                         break
    466                 elif initial in single_quoted or \
    467                     token[:2] in single_quoted or \
    468                     token[:3] in single_quoted:
    469                     if token[-1] == '\n':                  # continued string
    470                         strstart = (lnum, start)
    471                         endprog = (endprogs[initial] or endprogs[token[1]] or
    472                                    endprogs[token[2]])
    473                         contstr, needcont = line[start:], 1
    474                         contline = line
    475                         break
    476                     else:                                  # ordinary string
    477                         yield (STRING, token, spos, epos, line)
    478                 elif initial in namechars:                 # ordinary name
    479                     yield (NAME, token, spos, epos, line)
    480                 elif initial == '\\':                      # continued stmt
    481                     # This yield is new; needed for better idempotency:
    482                     yield (NL, token, spos, (lnum, pos), line)
    483                     continued = 1
    484                 else:
    485                     if initial in '([{': parenlev = parenlev + 1
    486                     elif initial in ')]}': parenlev = parenlev - 1
    487                     yield (OP, token, spos, epos, line)
    488             else:
    489                 yield (ERRORTOKEN, line[pos],
    490                            (lnum, pos), (lnum, pos+1), line)
    491                 pos = pos + 1
    492 
    493     for indent in indents[1:]:                 # pop remaining indent levels
    494         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    495     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    496 
    497 if __name__ == '__main__':                     # testing
    498     import sys
    499     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    500     else: tokenize(sys.stdin.readline)
    501