Home | History | Annotate | Download | only in Lib
      1 """Tokenization help for Python programs.
      2 
      3 tokenize(readline) is a generator that breaks a stream of bytes into
      4 Python tokens.  It decodes the bytes according to PEP-0263 for
      5 determining source file encoding.
      6 
      7 It accepts a readline-like method which is called repeatedly to get the
      8 next line of input (or b"" for EOF).  It generates 5-tuples with these
      9 members:
     10 
     11     the token type (see token.py)
     12     the token (a string)
     13     the starting (row, column) indices of the token (a 2-tuple of ints)
     14     the ending (row, column) indices of the token (a 2-tuple of ints)
     15     the original line (string)
     16 
     17 It is designed to match the working of the Python tokenizer exactly, except
     18 that it produces COMMENT tokens for comments and gives type OP for all
     19 operators.  Additionally, all token lists start with an ENCODING token
     20 which tells you which encoding was used to decode the bytes stream.
     21 """
     22 
     23 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>'
     24 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
     25                'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
     26                'Michael Foord')
     27 from builtins import open as _builtin_open
     28 from codecs import lookup, BOM_UTF8
     29 import collections
     30 from io import TextIOWrapper
     31 from itertools import chain
     32 import itertools as _itertools
     33 import re
     34 import sys
     35 from token import *
     36 
     37 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
     38 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
     39 
     40 import token
     41 __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
     42                            "NL", "untokenize", "ENCODING", "TokenInfo"]
     43 del token
     44 
     45 COMMENT = N_TOKENS
     46 tok_name[COMMENT] = 'COMMENT'
     47 NL = N_TOKENS + 1
     48 tok_name[NL] = 'NL'
     49 ENCODING = N_TOKENS + 2
     50 tok_name[ENCODING] = 'ENCODING'
     51 N_TOKENS += 3
     52 EXACT_TOKEN_TYPES = {
     53     '(':   LPAR,
     54     ')':   RPAR,
     55     '[':   LSQB,
     56     ']':   RSQB,
     57     ':':   COLON,
     58     ',':   COMMA,
     59     ';':   SEMI,
     60     '+':   PLUS,
     61     '-':   MINUS,
     62     '*':   STAR,
     63     '/':   SLASH,
     64     '|':   VBAR,
     65     '&':   AMPER,
     66     '<':   LESS,
     67     '>':   GREATER,
     68     '=':   EQUAL,
     69     '.':   DOT,
     70     '%':   PERCENT,
     71     '{':   LBRACE,
     72     '}':   RBRACE,
     73     '==':  EQEQUAL,
     74     '!=':  NOTEQUAL,
     75     '<=':  LESSEQUAL,
     76     '>=':  GREATEREQUAL,
     77     '~':   TILDE,
     78     '^':   CIRCUMFLEX,
     79     '<<':  LEFTSHIFT,
     80     '>>':  RIGHTSHIFT,
     81     '**':  DOUBLESTAR,
     82     '+=':  PLUSEQUAL,
     83     '-=':  MINEQUAL,
     84     '*=':  STAREQUAL,
     85     '/=':  SLASHEQUAL,
     86     '%=':  PERCENTEQUAL,
     87     '&=':  AMPEREQUAL,
     88     '|=':  VBAREQUAL,
     89     '^=': CIRCUMFLEXEQUAL,
     90     '<<=': LEFTSHIFTEQUAL,
     91     '>>=': RIGHTSHIFTEQUAL,
     92     '**=': DOUBLESTAREQUAL,
     93     '//':  DOUBLESLASH,
     94     '//=': DOUBLESLASHEQUAL,
     95     '@':   AT,
     96     '@=':  ATEQUAL,
     97 }
     98 
     99 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
    100     def __repr__(self):
    101         annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
    102         return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
    103                 self._replace(type=annotated_type))
    104 
    105     @property
    106     def exact_type(self):
    107         if self.type == OP and self.string in EXACT_TOKEN_TYPES:
    108             return EXACT_TOKEN_TYPES[self.string]
    109         else:
    110             return self.type
    111 
    112 def group(*choices): return '(' + '|'.join(choices) + ')'
    113 def any(*choices): return group(*choices) + '*'
    114 def maybe(*choices): return group(*choices) + '?'
    115 
    116 # Note: we use unicode matching for names ("\w") but ascii matching for
    117 # number literals.
    118 Whitespace = r'[ \f\t]*'
    119 Comment = r'#[^\r\n]*'
    120 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
    121 Name = r'\w+'
    122 
    123 Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
    124 Binnumber = r'0[bB](?:_?[01])+'
    125 Octnumber = r'0[oO](?:_?[0-7])+'
    126 Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
    127 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
    128 Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
    129 Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
    130                    r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
    131 Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
    132 Floatnumber = group(Pointfloat, Expfloat)
    133 Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
    134 Number = group(Imagnumber, Floatnumber, Intnumber)
    135 
    136 # Return the empty string, plus all of the valid string prefixes.
    137 def _all_string_prefixes():
    138     # The valid string prefixes. Only contain the lower case versions,
    139     #  and don't contain any permuations (include 'fr', but not
    140     #  'rf'). The various permutations will be generated.
    141     _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
    142     # if we add binary f-strings, add: ['fb', 'fbr']
    143     result = set([''])
    144     for prefix in _valid_string_prefixes:
    145         for t in _itertools.permutations(prefix):
    146             # create a list with upper and lower versions of each
    147             #  character
    148             for u in _itertools.product(*[(c, c.upper()) for c in t]):
    149                 result.add(''.join(u))
    150     return result
    151 
    152 def _compile(expr):
    153     return re.compile(expr, re.UNICODE)
    154 
    155 # Note that since _all_string_prefixes includes the empty string,
    156 #  StringPrefix can be the empty string (making it optional).
    157 StringPrefix = group(*_all_string_prefixes())
    158 
    159 # Tail end of ' string.
    160 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
    161 # Tail end of " string.
    162 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
    163 # Tail end of ''' string.
    164 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
    165 # Tail end of """ string.
    166 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
    167 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
    168 # Single-line ' or " string.
    169 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
    170                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
    171 
    172 # Because of leftmost-then-longest match semantics, be sure to put the
    173 # longest operators first (e.g., if = came before ==, == would get
    174 # recognized as two instances of =).
    175 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
    176                  r"//=?", r"->",
    177                  r"[+\-*/%&@|^=<>]=?",
    178                  r"~")
    179 
    180 Bracket = '[][(){}]'
    181 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
    182 Funny = group(Operator, Bracket, Special)
    183 
    184 PlainToken = group(Number, Funny, String, Name)
    185 Token = Ignore + PlainToken
    186 
    187 # First (or only) line of ' or " string.
    188 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
    189                 group("'", r'\\\r?\n'),
    190                 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
    191                 group('"', r'\\\r?\n'))
    192 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
    193 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    194 
    195 # For a given string prefix plus quotes, endpats maps it to a regex
    196 #  to match the remainder of that string. _prefix can be empty, for
    197 #  a normal single or triple quoted string (with no prefix).
    198 endpats = {}
    199 for _prefix in _all_string_prefixes():
    200     endpats[_prefix + "'"] = Single
    201     endpats[_prefix + '"'] = Double
    202     endpats[_prefix + "'''"] = Single3
    203     endpats[_prefix + '"""'] = Double3
    204 
    205 # A set of all of the single and triple quoted string prefixes,
    206 #  including the opening quotes.
    207 single_quoted = set()
    208 triple_quoted = set()
    209 for t in _all_string_prefixes():
    210     for u in (t + '"', t + "'"):
    211         single_quoted.add(u)
    212     for u in (t + '"""', t + "'''"):
    213         triple_quoted.add(u)
    214 
    215 tabsize = 8
    216 
    217 class TokenError(Exception): pass
    218 
    219 class StopTokenizing(Exception): pass
    220 
    221 
    222 class Untokenizer:
    223 
    224     def __init__(self):
    225         self.tokens = []
    226         self.prev_row = 1
    227         self.prev_col = 0
    228         self.encoding = None
    229 
    230     def add_whitespace(self, start):
    231         row, col = start
    232         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
    233             raise ValueError("start ({},{}) precedes previous end ({},{})"
    234                              .format(row, col, self.prev_row, self.prev_col))
    235         row_offset = row - self.prev_row
    236         if row_offset:
    237             self.tokens.append("\\\n" * row_offset)
    238             self.prev_col = 0
    239         col_offset = col - self.prev_col
    240         if col_offset:
    241             self.tokens.append(" " * col_offset)
    242 
    243     def untokenize(self, iterable):
    244         it = iter(iterable)
    245         indents = []
    246         startline = False
    247         for t in it:
    248             if len(t) == 2:
    249                 self.compat(t, it)
    250                 break
    251             tok_type, token, start, end, line = t
    252             if tok_type == ENCODING:
    253                 self.encoding = token
    254                 continue
    255             if tok_type == ENDMARKER:
    256                 break
    257             if tok_type == INDENT:
    258                 indents.append(token)
    259                 continue
    260             elif tok_type == DEDENT:
    261                 indents.pop()
    262                 self.prev_row, self.prev_col = end
    263                 continue
    264             elif tok_type in (NEWLINE, NL):
    265                 startline = True
    266             elif startline and indents:
    267                 indent = indents[-1]
    268                 if start[1] >= len(indent):
    269                     self.tokens.append(indent)
    270                     self.prev_col = len(indent)
    271                 startline = False
    272             self.add_whitespace(start)
    273             self.tokens.append(token)
    274             self.prev_row, self.prev_col = end
    275             if tok_type in (NEWLINE, NL):
    276                 self.prev_row += 1
    277                 self.prev_col = 0
    278         return "".join(self.tokens)
    279 
    280     def compat(self, token, iterable):
    281         indents = []
    282         toks_append = self.tokens.append
    283         startline = token[0] in (NEWLINE, NL)
    284         prevstring = False
    285 
    286         for tok in chain([token], iterable):
    287             toknum, tokval = tok[:2]
    288             if toknum == ENCODING:
    289                 self.encoding = tokval
    290                 continue
    291 
    292             if toknum in (NAME, NUMBER, ASYNC, AWAIT):
    293                 tokval += ' '
    294 
    295             # Insert a space between two consecutive strings
    296             if toknum == STRING:
    297                 if prevstring:
    298                     tokval = ' ' + tokval
    299                 prevstring = True
    300             else:
    301                 prevstring = False
    302 
    303             if toknum == INDENT:
    304                 indents.append(tokval)
    305                 continue
    306             elif toknum == DEDENT:
    307                 indents.pop()
    308                 continue
    309             elif toknum in (NEWLINE, NL):
    310                 startline = True
    311             elif startline and indents:
    312                 toks_append(indents[-1])
    313                 startline = False
    314             toks_append(tokval)
    315 
    316 
    317 def untokenize(iterable):
    318     """Transform tokens back into Python source code.
    319     It returns a bytes object, encoded using the ENCODING
    320     token, which is the first token sequence output by tokenize.
    321 
    322     Each element returned by the iterable must be a token sequence
    323     with at least two elements, a token number and token value.  If
    324     only two tokens are passed, the resulting output is poor.
    325 
    326     Round-trip invariant for full input:
    327         Untokenized source will match input source exactly
    328 
    329     Round-trip invariant for limited input:
    330         # Output bytes will tokenize back to the input
    331         t1 = [tok[:2] for tok in tokenize(f.readline)]
    332         newcode = untokenize(t1)
    333         readline = BytesIO(newcode).readline
    334         t2 = [tok[:2] for tok in tokenize(readline)]
    335         assert t1 == t2
    336     """
    337     ut = Untokenizer()
    338     out = ut.untokenize(iterable)
    339     if ut.encoding is not None:
    340         out = out.encode(ut.encoding)
    341     return out
    342 
    343 
    344 def _get_normal_name(orig_enc):
    345     """Imitates get_normal_name in tokenizer.c."""
    346     # Only care about the first 12 characters.
    347     enc = orig_enc[:12].lower().replace("_", "-")
    348     if enc == "utf-8" or enc.startswith("utf-8-"):
    349         return "utf-8"
    350     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
    351        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
    352         return "iso-8859-1"
    353     return orig_enc
    354 
    355 def detect_encoding(readline):
    356     """
    357     The detect_encoding() function is used to detect the encoding that should
    358     be used to decode a Python source file.  It requires one argument, readline,
    359     in the same way as the tokenize() generator.
    360 
    361     It will call readline a maximum of twice, and return the encoding used
    362     (as a string) and a list of any lines (left as bytes) it has read in.
    363 
    364     It detects the encoding from the presence of a utf-8 bom or an encoding
    365     cookie as specified in pep-0263.  If both a bom and a cookie are present,
    366     but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    367     invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    368     'utf-8-sig' is returned.
    369 
    370     If no encoding is specified, then the default of 'utf-8' will be returned.
    371     """
    372     try:
    373         filename = readline.__self__.name
    374     except AttributeError:
    375         filename = None
    376     bom_found = False
    377     encoding = None
    378     default = 'utf-8'
    379     def read_or_stop():
    380         try:
    381             return readline()
    382         except StopIteration:
    383             return b''
    384 
    385     def find_cookie(line):
    386         try:
    387             # Decode as UTF-8. Either the line is an encoding declaration,
    388             # in which case it should be pure ASCII, or it must be UTF-8
    389             # per default encoding.
    390             line_string = line.decode('utf-8')
    391         except UnicodeDecodeError:
    392             msg = "invalid or missing encoding declaration"
    393             if filename is not None:
    394                 msg = '{} for {!r}'.format(msg, filename)
    395             raise SyntaxError(msg)
    396 
    397         match = cookie_re.match(line_string)
    398         if not match:
    399             return None
    400         encoding = _get_normal_name(match.group(1))
    401         try:
    402             codec = lookup(encoding)
    403         except LookupError:
    404             # This behaviour mimics the Python interpreter
    405             if filename is None:
    406                 msg = "unknown encoding: " + encoding
    407             else:
    408                 msg = "unknown encoding for {!r}: {}".format(filename,
    409                         encoding)
    410             raise SyntaxError(msg)
    411 
    412         if bom_found:
    413             if encoding != 'utf-8':
    414                 # This behaviour mimics the Python interpreter
    415                 if filename is None:
    416                     msg = 'encoding problem: utf-8'
    417                 else:
    418                     msg = 'encoding problem for {!r}: utf-8'.format(filename)
    419                 raise SyntaxError(msg)
    420             encoding += '-sig'
    421         return encoding
    422 
    423     first = read_or_stop()
    424     if first.startswith(BOM_UTF8):
    425         bom_found = True
    426         first = first[3:]
    427         default = 'utf-8-sig'
    428     if not first:
    429         return default, []
    430 
    431     encoding = find_cookie(first)
    432     if encoding:
    433         return encoding, [first]
    434     if not blank_re.match(first):
    435         return default, [first]
    436 
    437     second = read_or_stop()
    438     if not second:
    439         return default, [first]
    440 
    441     encoding = find_cookie(second)
    442     if encoding:
    443         return encoding, [first, second]
    444 
    445     return default, [first, second]
    446 
    447 
    448 def open(filename):
    449     """Open a file in read only mode using the encoding detected by
    450     detect_encoding().
    451     """
    452     buffer = _builtin_open(filename, 'rb')
    453     try:
    454         encoding, lines = detect_encoding(buffer.readline)
    455         buffer.seek(0)
    456         text = TextIOWrapper(buffer, encoding, line_buffering=True)
    457         text.mode = 'r'
    458         return text
    459     except:
    460         buffer.close()
    461         raise
    462 
    463 
    464 def tokenize(readline):
    465     """
    466     The tokenize() generator requires one argument, readline, which
    467     must be a callable object which provides the same interface as the
    468     readline() method of built-in file objects.  Each call to the function
    469     should return one line of input as bytes.  Alternatively, readline
    470     can be a callable function terminating with StopIteration:
    471         readline = open(myfile, 'rb').__next__  # Example of alternate readline
    472 
    473     The generator produces 5-tuples with these members: the token type; the
    474     token string; a 2-tuple (srow, scol) of ints specifying the row and
    475     column where the token begins in the source; a 2-tuple (erow, ecol) of
    476     ints specifying the row and column where the token ends in the source;
    477     and the line on which the token was found.  The line passed is the
    478     logical line; continuation lines are included.
    479 
    480     The first token sequence will always be an ENCODING token
    481     which tells you which encoding was used to decode the bytes stream.
    482     """
    483     # This import is here to avoid problems when the itertools module is not
    484     # built yet and tokenize is imported.
    485     from itertools import chain, repeat
    486     encoding, consumed = detect_encoding(readline)
    487     rl_gen = iter(readline, b"")
    488     empty = repeat(b"")
    489     return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
    490 
    491 
    492 def _tokenize(readline, encoding):
    493     lnum = parenlev = continued = 0
    494     numchars = '0123456789'
    495     contstr, needcont = '', 0
    496     contline = None
    497     indents = [0]
    498 
    499     # 'stashed' and 'async_*' are used for async/await parsing
    500     stashed = None
    501     async_def = False
    502     async_def_indent = 0
    503     async_def_nl = False
    504 
    505     if encoding is not None:
    506         if encoding == "utf-8-sig":
    507             # BOM will already have been stripped.
    508             encoding = "utf-8"
    509         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
    510     while True:             # loop over lines in stream
    511         try:
    512             line = readline()
    513         except StopIteration:
    514             line = b''
    515 
    516         if encoding is not None:
    517             line = line.decode(encoding)
    518         lnum += 1
    519         pos, max = 0, len(line)
    520 
    521         if contstr:                            # continued string
    522             if not line:
    523                 raise TokenError("EOF in multi-line string", strstart)
    524             endmatch = endprog.match(line)
    525             if endmatch:
    526                 pos = end = endmatch.end(0)
    527                 yield TokenInfo(STRING, contstr + line[:end],
    528                        strstart, (lnum, end), contline + line)
    529                 contstr, needcont = '', 0
    530                 contline = None
    531             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
    532                 yield TokenInfo(ERRORTOKEN, contstr + line,
    533                            strstart, (lnum, len(line)), contline)
    534                 contstr = ''
    535                 contline = None
    536                 continue
    537             else:
    538                 contstr = contstr + line
    539                 contline = contline + line
    540                 continue
    541 
    542         elif parenlev == 0 and not continued:  # new statement
    543             if not line: break
    544             column = 0
    545             while pos < max:                   # measure leading whitespace
    546                 if line[pos] == ' ':
    547                     column += 1
    548                 elif line[pos] == '\t':
    549                     column = (column//tabsize + 1)*tabsize
    550                 elif line[pos] == '\f':
    551                     column = 0
    552                 else:
    553                     break
    554                 pos += 1
    555             if pos == max:
    556                 break
    557 
    558             if line[pos] in '#\r\n':           # skip comments or blank lines
    559                 if line[pos] == '#':
    560                     comment_token = line[pos:].rstrip('\r\n')
    561                     nl_pos = pos + len(comment_token)
    562                     yield TokenInfo(COMMENT, comment_token,
    563                            (lnum, pos), (lnum, pos + len(comment_token)), line)
    564                     yield TokenInfo(NL, line[nl_pos:],
    565                            (lnum, nl_pos), (lnum, len(line)), line)
    566                 else:
    567                     yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
    568                            (lnum, pos), (lnum, len(line)), line)
    569                 continue
    570 
    571             if column > indents[-1]:           # count indents or dedents
    572                 indents.append(column)
    573                 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
    574             while column < indents[-1]:
    575                 if column not in indents:
    576                     raise IndentationError(
    577                         "unindent does not match any outer indentation level",
    578                         ("<tokenize>", lnum, pos, line))
    579                 indents = indents[:-1]
    580 
    581                 if async_def and async_def_indent >= indents[-1]:
    582                     async_def = False
    583                     async_def_nl = False
    584                     async_def_indent = 0
    585 
    586                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
    587 
    588             if async_def and async_def_nl and async_def_indent >= indents[-1]:
    589                 async_def = False
    590                 async_def_nl = False
    591                 async_def_indent = 0
    592 
    593         else:                                  # continued statement
    594             if not line:
    595                 raise TokenError("EOF in multi-line statement", (lnum, 0))
    596             continued = 0
    597 
    598         while pos < max:
    599             pseudomatch = _compile(PseudoToken).match(line, pos)
    600             if pseudomatch:                                # scan for tokens
    601                 start, end = pseudomatch.span(1)
    602                 spos, epos, pos = (lnum, start), (lnum, end), end
    603                 if start == end:
    604                     continue
    605                 token, initial = line[start:end], line[start]
    606 
    607                 if (initial in numchars or                  # ordinary number
    608                     (initial == '.' and token != '.' and token != '...')):
    609                     yield TokenInfo(NUMBER, token, spos, epos, line)
    610                 elif initial in '\r\n':
    611                     if stashed:
    612                         yield stashed
    613                         stashed = None
    614                     if parenlev > 0:
    615                         yield TokenInfo(NL, token, spos, epos, line)
    616                     else:
    617                         yield TokenInfo(NEWLINE, token, spos, epos, line)
    618                         if async_def:
    619                             async_def_nl = True
    620 
    621                 elif initial == '#':
    622                     assert not token.endswith("\n")
    623                     if stashed:
    624                         yield stashed
    625                         stashed = None
    626                     yield TokenInfo(COMMENT, token, spos, epos, line)
    627 
    628                 elif token in triple_quoted:
    629                     endprog = _compile(endpats[token])
    630                     endmatch = endprog.match(line, pos)
    631                     if endmatch:                           # all on one line
    632                         pos = endmatch.end(0)
    633                         token = line[start:pos]
    634                         yield TokenInfo(STRING, token, spos, (lnum, pos), line)
    635                     else:
    636                         strstart = (lnum, start)           # multiple lines
    637                         contstr = line[start:]
    638                         contline = line
    639                         break
    640 
    641                 # Check up to the first 3 chars of the token to see if
    642                 #  they're in the single_quoted set. If so, they start
    643                 #  a string.
    644                 # We're using the first 3, because we're looking for
    645                 #  "rb'" (for example) at the start of the token. If
    646                 #  we switch to longer prefixes, this needs to be
    647                 #  adjusted.
    648                 # Note that initial == token[:1].
    649                 # Also note that single quote checking must come after
    650                 #  triple quote checking (above).
    651                 elif (initial in single_quoted or
    652                       token[:2] in single_quoted or
    653                       token[:3] in single_quoted):
    654                     if token[-1] == '\n':                  # continued string
    655                         strstart = (lnum, start)
    656                         # Again, using the first 3 chars of the
    657                         #  token. This is looking for the matching end
    658                         #  regex for the correct type of quote
    659                         #  character. So it's really looking for
    660                         #  endpats["'"] or endpats['"'], by trying to
    661                         #  skip string prefix characters, if any.
    662                         endprog = _compile(endpats.get(initial) or
    663                                            endpats.get(token[1]) or
    664                                            endpats.get(token[2]))
    665                         contstr, needcont = line[start:], 1
    666                         contline = line
    667                         break
    668                     else:                                  # ordinary string
    669                         yield TokenInfo(STRING, token, spos, epos, line)
    670 
    671                 elif initial.isidentifier():               # ordinary name
    672                     if token in ('async', 'await'):
    673                         if async_def:
    674                             yield TokenInfo(
    675                                 ASYNC if token == 'async' else AWAIT,
    676                                 token, spos, epos, line)
    677                             continue
    678 
    679                     tok = TokenInfo(NAME, token, spos, epos, line)
    680                     if token == 'async' and not stashed:
    681                         stashed = tok
    682                         continue
    683 
    684                     if token == 'def':
    685                         if (stashed
    686                                 and stashed.type == NAME
    687                                 and stashed.string == 'async'):
    688 
    689                             async_def = True
    690                             async_def_indent = indents[-1]
    691 
    692                             yield TokenInfo(ASYNC, stashed.string,
    693                                             stashed.start, stashed.end,
    694                                             stashed.line)
    695                             stashed = None
    696 
    697                     if stashed:
    698                         yield stashed
    699                         stashed = None
    700 
    701                     yield tok
    702                 elif initial == '\\':                      # continued stmt
    703                     continued = 1
    704                 else:
    705                     if initial in '([{':
    706                         parenlev += 1
    707                     elif initial in ')]}':
    708                         parenlev -= 1
    709                     if stashed:
    710                         yield stashed
    711                         stashed = None
    712                     yield TokenInfo(OP, token, spos, epos, line)
    713             else:
    714                 yield TokenInfo(ERRORTOKEN, line[pos],
    715                            (lnum, pos), (lnum, pos+1), line)
    716                 pos += 1
    717 
    718     if stashed:
    719         yield stashed
    720         stashed = None
    721 
    722     for indent in indents[1:]:                 # pop remaining indent levels
    723         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    724     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
    725 
    726 
    727 # An undocumented, backwards compatible, API for all the places in the standard
    728 # library that expect to be able to use tokenize with strings
    729 def generate_tokens(readline):
    730     return _tokenize(readline, None)
    731 
    732 def main():
    733     import argparse
    734 
    735     # Helper error handling routines
    736     def perror(message):
    737         print(message, file=sys.stderr)
    738 
    739     def error(message, filename=None, location=None):
    740         if location:
    741             args = (filename,) + location + (message,)
    742             perror("%s:%d:%d: error: %s" % args)
    743         elif filename:
    744             perror("%s: error: %s" % (filename, message))
    745         else:
    746             perror("error: %s" % message)
    747         sys.exit(1)
    748 
    749     # Parse the arguments and options
    750     parser = argparse.ArgumentParser(prog='python -m tokenize')
    751     parser.add_argument(dest='filename', nargs='?',
    752                         metavar='filename.py',
    753                         help='the file to tokenize; defaults to stdin')
    754     parser.add_argument('-e', '--exact', dest='exact', action='store_true',
    755                         help='display token names using the exact type')
    756     args = parser.parse_args()
    757 
    758     try:
    759         # Tokenize the input
    760         if args.filename:
    761             filename = args.filename
    762             with _builtin_open(filename, 'rb') as f:
    763                 tokens = list(tokenize(f.readline))
    764         else:
    765             filename = "<stdin>"
    766             tokens = _tokenize(sys.stdin.readline, None)
    767 
    768         # Output the tokenization
    769         for token in tokens:
    770             token_type = token.type
    771             if args.exact:
    772                 token_type = token.exact_type
    773             token_range = "%d,%d-%d,%d:" % (token.start + token.end)
    774             print("%-20s%-15s%-15r" %
    775                   (token_range, tok_name[token_type], token.string))
    776     except IndentationError as err:
    777         line, column = err.args[1][1:3]
    778         error(err.args[0], filename, (line, column))
    779     except TokenError as err:
    780         line, column = err.args[1]
    781         error(err.args[0], filename, (line, column))
    782     except SyntaxError as err:
    783         error(err, filename)
    784     except OSError as err:
    785         error(err)
    786     except KeyboardInterrupt:
    787         print("interrupted\n")
    788     except Exception as err:
    789         perror("unexpected error: %s" % err)
    790         raise
    791 
    792 if __name__ == "__main__":
    793     main()
    794