1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2 # All rights reserved. 3 4 """Tokenization help for Python programs. 5 6 generate_tokens(readline) is a generator that breaks a stream of 7 text into Python tokens. It accepts a readline-like method which is called 8 repeatedly to get the next line of input (or "" for EOF). It generates 9 5-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17 It is designed to match the working of the Python tokenizer exactly, except 18 that it produces COMMENT tokens for comments and gives type OP for all 19 operators 20 21 Older entry points 22 tokenize_loop(readline, tokeneater) 23 tokenize(readline, tokeneater=printtoken) 24 are the same, except instead of generating tokens, tokeneater is a callback 25 function to which the 5 fields described above are passed as 5 arguments, 26 each time a new token is found.""" 27 28 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>' 29 __credits__ = \ 30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 31 32 import string, re 33 from codecs import BOM_UTF8, lookup 34 from lib2to3.pgen2.token import * 35 36 from . import token 37 __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 38 "generate_tokens", "untokenize"] 39 del token 40 41 try: 42 bytes 43 except NameError: 44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 45 # valid Python 3 code. 46 bytes = str 47 48 def group(*choices): return '(' + '|'.join(choices) + ')' 49 def any(*choices): return group(*choices) + '*' 50 def maybe(*choices): return group(*choices) + '?' 51 52 Whitespace = r'[ \f\t]*' 53 Comment = r'#[^\r\n]*' 54 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 55 Name = r'[a-zA-Z_]\w*' 56 57 Binnumber = r'0[bB][01]*' 58 Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 59 Octnumber = r'0[oO]?[0-7]*[lL]?' 60 Decnumber = r'[1-9]\d*[lL]?' 61 Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 62 Exponent = r'[eE][-+]?\d+' 63 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 64 Expfloat = r'\d+' + Exponent 65 Floatnumber = group(Pointfloat, Expfloat) 66 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 67 Number = group(Imagnumber, Floatnumber, Intnumber) 68 69 # Tail end of ' string. 70 Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 71 # Tail end of " string. 72 Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 73 # Tail end of ''' string. 74 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 75 # Tail end of """ string. 76 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 77 Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') 78 # Single-line ' or " string. 79 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 81 82 # Because of leftmost-then-longest match semantics, be sure to put the 83 # longest operators first (e.g., if = came before ==, == would get 84 # recognized as two instances of =). 85 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 86 r"//=?", r"->", 87 r"[+\-*/%&|^=<>]=?", 88 r"~") 89 90 Bracket = '[][(){}]' 91 Special = group(r'\r?\n', r'[:;.,`@]') 92 Funny = group(Operator, Bracket, Special) 93 94 PlainToken = group(Number, Funny, String, Name) 95 Token = Ignore + PlainToken 96 97 # First (or only) line of ' or " string. 98 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 99 group("'", r'\\\r?\n'), 100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 101 group('"', r'\\\r?\n')) 102 PseudoExtras = group(r'\\\r?\n', Comment, Triple) 103 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 104 105 tokenprog, pseudoprog, single3prog, double3prog = map( 106 re.compile, (Token, PseudoToken, Single3, Double3)) 107 endprogs = {"'": re.compile(Single), '"': re.compile(Double), 108 "'''": single3prog, '"""': double3prog, 109 "r'''": single3prog, 'r"""': double3prog, 110 "u'''": single3prog, 'u"""': double3prog, 111 "b'''": single3prog, 'b"""': double3prog, 112 "ur'''": single3prog, 'ur"""': double3prog, 113 "br'''": single3prog, 'br"""': double3prog, 114 "R'''": single3prog, 'R"""': double3prog, 115 "U'''": single3prog, 'U"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "uR'''": single3prog, 'uR"""': double3prog, 118 "Ur'''": single3prog, 'Ur"""': double3prog, 119 "UR'''": single3prog, 'UR"""': double3prog, 120 "bR'''": single3prog, 'bR"""': double3prog, 121 "Br'''": single3prog, 'Br"""': double3prog, 122 "BR'''": single3prog, 'BR"""': double3prog, 123 'r': None, 'R': None, 124 'u': None, 'U': None, 125 'b': None, 'B': None} 126 127 triple_quoted = {} 128 for t in ("'''", '"""', 129 "r'''", 'r"""', "R'''", 'R"""', 130 "u'''", 'u"""', "U'''", 'U"""', 131 "b'''", 'b"""', "B'''", 'B"""', 132 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 133 "uR'''", 'uR"""', "UR'''", 'UR"""', 134 "br'''", 'br"""', "Br'''", 'Br"""', 135 "bR'''", 'bR"""', "BR'''", 'BR"""',): 136 triple_quoted[t] = t 137 single_quoted = {} 138 for t in ("'", '"', 139 "r'", 'r"', "R'", 'R"', 140 "u'", 'u"', "U'", 'U"', 141 "b'", 'b"', "B'", 'B"', 142 "ur'", 'ur"', "Ur'", 'Ur"', 143 "uR'", 'uR"', "UR'", 'UR"', 144 "br'", 'br"', "Br'", 'Br"', 145 "bR'", 'bR"', "BR'", 'BR"', ): 146 single_quoted[t] = t 147 148 tabsize = 8 149 150 class TokenError(Exception): pass 151 152 class StopTokenizing(Exception): pass 153 154 def printtoken(type, token, start, end, line): # for testing 155 (srow, scol) = start 156 (erow, ecol) = end 157 print "%d,%d-%d,%d:\t%s\t%s" % \ 158 (srow, scol, erow, ecol, tok_name[type], repr(token)) 159 160 def tokenize(readline, tokeneater=printtoken): 161 """ 162 The tokenize() function accepts two parameters: one representing the 163 input stream, and one providing an output mechanism for tokenize(). 164 165 The first parameter, readline, must be a callable object which provides 166 the same interface as the readline() method of built-in file objects. 167 Each call to the function should return one line of input as a string. 168 169 The second parameter, tokeneater, must also be a callable object. It is 170 called once for each token, with five arguments, corresponding to the 171 tuples generated by generate_tokens(). 172 """ 173 try: 174 tokenize_loop(readline, tokeneater) 175 except StopTokenizing: 176 pass 177 178 # backwards compatible interface 179 def tokenize_loop(readline, tokeneater): 180 for token_info in generate_tokens(readline): 181 tokeneater(*token_info) 182 183 class Untokenizer: 184 185 def __init__(self): 186 self.tokens = [] 187 self.prev_row = 1 188 self.prev_col = 0 189 190 def add_whitespace(self, start): 191 row, col = start 192 assert row <= self.prev_row 193 col_offset = col - self.prev_col 194 if col_offset: 195 self.tokens.append(" " * col_offset) 196 197 def untokenize(self, iterable): 198 for t in iterable: 199 if len(t) == 2: 200 self.compat(t, iterable) 201 break 202 tok_type, token, start, end, line = t 203 self.add_whitespace(start) 204 self.tokens.append(token) 205 self.prev_row, self.prev_col = end 206 if tok_type in (NEWLINE, NL): 207 self.prev_row += 1 208 self.prev_col = 0 209 return "".join(self.tokens) 210 211 def compat(self, token, iterable): 212 startline = False 213 indents = [] 214 toks_append = self.tokens.append 215 toknum, tokval = token 216 if toknum in (NAME, NUMBER): 217 tokval += ' ' 218 if toknum in (NEWLINE, NL): 219 startline = True 220 for tok in iterable: 221 toknum, tokval = tok[:2] 222 223 if toknum in (NAME, NUMBER): 224 tokval += ' ' 225 226 if toknum == INDENT: 227 indents.append(tokval) 228 continue 229 elif toknum == DEDENT: 230 indents.pop() 231 continue 232 elif toknum in (NEWLINE, NL): 233 startline = True 234 elif startline and indents: 235 toks_append(indents[-1]) 236 startline = False 237 toks_append(tokval) 238 239 cookie_re = re.compile("coding[:=]\s*([-\w.]+)") 240 241 def _get_normal_name(orig_enc): 242 """Imitates get_normal_name in tokenizer.c.""" 243 # Only care about the first 12 characters. 244 enc = orig_enc[:12].lower().replace("_", "-") 245 if enc == "utf-8" or enc.startswith("utf-8-"): 246 return "utf-8" 247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 249 return "iso-8859-1" 250 return orig_enc 251 252 def detect_encoding(readline): 253 """ 254 The detect_encoding() function is used to detect the encoding that should 255 be used to decode a Python source file. It requires one argment, readline, 256 in the same way as the tokenize() generator. 257 258 It will call readline a maximum of twice, and return the encoding used 259 (as a string) and a list of any lines (left as bytes) it has read 260 in. 261 262 It detects the encoding from the presence of a utf-8 bom or an encoding 263 cookie as specified in pep-0263. If both a bom and a cookie are present, but 264 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 265 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 266 'utf-8-sig' is returned. 267 268 If no encoding is specified, then the default of 'utf-8' will be returned. 269 """ 270 bom_found = False 271 encoding = None 272 default = 'utf-8' 273 def read_or_stop(): 274 try: 275 return readline() 276 except StopIteration: 277 return bytes() 278 279 def find_cookie(line): 280 try: 281 line_string = line.decode('ascii') 282 except UnicodeDecodeError: 283 return None 284 285 matches = cookie_re.findall(line_string) 286 if not matches: 287 return None 288 encoding = _get_normal_name(matches[0]) 289 try: 290 codec = lookup(encoding) 291 except LookupError: 292 # This behaviour mimics the Python interpreter 293 raise SyntaxError("unknown encoding: " + encoding) 294 295 if bom_found: 296 if codec.name != 'utf-8': 297 # This behaviour mimics the Python interpreter 298 raise SyntaxError('encoding problem: utf-8') 299 encoding += '-sig' 300 return encoding 301 302 first = read_or_stop() 303 if first.startswith(BOM_UTF8): 304 bom_found = True 305 first = first[3:] 306 default = 'utf-8-sig' 307 if not first: 308 return default, [] 309 310 encoding = find_cookie(first) 311 if encoding: 312 return encoding, [first] 313 314 second = read_or_stop() 315 if not second: 316 return default, [first] 317 318 encoding = find_cookie(second) 319 if encoding: 320 return encoding, [first, second] 321 322 return default, [first, second] 323 324 def untokenize(iterable): 325 """Transform tokens back into Python source code. 326 327 Each element returned by the iterable must be a token sequence 328 with at least two elements, a token number and token value. If 329 only two tokens are passed, the resulting output is poor. 330 331 Round-trip invariant for full input: 332 Untokenized source will match input source exactly 333 334 Round-trip invariant for limited intput: 335 # Output text will tokenize the back to the input 336 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 337 newcode = untokenize(t1) 338 readline = iter(newcode.splitlines(1)).next 339 t2 = [tok[:2] for tokin generate_tokens(readline)] 340 assert t1 == t2 341 """ 342 ut = Untokenizer() 343 return ut.untokenize(iterable) 344 345 def generate_tokens(readline): 346 """ 347 The generate_tokens() generator requires one argment, readline, which 348 must be a callable object which provides the same interface as the 349 readline() method of built-in file objects. Each call to the function 350 should return one line of input as a string. Alternately, readline 351 can be a callable function terminating with StopIteration: 352 readline = open(myfile).next # Example of alternate readline 353 354 The generator produces 5-tuples with these members: the token type; the 355 token string; a 2-tuple (srow, scol) of ints specifying the row and 356 column where the token begins in the source; a 2-tuple (erow, ecol) of 357 ints specifying the row and column where the token ends in the source; 358 and the line on which the token was found. The line passed is the 359 logical line; continuation lines are included. 360 """ 361 lnum = parenlev = continued = 0 362 namechars, numchars = string.ascii_letters + '_', '0123456789' 363 contstr, needcont = '', 0 364 contline = None 365 indents = [0] 366 367 while 1: # loop over lines in stream 368 try: 369 line = readline() 370 except StopIteration: 371 line = '' 372 lnum = lnum + 1 373 pos, max = 0, len(line) 374 375 if contstr: # continued string 376 if not line: 377 raise TokenError, ("EOF in multi-line string", strstart) 378 endmatch = endprog.match(line) 379 if endmatch: 380 pos = end = endmatch.end(0) 381 yield (STRING, contstr + line[:end], 382 strstart, (lnum, end), contline + line) 383 contstr, needcont = '', 0 384 contline = None 385 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 386 yield (ERRORTOKEN, contstr + line, 387 strstart, (lnum, len(line)), contline) 388 contstr = '' 389 contline = None 390 continue 391 else: 392 contstr = contstr + line 393 contline = contline + line 394 continue 395 396 elif parenlev == 0 and not continued: # new statement 397 if not line: break 398 column = 0 399 while pos < max: # measure leading whitespace 400 if line[pos] == ' ': column = column + 1 401 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize 402 elif line[pos] == '\f': column = 0 403 else: break 404 pos = pos + 1 405 if pos == max: break 406 407 if line[pos] in '#\r\n': # skip comments or blank lines 408 if line[pos] == '#': 409 comment_token = line[pos:].rstrip('\r\n') 410 nl_pos = pos + len(comment_token) 411 yield (COMMENT, comment_token, 412 (lnum, pos), (lnum, pos + len(comment_token)), line) 413 yield (NL, line[nl_pos:], 414 (lnum, nl_pos), (lnum, len(line)), line) 415 else: 416 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 417 (lnum, pos), (lnum, len(line)), line) 418 continue 419 420 if column > indents[-1]: # count indents or dedents 421 indents.append(column) 422 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 423 while column < indents[-1]: 424 if column not in indents: 425 raise IndentationError( 426 "unindent does not match any outer indentation level", 427 ("<tokenize>", lnum, pos, line)) 428 indents = indents[:-1] 429 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 430 431 else: # continued statement 432 if not line: 433 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 434 continued = 0 435 436 while pos < max: 437 pseudomatch = pseudoprog.match(line, pos) 438 if pseudomatch: # scan for tokens 439 start, end = pseudomatch.span(1) 440 spos, epos, pos = (lnum, start), (lnum, end), end 441 token, initial = line[start:end], line[start] 442 443 if initial in numchars or \ 444 (initial == '.' and token != '.'): # ordinary number 445 yield (NUMBER, token, spos, epos, line) 446 elif initial in '\r\n': 447 newline = NEWLINE 448 if parenlev > 0: 449 newline = NL 450 yield (newline, token, spos, epos, line) 451 elif initial == '#': 452 assert not token.endswith("\n") 453 yield (COMMENT, token, spos, epos, line) 454 elif token in triple_quoted: 455 endprog = endprogs[token] 456 endmatch = endprog.match(line, pos) 457 if endmatch: # all on one line 458 pos = endmatch.end(0) 459 token = line[start:pos] 460 yield (STRING, token, spos, (lnum, pos), line) 461 else: 462 strstart = (lnum, start) # multiple lines 463 contstr = line[start:] 464 contline = line 465 break 466 elif initial in single_quoted or \ 467 token[:2] in single_quoted or \ 468 token[:3] in single_quoted: 469 if token[-1] == '\n': # continued string 470 strstart = (lnum, start) 471 endprog = (endprogs[initial] or endprogs[token[1]] or 472 endprogs[token[2]]) 473 contstr, needcont = line[start:], 1 474 contline = line 475 break 476 else: # ordinary string 477 yield (STRING, token, spos, epos, line) 478 elif initial in namechars: # ordinary name 479 yield (NAME, token, spos, epos, line) 480 elif initial == '\\': # continued stmt 481 # This yield is new; needed for better idempotency: 482 yield (NL, token, spos, (lnum, pos), line) 483 continued = 1 484 else: 485 if initial in '([{': parenlev = parenlev + 1 486 elif initial in ')]}': parenlev = parenlev - 1 487 yield (OP, token, spos, epos, line) 488 else: 489 yield (ERRORTOKEN, line[pos], 490 (lnum, pos), (lnum, pos+1), line) 491 pos = pos + 1 492 493 for indent in indents[1:]: # pop remaining indent levels 494 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 495 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 496 497 if __name__ == '__main__': # testing 498 import sys 499 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 500 else: tokenize(sys.stdin.readline) 501