1 """Tokenization help for Python programs. 2 3 generate_tokens(readline) is a generator that breaks a stream of 4 text into Python tokens. It accepts a readline-like method which is called 5 repeatedly to get the next line of input (or "" for EOF). It generates 6 5-tuples with these members: 7 8 the token type (see token.py) 9 the token (a string) 10 the starting (row, column) indices of the token (a 2-tuple of ints) 11 the ending (row, column) indices of the token (a 2-tuple of ints) 12 the original line (string) 13 14 It is designed to match the working of the Python tokenizer exactly, except 15 that it produces COMMENT tokens for comments and gives type OP for all 16 operators 17 18 Older entry points 19 tokenize_loop(readline, tokeneater) 20 tokenize(readline, tokeneater=printtoken) 21 are the same, except instead of generating tokens, tokeneater is a callback 22 function to which the 5 fields described above are passed as 5 arguments, 23 each time a new token is found.""" 24 25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>' 26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 27 'Skip Montanaro, Raymond Hettinger') 28 29 from itertools import chain 30 import string, re 31 from token import * 32 33 import token 34 __all__ = [x for x in dir(token) if not x.startswith("_")] 35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] 36 del x 37 del token 38 39 COMMENT = N_TOKENS 40 tok_name[COMMENT] = 'COMMENT' 41 NL = N_TOKENS + 1 42 tok_name[NL] = 'NL' 43 N_TOKENS += 2 44 45 def group(*choices): return '(' + '|'.join(choices) + ')' 46 def any(*choices): return group(*choices) + '*' 47 def maybe(*choices): return group(*choices) + '?' 48 49 Whitespace = r'[ \f\t]*' 50 Comment = r'#[^\r\n]*' 51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 52 Name = r'[a-zA-Z_]\w*' 53 54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?' 55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' 56 Binnumber = r'0[bB][01]+[lL]?' 57 Decnumber = r'[1-9]\d*[lL]?' 58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 59 Exponent = r'[eE][-+]?\d+' 60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 61 Expfloat = r'\d+' + Exponent 62 Floatnumber = group(Pointfloat, Expfloat) 63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 64 Number = group(Imagnumber, Floatnumber, Intnumber) 65 66 # Tail end of ' string. 67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 68 # Tail end of " string. 69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 70 # Tail end of ''' string. 71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 72 # Tail end of """ string. 73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') 75 # Single-line ' or " string. 76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 77 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 78 79 # Because of leftmost-then-longest match semantics, be sure to put the 80 # longest operators first (e.g., if = came before ==, == would get 81 # recognized as two instances of =). 82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 83 r"//=?", 84 r"[+\-*/%&|^=<>]=?", 85 r"~") 86 87 Bracket = '[][(){}]' 88 Special = group(r'\r?\n', r'[:;.,`@]') 89 Funny = group(Operator, Bracket, Special) 90 91 PlainToken = group(Number, Funny, String, Name) 92 Token = Ignore + PlainToken 93 94 # First (or only) line of ' or " string. 95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 96 group("'", r'\\\r?\n'), 97 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 98 group('"', r'\\\r?\n')) 99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 101 102 tokenprog, pseudoprog, single3prog, double3prog = map( 103 re.compile, (Token, PseudoToken, Single3, Double3)) 104 endprogs = {"'": re.compile(Single), '"': re.compile(Double), 105 "'''": single3prog, '"""': double3prog, 106 "r'''": single3prog, 'r"""': double3prog, 107 "u'''": single3prog, 'u"""': double3prog, 108 "ur'''": single3prog, 'ur"""': double3prog, 109 "R'''": single3prog, 'R"""': double3prog, 110 "U'''": single3prog, 'U"""': double3prog, 111 "uR'''": single3prog, 'uR"""': double3prog, 112 "Ur'''": single3prog, 'Ur"""': double3prog, 113 "UR'''": single3prog, 'UR"""': double3prog, 114 "b'''": single3prog, 'b"""': double3prog, 115 "br'''": single3prog, 'br"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "bR'''": single3prog, 'bR"""': double3prog, 118 "Br'''": single3prog, 'Br"""': double3prog, 119 "BR'''": single3prog, 'BR"""': double3prog, 120 'r': None, 'R': None, 'u': None, 'U': None, 121 'b': None, 'B': None} 122 123 triple_quoted = {} 124 for t in ("'''", '"""', 125 "r'''", 'r"""', "R'''", 'R"""', 126 "u'''", 'u"""', "U'''", 'U"""', 127 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 128 "uR'''", 'uR"""', "UR'''", 'UR"""', 129 "b'''", 'b"""', "B'''", 'B"""', 130 "br'''", 'br"""', "Br'''", 'Br"""', 131 "bR'''", 'bR"""', "BR'''", 'BR"""'): 132 triple_quoted[t] = t 133 single_quoted = {} 134 for t in ("'", '"', 135 "r'", 'r"', "R'", 'R"', 136 "u'", 'u"', "U'", 'U"', 137 "ur'", 'ur"', "Ur'", 'Ur"', 138 "uR'", 'uR"', "UR'", 'UR"', 139 "b'", 'b"', "B'", 'B"', 140 "br'", 'br"', "Br'", 'Br"', 141 "bR'", 'bR"', "BR'", 'BR"' ): 142 single_quoted[t] = t 143 144 tabsize = 8 145 146 class TokenError(Exception): pass 147 148 class StopTokenizing(Exception): pass 149 150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing 151 srow, scol = srow_scol 152 erow, ecol = erow_ecol 153 print "%d,%d-%d,%d:\t%s\t%s" % \ 154 (srow, scol, erow, ecol, tok_name[type], repr(token)) 155 156 def tokenize(readline, tokeneater=printtoken): 157 """ 158 The tokenize() function accepts two parameters: one representing the 159 input stream, and one providing an output mechanism for tokenize(). 160 161 The first parameter, readline, must be a callable object which provides 162 the same interface as the readline() method of built-in file objects. 163 Each call to the function should return one line of input as a string. 164 165 The second parameter, tokeneater, must also be a callable object. It is 166 called once for each token, with five arguments, corresponding to the 167 tuples generated by generate_tokens(). 168 """ 169 try: 170 tokenize_loop(readline, tokeneater) 171 except StopTokenizing: 172 pass 173 174 # backwards compatible interface 175 def tokenize_loop(readline, tokeneater): 176 for token_info in generate_tokens(readline): 177 tokeneater(*token_info) 178 179 class Untokenizer: 180 181 def __init__(self): 182 self.tokens = [] 183 self.prev_row = 1 184 self.prev_col = 0 185 186 def add_whitespace(self, start): 187 row, col = start 188 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 189 raise ValueError("start ({},{}) precedes previous end ({},{})" 190 .format(row, col, self.prev_row, self.prev_col)) 191 row_offset = row - self.prev_row 192 if row_offset: 193 self.tokens.append("\\\n" * row_offset) 194 self.prev_col = 0 195 col_offset = col - self.prev_col 196 if col_offset: 197 self.tokens.append(" " * col_offset) 198 199 def untokenize(self, iterable): 200 it = iter(iterable) 201 indents = [] 202 startline = False 203 for t in it: 204 if len(t) == 2: 205 self.compat(t, it) 206 break 207 tok_type, token, start, end, line = t 208 if tok_type == ENDMARKER: 209 break 210 if tok_type == INDENT: 211 indents.append(token) 212 continue 213 elif tok_type == DEDENT: 214 indents.pop() 215 self.prev_row, self.prev_col = end 216 continue 217 elif tok_type in (NEWLINE, NL): 218 startline = True 219 elif startline and indents: 220 indent = indents[-1] 221 if start[1] >= len(indent): 222 self.tokens.append(indent) 223 self.prev_col = len(indent) 224 startline = False 225 self.add_whitespace(start) 226 self.tokens.append(token) 227 self.prev_row, self.prev_col = end 228 if tok_type in (NEWLINE, NL): 229 self.prev_row += 1 230 self.prev_col = 0 231 return "".join(self.tokens) 232 233 def compat(self, token, iterable): 234 indents = [] 235 toks_append = self.tokens.append 236 startline = token[0] in (NEWLINE, NL) 237 prevstring = False 238 239 for tok in chain([token], iterable): 240 toknum, tokval = tok[:2] 241 242 if toknum in (NAME, NUMBER): 243 tokval += ' ' 244 245 # Insert a space between two consecutive strings 246 if toknum == STRING: 247 if prevstring: 248 tokval = ' ' + tokval 249 prevstring = True 250 else: 251 prevstring = False 252 253 if toknum == INDENT: 254 indents.append(tokval) 255 continue 256 elif toknum == DEDENT: 257 indents.pop() 258 continue 259 elif toknum in (NEWLINE, NL): 260 startline = True 261 elif startline and indents: 262 toks_append(indents[-1]) 263 startline = False 264 toks_append(tokval) 265 266 def untokenize(iterable): 267 """Transform tokens back into Python source code. 268 269 Each element returned by the iterable must be a token sequence 270 with at least two elements, a token number and token value. If 271 only two tokens are passed, the resulting output is poor. 272 273 Round-trip invariant for full input: 274 Untokenized source will match input source exactly 275 276 Round-trip invariant for limited intput: 277 # Output text will tokenize the back to the input 278 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 279 newcode = untokenize(t1) 280 readline = iter(newcode.splitlines(1)).next 281 t2 = [tok[:2] for tok in generate_tokens(readline)] 282 assert t1 == t2 283 """ 284 ut = Untokenizer() 285 return ut.untokenize(iterable) 286 287 def generate_tokens(readline): 288 """ 289 The generate_tokens() generator requires one argument, readline, which 290 must be a callable object which provides the same interface as the 291 readline() method of built-in file objects. Each call to the function 292 should return one line of input as a string. Alternately, readline 293 can be a callable function terminating with StopIteration: 294 readline = open(myfile).next # Example of alternate readline 295 296 The generator produces 5-tuples with these members: the token type; the 297 token string; a 2-tuple (srow, scol) of ints specifying the row and 298 column where the token begins in the source; a 2-tuple (erow, ecol) of 299 ints specifying the row and column where the token ends in the source; 300 and the line on which the token was found. The line passed is the 301 logical line; continuation lines are included. 302 """ 303 lnum = parenlev = continued = 0 304 namechars, numchars = string.ascii_letters + '_', '0123456789' 305 contstr, needcont = '', 0 306 contline = None 307 indents = [0] 308 309 last_line = b'' 310 line = b'' 311 while 1: # loop over lines in stream 312 try: 313 # We capture the value of the line variable here because 314 # readline uses the empty string '' to signal end of input, 315 # hence `line` itself will always be overwritten at the end 316 # of this loop. 317 last_line = line 318 line = readline() 319 except StopIteration: 320 line = '' 321 lnum += 1 322 pos, max = 0, len(line) 323 324 if contstr: # continued string 325 if not line: 326 raise TokenError, ("EOF in multi-line string", strstart) 327 endmatch = endprog.match(line) 328 if endmatch: 329 pos = end = endmatch.end(0) 330 yield (STRING, contstr + line[:end], 331 strstart, (lnum, end), contline + line) 332 contstr, needcont = '', 0 333 contline = None 334 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 335 yield (ERRORTOKEN, contstr + line, 336 strstart, (lnum, len(line)), contline) 337 contstr = '' 338 contline = None 339 continue 340 else: 341 contstr = contstr + line 342 contline = contline + line 343 continue 344 345 elif parenlev == 0 and not continued: # new statement 346 if not line: break 347 column = 0 348 while pos < max: # measure leading whitespace 349 if line[pos] == ' ': 350 column += 1 351 elif line[pos] == '\t': 352 column = (column//tabsize + 1)*tabsize 353 elif line[pos] == '\f': 354 column = 0 355 else: 356 break 357 pos += 1 358 if pos == max: 359 break 360 361 if line[pos] in '#\r\n': # skip comments or blank lines 362 if line[pos] == '#': 363 comment_token = line[pos:].rstrip('\r\n') 364 nl_pos = pos + len(comment_token) 365 yield (COMMENT, comment_token, 366 (lnum, pos), (lnum, pos + len(comment_token)), line) 367 yield (NL, line[nl_pos:], 368 (lnum, nl_pos), (lnum, len(line)), line) 369 else: 370 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 371 (lnum, pos), (lnum, len(line)), line) 372 continue 373 374 if column > indents[-1]: # count indents or dedents 375 indents.append(column) 376 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 377 while column < indents[-1]: 378 if column not in indents: 379 raise IndentationError( 380 "unindent does not match any outer indentation level", 381 ("<tokenize>", lnum, pos, line)) 382 indents = indents[:-1] 383 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 384 385 else: # continued statement 386 if not line: 387 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 388 continued = 0 389 390 while pos < max: 391 pseudomatch = pseudoprog.match(line, pos) 392 if pseudomatch: # scan for tokens 393 start, end = pseudomatch.span(1) 394 spos, epos, pos = (lnum, start), (lnum, end), end 395 if start == end: 396 continue 397 token, initial = line[start:end], line[start] 398 399 if initial in numchars or \ 400 (initial == '.' and token != '.'): # ordinary number 401 yield (NUMBER, token, spos, epos, line) 402 elif initial in '\r\n': 403 yield (NL if parenlev > 0 else NEWLINE, 404 token, spos, epos, line) 405 elif initial == '#': 406 assert not token.endswith("\n") 407 yield (COMMENT, token, spos, epos, line) 408 elif token in triple_quoted: 409 endprog = endprogs[token] 410 endmatch = endprog.match(line, pos) 411 if endmatch: # all on one line 412 pos = endmatch.end(0) 413 token = line[start:pos] 414 yield (STRING, token, spos, (lnum, pos), line) 415 else: 416 strstart = (lnum, start) # multiple lines 417 contstr = line[start:] 418 contline = line 419 break 420 elif initial in single_quoted or \ 421 token[:2] in single_quoted or \ 422 token[:3] in single_quoted: 423 if token[-1] == '\n': # continued string 424 strstart = (lnum, start) 425 endprog = (endprogs[initial] or endprogs[token[1]] or 426 endprogs[token[2]]) 427 contstr, needcont = line[start:], 1 428 contline = line 429 break 430 else: # ordinary string 431 yield (STRING, token, spos, epos, line) 432 elif initial in namechars: # ordinary name 433 yield (NAME, token, spos, epos, line) 434 elif initial == '\\': # continued stmt 435 continued = 1 436 else: 437 if initial in '([{': 438 parenlev += 1 439 elif initial in ')]}': 440 parenlev -= 1 441 yield (OP, token, spos, epos, line) 442 else: 443 yield (ERRORTOKEN, line[pos], 444 (lnum, pos), (lnum, pos+1), line) 445 pos += 1 446 447 # Add an implicit NEWLINE if the input doesn't end in one 448 if last_line and last_line[-1] not in '\r\n': 449 yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 450 for indent in indents[1:]: # pop remaining indent levels 451 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 452 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 453 454 if __name__ == '__main__': # testing 455 import sys 456 if len(sys.argv) > 1: 457 tokenize(open(sys.argv[1]).readline) 458 else: 459 tokenize(sys.stdin.readline) 460