1 """Tokenization help for Python programs. 2 3 generate_tokens(readline) is a generator that breaks a stream of 4 text into Python tokens. It accepts a readline-like method which is called 5 repeatedly to get the next line of input (or "" for EOF). It generates 6 5-tuples with these members: 7 8 the token type (see token.py) 9 the token (a string) 10 the starting (row, column) indices of the token (a 2-tuple of ints) 11 the ending (row, column) indices of the token (a 2-tuple of ints) 12 the original line (string) 13 14 It is designed to match the working of the Python tokenizer exactly, except 15 that it produces COMMENT tokens for comments and gives type OP for all 16 operators 17 18 Older entry points 19 tokenize_loop(readline, tokeneater) 20 tokenize(readline, tokeneater=printtoken) 21 are the same, except instead of generating tokens, tokeneater is a callback 22 function to which the 5 fields described above are passed as 5 arguments, 23 each time a new token is found.""" 24 25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>' 26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 27 'Skip Montanaro, Raymond Hettinger') 28 29 from itertools import chain 30 import string, re 31 from token import * 32 33 import token 34 __all__ = [x for x in dir(token) if not x.startswith("_")] 35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] 36 del x 37 del token 38 39 COMMENT = N_TOKENS 40 tok_name[COMMENT] = 'COMMENT' 41 NL = N_TOKENS + 1 42 tok_name[NL] = 'NL' 43 N_TOKENS += 2 44 45 def group(*choices): return '(' + '|'.join(choices) + ')' 46 def any(*choices): return group(*choices) + '*' 47 def maybe(*choices): return group(*choices) + '?' 48 49 Whitespace = r'[ \f\t]*' 50 Comment = r'#[^\r\n]*' 51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 52 Name = r'[a-zA-Z_]\w*' 53 54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?' 55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' 56 Binnumber = r'0[bB][01]+[lL]?' 57 Decnumber = r'[1-9]\d*[lL]?' 58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 59 Exponent = r'[eE][-+]?\d+' 60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 61 Expfloat = r'\d+' + Exponent 62 Floatnumber = group(Pointfloat, Expfloat) 63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 64 Number = group(Imagnumber, Floatnumber, Intnumber) 65 66 # Tail end of ' string. 67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 68 # Tail end of " string. 69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 70 # Tail end of ''' string. 71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 72 # Tail end of """ string. 73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') 75 # Single-line ' or " string. 76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 77 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 78 79 # Because of leftmost-then-longest match semantics, be sure to put the 80 # longest operators first (e.g., if = came before ==, == would get 81 # recognized as two instances of =). 82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 83 r"//=?", 84 r"[+\-*/%&|^=<>]=?", 85 r"~") 86 87 Bracket = '[][(){}]' 88 Special = group(r'\r?\n', r'[:;.,`@]') 89 Funny = group(Operator, Bracket, Special) 90 91 PlainToken = group(Number, Funny, String, Name) 92 Token = Ignore + PlainToken 93 94 # First (or only) line of ' or " string. 95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 96 group("'", r'\\\r?\n'), 97 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 98 group('"', r'\\\r?\n')) 99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 101 102 tokenprog, pseudoprog, single3prog, double3prog = map( 103 re.compile, (Token, PseudoToken, Single3, Double3)) 104 endprogs = {"'": re.compile(Single), '"': re.compile(Double), 105 "'''": single3prog, '"""': double3prog, 106 "r'''": single3prog, 'r"""': double3prog, 107 "u'''": single3prog, 'u"""': double3prog, 108 "ur'''": single3prog, 'ur"""': double3prog, 109 "R'''": single3prog, 'R"""': double3prog, 110 "U'''": single3prog, 'U"""': double3prog, 111 "uR'''": single3prog, 'uR"""': double3prog, 112 "Ur'''": single3prog, 'Ur"""': double3prog, 113 "UR'''": single3prog, 'UR"""': double3prog, 114 "b'''": single3prog, 'b"""': double3prog, 115 "br'''": single3prog, 'br"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "bR'''": single3prog, 'bR"""': double3prog, 118 "Br'''": single3prog, 'Br"""': double3prog, 119 "BR'''": single3prog, 'BR"""': double3prog, 120 'r': None, 'R': None, 'u': None, 'U': None, 121 'b': None, 'B': None} 122 123 triple_quoted = {} 124 for t in ("'''", '"""', 125 "r'''", 'r"""', "R'''", 'R"""', 126 "u'''", 'u"""', "U'''", 'U"""', 127 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 128 "uR'''", 'uR"""', "UR'''", 'UR"""', 129 "b'''", 'b"""', "B'''", 'B"""', 130 "br'''", 'br"""', "Br'''", 'Br"""', 131 "bR'''", 'bR"""', "BR'''", 'BR"""'): 132 triple_quoted[t] = t 133 single_quoted = {} 134 for t in ("'", '"', 135 "r'", 'r"', "R'", 'R"', 136 "u'", 'u"', "U'", 'U"', 137 "ur'", 'ur"', "Ur'", 'Ur"', 138 "uR'", 'uR"', "UR'", 'UR"', 139 "b'", 'b"', "B'", 'B"', 140 "br'", 'br"', "Br'", 'Br"', 141 "bR'", 'bR"', "BR'", 'BR"' ): 142 single_quoted[t] = t 143 144 tabsize = 8 145 146 class TokenError(Exception): pass 147 148 class StopTokenizing(Exception): pass 149 150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing 151 srow, scol = srow_scol 152 erow, ecol = erow_ecol 153 print "%d,%d-%d,%d:\t%s\t%s" % \ 154 (srow, scol, erow, ecol, tok_name[type], repr(token)) 155 156 def tokenize(readline, tokeneater=printtoken): 157 """ 158 The tokenize() function accepts two parameters: one representing the 159 input stream, and one providing an output mechanism for tokenize(). 160 161 The first parameter, readline, must be a callable object which provides 162 the same interface as the readline() method of built-in file objects. 163 Each call to the function should return one line of input as a string. 164 165 The second parameter, tokeneater, must also be a callable object. It is 166 called once for each token, with five arguments, corresponding to the 167 tuples generated by generate_tokens(). 168 """ 169 try: 170 tokenize_loop(readline, tokeneater) 171 except StopTokenizing: 172 pass 173 174 # backwards compatible interface 175 def tokenize_loop(readline, tokeneater): 176 for token_info in generate_tokens(readline): 177 tokeneater(*token_info) 178 179 class Untokenizer: 180 181 def __init__(self): 182 self.tokens = [] 183 self.prev_row = 1 184 self.prev_col = 0 185 186 def add_whitespace(self, start): 187 row, col = start 188 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 189 raise ValueError("start ({},{}) precedes previous end ({},{})" 190 .format(row, col, self.prev_row, self.prev_col)) 191 row_offset = row - self.prev_row 192 if row_offset: 193 self.tokens.append("\\\n" * row_offset) 194 self.prev_col = 0 195 col_offset = col - self.prev_col 196 if col_offset: 197 self.tokens.append(" " * col_offset) 198 199 def untokenize(self, iterable): 200 it = iter(iterable) 201 indents = [] 202 startline = False 203 for t in it: 204 if len(t) == 2: 205 self.compat(t, it) 206 break 207 tok_type, token, start, end, line = t 208 if tok_type == ENDMARKER: 209 break 210 if tok_type == INDENT: 211 indents.append(token) 212 continue 213 elif tok_type == DEDENT: 214 indents.pop() 215 self.prev_row, self.prev_col = end 216 continue 217 elif tok_type in (NEWLINE, NL): 218 startline = True 219 elif startline and indents: 220 indent = indents[-1] 221 if start[1] >= len(indent): 222 self.tokens.append(indent) 223 self.prev_col = len(indent) 224 startline = False 225 self.add_whitespace(start) 226 self.tokens.append(token) 227 self.prev_row, self.prev_col = end 228 if tok_type in (NEWLINE, NL): 229 self.prev_row += 1 230 self.prev_col = 0 231 return "".join(self.tokens) 232 233 def compat(self, token, iterable): 234 indents = [] 235 toks_append = self.tokens.append 236 startline = token[0] in (NEWLINE, NL) 237 prevstring = False 238 239 for tok in chain([token], iterable): 240 toknum, tokval = tok[:2] 241 242 if toknum in (NAME, NUMBER): 243 tokval += ' ' 244 245 # Insert a space between two consecutive strings 246 if toknum == STRING: 247 if prevstring: 248 tokval = ' ' + tokval 249 prevstring = True 250 else: 251 prevstring = False 252 253 if toknum == INDENT: 254 indents.append(tokval) 255 continue 256 elif toknum == DEDENT: 257 indents.pop() 258 continue 259 elif toknum in (NEWLINE, NL): 260 startline = True 261 elif startline and indents: 262 toks_append(indents[-1]) 263 startline = False 264 toks_append(tokval) 265 266 def untokenize(iterable): 267 """Transform tokens back into Python source code. 268 269 Each element returned by the iterable must be a token sequence 270 with at least two elements, a token number and token value. If 271 only two tokens are passed, the resulting output is poor. 272 273 Round-trip invariant for full input: 274 Untokenized source will match input source exactly 275 276 Round-trip invariant for limited intput: 277 # Output text will tokenize the back to the input 278 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 279 newcode = untokenize(t1) 280 readline = iter(newcode.splitlines(1)).next 281 t2 = [tok[:2] for tok in generate_tokens(readline)] 282 assert t1 == t2 283 """ 284 ut = Untokenizer() 285 return ut.untokenize(iterable) 286 287 def generate_tokens(readline): 288 """ 289 The generate_tokens() generator requires one argument, readline, which 290 must be a callable object which provides the same interface as the 291 readline() method of built-in file objects. Each call to the function 292 should return one line of input as a string. Alternately, readline 293 can be a callable function terminating with StopIteration: 294 readline = open(myfile).next # Example of alternate readline 295 296 The generator produces 5-tuples with these members: the token type; the 297 token string; a 2-tuple (srow, scol) of ints specifying the row and 298 column where the token begins in the source; a 2-tuple (erow, ecol) of 299 ints specifying the row and column where the token ends in the source; 300 and the line on which the token was found. The line passed is the 301 logical line; continuation lines are included. 302 """ 303 lnum = parenlev = continued = 0 304 namechars, numchars = string.ascii_letters + '_', '0123456789' 305 contstr, needcont = '', 0 306 contline = None 307 indents = [0] 308 309 while 1: # loop over lines in stream 310 try: 311 line = readline() 312 except StopIteration: 313 line = '' 314 lnum += 1 315 pos, max = 0, len(line) 316 317 if contstr: # continued string 318 if not line: 319 raise TokenError, ("EOF in multi-line string", strstart) 320 endmatch = endprog.match(line) 321 if endmatch: 322 pos = end = endmatch.end(0) 323 yield (STRING, contstr + line[:end], 324 strstart, (lnum, end), contline + line) 325 contstr, needcont = '', 0 326 contline = None 327 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 328 yield (ERRORTOKEN, contstr + line, 329 strstart, (lnum, len(line)), contline) 330 contstr = '' 331 contline = None 332 continue 333 else: 334 contstr = contstr + line 335 contline = contline + line 336 continue 337 338 elif parenlev == 0 and not continued: # new statement 339 if not line: break 340 column = 0 341 while pos < max: # measure leading whitespace 342 if line[pos] == ' ': 343 column += 1 344 elif line[pos] == '\t': 345 column = (column//tabsize + 1)*tabsize 346 elif line[pos] == '\f': 347 column = 0 348 else: 349 break 350 pos += 1 351 if pos == max: 352 break 353 354 if line[pos] in '#\r\n': # skip comments or blank lines 355 if line[pos] == '#': 356 comment_token = line[pos:].rstrip('\r\n') 357 nl_pos = pos + len(comment_token) 358 yield (COMMENT, comment_token, 359 (lnum, pos), (lnum, pos + len(comment_token)), line) 360 yield (NL, line[nl_pos:], 361 (lnum, nl_pos), (lnum, len(line)), line) 362 else: 363 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 364 (lnum, pos), (lnum, len(line)), line) 365 continue 366 367 if column > indents[-1]: # count indents or dedents 368 indents.append(column) 369 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 370 while column < indents[-1]: 371 if column not in indents: 372 raise IndentationError( 373 "unindent does not match any outer indentation level", 374 ("<tokenize>", lnum, pos, line)) 375 indents = indents[:-1] 376 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 377 378 else: # continued statement 379 if not line: 380 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 381 continued = 0 382 383 while pos < max: 384 pseudomatch = pseudoprog.match(line, pos) 385 if pseudomatch: # scan for tokens 386 start, end = pseudomatch.span(1) 387 spos, epos, pos = (lnum, start), (lnum, end), end 388 if start == end: 389 continue 390 token, initial = line[start:end], line[start] 391 392 if initial in numchars or \ 393 (initial == '.' and token != '.'): # ordinary number 394 yield (NUMBER, token, spos, epos, line) 395 elif initial in '\r\n': 396 yield (NL if parenlev > 0 else NEWLINE, 397 token, spos, epos, line) 398 elif initial == '#': 399 assert not token.endswith("\n") 400 yield (COMMENT, token, spos, epos, line) 401 elif token in triple_quoted: 402 endprog = endprogs[token] 403 endmatch = endprog.match(line, pos) 404 if endmatch: # all on one line 405 pos = endmatch.end(0) 406 token = line[start:pos] 407 yield (STRING, token, spos, (lnum, pos), line) 408 else: 409 strstart = (lnum, start) # multiple lines 410 contstr = line[start:] 411 contline = line 412 break 413 elif initial in single_quoted or \ 414 token[:2] in single_quoted or \ 415 token[:3] in single_quoted: 416 if token[-1] == '\n': # continued string 417 strstart = (lnum, start) 418 endprog = (endprogs[initial] or endprogs[token[1]] or 419 endprogs[token[2]]) 420 contstr, needcont = line[start:], 1 421 contline = line 422 break 423 else: # ordinary string 424 yield (STRING, token, spos, epos, line) 425 elif initial in namechars: # ordinary name 426 yield (NAME, token, spos, epos, line) 427 elif initial == '\\': # continued stmt 428 continued = 1 429 else: 430 if initial in '([{': 431 parenlev += 1 432 elif initial in ')]}': 433 parenlev -= 1 434 yield (OP, token, spos, epos, line) 435 else: 436 yield (ERRORTOKEN, line[pos], 437 (lnum, pos), (lnum, pos+1), line) 438 pos += 1 439 440 for indent in indents[1:]: # pop remaining indent levels 441 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 442 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 443 444 if __name__ == '__main__': # testing 445 import sys 446 if len(sys.argv) > 1: 447 tokenize(open(sys.argv[1]).readline) 448 else: 449 tokenize(sys.stdin.readline) 450