1 """Tokenization help for Python programs. 2 3 generate_tokens(readline) is a generator that breaks a stream of 4 text into Python tokens. It accepts a readline-like method which is called 5 repeatedly to get the next line of input (or "" for EOF). It generates 6 5-tuples with these members: 7 8 the token type (see token.py) 9 the token (a string) 10 the starting (row, column) indices of the token (a 2-tuple of ints) 11 the ending (row, column) indices of the token (a 2-tuple of ints) 12 the original line (string) 13 14 It is designed to match the working of the Python tokenizer exactly, except 15 that it produces COMMENT tokens for comments and gives type OP for all 16 operators 17 18 Older entry points 19 tokenize_loop(readline, tokeneater) 20 tokenize(readline, tokeneater=printtoken) 21 are the same, except instead of generating tokens, tokeneater is a callback 22 function to which the 5 fields described above are passed as 5 arguments, 23 each time a new token is found.""" 24 25 __author__ = 'Ka-Ping Yee <ping (at] lfw.org>' 26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 27 'Skip Montanaro, Raymond Hettinger') 28 29 from itertools import chain 30 import string, re 31 from token import * 32 33 import token 34 __all__ = [x for x in dir(token) if not x.startswith("_")] 35 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] 36 del x 37 del token 38 39 COMMENT = N_TOKENS 40 tok_name[COMMENT] = 'COMMENT' 41 NL = N_TOKENS + 1 42 tok_name[NL] = 'NL' 43 N_TOKENS += 2 44 45 def group(*choices): return '(' + '|'.join(choices) + ')' 46 def any(*choices): return group(*choices) + '*' 47 def maybe(*choices): return group(*choices) + '?' 48 49 Whitespace = r'[ \f\t]*' 50 Comment = r'#[^\r\n]*' 51 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 52 Name = r'[a-zA-Z_]\w*' 53 54 Hexnumber = r'0[xX][\da-fA-F]+[lL]?' 55 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' 56 Binnumber = r'0[bB][01]+[lL]?' 57 Decnumber = r'[1-9]\d*[lL]?' 58 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 59 Exponent = r'[eE][-+]?\d+' 60 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 61 Expfloat = r'\d+' + Exponent 62 Floatnumber = group(Pointfloat, Expfloat) 63 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 64 Number = group(Imagnumber, Floatnumber, Intnumber) 65 66 # Tail end of ' string. 67 Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 68 # Tail end of " string. 69 Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 70 # Tail end of ''' string. 71 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 72 # Tail end of """ string. 73 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 74 Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') 75 # Single-line ' or " string. 76 String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 77 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 78 79 # Because of leftmost-then-longest match semantics, be sure to put the 80 # longest operators first (e.g., if = came before ==, == would get 81 # recognized as two instances of =). 82 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 83 r"//=?", 84 r"[+\-*/%&|^=<>]=?", 85 r"~") 86 87 Bracket = '[][(){}]' 88 Special = group(r'\r?\n', r'[:;.,`@]') 89 Funny = group(Operator, Bracket, Special) 90 91 PlainToken = group(Number, Funny, String, Name) 92 Token = Ignore + PlainToken 93 94 # First (or only) line of ' or " string. 95 ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 96 group("'", r'\\\r?\n'), 97 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 98 group('"', r'\\\r?\n')) 99 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 100 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 101 102 tokenprog, pseudoprog, single3prog, double3prog = map( 103 re.compile, (Token, PseudoToken, Single3, Double3)) 104 endprogs = {"'": re.compile(Single), '"': re.compile(Double), 105 "'''": single3prog, '"""': double3prog, 106 "r'''": single3prog, 'r"""': double3prog, 107 "u'''": single3prog, 'u"""': double3prog, 108 "ur'''": single3prog, 'ur"""': double3prog, 109 "R'''": single3prog, 'R"""': double3prog, 110 "U'''": single3prog, 'U"""': double3prog, 111 "uR'''": single3prog, 'uR"""': double3prog, 112 "Ur'''": single3prog, 'Ur"""': double3prog, 113 "UR'''": single3prog, 'UR"""': double3prog, 114 "b'''": single3prog, 'b"""': double3prog, 115 "br'''": single3prog, 'br"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "bR'''": single3prog, 'bR"""': double3prog, 118 "Br'''": single3prog, 'Br"""': double3prog, 119 "BR'''": single3prog, 'BR"""': double3prog, 120 'r': None, 'R': None, 'u': None, 'U': None, 121 'b': None, 'B': None} 122 123 triple_quoted = {} 124 for t in ("'''", '"""', 125 "r'''", 'r"""', "R'''", 'R"""', 126 "u'''", 'u"""', "U'''", 'U"""', 127 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 128 "uR'''", 'uR"""', "UR'''", 'UR"""', 129 "b'''", 'b"""', "B'''", 'B"""', 130 "br'''", 'br"""', "Br'''", 'Br"""', 131 "bR'''", 'bR"""', "BR'''", 'BR"""'): 132 triple_quoted[t] = t 133 single_quoted = {} 134 for t in ("'", '"', 135 "r'", 'r"', "R'", 'R"', 136 "u'", 'u"', "U'", 'U"', 137 "ur'", 'ur"', "Ur'", 'Ur"', 138 "uR'", 'uR"', "UR'", 'UR"', 139 "b'", 'b"', "B'", 'B"', 140 "br'", 'br"', "Br'", 'Br"', 141 "bR'", 'bR"', "BR'", 'BR"' ): 142 single_quoted[t] = t 143 144 tabsize = 8 145 146 class TokenError(Exception): pass 147 148 class StopTokenizing(Exception): pass 149 150 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing 151 srow, scol = srow_scol 152 erow, ecol = erow_ecol 153 print "%d,%d-%d,%d:\t%s\t%s" % \ 154 (srow, scol, erow, ecol, tok_name[type], repr(token)) 155 156 def tokenize(readline, tokeneater=printtoken): 157 """ 158 The tokenize() function accepts two parameters: one representing the 159 input stream, and one providing an output mechanism for tokenize(). 160 161 The first parameter, readline, must be a callable object which provides 162 the same interface as the readline() method of built-in file objects. 163 Each call to the function should return one line of input as a string. 164 165 The second parameter, tokeneater, must also be a callable object. It is 166 called once for each token, with five arguments, corresponding to the 167 tuples generated by generate_tokens(). 168 """ 169 try: 170 tokenize_loop(readline, tokeneater) 171 except StopTokenizing: 172 pass 173 174 # backwards compatible interface 175 def tokenize_loop(readline, tokeneater): 176 for token_info in generate_tokens(readline): 177 tokeneater(*token_info) 178 179 class Untokenizer: 180 181 def __init__(self): 182 self.tokens = [] 183 self.prev_row = 1 184 self.prev_col = 0 185 186 def add_whitespace(self, start): 187 row, col = start 188 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 189 raise ValueError("start ({},{}) precedes previous end ({},{})" 190 .format(row, col, self.prev_row, self.prev_col)) 191 row_offset = row - self.prev_row 192 if row_offset: 193 self.tokens.append("\\\n" * row_offset) 194 self.prev_col = 0 195 col_offset = col - self.prev_col 196 if col_offset: 197 self.tokens.append(" " * col_offset) 198 199 def untokenize(self, iterable): 200 it = iter(iterable) 201 for t in it: 202 if len(t) == 2: 203 self.compat(t, it) 204 break 205 tok_type, token, start, end, line = t 206 if tok_type == ENDMARKER: 207 break 208 self.add_whitespace(start) 209 self.tokens.append(token) 210 self.prev_row, self.prev_col = end 211 if tok_type in (NEWLINE, NL): 212 self.prev_row += 1 213 self.prev_col = 0 214 return "".join(self.tokens) 215 216 def compat(self, token, iterable): 217 indents = [] 218 toks_append = self.tokens.append 219 startline = token[0] in (NEWLINE, NL) 220 prevstring = False 221 222 for tok in chain([token], iterable): 223 toknum, tokval = tok[:2] 224 225 if toknum in (NAME, NUMBER): 226 tokval += ' ' 227 228 # Insert a space between two consecutive strings 229 if toknum == STRING: 230 if prevstring: 231 tokval = ' ' + tokval 232 prevstring = True 233 else: 234 prevstring = False 235 236 if toknum == INDENT: 237 indents.append(tokval) 238 continue 239 elif toknum == DEDENT: 240 indents.pop() 241 continue 242 elif toknum in (NEWLINE, NL): 243 startline = True 244 elif startline and indents: 245 toks_append(indents[-1]) 246 startline = False 247 toks_append(tokval) 248 249 def untokenize(iterable): 250 """Transform tokens back into Python source code. 251 252 Each element returned by the iterable must be a token sequence 253 with at least two elements, a token number and token value. If 254 only two tokens are passed, the resulting output is poor. 255 256 Round-trip invariant for full input: 257 Untokenized source will match input source exactly 258 259 Round-trip invariant for limited intput: 260 # Output text will tokenize the back to the input 261 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 262 newcode = untokenize(t1) 263 readline = iter(newcode.splitlines(1)).next 264 t2 = [tok[:2] for tok in generate_tokens(readline)] 265 assert t1 == t2 266 """ 267 ut = Untokenizer() 268 return ut.untokenize(iterable) 269 270 def generate_tokens(readline): 271 """ 272 The generate_tokens() generator requires one argument, readline, which 273 must be a callable object which provides the same interface as the 274 readline() method of built-in file objects. Each call to the function 275 should return one line of input as a string. Alternately, readline 276 can be a callable function terminating with StopIteration: 277 readline = open(myfile).next # Example of alternate readline 278 279 The generator produces 5-tuples with these members: the token type; the 280 token string; a 2-tuple (srow, scol) of ints specifying the row and 281 column where the token begins in the source; a 2-tuple (erow, ecol) of 282 ints specifying the row and column where the token ends in the source; 283 and the line on which the token was found. The line passed is the 284 logical line; continuation lines are included. 285 """ 286 lnum = parenlev = continued = 0 287 namechars, numchars = string.ascii_letters + '_', '0123456789' 288 contstr, needcont = '', 0 289 contline = None 290 indents = [0] 291 292 while 1: # loop over lines in stream 293 try: 294 line = readline() 295 except StopIteration: 296 line = '' 297 lnum += 1 298 pos, max = 0, len(line) 299 300 if contstr: # continued string 301 if not line: 302 raise TokenError, ("EOF in multi-line string", strstart) 303 endmatch = endprog.match(line) 304 if endmatch: 305 pos = end = endmatch.end(0) 306 yield (STRING, contstr + line[:end], 307 strstart, (lnum, end), contline + line) 308 contstr, needcont = '', 0 309 contline = None 310 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 311 yield (ERRORTOKEN, contstr + line, 312 strstart, (lnum, len(line)), contline) 313 contstr = '' 314 contline = None 315 continue 316 else: 317 contstr = contstr + line 318 contline = contline + line 319 continue 320 321 elif parenlev == 0 and not continued: # new statement 322 if not line: break 323 column = 0 324 while pos < max: # measure leading whitespace 325 if line[pos] == ' ': 326 column += 1 327 elif line[pos] == '\t': 328 column = (column//tabsize + 1)*tabsize 329 elif line[pos] == '\f': 330 column = 0 331 else: 332 break 333 pos += 1 334 if pos == max: 335 break 336 337 if line[pos] in '#\r\n': # skip comments or blank lines 338 if line[pos] == '#': 339 comment_token = line[pos:].rstrip('\r\n') 340 nl_pos = pos + len(comment_token) 341 yield (COMMENT, comment_token, 342 (lnum, pos), (lnum, pos + len(comment_token)), line) 343 yield (NL, line[nl_pos:], 344 (lnum, nl_pos), (lnum, len(line)), line) 345 else: 346 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 347 (lnum, pos), (lnum, len(line)), line) 348 continue 349 350 if column > indents[-1]: # count indents or dedents 351 indents.append(column) 352 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 353 while column < indents[-1]: 354 if column not in indents: 355 raise IndentationError( 356 "unindent does not match any outer indentation level", 357 ("<tokenize>", lnum, pos, line)) 358 indents = indents[:-1] 359 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 360 361 else: # continued statement 362 if not line: 363 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 364 continued = 0 365 366 while pos < max: 367 pseudomatch = pseudoprog.match(line, pos) 368 if pseudomatch: # scan for tokens 369 start, end = pseudomatch.span(1) 370 spos, epos, pos = (lnum, start), (lnum, end), end 371 if start == end: 372 continue 373 token, initial = line[start:end], line[start] 374 375 if initial in numchars or \ 376 (initial == '.' and token != '.'): # ordinary number 377 yield (NUMBER, token, spos, epos, line) 378 elif initial in '\r\n': 379 yield (NL if parenlev > 0 else NEWLINE, 380 token, spos, epos, line) 381 elif initial == '#': 382 assert not token.endswith("\n") 383 yield (COMMENT, token, spos, epos, line) 384 elif token in triple_quoted: 385 endprog = endprogs[token] 386 endmatch = endprog.match(line, pos) 387 if endmatch: # all on one line 388 pos = endmatch.end(0) 389 token = line[start:pos] 390 yield (STRING, token, spos, (lnum, pos), line) 391 else: 392 strstart = (lnum, start) # multiple lines 393 contstr = line[start:] 394 contline = line 395 break 396 elif initial in single_quoted or \ 397 token[:2] in single_quoted or \ 398 token[:3] in single_quoted: 399 if token[-1] == '\n': # continued string 400 strstart = (lnum, start) 401 endprog = (endprogs[initial] or endprogs[token[1]] or 402 endprogs[token[2]]) 403 contstr, needcont = line[start:], 1 404 contline = line 405 break 406 else: # ordinary string 407 yield (STRING, token, spos, epos, line) 408 elif initial in namechars: # ordinary name 409 yield (NAME, token, spos, epos, line) 410 elif initial == '\\': # continued stmt 411 continued = 1 412 else: 413 if initial in '([{': 414 parenlev += 1 415 elif initial in ')]}': 416 parenlev -= 1 417 yield (OP, token, spos, epos, line) 418 else: 419 yield (ERRORTOKEN, line[pos], 420 (lnum, pos), (lnum, pos+1), line) 421 pos += 1 422 423 for indent in indents[1:]: # pop remaining indent levels 424 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 425 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 426 427 if __name__ == '__main__': # testing 428 import sys 429 if len(sys.argv) > 1: 430 tokenize(open(sys.argv[1]).readline) 431 else: 432 tokenize(sys.stdin.readline) 433