1 import re 2 import sys 3 4 # Reason last stmt is continued (or C_NONE if it's not). 5 (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, 6 C_STRING_NEXT_LINES, C_BRACKET) = range(5) 7 8 if 0: # for throwaway debugging output 9 def dump(*stuff): 10 sys.__stdout__.write(" ".join(map(str, stuff)) + "\n") 11 12 # Find what looks like the start of a popular stmt. 13 14 _synchre = re.compile(r""" 15 ^ 16 [ \t]* 17 (?: while 18 | else 19 | def 20 | return 21 | assert 22 | break 23 | class 24 | continue 25 | elif 26 | try 27 | except 28 | raise 29 | import 30 | yield 31 ) 32 \b 33 """, re.VERBOSE | re.MULTILINE).search 34 35 # Match blank line or non-indenting comment line. 36 37 _junkre = re.compile(r""" 38 [ \t]* 39 (?: \# \S .* )? 40 \n 41 """, re.VERBOSE).match 42 43 # Match any flavor of string; the terminating quote is optional 44 # so that we're robust in the face of incomplete program text. 45 46 _match_stringre = re.compile(r""" 47 \""" [^"\\]* (?: 48 (?: \\. | "(?!"") ) 49 [^"\\]* 50 )* 51 (?: \""" )? 52 53 | " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 54 55 | ''' [^'\\]* (?: 56 (?: \\. | '(?!'') ) 57 [^'\\]* 58 )* 59 (?: ''' )? 60 61 | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 62 """, re.VERBOSE | re.DOTALL).match 63 64 # Match a line that starts with something interesting; 65 # used to find the first item of a bracket structure. 66 67 _itemre = re.compile(r""" 68 [ \t]* 69 [^\s#\\] # if we match, m.end()-1 is the interesting char 70 """, re.VERBOSE).match 71 72 # Match start of stmts that should be followed by a dedent. 73 74 _closere = re.compile(r""" 75 \s* 76 (?: return 77 | break 78 | continue 79 | raise 80 | pass 81 ) 82 \b 83 """, re.VERBOSE).match 84 85 # Chew up non-special chars as quickly as possible. If match is 86 # successful, m.end() less 1 is the index of the last boring char 87 # matched. If match is unsuccessful, the string starts with an 88 # interesting char. 89 90 _chew_ordinaryre = re.compile(r""" 91 [^[\](){}#'"\\]+ 92 """, re.VERBOSE).match 93 94 # Build translation table to map uninteresting chars to "x", open 95 # brackets to "(", and close brackets to ")". 96 97 _tran = ['x'] * 256 98 for ch in "({[": 99 _tran[ord(ch)] = '(' 100 for ch in ")}]": 101 _tran[ord(ch)] = ')' 102 for ch in "\"'\\\n#": 103 _tran[ord(ch)] = ch 104 _tran = ''.join(_tran) 105 del ch 106 107 try: 108 UnicodeType = type(unicode("")) 109 except NameError: 110 UnicodeType = None 111 112 class Parser: 113 114 def __init__(self, indentwidth, tabwidth): 115 self.indentwidth = indentwidth 116 self.tabwidth = tabwidth 117 118 def set_str(self, str): 119 assert len(str) == 0 or str[-1] == '\n' 120 if type(str) is UnicodeType: 121 # The parse functions have no idea what to do with Unicode, so 122 # replace all Unicode characters with "x". This is "safe" 123 # so long as the only characters germane to parsing the structure 124 # of Python are 7-bit ASCII. It's *necessary* because Unicode 125 # strings don't have a .translate() method that supports 126 # deletechars. 127 uniphooey = str 128 str = [] 129 push = str.append 130 for raw in map(ord, uniphooey): 131 push(raw < 127 and chr(raw) or "x") 132 str = "".join(str) 133 self.str = str 134 self.study_level = 0 135 136 # Return index of a good place to begin parsing, as close to the 137 # end of the string as possible. This will be the start of some 138 # popular stmt like "if" or "def". Return None if none found: 139 # the caller should pass more prior context then, if possible, or 140 # if not (the entire program text up until the point of interest 141 # has already been tried) pass 0 to set_lo. 142 # 143 # This will be reliable iff given a reliable is_char_in_string 144 # function, meaning that when it says "no", it's absolutely 145 # guaranteed that the char is not in a string. 146 147 def find_good_parse_start(self, is_char_in_string=None, 148 _synchre=_synchre): 149 str, pos = self.str, None 150 151 if not is_char_in_string: 152 # no clue -- make the caller pass everything 153 return None 154 155 # Peek back from the end for a good place to start, 156 # but don't try too often; pos will be left None, or 157 # bumped to a legitimate synch point. 158 limit = len(str) 159 for tries in range(5): 160 i = str.rfind(":\n", 0, limit) 161 if i < 0: 162 break 163 i = str.rfind('\n', 0, i) + 1 # start of colon line 164 m = _synchre(str, i, limit) 165 if m and not is_char_in_string(m.start()): 166 pos = m.start() 167 break 168 limit = i 169 if pos is None: 170 # Nothing looks like a block-opener, or stuff does 171 # but is_char_in_string keeps returning true; most likely 172 # we're in or near a giant string, the colorizer hasn't 173 # caught up enough to be helpful, or there simply *aren't* 174 # any interesting stmts. In any of these cases we're 175 # going to have to parse the whole thing to be sure, so 176 # give it one last try from the start, but stop wasting 177 # time here regardless of the outcome. 178 m = _synchre(str) 179 if m and not is_char_in_string(m.start()): 180 pos = m.start() 181 return pos 182 183 # Peeking back worked; look forward until _synchre no longer 184 # matches. 185 i = pos + 1 186 while 1: 187 m = _synchre(str, i) 188 if m: 189 s, i = m.span() 190 if not is_char_in_string(s): 191 pos = s 192 else: 193 break 194 return pos 195 196 # Throw away the start of the string. Intended to be called with 197 # find_good_parse_start's result. 198 199 def set_lo(self, lo): 200 assert lo == 0 or self.str[lo-1] == '\n' 201 if lo > 0: 202 self.str = self.str[lo:] 203 204 # As quickly as humanly possible <wink>, find the line numbers (0- 205 # based) of the non-continuation lines. 206 # Creates self.{goodlines, continuation}. 207 208 def _study1(self): 209 if self.study_level >= 1: 210 return 211 self.study_level = 1 212 213 # Map all uninteresting characters to "x", all open brackets 214 # to "(", all close brackets to ")", then collapse runs of 215 # uninteresting characters. This can cut the number of chars 216 # by a factor of 10-40, and so greatly speed the following loop. 217 str = self.str 218 str = str.translate(_tran) 219 str = str.replace('xxxxxxxx', 'x') 220 str = str.replace('xxxx', 'x') 221 str = str.replace('xx', 'x') 222 str = str.replace('xx', 'x') 223 str = str.replace('\nx', '\n') 224 # note that replacing x\n with \n would be incorrect, because 225 # x may be preceded by a backslash 226 227 # March over the squashed version of the program, accumulating 228 # the line numbers of non-continued stmts, and determining 229 # whether & why the last stmt is a continuation. 230 continuation = C_NONE 231 level = lno = 0 # level is nesting level; lno is line number 232 self.goodlines = goodlines = [0] 233 push_good = goodlines.append 234 i, n = 0, len(str) 235 while i < n: 236 ch = str[i] 237 i = i+1 238 239 # cases are checked in decreasing order of frequency 240 if ch == 'x': 241 continue 242 243 if ch == '\n': 244 lno = lno + 1 245 if level == 0: 246 push_good(lno) 247 # else we're in an unclosed bracket structure 248 continue 249 250 if ch == '(': 251 level = level + 1 252 continue 253 254 if ch == ')': 255 if level: 256 level = level - 1 257 # else the program is invalid, but we can't complain 258 continue 259 260 if ch == '"' or ch == "'": 261 # consume the string 262 quote = ch 263 if str[i-1:i+2] == quote * 3: 264 quote = quote * 3 265 firstlno = lno 266 w = len(quote) - 1 267 i = i+w 268 while i < n: 269 ch = str[i] 270 i = i+1 271 272 if ch == 'x': 273 continue 274 275 if str[i-1:i+w] == quote: 276 i = i+w 277 break 278 279 if ch == '\n': 280 lno = lno + 1 281 if w == 0: 282 # unterminated single-quoted string 283 if level == 0: 284 push_good(lno) 285 break 286 continue 287 288 if ch == '\\': 289 assert i < n 290 if str[i] == '\n': 291 lno = lno + 1 292 i = i+1 293 continue 294 295 # else comment char or paren inside string 296 297 else: 298 # didn't break out of the loop, so we're still 299 # inside a string 300 if (lno - 1) == firstlno: 301 # before the previous \n in str, we were in the first 302 # line of the string 303 continuation = C_STRING_FIRST_LINE 304 else: 305 continuation = C_STRING_NEXT_LINES 306 continue # with outer loop 307 308 if ch == '#': 309 # consume the comment 310 i = str.find('\n', i) 311 assert i >= 0 312 continue 313 314 assert ch == '\\' 315 assert i < n 316 if str[i] == '\n': 317 lno = lno + 1 318 if i+1 == n: 319 continuation = C_BACKSLASH 320 i = i+1 321 322 # The last stmt may be continued for all 3 reasons. 323 # String continuation takes precedence over bracket 324 # continuation, which beats backslash continuation. 325 if (continuation != C_STRING_FIRST_LINE 326 and continuation != C_STRING_NEXT_LINES and level > 0): 327 continuation = C_BRACKET 328 self.continuation = continuation 329 330 # Push the final line number as a sentinel value, regardless of 331 # whether it's continued. 332 assert (continuation == C_NONE) == (goodlines[-1] == lno) 333 if goodlines[-1] != lno: 334 push_good(lno) 335 336 def get_continuation_type(self): 337 self._study1() 338 return self.continuation 339 340 # study1 was sufficient to determine the continuation status, 341 # but doing more requires looking at every character. study2 342 # does this for the last interesting statement in the block. 343 # Creates: 344 # self.stmt_start, stmt_end 345 # slice indices of last interesting stmt 346 # self.stmt_bracketing 347 # the bracketing structure of the last interesting stmt; 348 # for example, for the statement "say(boo) or die", stmt_bracketing 349 # will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are 350 # treated as brackets, for the matter. 351 # self.lastch 352 # last non-whitespace character before optional trailing 353 # comment 354 # self.lastopenbracketpos 355 # if continuation is C_BRACKET, index of last open bracket 356 357 def _study2(self): 358 if self.study_level >= 2: 359 return 360 self._study1() 361 self.study_level = 2 362 363 # Set p and q to slice indices of last interesting stmt. 364 str, goodlines = self.str, self.goodlines 365 i = len(goodlines) - 1 366 p = len(str) # index of newest line 367 while i: 368 assert p 369 # p is the index of the stmt at line number goodlines[i]. 370 # Move p back to the stmt at line number goodlines[i-1]. 371 q = p 372 for nothing in range(goodlines[i-1], goodlines[i]): 373 # tricky: sets p to 0 if no preceding newline 374 p = str.rfind('\n', 0, p-1) + 1 375 # The stmt str[p:q] isn't a continuation, but may be blank 376 # or a non-indenting comment line. 377 if _junkre(str, p): 378 i = i-1 379 else: 380 break 381 if i == 0: 382 # nothing but junk! 383 assert p == 0 384 q = p 385 self.stmt_start, self.stmt_end = p, q 386 387 # Analyze this stmt, to find the last open bracket (if any) 388 # and last interesting character (if any). 389 lastch = "" 390 stack = [] # stack of open bracket indices 391 push_stack = stack.append 392 bracketing = [(p, 0)] 393 while p < q: 394 # suck up all except ()[]{}'"#\\ 395 m = _chew_ordinaryre(str, p, q) 396 if m: 397 # we skipped at least one boring char 398 newp = m.end() 399 # back up over totally boring whitespace 400 i = newp - 1 # index of last boring char 401 while i >= p and str[i] in " \t\n": 402 i = i-1 403 if i >= p: 404 lastch = str[i] 405 p = newp 406 if p >= q: 407 break 408 409 ch = str[p] 410 411 if ch in "([{": 412 push_stack(p) 413 bracketing.append((p, len(stack))) 414 lastch = ch 415 p = p+1 416 continue 417 418 if ch in ")]}": 419 if stack: 420 del stack[-1] 421 lastch = ch 422 p = p+1 423 bracketing.append((p, len(stack))) 424 continue 425 426 if ch == '"' or ch == "'": 427 # consume string 428 # Note that study1 did this with a Python loop, but 429 # we use a regexp here; the reason is speed in both 430 # cases; the string may be huge, but study1 pre-squashed 431 # strings to a couple of characters per line. study1 432 # also needed to keep track of newlines, and we don't 433 # have to. 434 bracketing.append((p, len(stack)+1)) 435 lastch = ch 436 p = _match_stringre(str, p, q).end() 437 bracketing.append((p, len(stack))) 438 continue 439 440 if ch == '#': 441 # consume comment and trailing newline 442 bracketing.append((p, len(stack)+1)) 443 p = str.find('\n', p, q) + 1 444 assert p > 0 445 bracketing.append((p, len(stack))) 446 continue 447 448 assert ch == '\\' 449 p = p+1 # beyond backslash 450 assert p < q 451 if str[p] != '\n': 452 # the program is invalid, but can't complain 453 lastch = ch + str[p] 454 p = p+1 # beyond escaped char 455 456 # end while p < q: 457 458 self.lastch = lastch 459 if stack: 460 self.lastopenbracketpos = stack[-1] 461 self.stmt_bracketing = tuple(bracketing) 462 463 # Assuming continuation is C_BRACKET, return the number 464 # of spaces the next line should be indented. 465 466 def compute_bracket_indent(self): 467 self._study2() 468 assert self.continuation == C_BRACKET 469 j = self.lastopenbracketpos 470 str = self.str 471 n = len(str) 472 origi = i = str.rfind('\n', 0, j) + 1 473 j = j+1 # one beyond open bracket 474 # find first list item; set i to start of its line 475 while j < n: 476 m = _itemre(str, j) 477 if m: 478 j = m.end() - 1 # index of first interesting char 479 extra = 0 480 break 481 else: 482 # this line is junk; advance to next line 483 i = j = str.find('\n', j) + 1 484 else: 485 # nothing interesting follows the bracket; 486 # reproduce the bracket line's indentation + a level 487 j = i = origi 488 while str[j] in " \t": 489 j = j+1 490 extra = self.indentwidth 491 return len(str[i:j].expandtabs(self.tabwidth)) + extra 492 493 # Return number of physical lines in last stmt (whether or not 494 # it's an interesting stmt! this is intended to be called when 495 # continuation is C_BACKSLASH). 496 497 def get_num_lines_in_stmt(self): 498 self._study1() 499 goodlines = self.goodlines 500 return goodlines[-1] - goodlines[-2] 501 502 # Assuming continuation is C_BACKSLASH, return the number of spaces 503 # the next line should be indented. Also assuming the new line is 504 # the first one following the initial line of the stmt. 505 506 def compute_backslash_indent(self): 507 self._study2() 508 assert self.continuation == C_BACKSLASH 509 str = self.str 510 i = self.stmt_start 511 while str[i] in " \t": 512 i = i+1 513 startpos = i 514 515 # See whether the initial line starts an assignment stmt; i.e., 516 # look for an = operator 517 endpos = str.find('\n', startpos) + 1 518 found = level = 0 519 while i < endpos: 520 ch = str[i] 521 if ch in "([{": 522 level = level + 1 523 i = i+1 524 elif ch in ")]}": 525 if level: 526 level = level - 1 527 i = i+1 528 elif ch == '"' or ch == "'": 529 i = _match_stringre(str, i, endpos).end() 530 elif ch == '#': 531 break 532 elif level == 0 and ch == '=' and \ 533 (i == 0 or str[i-1] not in "=<>!") and \ 534 str[i+1] != '=': 535 found = 1 536 break 537 else: 538 i = i+1 539 540 if found: 541 # found a legit =, but it may be the last interesting 542 # thing on the line 543 i = i+1 # move beyond the = 544 found = re.match(r"\s*\\", str[i:endpos]) is None 545 546 if not found: 547 # oh well ... settle for moving beyond the first chunk 548 # of non-whitespace chars 549 i = startpos 550 while str[i] not in " \t\n": 551 i = i+1 552 553 return len(str[self.stmt_start:i].expandtabs(\ 554 self.tabwidth)) + 1 555 556 # Return the leading whitespace on the initial line of the last 557 # interesting stmt. 558 559 def get_base_indent_string(self): 560 self._study2() 561 i, n = self.stmt_start, self.stmt_end 562 j = i 563 str = self.str 564 while j < n and str[j] in " \t": 565 j = j + 1 566 return str[i:j] 567 568 # Did the last interesting stmt open a block? 569 570 def is_block_opener(self): 571 self._study2() 572 return self.lastch == ':' 573 574 # Did the last interesting stmt close a block? 575 576 def is_block_closer(self): 577 self._study2() 578 return _closere(self.str, self.stmt_start) is not None 579 580 # index of last open bracket ({[, or None if none 581 lastopenbracketpos = None 582 583 def get_last_open_bracket_pos(self): 584 self._study2() 585 return self.lastopenbracketpos 586 587 # the structure of the bracketing of the last interesting statement, 588 # in the format defined in _study2, or None if the text didn't contain 589 # anything 590 stmt_bracketing = None 591 592 def get_last_stmt_bracketing(self): 593 self._study2() 594 return self.stmt_bracketing 595