1 """ 2 Module difflib -- helpers for computing deltas between objects. 3 4 Function get_close_matches(word, possibilities, n=3, cutoff=0.6): 5 Use SequenceMatcher to return list of the best "good enough" matches. 6 7 Function context_diff(a, b): 8 For two lists of strings, return a delta in context diff format. 9 10 Function ndiff(a, b): 11 Return a delta: the difference between `a` and `b` (lists of strings). 12 13 Function restore(delta, which): 14 Return one of the two sequences that generated an ndiff delta. 15 16 Function unified_diff(a, b): 17 For two lists of strings, return a delta in unified diff format. 18 19 Class SequenceMatcher: 20 A flexible class for comparing pairs of sequences of any type. 21 22 Class Differ: 23 For producing human-readable deltas from sequences of lines of text. 24 25 Class HtmlDiff: 26 For producing HTML side by side comparison with change highlights. 27 """ 28 29 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 30 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 31 'unified_diff', 'HtmlDiff', 'Match'] 32 33 import heapq 34 from collections import namedtuple as _namedtuple 35 from functools import reduce 36 37 Match = _namedtuple('Match', 'a b size') 38 39 def _calculate_ratio(matches, length): 40 if length: 41 return 2.0 * matches / length 42 return 1.0 43 44 class SequenceMatcher: 45 46 """ 47 SequenceMatcher is a flexible class for comparing pairs of sequences of 48 any type, so long as the sequence elements are hashable. The basic 49 algorithm predates, and is a little fancier than, an algorithm 50 published in the late 1980's by Ratcliff and Obershelp under the 51 hyperbolic name "gestalt pattern matching". The basic idea is to find 52 the longest contiguous matching subsequence that contains no "junk" 53 elements (R-O doesn't address junk). The same idea is then applied 54 recursively to the pieces of the sequences to the left and to the right 55 of the matching subsequence. This does not yield minimal edit 56 sequences, but does tend to yield matches that "look right" to people. 57 58 SequenceMatcher tries to compute a "human-friendly diff" between two 59 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the 60 longest *contiguous* & junk-free matching subsequence. That's what 61 catches peoples' eyes. The Windows(tm) windiff has another interesting 62 notion, pairing up elements that appear uniquely in each sequence. 63 That, and the method here, appear to yield more intuitive difference 64 reports than does diff. This method appears to be the least vulnerable 65 to synching up on blocks of "junk lines", though (like blank lines in 66 ordinary text files, or maybe "<P>" lines in HTML files). That may be 67 because this is the only method of the 3 that has a *concept* of 68 "junk" <wink>. 69 70 Example, comparing two strings, and considering blanks to be "junk": 71 72 >>> s = SequenceMatcher(lambda x: x == " ", 73 ... "private Thread currentThread;", 74 ... "private volatile Thread currentThread;") 75 >>> 76 77 .ratio() returns a float in [0, 1], measuring the "similarity" of the 78 sequences. As a rule of thumb, a .ratio() value over 0.6 means the 79 sequences are close matches: 80 81 >>> print round(s.ratio(), 3) 82 0.866 83 >>> 84 85 If you're only interested in where the sequences match, 86 .get_matching_blocks() is handy: 87 88 >>> for block in s.get_matching_blocks(): 89 ... print "a[%d] and b[%d] match for %d elements" % block 90 a[0] and b[0] match for 8 elements 91 a[8] and b[17] match for 21 elements 92 a[29] and b[38] match for 0 elements 93 94 Note that the last tuple returned by .get_matching_blocks() is always a 95 dummy, (len(a), len(b), 0), and this is the only case in which the last 96 tuple element (number of elements matched) is 0. 97 98 If you want to know how to change the first sequence into the second, 99 use .get_opcodes(): 100 101 >>> for opcode in s.get_opcodes(): 102 ... print "%6s a[%d:%d] b[%d:%d]" % opcode 103 equal a[0:8] b[0:8] 104 insert a[8:8] b[8:17] 105 equal a[8:29] b[17:38] 106 107 See the Differ class for a fancy human-friendly file differencer, which 108 uses SequenceMatcher both to compare sequences of lines, and to compare 109 sequences of characters within similar (near-matching) lines. 110 111 See also function get_close_matches() in this module, which shows how 112 simple code building on SequenceMatcher can be used to do useful work. 113 114 Timing: Basic R-O is cubic time worst case and quadratic time expected 115 case. SequenceMatcher is quadratic time for the worst case and has 116 expected-case behavior dependent in a complicated way on how many 117 elements the sequences have in common; best case time is linear. 118 119 Methods: 120 121 __init__(isjunk=None, a='', b='') 122 Construct a SequenceMatcher. 123 124 set_seqs(a, b) 125 Set the two sequences to be compared. 126 127 set_seq1(a) 128 Set the first sequence to be compared. 129 130 set_seq2(b) 131 Set the second sequence to be compared. 132 133 find_longest_match(alo, ahi, blo, bhi) 134 Find longest matching block in a[alo:ahi] and b[blo:bhi]. 135 136 get_matching_blocks() 137 Return list of triples describing matching subsequences. 138 139 get_opcodes() 140 Return list of 5-tuples describing how to turn a into b. 141 142 ratio() 143 Return a measure of the sequences' similarity (float in [0,1]). 144 145 quick_ratio() 146 Return an upper bound on .ratio() relatively quickly. 147 148 real_quick_ratio() 149 Return an upper bound on ratio() very quickly. 150 """ 151 152 def __init__(self, isjunk=None, a='', b='', autojunk=True): 153 """Construct a SequenceMatcher. 154 155 Optional arg isjunk is None (the default), or a one-argument 156 function that takes a sequence element and returns true iff the 157 element is junk. None is equivalent to passing "lambda x: 0", i.e. 158 no elements are considered to be junk. For example, pass 159 lambda x: x in " \\t" 160 if you're comparing lines as sequences of characters, and don't 161 want to synch up on blanks or hard tabs. 162 163 Optional arg a is the first of two sequences to be compared. By 164 default, an empty string. The elements of a must be hashable. See 165 also .set_seqs() and .set_seq1(). 166 167 Optional arg b is the second of two sequences to be compared. By 168 default, an empty string. The elements of b must be hashable. See 169 also .set_seqs() and .set_seq2(). 170 171 Optional arg autojunk should be set to False to disable the 172 "automatic junk heuristic" that treats popular elements as junk 173 (see module documentation for more information). 174 """ 175 176 # Members: 177 # a 178 # first sequence 179 # b 180 # second sequence; differences are computed as "what do 181 # we need to do to 'a' to change it into 'b'?" 182 # b2j 183 # for x in b, b2j[x] is a list of the indices (into b) 184 # at which x appears; junk elements do not appear 185 # fullbcount 186 # for x in b, fullbcount[x] == the number of times x 187 # appears in b; only materialized if really needed (used 188 # only for computing quick_ratio()) 189 # matching_blocks 190 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; 191 # ascending & non-overlapping in i and in j; terminated by 192 # a dummy (len(a), len(b), 0) sentinel 193 # opcodes 194 # a list of (tag, i1, i2, j1, j2) tuples, where tag is 195 # one of 196 # 'replace' a[i1:i2] should be replaced by b[j1:j2] 197 # 'delete' a[i1:i2] should be deleted 198 # 'insert' b[j1:j2] should be inserted 199 # 'equal' a[i1:i2] == b[j1:j2] 200 # isjunk 201 # a user-supplied function taking a sequence element and 202 # returning true iff the element is "junk" -- this has 203 # subtle but helpful effects on the algorithm, which I'll 204 # get around to writing up someday <0.9 wink>. 205 # DON'T USE! Only __chain_b uses this. Use isbjunk. 206 # isbjunk 207 # for x in b, isbjunk(x) == isjunk(x) but much faster; 208 # it's really the __contains__ method of a hidden dict. 209 # DOES NOT WORK for x in a! 210 # isbpopular 211 # for x in b, isbpopular(x) is true iff b is reasonably long 212 # (at least 200 elements) and x accounts for more than 1 + 1% of 213 # its elements (when autojunk is enabled). 214 # DOES NOT WORK for x in a! 215 216 self.isjunk = isjunk 217 self.a = self.b = None 218 self.autojunk = autojunk 219 self.set_seqs(a, b) 220 221 def set_seqs(self, a, b): 222 """Set the two sequences to be compared. 223 224 >>> s = SequenceMatcher() 225 >>> s.set_seqs("abcd", "bcde") 226 >>> s.ratio() 227 0.75 228 """ 229 230 self.set_seq1(a) 231 self.set_seq2(b) 232 233 def set_seq1(self, a): 234 """Set the first sequence to be compared. 235 236 The second sequence to be compared is not changed. 237 238 >>> s = SequenceMatcher(None, "abcd", "bcde") 239 >>> s.ratio() 240 0.75 241 >>> s.set_seq1("bcde") 242 >>> s.ratio() 243 1.0 244 >>> 245 246 SequenceMatcher computes and caches detailed information about the 247 second sequence, so if you want to compare one sequence S against 248 many sequences, use .set_seq2(S) once and call .set_seq1(x) 249 repeatedly for each of the other sequences. 250 251 See also set_seqs() and set_seq2(). 252 """ 253 254 if a is self.a: 255 return 256 self.a = a 257 self.matching_blocks = self.opcodes = None 258 259 def set_seq2(self, b): 260 """Set the second sequence to be compared. 261 262 The first sequence to be compared is not changed. 263 264 >>> s = SequenceMatcher(None, "abcd", "bcde") 265 >>> s.ratio() 266 0.75 267 >>> s.set_seq2("abcd") 268 >>> s.ratio() 269 1.0 270 >>> 271 272 SequenceMatcher computes and caches detailed information about the 273 second sequence, so if you want to compare one sequence S against 274 many sequences, use .set_seq2(S) once and call .set_seq1(x) 275 repeatedly for each of the other sequences. 276 277 See also set_seqs() and set_seq1(). 278 """ 279 280 if b is self.b: 281 return 282 self.b = b 283 self.matching_blocks = self.opcodes = None 284 self.fullbcount = None 285 self.__chain_b() 286 287 # For each element x in b, set b2j[x] to a list of the indices in 288 # b where x appears; the indices are in increasing order; note that 289 # the number of times x appears in b is len(b2j[x]) ... 290 # when self.isjunk is defined, junk elements don't show up in this 291 # map at all, which stops the central find_longest_match method 292 # from starting any matching block at a junk element ... 293 # also creates the fast isbjunk function ... 294 # b2j also does not contain entries for "popular" elements, meaning 295 # elements that account for more than 1 + 1% of the total elements, and 296 # when the sequence is reasonably large (>= 200 elements); this can 297 # be viewed as an adaptive notion of semi-junk, and yields an enormous 298 # speedup when, e.g., comparing program files with hundreds of 299 # instances of "return NULL;" ... 300 # note that this is only called when b changes; so for cross-product 301 # kinds of matches, it's best to call set_seq2 once, then set_seq1 302 # repeatedly 303 304 def __chain_b(self): 305 # Because isjunk is a user-defined (not C) function, and we test 306 # for junk a LOT, it's important to minimize the number of calls. 307 # Before the tricks described here, __chain_b was by far the most 308 # time-consuming routine in the whole module! If anyone sees 309 # Jim Roskind, thank him again for profile.py -- I never would 310 # have guessed that. 311 # The first trick is to build b2j ignoring the possibility 312 # of junk. I.e., we don't call isjunk at all yet. Throwing 313 # out the junk later is much cheaper than building b2j "right" 314 # from the start. 315 b = self.b 316 self.b2j = b2j = {} 317 318 for i, elt in enumerate(b): 319 indices = b2j.setdefault(elt, []) 320 indices.append(i) 321 322 # Purge junk elements 323 junk = set() 324 isjunk = self.isjunk 325 if isjunk: 326 for elt in list(b2j.keys()): # using list() since b2j is modified 327 if isjunk(elt): 328 junk.add(elt) 329 del b2j[elt] 330 331 # Purge popular elements that are not junk 332 popular = set() 333 n = len(b) 334 if self.autojunk and n >= 200: 335 ntest = n // 100 + 1 336 for elt, idxs in list(b2j.items()): 337 if len(idxs) > ntest: 338 popular.add(elt) 339 del b2j[elt] 340 341 # Now for x in b, isjunk(x) == x in junk, but the latter is much faster. 342 # Sicne the number of *unique* junk elements is probably small, the 343 # memory burden of keeping this set alive is likely trivial compared to 344 # the size of b2j. 345 self.isbjunk = junk.__contains__ 346 self.isbpopular = popular.__contains__ 347 348 def find_longest_match(self, alo, ahi, blo, bhi): 349 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. 350 351 If isjunk is not defined: 352 353 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where 354 alo <= i <= i+k <= ahi 355 blo <= j <= j+k <= bhi 356 and for all (i',j',k') meeting those conditions, 357 k >= k' 358 i <= i' 359 and if i == i', j <= j' 360 361 In other words, of all maximal matching blocks, return one that 362 starts earliest in a, and of all those maximal matching blocks that 363 start earliest in a, return the one that starts earliest in b. 364 365 >>> s = SequenceMatcher(None, " abcd", "abcd abcd") 366 >>> s.find_longest_match(0, 5, 0, 9) 367 Match(a=0, b=4, size=5) 368 369 If isjunk is defined, first the longest matching block is 370 determined as above, but with the additional restriction that no 371 junk element appears in the block. Then that block is extended as 372 far as possible by matching (only) junk elements on both sides. So 373 the resulting block never matches on junk except as identical junk 374 happens to be adjacent to an "interesting" match. 375 376 Here's the same example as before, but considering blanks to be 377 junk. That prevents " abcd" from matching the " abcd" at the tail 378 end of the second sequence directly. Instead only the "abcd" can 379 match, and matches the leftmost "abcd" in the second sequence: 380 381 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") 382 >>> s.find_longest_match(0, 5, 0, 9) 383 Match(a=1, b=0, size=4) 384 385 If no blocks match, return (alo, blo, 0). 386 387 >>> s = SequenceMatcher(None, "ab", "c") 388 >>> s.find_longest_match(0, 2, 0, 1) 389 Match(a=0, b=0, size=0) 390 """ 391 392 # CAUTION: stripping common prefix or suffix would be incorrect. 393 # E.g., 394 # ab 395 # acab 396 # Longest matching block is "ab", but if common prefix is 397 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so 398 # strip, so ends up claiming that ab is changed to acab by 399 # inserting "ca" in the middle. That's minimal but unintuitive: 400 # "it's obvious" that someone inserted "ac" at the front. 401 # Windiff ends up at the same place as diff, but by pairing up 402 # the unique 'b's and then matching the first two 'a's. 403 404 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk 405 besti, bestj, bestsize = alo, blo, 0 406 # find longest junk-free match 407 # during an iteration of the loop, j2len[j] = length of longest 408 # junk-free match ending with a[i-1] and b[j] 409 j2len = {} 410 nothing = [] 411 for i in xrange(alo, ahi): 412 # look at all instances of a[i] in b; note that because 413 # b2j has no junk keys, the loop is skipped if a[i] is junk 414 j2lenget = j2len.get 415 newj2len = {} 416 for j in b2j.get(a[i], nothing): 417 # a[i] matches b[j] 418 if j < blo: 419 continue 420 if j >= bhi: 421 break 422 k = newj2len[j] = j2lenget(j-1, 0) + 1 423 if k > bestsize: 424 besti, bestj, bestsize = i-k+1, j-k+1, k 425 j2len = newj2len 426 427 # Extend the best by non-junk elements on each end. In particular, 428 # "popular" non-junk elements aren't in b2j, which greatly speeds 429 # the inner loop above, but also means "the best" match so far 430 # doesn't contain any junk *or* popular non-junk elements. 431 while besti > alo and bestj > blo and \ 432 not isbjunk(b[bestj-1]) and \ 433 a[besti-1] == b[bestj-1]: 434 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 435 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 436 not isbjunk(b[bestj+bestsize]) and \ 437 a[besti+bestsize] == b[bestj+bestsize]: 438 bestsize += 1 439 440 # Now that we have a wholly interesting match (albeit possibly 441 # empty!), we may as well suck up the matching junk on each 442 # side of it too. Can't think of a good reason not to, and it 443 # saves post-processing the (possibly considerable) expense of 444 # figuring out what to do with it. In the case of an empty 445 # interesting match, this is clearly the right thing to do, 446 # because no other kind of match is possible in the regions. 447 while besti > alo and bestj > blo and \ 448 isbjunk(b[bestj-1]) and \ 449 a[besti-1] == b[bestj-1]: 450 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 451 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 452 isbjunk(b[bestj+bestsize]) and \ 453 a[besti+bestsize] == b[bestj+bestsize]: 454 bestsize = bestsize + 1 455 456 return Match(besti, bestj, bestsize) 457 458 def get_matching_blocks(self): 459 """Return list of triples describing matching subsequences. 460 461 Each triple is of the form (i, j, n), and means that 462 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in 463 i and in j. New in Python 2.5, it's also guaranteed that if 464 (i, j, n) and (i', j', n') are adjacent triples in the list, and 465 the second is not the last triple in the list, then i+n != i' or 466 j+n != j'. IOW, adjacent triples never describe adjacent equal 467 blocks. 468 469 The last triple is a dummy, (len(a), len(b), 0), and is the only 470 triple with n==0. 471 472 >>> s = SequenceMatcher(None, "abxcd", "abcd") 473 >>> s.get_matching_blocks() 474 [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] 475 """ 476 477 if self.matching_blocks is not None: 478 return self.matching_blocks 479 la, lb = len(self.a), len(self.b) 480 481 # This is most naturally expressed as a recursive algorithm, but 482 # at least one user bumped into extreme use cases that exceeded 483 # the recursion limit on their box. So, now we maintain a list 484 # ('queue`) of blocks we still need to look at, and append partial 485 # results to `matching_blocks` in a loop; the matches are sorted 486 # at the end. 487 queue = [(0, la, 0, lb)] 488 matching_blocks = [] 489 while queue: 490 alo, ahi, blo, bhi = queue.pop() 491 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) 492 # a[alo:i] vs b[blo:j] unknown 493 # a[i:i+k] same as b[j:j+k] 494 # a[i+k:ahi] vs b[j+k:bhi] unknown 495 if k: # if k is 0, there was no matching block 496 matching_blocks.append(x) 497 if alo < i and blo < j: 498 queue.append((alo, i, blo, j)) 499 if i+k < ahi and j+k < bhi: 500 queue.append((i+k, ahi, j+k, bhi)) 501 matching_blocks.sort() 502 503 # It's possible that we have adjacent equal blocks in the 504 # matching_blocks list now. Starting with 2.5, this code was added 505 # to collapse them. 506 i1 = j1 = k1 = 0 507 non_adjacent = [] 508 for i2, j2, k2 in matching_blocks: 509 # Is this block adjacent to i1, j1, k1? 510 if i1 + k1 == i2 and j1 + k1 == j2: 511 # Yes, so collapse them -- this just increases the length of 512 # the first block by the length of the second, and the first 513 # block so lengthened remains the block to compare against. 514 k1 += k2 515 else: 516 # Not adjacent. Remember the first block (k1==0 means it's 517 # the dummy we started with), and make the second block the 518 # new block to compare against. 519 if k1: 520 non_adjacent.append((i1, j1, k1)) 521 i1, j1, k1 = i2, j2, k2 522 if k1: 523 non_adjacent.append((i1, j1, k1)) 524 525 non_adjacent.append( (la, lb, 0) ) 526 self.matching_blocks = map(Match._make, non_adjacent) 527 return self.matching_blocks 528 529 def get_opcodes(self): 530 """Return list of 5-tuples describing how to turn a into b. 531 532 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple 533 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the 534 tuple preceding it, and likewise for j1 == the previous j2. 535 536 The tags are strings, with these meanings: 537 538 'replace': a[i1:i2] should be replaced by b[j1:j2] 539 'delete': a[i1:i2] should be deleted. 540 Note that j1==j2 in this case. 541 'insert': b[j1:j2] should be inserted at a[i1:i1]. 542 Note that i1==i2 in this case. 543 'equal': a[i1:i2] == b[j1:j2] 544 545 >>> a = "qabxcd" 546 >>> b = "abycdf" 547 >>> s = SequenceMatcher(None, a, b) 548 >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): 549 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % 550 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) 551 delete a[0:1] (q) b[0:0] () 552 equal a[1:3] (ab) b[0:2] (ab) 553 replace a[3:4] (x) b[2:3] (y) 554 equal a[4:6] (cd) b[3:5] (cd) 555 insert a[6:6] () b[5:6] (f) 556 """ 557 558 if self.opcodes is not None: 559 return self.opcodes 560 i = j = 0 561 self.opcodes = answer = [] 562 for ai, bj, size in self.get_matching_blocks(): 563 # invariant: we've pumped out correct diffs to change 564 # a[:i] into b[:j], and the next matching block is 565 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump 566 # out a diff to change a[i:ai] into b[j:bj], pump out 567 # the matching block, and move (i,j) beyond the match 568 tag = '' 569 if i < ai and j < bj: 570 tag = 'replace' 571 elif i < ai: 572 tag = 'delete' 573 elif j < bj: 574 tag = 'insert' 575 if tag: 576 answer.append( (tag, i, ai, j, bj) ) 577 i, j = ai+size, bj+size 578 # the list of matching blocks is terminated by a 579 # sentinel with size 0 580 if size: 581 answer.append( ('equal', ai, i, bj, j) ) 582 return answer 583 584 def get_grouped_opcodes(self, n=3): 585 """ Isolate change clusters by eliminating ranges with no changes. 586 587 Return a generator of groups with up to n lines of context. 588 Each group is in the same format as returned by get_opcodes(). 589 590 >>> from pprint import pprint 591 >>> a = map(str, range(1,40)) 592 >>> b = a[:] 593 >>> b[8:8] = ['i'] # Make an insertion 594 >>> b[20] += 'x' # Make a replacement 595 >>> b[23:28] = [] # Make a deletion 596 >>> b[30] += 'y' # Make another replacement 597 >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) 598 [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], 599 [('equal', 16, 19, 17, 20), 600 ('replace', 19, 20, 20, 21), 601 ('equal', 20, 22, 21, 23), 602 ('delete', 22, 27, 23, 23), 603 ('equal', 27, 30, 23, 26)], 604 [('equal', 31, 34, 27, 30), 605 ('replace', 34, 35, 30, 31), 606 ('equal', 35, 38, 31, 34)]] 607 """ 608 609 codes = self.get_opcodes() 610 if not codes: 611 codes = [("equal", 0, 1, 0, 1)] 612 # Fixup leading and trailing groups if they show no changes. 613 if codes[0][0] == 'equal': 614 tag, i1, i2, j1, j2 = codes[0] 615 codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 616 if codes[-1][0] == 'equal': 617 tag, i1, i2, j1, j2 = codes[-1] 618 codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) 619 620 nn = n + n 621 group = [] 622 for tag, i1, i2, j1, j2 in codes: 623 # End the current group and start a new one whenever 624 # there is a large range with no changes. 625 if tag == 'equal' and i2-i1 > nn: 626 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) 627 yield group 628 group = [] 629 i1, j1 = max(i1, i2-n), max(j1, j2-n) 630 group.append((tag, i1, i2, j1 ,j2)) 631 if group and not (len(group)==1 and group[0][0] == 'equal'): 632 yield group 633 634 def ratio(self): 635 """Return a measure of the sequences' similarity (float in [0,1]). 636 637 Where T is the total number of elements in both sequences, and 638 M is the number of matches, this is 2.0*M / T. 639 Note that this is 1 if the sequences are identical, and 0 if 640 they have nothing in common. 641 642 .ratio() is expensive to compute if you haven't already computed 643 .get_matching_blocks() or .get_opcodes(), in which case you may 644 want to try .quick_ratio() or .real_quick_ratio() first to get an 645 upper bound. 646 647 >>> s = SequenceMatcher(None, "abcd", "bcde") 648 >>> s.ratio() 649 0.75 650 >>> s.quick_ratio() 651 0.75 652 >>> s.real_quick_ratio() 653 1.0 654 """ 655 656 matches = reduce(lambda sum, triple: sum + triple[-1], 657 self.get_matching_blocks(), 0) 658 return _calculate_ratio(matches, len(self.a) + len(self.b)) 659 660 def quick_ratio(self): 661 """Return an upper bound on ratio() relatively quickly. 662 663 This isn't defined beyond that it is an upper bound on .ratio(), and 664 is faster to compute. 665 """ 666 667 # viewing a and b as multisets, set matches to the cardinality 668 # of their intersection; this counts the number of matches 669 # without regard to order, so is clearly an upper bound 670 if self.fullbcount is None: 671 self.fullbcount = fullbcount = {} 672 for elt in self.b: 673 fullbcount[elt] = fullbcount.get(elt, 0) + 1 674 fullbcount = self.fullbcount 675 # avail[x] is the number of times x appears in 'b' less the 676 # number of times we've seen it in 'a' so far ... kinda 677 avail = {} 678 availhas, matches = avail.__contains__, 0 679 for elt in self.a: 680 if availhas(elt): 681 numb = avail[elt] 682 else: 683 numb = fullbcount.get(elt, 0) 684 avail[elt] = numb - 1 685 if numb > 0: 686 matches = matches + 1 687 return _calculate_ratio(matches, len(self.a) + len(self.b)) 688 689 def real_quick_ratio(self): 690 """Return an upper bound on ratio() very quickly. 691 692 This isn't defined beyond that it is an upper bound on .ratio(), and 693 is faster to compute than either .ratio() or .quick_ratio(). 694 """ 695 696 la, lb = len(self.a), len(self.b) 697 # can't have more matches than the number of elements in the 698 # shorter sequence 699 return _calculate_ratio(min(la, lb), la + lb) 700 701 def get_close_matches(word, possibilities, n=3, cutoff=0.6): 702 """Use SequenceMatcher to return list of the best "good enough" matches. 703 704 word is a sequence for which close matches are desired (typically a 705 string). 706 707 possibilities is a list of sequences against which to match word 708 (typically a list of strings). 709 710 Optional arg n (default 3) is the maximum number of close matches to 711 return. n must be > 0. 712 713 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities 714 that don't score at least that similar to word are ignored. 715 716 The best (no more than n) matches among the possibilities are returned 717 in a list, sorted by similarity score, most similar first. 718 719 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) 720 ['apple', 'ape'] 721 >>> import keyword as _keyword 722 >>> get_close_matches("wheel", _keyword.kwlist) 723 ['while'] 724 >>> get_close_matches("apple", _keyword.kwlist) 725 [] 726 >>> get_close_matches("accept", _keyword.kwlist) 727 ['except'] 728 """ 729 730 if not n > 0: 731 raise ValueError("n must be > 0: %r" % (n,)) 732 if not 0.0 <= cutoff <= 1.0: 733 raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) 734 result = [] 735 s = SequenceMatcher() 736 s.set_seq2(word) 737 for x in possibilities: 738 s.set_seq1(x) 739 if s.real_quick_ratio() >= cutoff and \ 740 s.quick_ratio() >= cutoff and \ 741 s.ratio() >= cutoff: 742 result.append((s.ratio(), x)) 743 744 # Move the best scorers to head of list 745 result = heapq.nlargest(n, result) 746 # Strip scores for the best n matches 747 return [x for score, x in result] 748 749 def _count_leading(line, ch): 750 """ 751 Return number of `ch` characters at the start of `line`. 752 753 Example: 754 755 >>> _count_leading(' abc', ' ') 756 3 757 """ 758 759 i, n = 0, len(line) 760 while i < n and line[i] == ch: 761 i += 1 762 return i 763 764 class Differ: 765 r""" 766 Differ is a class for comparing sequences of lines of text, and 767 producing human-readable differences or deltas. Differ uses 768 SequenceMatcher both to compare sequences of lines, and to compare 769 sequences of characters within similar (near-matching) lines. 770 771 Each line of a Differ delta begins with a two-letter code: 772 773 '- ' line unique to sequence 1 774 '+ ' line unique to sequence 2 775 ' ' line common to both sequences 776 '? ' line not present in either input sequence 777 778 Lines beginning with '? ' attempt to guide the eye to intraline 779 differences, and were not present in either input sequence. These lines 780 can be confusing if the sequences contain tab characters. 781 782 Note that Differ makes no claim to produce a *minimal* diff. To the 783 contrary, minimal diffs are often counter-intuitive, because they synch 784 up anywhere possible, sometimes accidental matches 100 pages apart. 785 Restricting synch points to contiguous matches preserves some notion of 786 locality, at the occasional cost of producing a longer diff. 787 788 Example: Comparing two texts. 789 790 First we set up the texts, sequences of individual single-line strings 791 ending with newlines (such sequences can also be obtained from the 792 `readlines()` method of file-like objects): 793 794 >>> text1 = ''' 1. Beautiful is better than ugly. 795 ... 2. Explicit is better than implicit. 796 ... 3. Simple is better than complex. 797 ... 4. Complex is better than complicated. 798 ... '''.splitlines(1) 799 >>> len(text1) 800 4 801 >>> text1[0][-1] 802 '\n' 803 >>> text2 = ''' 1. Beautiful is better than ugly. 804 ... 3. Simple is better than complex. 805 ... 4. Complicated is better than complex. 806 ... 5. Flat is better than nested. 807 ... '''.splitlines(1) 808 809 Next we instantiate a Differ object: 810 811 >>> d = Differ() 812 813 Note that when instantiating a Differ object we may pass functions to 814 filter out line and character 'junk'. See Differ.__init__ for details. 815 816 Finally, we compare the two: 817 818 >>> result = list(d.compare(text1, text2)) 819 820 'result' is a list of strings, so let's pretty-print it: 821 822 >>> from pprint import pprint as _pprint 823 >>> _pprint(result) 824 [' 1. Beautiful is better than ugly.\n', 825 '- 2. Explicit is better than implicit.\n', 826 '- 3. Simple is better than complex.\n', 827 '+ 3. Simple is better than complex.\n', 828 '? ++\n', 829 '- 4. Complex is better than complicated.\n', 830 '? ^ ---- ^\n', 831 '+ 4. Complicated is better than complex.\n', 832 '? ++++ ^ ^\n', 833 '+ 5. Flat is better than nested.\n'] 834 835 As a single multi-line string it looks like this: 836 837 >>> print ''.join(result), 838 1. Beautiful is better than ugly. 839 - 2. Explicit is better than implicit. 840 - 3. Simple is better than complex. 841 + 3. Simple is better than complex. 842 ? ++ 843 - 4. Complex is better than complicated. 844 ? ^ ---- ^ 845 + 4. Complicated is better than complex. 846 ? ++++ ^ ^ 847 + 5. Flat is better than nested. 848 849 Methods: 850 851 __init__(linejunk=None, charjunk=None) 852 Construct a text differencer, with optional filters. 853 854 compare(a, b) 855 Compare two sequences of lines; generate the resulting delta. 856 """ 857 858 def __init__(self, linejunk=None, charjunk=None): 859 """ 860 Construct a text differencer, with optional filters. 861 862 The two optional keyword parameters are for filter functions: 863 864 - `linejunk`: A function that should accept a single string argument, 865 and return true iff the string is junk. The module-level function 866 `IS_LINE_JUNK` may be used to filter out lines without visible 867 characters, except for at most one splat ('#'). It is recommended 868 to leave linejunk None; as of Python 2.3, the underlying 869 SequenceMatcher class has grown an adaptive notion of "noise" lines 870 that's better than any static definition the author has ever been 871 able to craft. 872 873 - `charjunk`: A function that should accept a string of length 1. The 874 module-level function `IS_CHARACTER_JUNK` may be used to filter out 875 whitespace characters (a blank or tab; **note**: bad idea to include 876 newline in this!). Use of IS_CHARACTER_JUNK is recommended. 877 """ 878 879 self.linejunk = linejunk 880 self.charjunk = charjunk 881 882 def compare(self, a, b): 883 r""" 884 Compare two sequences of lines; generate the resulting delta. 885 886 Each sequence must contain individual single-line strings ending with 887 newlines. Such sequences can be obtained from the `readlines()` method 888 of file-like objects. The delta generated also consists of newline- 889 terminated strings, ready to be printed as-is via the writeline() 890 method of a file-like object. 891 892 Example: 893 894 >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), 895 ... 'ore\ntree\nemu\n'.splitlines(1))), 896 - one 897 ? ^ 898 + ore 899 ? ^ 900 - two 901 - three 902 ? - 903 + tree 904 + emu 905 """ 906 907 cruncher = SequenceMatcher(self.linejunk, a, b) 908 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): 909 if tag == 'replace': 910 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 911 elif tag == 'delete': 912 g = self._dump('-', a, alo, ahi) 913 elif tag == 'insert': 914 g = self._dump('+', b, blo, bhi) 915 elif tag == 'equal': 916 g = self._dump(' ', a, alo, ahi) 917 else: 918 raise ValueError, 'unknown tag %r' % (tag,) 919 920 for line in g: 921 yield line 922 923 def _dump(self, tag, x, lo, hi): 924 """Generate comparison results for a same-tagged range.""" 925 for i in xrange(lo, hi): 926 yield '%s %s' % (tag, x[i]) 927 928 def _plain_replace(self, a, alo, ahi, b, blo, bhi): 929 assert alo < ahi and blo < bhi 930 # dump the shorter block first -- reduces the burden on short-term 931 # memory if the blocks are of very different sizes 932 if bhi - blo < ahi - alo: 933 first = self._dump('+', b, blo, bhi) 934 second = self._dump('-', a, alo, ahi) 935 else: 936 first = self._dump('-', a, alo, ahi) 937 second = self._dump('+', b, blo, bhi) 938 939 for g in first, second: 940 for line in g: 941 yield line 942 943 def _fancy_replace(self, a, alo, ahi, b, blo, bhi): 944 r""" 945 When replacing one block of lines with another, search the blocks 946 for *similar* lines; the best-matching pair (if any) is used as a 947 synch point, and intraline difference marking is done on the 948 similar pair. Lots of work, but often worth it. 949 950 Example: 951 952 >>> d = Differ() 953 >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, 954 ... ['abcdefGhijkl\n'], 0, 1) 955 >>> print ''.join(results), 956 - abcDefghiJkl 957 ? ^ ^ ^ 958 + abcdefGhijkl 959 ? ^ ^ ^ 960 """ 961 962 # don't synch up unless the lines have a similarity score of at 963 # least cutoff; best_ratio tracks the best score seen so far 964 best_ratio, cutoff = 0.74, 0.75 965 cruncher = SequenceMatcher(self.charjunk) 966 eqi, eqj = None, None # 1st indices of equal lines (if any) 967 968 # search for the pair that matches best without being identical 969 # (identical lines must be junk lines, & we don't want to synch up 970 # on junk -- unless we have to) 971 for j in xrange(blo, bhi): 972 bj = b[j] 973 cruncher.set_seq2(bj) 974 for i in xrange(alo, ahi): 975 ai = a[i] 976 if ai == bj: 977 if eqi is None: 978 eqi, eqj = i, j 979 continue 980 cruncher.set_seq1(ai) 981 # computing similarity is expensive, so use the quick 982 # upper bounds first -- have seen this speed up messy 983 # compares by a factor of 3. 984 # note that ratio() is only expensive to compute the first 985 # time it's called on a sequence pair; the expensive part 986 # of the computation is cached by cruncher 987 if cruncher.real_quick_ratio() > best_ratio and \ 988 cruncher.quick_ratio() > best_ratio and \ 989 cruncher.ratio() > best_ratio: 990 best_ratio, best_i, best_j = cruncher.ratio(), i, j 991 if best_ratio < cutoff: 992 # no non-identical "pretty close" pair 993 if eqi is None: 994 # no identical pair either -- treat it as a straight replace 995 for line in self._plain_replace(a, alo, ahi, b, blo, bhi): 996 yield line 997 return 998 # no close pair, but an identical pair -- synch up on that 999 best_i, best_j, best_ratio = eqi, eqj, 1.0 1000 else: 1001 # there's a close pair, so forget the identical pair (if any) 1002 eqi = None 1003 1004 # a[best_i] very similar to b[best_j]; eqi is None iff they're not 1005 # identical 1006 1007 # pump out diffs from before the synch point 1008 for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): 1009 yield line 1010 1011 # do intraline marking on the synch pair 1012 aelt, belt = a[best_i], b[best_j] 1013 if eqi is None: 1014 # pump out a '-', '?', '+', '?' quad for the synched lines 1015 atags = btags = "" 1016 cruncher.set_seqs(aelt, belt) 1017 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): 1018 la, lb = ai2 - ai1, bj2 - bj1 1019 if tag == 'replace': 1020 atags += '^' * la 1021 btags += '^' * lb 1022 elif tag == 'delete': 1023 atags += '-' * la 1024 elif tag == 'insert': 1025 btags += '+' * lb 1026 elif tag == 'equal': 1027 atags += ' ' * la 1028 btags += ' ' * lb 1029 else: 1030 raise ValueError, 'unknown tag %r' % (tag,) 1031 for line in self._qformat(aelt, belt, atags, btags): 1032 yield line 1033 else: 1034 # the synch pair is identical 1035 yield ' ' + aelt 1036 1037 # pump out diffs from after the synch point 1038 for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): 1039 yield line 1040 1041 def _fancy_helper(self, a, alo, ahi, b, blo, bhi): 1042 g = [] 1043 if alo < ahi: 1044 if blo < bhi: 1045 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 1046 else: 1047 g = self._dump('-', a, alo, ahi) 1048 elif blo < bhi: 1049 g = self._dump('+', b, blo, bhi) 1050 1051 for line in g: 1052 yield line 1053 1054 def _qformat(self, aline, bline, atags, btags): 1055 r""" 1056 Format "?" output and deal with leading tabs. 1057 1058 Example: 1059 1060 >>> d = Differ() 1061 >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n', 1062 ... ' ^ ^ ^ ', ' ^ ^ ^ ') 1063 >>> for line in results: print repr(line) 1064 ... 1065 '- \tabcDefghiJkl\n' 1066 '? \t ^ ^ ^\n' 1067 '+ \tabcdefGhijkl\n' 1068 '? \t ^ ^ ^\n' 1069 """ 1070 1071 # Can hurt, but will probably help most of the time. 1072 common = min(_count_leading(aline, "\t"), 1073 _count_leading(bline, "\t")) 1074 common = min(common, _count_leading(atags[:common], " ")) 1075 common = min(common, _count_leading(btags[:common], " ")) 1076 atags = atags[common:].rstrip() 1077 btags = btags[common:].rstrip() 1078 1079 yield "- " + aline 1080 if atags: 1081 yield "? %s%s\n" % ("\t" * common, atags) 1082 1083 yield "+ " + bline 1084 if btags: 1085 yield "? %s%s\n" % ("\t" * common, btags) 1086 1087 # With respect to junk, an earlier version of ndiff simply refused to 1088 # *start* a match with a junk element. The result was cases like this: 1089 # before: private Thread currentThread; 1090 # after: private volatile Thread currentThread; 1091 # If you consider whitespace to be junk, the longest contiguous match 1092 # not starting with junk is "e Thread currentThread". So ndiff reported 1093 # that "e volatil" was inserted between the 't' and the 'e' in "private". 1094 # While an accurate view, to people that's absurd. The current version 1095 # looks for matching blocks that are entirely junk-free, then extends the 1096 # longest one of those as far as possible but only with matching junk. 1097 # So now "currentThread" is matched, then extended to suck up the 1098 # preceding blank; then "private" is matched, and extended to suck up the 1099 # following blank; then "Thread" is matched; and finally ndiff reports 1100 # that "volatile " was inserted before "Thread". The only quibble 1101 # remaining is that perhaps it was really the case that " volatile" 1102 # was inserted after "private". I can live with that <wink>. 1103 1104 import re 1105 1106 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match): 1107 r""" 1108 Return 1 for ignorable line: iff `line` is blank or contains a single '#'. 1109 1110 Examples: 1111 1112 >>> IS_LINE_JUNK('\n') 1113 True 1114 >>> IS_LINE_JUNK(' # \n') 1115 True 1116 >>> IS_LINE_JUNK('hello\n') 1117 False 1118 """ 1119 1120 return pat(line) is not None 1121 1122 def IS_CHARACTER_JUNK(ch, ws=" \t"): 1123 r""" 1124 Return 1 for ignorable character: iff `ch` is a space or tab. 1125 1126 Examples: 1127 1128 >>> IS_CHARACTER_JUNK(' ') 1129 True 1130 >>> IS_CHARACTER_JUNK('\t') 1131 True 1132 >>> IS_CHARACTER_JUNK('\n') 1133 False 1134 >>> IS_CHARACTER_JUNK('x') 1135 False 1136 """ 1137 1138 return ch in ws 1139 1140 1141 ######################################################################## 1142 ### Unified Diff 1143 ######################################################################## 1144 1145 def _format_range_unified(start, stop): 1146 'Convert range to the "ed" format' 1147 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1148 beginning = start + 1 # lines start numbering with one 1149 length = stop - start 1150 if length == 1: 1151 return '{}'.format(beginning) 1152 if not length: 1153 beginning -= 1 # empty ranges begin at line just before the range 1154 return '{},{}'.format(beginning, length) 1155 1156 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 1157 tofiledate='', n=3, lineterm='\n'): 1158 r""" 1159 Compare two sequences of lines; generate the delta as a unified diff. 1160 1161 Unified diffs are a compact way of showing line changes and a few 1162 lines of context. The number of context lines is set by 'n' which 1163 defaults to three. 1164 1165 By default, the diff control lines (those with ---, +++, or @@) are 1166 created with a trailing newline. This is helpful so that inputs 1167 created from file.readlines() result in diffs that are suitable for 1168 file.writelines() since both the inputs and outputs have trailing 1169 newlines. 1170 1171 For inputs that do not have trailing newlines, set the lineterm 1172 argument to "" so that the output will be uniformly newline free. 1173 1174 The unidiff format normally has a header for filenames and modification 1175 times. Any or all of these may be specified using strings for 1176 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1177 The modification times are normally expressed in the ISO 8601 format. 1178 1179 Example: 1180 1181 >>> for line in unified_diff('one two three four'.split(), 1182 ... 'zero one tree four'.split(), 'Original', 'Current', 1183 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', 1184 ... lineterm=''): 1185 ... print line # doctest: +NORMALIZE_WHITESPACE 1186 --- Original 2005-01-26 23:30:50 1187 +++ Current 2010-04-02 10:20:52 1188 @@ -1,4 +1,4 @@ 1189 +zero 1190 one 1191 -two 1192 -three 1193 +tree 1194 four 1195 """ 1196 1197 started = False 1198 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1199 if not started: 1200 started = True 1201 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1202 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1203 yield '--- {}{}{}'.format(fromfile, fromdate, lineterm) 1204 yield '+++ {}{}{}'.format(tofile, todate, lineterm) 1205 1206 first, last = group[0], group[-1] 1207 file1_range = _format_range_unified(first[1], last[2]) 1208 file2_range = _format_range_unified(first[3], last[4]) 1209 yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm) 1210 1211 for tag, i1, i2, j1, j2 in group: 1212 if tag == 'equal': 1213 for line in a[i1:i2]: 1214 yield ' ' + line 1215 continue 1216 if tag in ('replace', 'delete'): 1217 for line in a[i1:i2]: 1218 yield '-' + line 1219 if tag in ('replace', 'insert'): 1220 for line in b[j1:j2]: 1221 yield '+' + line 1222 1223 1224 ######################################################################## 1225 ### Context Diff 1226 ######################################################################## 1227 1228 def _format_range_context(start, stop): 1229 'Convert range to the "ed" format' 1230 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1231 beginning = start + 1 # lines start numbering with one 1232 length = stop - start 1233 if not length: 1234 beginning -= 1 # empty ranges begin at line just before the range 1235 if length <= 1: 1236 return '{}'.format(beginning) 1237 return '{},{}'.format(beginning, beginning + length - 1) 1238 1239 # See http://www.unix.org/single_unix_specification/ 1240 def context_diff(a, b, fromfile='', tofile='', 1241 fromfiledate='', tofiledate='', n=3, lineterm='\n'): 1242 r""" 1243 Compare two sequences of lines; generate the delta as a context diff. 1244 1245 Context diffs are a compact way of showing line changes and a few 1246 lines of context. The number of context lines is set by 'n' which 1247 defaults to three. 1248 1249 By default, the diff control lines (those with *** or ---) are 1250 created with a trailing newline. This is helpful so that inputs 1251 created from file.readlines() result in diffs that are suitable for 1252 file.writelines() since both the inputs and outputs have trailing 1253 newlines. 1254 1255 For inputs that do not have trailing newlines, set the lineterm 1256 argument to "" so that the output will be uniformly newline free. 1257 1258 The context diff format normally has a header for filenames and 1259 modification times. Any or all of these may be specified using 1260 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1261 The modification times are normally expressed in the ISO 8601 format. 1262 If not specified, the strings default to blanks. 1263 1264 Example: 1265 1266 >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), 1267 ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')), 1268 *** Original 1269 --- Current 1270 *************** 1271 *** 1,4 **** 1272 one 1273 ! two 1274 ! three 1275 four 1276 --- 1,4 ---- 1277 + zero 1278 one 1279 ! tree 1280 four 1281 """ 1282 1283 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') 1284 started = False 1285 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1286 if not started: 1287 started = True 1288 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1289 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1290 yield '*** {}{}{}'.format(fromfile, fromdate, lineterm) 1291 yield '--- {}{}{}'.format(tofile, todate, lineterm) 1292 1293 first, last = group[0], group[-1] 1294 yield '***************' + lineterm 1295 1296 file1_range = _format_range_context(first[1], last[2]) 1297 yield '*** {} ****{}'.format(file1_range, lineterm) 1298 1299 if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group): 1300 for tag, i1, i2, _, _ in group: 1301 if tag != 'insert': 1302 for line in a[i1:i2]: 1303 yield prefix[tag] + line 1304 1305 file2_range = _format_range_context(first[3], last[4]) 1306 yield '--- {} ----{}'.format(file2_range, lineterm) 1307 1308 if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group): 1309 for tag, _, _, j1, j2 in group: 1310 if tag != 'delete': 1311 for line in b[j1:j2]: 1312 yield prefix[tag] + line 1313 1314 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): 1315 r""" 1316 Compare `a` and `b` (lists of strings); return a `Differ`-style delta. 1317 1318 Optional keyword parameters `linejunk` and `charjunk` are for filter 1319 functions (or None): 1320 1321 - linejunk: A function that should accept a single string argument, and 1322 return true iff the string is junk. The default is None, and is 1323 recommended; as of Python 2.3, an adaptive notion of "noise" lines is 1324 used that does a good job on its own. 1325 1326 - charjunk: A function that should accept a string of length 1. The 1327 default is module-level function IS_CHARACTER_JUNK, which filters out 1328 whitespace characters (a blank or tab; note: bad idea to include newline 1329 in this!). 1330 1331 Tools/scripts/ndiff.py is a command-line front-end to this function. 1332 1333 Example: 1334 1335 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 1336 ... 'ore\ntree\nemu\n'.splitlines(1)) 1337 >>> print ''.join(diff), 1338 - one 1339 ? ^ 1340 + ore 1341 ? ^ 1342 - two 1343 - three 1344 ? - 1345 + tree 1346 + emu 1347 """ 1348 return Differ(linejunk, charjunk).compare(a, b) 1349 1350 def _mdiff(fromlines, tolines, context=None, linejunk=None, 1351 charjunk=IS_CHARACTER_JUNK): 1352 r"""Returns generator yielding marked up from/to side by side differences. 1353 1354 Arguments: 1355 fromlines -- list of text lines to compared to tolines 1356 tolines -- list of text lines to be compared to fromlines 1357 context -- number of context lines to display on each side of difference, 1358 if None, all from/to text lines will be generated. 1359 linejunk -- passed on to ndiff (see ndiff documentation) 1360 charjunk -- passed on to ndiff (see ndiff documentation) 1361 1362 This function returns an iterator which returns a tuple: 1363 (from line tuple, to line tuple, boolean flag) 1364 1365 from/to line tuple -- (line num, line text) 1366 line num -- integer or None (to indicate a context separation) 1367 line text -- original line text with following markers inserted: 1368 '\0+' -- marks start of added text 1369 '\0-' -- marks start of deleted text 1370 '\0^' -- marks start of changed text 1371 '\1' -- marks end of added/deleted/changed text 1372 1373 boolean flag -- None indicates context separation, True indicates 1374 either "from" or "to" line contains a change, otherwise False. 1375 1376 This function/iterator was originally developed to generate side by side 1377 file difference for making HTML pages (see HtmlDiff class for example 1378 usage). 1379 1380 Note, this function utilizes the ndiff function to generate the side by 1381 side difference markup. Optional ndiff arguments may be passed to this 1382 function and they in turn will be passed to ndiff. 1383 """ 1384 import re 1385 1386 # regular expression for finding intraline change indices 1387 change_re = re.compile('(\++|\-+|\^+)') 1388 1389 # create the difference iterator to generate the differences 1390 diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) 1391 1392 def _make_line(lines, format_key, side, num_lines=[0,0]): 1393 """Returns line of text with user's change markup and line formatting. 1394 1395 lines -- list of lines from the ndiff generator to produce a line of 1396 text from. When producing the line of text to return, the 1397 lines used are removed from this list. 1398 format_key -- '+' return first line in list with "add" markup around 1399 the entire line. 1400 '-' return first line in list with "delete" markup around 1401 the entire line. 1402 '?' return first line in list with add/delete/change 1403 intraline markup (indices obtained from second line) 1404 None return first line in list with no markup 1405 side -- indice into the num_lines list (0=from,1=to) 1406 num_lines -- from/to current line number. This is NOT intended to be a 1407 passed parameter. It is present as a keyword argument to 1408 maintain memory of the current line numbers between calls 1409 of this function. 1410 1411 Note, this function is purposefully not defined at the module scope so 1412 that data it needs from its parent function (within whose context it 1413 is defined) does not need to be of module scope. 1414 """ 1415 num_lines[side] += 1 1416 # Handle case where no user markup is to be added, just return line of 1417 # text with user's line format to allow for usage of the line number. 1418 if format_key is None: 1419 return (num_lines[side],lines.pop(0)[2:]) 1420 # Handle case of intraline changes 1421 if format_key == '?': 1422 text, markers = lines.pop(0), lines.pop(0) 1423 # find intraline changes (store change type and indices in tuples) 1424 sub_info = [] 1425 def record_sub_info(match_object,sub_info=sub_info): 1426 sub_info.append([match_object.group(1)[0],match_object.span()]) 1427 return match_object.group(1) 1428 change_re.sub(record_sub_info,markers) 1429 # process each tuple inserting our special marks that won't be 1430 # noticed by an xml/html escaper. 1431 for key,(begin,end) in sub_info[::-1]: 1432 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] 1433 text = text[2:] 1434 # Handle case of add/delete entire line 1435 else: 1436 text = lines.pop(0)[2:] 1437 # if line of text is just a newline, insert a space so there is 1438 # something for the user to highlight and see. 1439 if not text: 1440 text = ' ' 1441 # insert marks that won't be noticed by an xml/html escaper. 1442 text = '\0' + format_key + text + '\1' 1443 # Return line of text, first allow user's line formatter to do its 1444 # thing (such as adding the line number) then replace the special 1445 # marks with what the user's change markup. 1446 return (num_lines[side],text) 1447 1448 def _line_iterator(): 1449 """Yields from/to lines of text with a change indication. 1450 1451 This function is an iterator. It itself pulls lines from a 1452 differencing iterator, processes them and yields them. When it can 1453 it yields both a "from" and a "to" line, otherwise it will yield one 1454 or the other. In addition to yielding the lines of from/to text, a 1455 boolean flag is yielded to indicate if the text line(s) have 1456 differences in them. 1457 1458 Note, this function is purposefully not defined at the module scope so 1459 that data it needs from its parent function (within whose context it 1460 is defined) does not need to be of module scope. 1461 """ 1462 lines = [] 1463 num_blanks_pending, num_blanks_to_yield = 0, 0 1464 while True: 1465 # Load up next 4 lines so we can look ahead, create strings which 1466 # are a concatenation of the first character of each of the 4 lines 1467 # so we can do some very readable comparisons. 1468 while len(lines) < 4: 1469 try: 1470 lines.append(diff_lines_iterator.next()) 1471 except StopIteration: 1472 lines.append('X') 1473 s = ''.join([line[0] for line in lines]) 1474 if s.startswith('X'): 1475 # When no more lines, pump out any remaining blank lines so the 1476 # corresponding add/delete lines get a matching blank line so 1477 # all line pairs get yielded at the next level. 1478 num_blanks_to_yield = num_blanks_pending 1479 elif s.startswith('-?+?'): 1480 # simple intraline change 1481 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True 1482 continue 1483 elif s.startswith('--++'): 1484 # in delete block, add block coming: we do NOT want to get 1485 # caught up on blank lines yet, just process the delete line 1486 num_blanks_pending -= 1 1487 yield _make_line(lines,'-',0), None, True 1488 continue 1489 elif s.startswith(('--?+', '--+', '- ')): 1490 # in delete block and see an intraline change or unchanged line 1491 # coming: yield the delete line and then blanks 1492 from_line,to_line = _make_line(lines,'-',0), None 1493 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 1494 elif s.startswith('-+?'): 1495 # intraline change 1496 yield _make_line(lines,None,0), _make_line(lines,'?',1), True 1497 continue 1498 elif s.startswith('-?+'): 1499 # intraline change 1500 yield _make_line(lines,'?',0), _make_line(lines,None,1), True 1501 continue 1502 elif s.startswith('-'): 1503 # delete FROM line 1504 num_blanks_pending -= 1 1505 yield _make_line(lines,'-',0), None, True 1506 continue 1507 elif s.startswith('+--'): 1508 # in add block, delete block coming: we do NOT want to get 1509 # caught up on blank lines yet, just process the add line 1510 num_blanks_pending += 1 1511 yield None, _make_line(lines,'+',1), True 1512 continue 1513 elif s.startswith(('+ ', '+-')): 1514 # will be leaving an add block: yield blanks then add line 1515 from_line, to_line = None, _make_line(lines,'+',1) 1516 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 1517 elif s.startswith('+'): 1518 # inside an add block, yield the add line 1519 num_blanks_pending += 1 1520 yield None, _make_line(lines,'+',1), True 1521 continue 1522 elif s.startswith(' '): 1523 # unchanged text, yield it to both sides 1524 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False 1525 continue 1526 # Catch up on the blank lines so when we yield the next from/to 1527 # pair, they are lined up. 1528 while(num_blanks_to_yield < 0): 1529 num_blanks_to_yield += 1 1530 yield None,('','\n'),True 1531 while(num_blanks_to_yield > 0): 1532 num_blanks_to_yield -= 1 1533 yield ('','\n'),None,True 1534 if s.startswith('X'): 1535 raise StopIteration 1536 else: 1537 yield from_line,to_line,True 1538 1539 def _line_pair_iterator(): 1540 """Yields from/to lines of text with a change indication. 1541 1542 This function is an iterator. It itself pulls lines from the line 1543 iterator. Its difference from that iterator is that this function 1544 always yields a pair of from/to text lines (with the change 1545 indication). If necessary it will collect single from/to lines 1546 until it has a matching pair from/to pair to yield. 1547 1548 Note, this function is purposefully not defined at the module scope so 1549 that data it needs from its parent function (within whose context it 1550 is defined) does not need to be of module scope. 1551 """ 1552 line_iterator = _line_iterator() 1553 fromlines,tolines=[],[] 1554 while True: 1555 # Collecting lines of text until we have a from/to pair 1556 while (len(fromlines)==0 or len(tolines)==0): 1557 from_line, to_line, found_diff =line_iterator.next() 1558 if from_line is not None: 1559 fromlines.append((from_line,found_diff)) 1560 if to_line is not None: 1561 tolines.append((to_line,found_diff)) 1562 # Once we have a pair, remove them from the collection and yield it 1563 from_line, fromDiff = fromlines.pop(0) 1564 to_line, to_diff = tolines.pop(0) 1565 yield (from_line,to_line,fromDiff or to_diff) 1566 1567 # Handle case where user does not want context differencing, just yield 1568 # them up without doing anything else with them. 1569 line_pair_iterator = _line_pair_iterator() 1570 if context is None: 1571 while True: 1572 yield line_pair_iterator.next() 1573 # Handle case where user wants context differencing. We must do some 1574 # storage of lines until we know for sure that they are to be yielded. 1575 else: 1576 context += 1 1577 lines_to_write = 0 1578 while True: 1579 # Store lines up until we find a difference, note use of a 1580 # circular queue because we only need to keep around what 1581 # we need for context. 1582 index, contextLines = 0, [None]*(context) 1583 found_diff = False 1584 while(found_diff is False): 1585 from_line, to_line, found_diff = line_pair_iterator.next() 1586 i = index % context 1587 contextLines[i] = (from_line, to_line, found_diff) 1588 index += 1 1589 # Yield lines that we have collected so far, but first yield 1590 # the user's separator. 1591 if index > context: 1592 yield None, None, None 1593 lines_to_write = context 1594 else: 1595 lines_to_write = index 1596 index = 0 1597 while(lines_to_write): 1598 i = index % context 1599 index += 1 1600 yield contextLines[i] 1601 lines_to_write -= 1 1602 # Now yield the context lines after the change 1603 lines_to_write = context-1 1604 while(lines_to_write): 1605 from_line, to_line, found_diff = line_pair_iterator.next() 1606 # If another change within the context, extend the context 1607 if found_diff: 1608 lines_to_write = context-1 1609 else: 1610 lines_to_write -= 1 1611 yield from_line, to_line, found_diff 1612 1613 1614 _file_template = """ 1615 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 1616 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 1617 1618 <html> 1619 1620 <head> 1621 <meta http-equiv="Content-Type" 1622 content="text/html; charset=ISO-8859-1" /> 1623 <title></title> 1624 <style type="text/css">%(styles)s 1625 </style> 1626 </head> 1627 1628 <body> 1629 %(table)s%(legend)s 1630 </body> 1631 1632 </html>""" 1633 1634 _styles = """ 1635 table.diff {font-family:Courier; border:medium;} 1636 .diff_header {background-color:#e0e0e0} 1637 td.diff_header {text-align:right} 1638 .diff_next {background-color:#c0c0c0} 1639 .diff_add {background-color:#aaffaa} 1640 .diff_chg {background-color:#ffff77} 1641 .diff_sub {background-color:#ffaaaa}""" 1642 1643 _table_template = """ 1644 <table class="diff" id="difflib_chg_%(prefix)s_top" 1645 cellspacing="0" cellpadding="0" rules="groups" > 1646 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1647 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1648 %(header_row)s 1649 <tbody> 1650 %(data_rows)s </tbody> 1651 </table>""" 1652 1653 _legend = """ 1654 <table class="diff" summary="Legends"> 1655 <tr> <th colspan="2"> Legends </th> </tr> 1656 <tr> <td> <table border="" summary="Colors"> 1657 <tr><th> Colors </th> </tr> 1658 <tr><td class="diff_add"> Added </td></tr> 1659 <tr><td class="diff_chg">Changed</td> </tr> 1660 <tr><td class="diff_sub">Deleted</td> </tr> 1661 </table></td> 1662 <td> <table border="" summary="Links"> 1663 <tr><th colspan="2"> Links </th> </tr> 1664 <tr><td>(f)irst change</td> </tr> 1665 <tr><td>(n)ext change</td> </tr> 1666 <tr><td>(t)op</td> </tr> 1667 </table></td> </tr> 1668 </table>""" 1669 1670 class HtmlDiff(object): 1671 """For producing HTML side by side comparison with change highlights. 1672 1673 This class can be used to create an HTML table (or a complete HTML file 1674 containing the table) showing a side by side, line by line comparison 1675 of text with inter-line and intra-line change highlights. The table can 1676 be generated in either full or contextual difference mode. 1677 1678 The following methods are provided for HTML generation: 1679 1680 make_table -- generates HTML for a single side by side table 1681 make_file -- generates complete HTML file with a single side by side table 1682 1683 See tools/scripts/diff.py for an example usage of this class. 1684 """ 1685 1686 _file_template = _file_template 1687 _styles = _styles 1688 _table_template = _table_template 1689 _legend = _legend 1690 _default_prefix = 0 1691 1692 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, 1693 charjunk=IS_CHARACTER_JUNK): 1694 """HtmlDiff instance initializer 1695 1696 Arguments: 1697 tabsize -- tab stop spacing, defaults to 8. 1698 wrapcolumn -- column number where lines are broken and wrapped, 1699 defaults to None where lines are not wrapped. 1700 linejunk,charjunk -- keyword arguments passed into ndiff() (used to by 1701 HtmlDiff() to generate the side by side HTML differences). See 1702 ndiff() documentation for argument default values and descriptions. 1703 """ 1704 self._tabsize = tabsize 1705 self._wrapcolumn = wrapcolumn 1706 self._linejunk = linejunk 1707 self._charjunk = charjunk 1708 1709 def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, 1710 numlines=5): 1711 """Returns HTML file of side by side comparison with change highlights 1712 1713 Arguments: 1714 fromlines -- list of "from" lines 1715 tolines -- list of "to" lines 1716 fromdesc -- "from" file column header string 1717 todesc -- "to" file column header string 1718 context -- set to True for contextual differences (defaults to False 1719 which shows full differences). 1720 numlines -- number of context lines. When context is set True, 1721 controls number of lines displayed before and after the change. 1722 When context is False, controls the number of lines to place 1723 the "next" link anchors before the next change (so click of 1724 "next" link jumps to just before the change). 1725 """ 1726 1727 return self._file_template % dict( 1728 styles = self._styles, 1729 legend = self._legend, 1730 table = self.make_table(fromlines,tolines,fromdesc,todesc, 1731 context=context,numlines=numlines)) 1732 1733 def _tab_newline_replace(self,fromlines,tolines): 1734 """Returns from/to line lists with tabs expanded and newlines removed. 1735 1736 Instead of tab characters being replaced by the number of spaces 1737 needed to fill in to the next tab stop, this function will fill 1738 the space with tab characters. This is done so that the difference 1739 algorithms can identify changes in a file when tabs are replaced by 1740 spaces and vice versa. At the end of the HTML generation, the tab 1741 characters will be replaced with a nonbreakable space. 1742 """ 1743 def expand_tabs(line): 1744 # hide real spaces 1745 line = line.replace(' ','\0') 1746 # expand tabs into spaces 1747 line = line.expandtabs(self._tabsize) 1748 # replace spaces from expanded tabs back into tab characters 1749 # (we'll replace them with markup after we do differencing) 1750 line = line.replace(' ','\t') 1751 return line.replace('\0',' ').rstrip('\n') 1752 fromlines = [expand_tabs(line) for line in fromlines] 1753 tolines = [expand_tabs(line) for line in tolines] 1754 return fromlines,tolines 1755 1756 def _split_line(self,data_list,line_num,text): 1757 """Builds list of text lines by splitting text lines at wrap point 1758 1759 This function will determine if the input text line needs to be 1760 wrapped (split) into separate lines. If so, the first wrap point 1761 will be determined and the first line appended to the output 1762 text line list. This function is used recursively to handle 1763 the second part of the split line to further split it. 1764 """ 1765 # if blank line or context separator, just add it to the output list 1766 if not line_num: 1767 data_list.append((line_num,text)) 1768 return 1769 1770 # if line text doesn't need wrapping, just add it to the output list 1771 size = len(text) 1772 max = self._wrapcolumn 1773 if (size <= max) or ((size -(text.count('\0')*3)) <= max): 1774 data_list.append((line_num,text)) 1775 return 1776 1777 # scan text looking for the wrap point, keeping track if the wrap 1778 # point is inside markers 1779 i = 0 1780 n = 0 1781 mark = '' 1782 while n < max and i < size: 1783 if text[i] == '\0': 1784 i += 1 1785 mark = text[i] 1786 i += 1 1787 elif text[i] == '\1': 1788 i += 1 1789 mark = '' 1790 else: 1791 i += 1 1792 n += 1 1793 1794 # wrap point is inside text, break it up into separate lines 1795 line1 = text[:i] 1796 line2 = text[i:] 1797 1798 # if wrap point is inside markers, place end marker at end of first 1799 # line and start marker at beginning of second line because each 1800 # line will have its own table tag markup around it. 1801 if mark: 1802 line1 = line1 + '\1' 1803 line2 = '\0' + mark + line2 1804 1805 # tack on first line onto the output list 1806 data_list.append((line_num,line1)) 1807 1808 # use this routine again to wrap the remaining text 1809 self._split_line(data_list,'>',line2) 1810 1811 def _line_wrapper(self,diffs): 1812 """Returns iterator that splits (wraps) mdiff text lines""" 1813 1814 # pull from/to data and flags from mdiff iterator 1815 for fromdata,todata,flag in diffs: 1816 # check for context separators and pass them through 1817 if flag is None: 1818 yield fromdata,todata,flag 1819 continue 1820 (fromline,fromtext),(toline,totext) = fromdata,todata 1821 # for each from/to line split it at the wrap column to form 1822 # list of text lines. 1823 fromlist,tolist = [],[] 1824 self._split_line(fromlist,fromline,fromtext) 1825 self._split_line(tolist,toline,totext) 1826 # yield from/to line in pairs inserting blank lines as 1827 # necessary when one side has more wrapped lines 1828 while fromlist or tolist: 1829 if fromlist: 1830 fromdata = fromlist.pop(0) 1831 else: 1832 fromdata = ('',' ') 1833 if tolist: 1834 todata = tolist.pop(0) 1835 else: 1836 todata = ('',' ') 1837 yield fromdata,todata,flag 1838 1839 def _collect_lines(self,diffs): 1840 """Collects mdiff output into separate lists 1841 1842 Before storing the mdiff from/to data into a list, it is converted 1843 into a single line of text with HTML markup. 1844 """ 1845 1846 fromlist,tolist,flaglist = [],[],[] 1847 # pull from/to data and flags from mdiff style iterator 1848 for fromdata,todata,flag in diffs: 1849 try: 1850 # store HTML markup of the lines into the lists 1851 fromlist.append(self._format_line(0,flag,*fromdata)) 1852 tolist.append(self._format_line(1,flag,*todata)) 1853 except TypeError: 1854 # exceptions occur for lines where context separators go 1855 fromlist.append(None) 1856 tolist.append(None) 1857 flaglist.append(flag) 1858 return fromlist,tolist,flaglist 1859 1860 def _format_line(self,side,flag,linenum,text): 1861 """Returns HTML markup of "from" / "to" text lines 1862 1863 side -- 0 or 1 indicating "from" or "to" text 1864 flag -- indicates if difference on line 1865 linenum -- line number (used for line number column) 1866 text -- line text to be marked up 1867 """ 1868 try: 1869 linenum = '%d' % linenum 1870 id = ' id="%s%s"' % (self._prefix[side],linenum) 1871 except TypeError: 1872 # handle blank lines where linenum is '>' or '' 1873 id = '' 1874 # replace those things that would get confused with HTML symbols 1875 text=text.replace("&","&").replace(">",">").replace("<","<") 1876 1877 # make space non-breakable so they don't get compressed or line wrapped 1878 text = text.replace(' ',' ').rstrip() 1879 1880 return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ 1881 % (id,linenum,text) 1882 1883 def _make_prefix(self): 1884 """Create unique anchor prefixes""" 1885 1886 # Generate a unique anchor prefix so multiple tables 1887 # can exist on the same HTML page without conflicts. 1888 fromprefix = "from%d_" % HtmlDiff._default_prefix 1889 toprefix = "to%d_" % HtmlDiff._default_prefix 1890 HtmlDiff._default_prefix += 1 1891 # store prefixes so line format method has access 1892 self._prefix = [fromprefix,toprefix] 1893 1894 def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): 1895 """Makes list of "next" links""" 1896 1897 # all anchor names will be generated using the unique "to" prefix 1898 toprefix = self._prefix[1] 1899 1900 # process change flags, generating middle column of next anchors/links 1901 next_id = ['']*len(flaglist) 1902 next_href = ['']*len(flaglist) 1903 num_chg, in_change = 0, False 1904 last = 0 1905 for i,flag in enumerate(flaglist): 1906 if flag: 1907 if not in_change: 1908 in_change = True 1909 last = i 1910 # at the beginning of a change, drop an anchor a few lines 1911 # (the context lines) before the change for the previous 1912 # link 1913 i = max([0,i-numlines]) 1914 next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) 1915 # at the beginning of a change, drop a link to the next 1916 # change 1917 num_chg += 1 1918 next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( 1919 toprefix,num_chg) 1920 else: 1921 in_change = False 1922 # check for cases where there is no content to avoid exceptions 1923 if not flaglist: 1924 flaglist = [False] 1925 next_id = [''] 1926 next_href = [''] 1927 last = 0 1928 if context: 1929 fromlist = ['<td></td><td> No Differences Found </td>'] 1930 tolist = fromlist 1931 else: 1932 fromlist = tolist = ['<td></td><td> Empty File </td>'] 1933 # if not a change on first line, drop a link 1934 if not flaglist[0]: 1935 next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix 1936 # redo the last link to link to the top 1937 next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) 1938 1939 return fromlist,tolist,flaglist,next_href,next_id 1940 1941 def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, 1942 numlines=5): 1943 """Returns HTML table of side by side comparison with change highlights 1944 1945 Arguments: 1946 fromlines -- list of "from" lines 1947 tolines -- list of "to" lines 1948 fromdesc -- "from" file column header string 1949 todesc -- "to" file column header string 1950 context -- set to True for contextual differences (defaults to False 1951 which shows full differences). 1952 numlines -- number of context lines. When context is set True, 1953 controls number of lines displayed before and after the change. 1954 When context is False, controls the number of lines to place 1955 the "next" link anchors before the next change (so click of 1956 "next" link jumps to just before the change). 1957 """ 1958 1959 # make unique anchor prefixes so that multiple tables may exist 1960 # on the same page without conflict. 1961 self._make_prefix() 1962 1963 # change tabs to spaces before it gets more difficult after we insert 1964 # markup 1965 fromlines,tolines = self._tab_newline_replace(fromlines,tolines) 1966 1967 # create diffs iterator which generates side by side from/to data 1968 if context: 1969 context_lines = numlines 1970 else: 1971 context_lines = None 1972 diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, 1973 charjunk=self._charjunk) 1974 1975 # set up iterator to wrap lines that exceed desired width 1976 if self._wrapcolumn: 1977 diffs = self._line_wrapper(diffs) 1978 1979 # collect up from/to lines and flags into lists (also format the lines) 1980 fromlist,tolist,flaglist = self._collect_lines(diffs) 1981 1982 # process change flags, generating middle column of next anchors/links 1983 fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( 1984 fromlist,tolist,flaglist,context,numlines) 1985 1986 s = [] 1987 fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ 1988 '<td class="diff_next">%s</td>%s</tr>\n' 1989 for i in range(len(flaglist)): 1990 if flaglist[i] is None: 1991 # mdiff yields None on separator lines skip the bogus ones 1992 # generated for the first line 1993 if i > 0: 1994 s.append(' </tbody> \n <tbody>\n') 1995 else: 1996 s.append( fmt % (next_id[i],next_href[i],fromlist[i], 1997 next_href[i],tolist[i])) 1998 if fromdesc or todesc: 1999 header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( 2000 '<th class="diff_next"><br /></th>', 2001 '<th colspan="2" class="diff_header">%s</th>' % fromdesc, 2002 '<th class="diff_next"><br /></th>', 2003 '<th colspan="2" class="diff_header">%s</th>' % todesc) 2004 else: 2005 header_row = '' 2006 2007 table = self._table_template % dict( 2008 data_rows=''.join(s), 2009 header_row=header_row, 2010 prefix=self._prefix[1]) 2011 2012 return table.replace('\0+','<span class="diff_add">'). \ 2013 replace('\0-','<span class="diff_sub">'). \ 2014 replace('\0^','<span class="diff_chg">'). \ 2015 replace('\1','</span>'). \ 2016 replace('\t',' ') 2017 2018 del re 2019 2020 def restore(delta, which): 2021 r""" 2022 Generate one of the two sequences that generated a delta. 2023 2024 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract 2025 lines originating from file 1 or 2 (parameter `which`), stripping off line 2026 prefixes. 2027 2028 Examples: 2029 2030 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 2031 ... 'ore\ntree\nemu\n'.splitlines(1)) 2032 >>> diff = list(diff) 2033 >>> print ''.join(restore(diff, 1)), 2034 one 2035 two 2036 three 2037 >>> print ''.join(restore(diff, 2)), 2038 ore 2039 tree 2040 emu 2041 """ 2042 try: 2043 tag = {1: "- ", 2: "+ "}[int(which)] 2044 except KeyError: 2045 raise ValueError, ('unknown delta choice (must be 1 or 2): %r' 2046 % which) 2047 prefixes = (" ", tag) 2048 for line in delta: 2049 if line[:2] in prefixes: 2050 yield line[2:] 2051 2052 def _test(): 2053 import doctest, difflib 2054 return doctest.testmod(difflib) 2055 2056 if __name__ == "__main__": 2057 _test() 2058