Home | History | Annotate | Download | only in Lib
      1 """
      2 Module difflib -- helpers for computing deltas between objects.
      3 
      4 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
      5     Use SequenceMatcher to return list of the best "good enough" matches.
      6 
      7 Function context_diff(a, b):
      8     For two lists of strings, return a delta in context diff format.
      9 
     10 Function ndiff(a, b):
     11     Return a delta: the difference between `a` and `b` (lists of strings).
     12 
     13 Function restore(delta, which):
     14     Return one of the two sequences that generated an ndiff delta.
     15 
     16 Function unified_diff(a, b):
     17     For two lists of strings, return a delta in unified diff format.
     18 
     19 Class SequenceMatcher:
     20     A flexible class for comparing pairs of sequences of any type.
     21 
     22 Class Differ:
     23     For producing human-readable deltas from sequences of lines of text.
     24 
     25 Class HtmlDiff:
     26     For producing HTML side by side comparison with change highlights.
     27 """
     28 
     29 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
     30            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
     31            'unified_diff', 'HtmlDiff', 'Match']
     32 
     33 import heapq
     34 from collections import namedtuple as _namedtuple
     35 from functools import reduce
     36 
     37 Match = _namedtuple('Match', 'a b size')
     38 
     39 def _calculate_ratio(matches, length):
     40     if length:
     41         return 2.0 * matches / length
     42     return 1.0
     43 
     44 class SequenceMatcher:
     45 
     46     """
     47     SequenceMatcher is a flexible class for comparing pairs of sequences of
     48     any type, so long as the sequence elements are hashable.  The basic
     49     algorithm predates, and is a little fancier than, an algorithm
     50     published in the late 1980's by Ratcliff and Obershelp under the
     51     hyperbolic name "gestalt pattern matching".  The basic idea is to find
     52     the longest contiguous matching subsequence that contains no "junk"
     53     elements (R-O doesn't address junk).  The same idea is then applied
     54     recursively to the pieces of the sequences to the left and to the right
     55     of the matching subsequence.  This does not yield minimal edit
     56     sequences, but does tend to yield matches that "look right" to people.
     57 
     58     SequenceMatcher tries to compute a "human-friendly diff" between two
     59     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
     60     longest *contiguous* & junk-free matching subsequence.  That's what
     61     catches peoples' eyes.  The Windows(tm) windiff has another interesting
     62     notion, pairing up elements that appear uniquely in each sequence.
     63     That, and the method here, appear to yield more intuitive difference
     64     reports than does diff.  This method appears to be the least vulnerable
     65     to synching up on blocks of "junk lines", though (like blank lines in
     66     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
     67     because this is the only method of the 3 that has a *concept* of
     68     "junk" <wink>.
     69 
     70     Example, comparing two strings, and considering blanks to be "junk":
     71 
     72     >>> s = SequenceMatcher(lambda x: x == " ",
     73     ...                     "private Thread currentThread;",
     74     ...                     "private volatile Thread currentThread;")
     75     >>>
     76 
     77     .ratio() returns a float in [0, 1], measuring the "similarity" of the
     78     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
     79     sequences are close matches:
     80 
     81     >>> print round(s.ratio(), 3)
     82     0.866
     83     >>>
     84 
     85     If you're only interested in where the sequences match,
     86     .get_matching_blocks() is handy:
     87 
     88     >>> for block in s.get_matching_blocks():
     89     ...     print "a[%d] and b[%d] match for %d elements" % block
     90     a[0] and b[0] match for 8 elements
     91     a[8] and b[17] match for 21 elements
     92     a[29] and b[38] match for 0 elements
     93 
     94     Note that the last tuple returned by .get_matching_blocks() is always a
     95     dummy, (len(a), len(b), 0), and this is the only case in which the last
     96     tuple element (number of elements matched) is 0.
     97 
     98     If you want to know how to change the first sequence into the second,
     99     use .get_opcodes():
    100 
    101     >>> for opcode in s.get_opcodes():
    102     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
    103      equal a[0:8] b[0:8]
    104     insert a[8:8] b[8:17]
    105      equal a[8:29] b[17:38]
    106 
    107     See the Differ class for a fancy human-friendly file differencer, which
    108     uses SequenceMatcher both to compare sequences of lines, and to compare
    109     sequences of characters within similar (near-matching) lines.
    110 
    111     See also function get_close_matches() in this module, which shows how
    112     simple code building on SequenceMatcher can be used to do useful work.
    113 
    114     Timing:  Basic R-O is cubic time worst case and quadratic time expected
    115     case.  SequenceMatcher is quadratic time for the worst case and has
    116     expected-case behavior dependent in a complicated way on how many
    117     elements the sequences have in common; best case time is linear.
    118 
    119     Methods:
    120 
    121     __init__(isjunk=None, a='', b='')
    122         Construct a SequenceMatcher.
    123 
    124     set_seqs(a, b)
    125         Set the two sequences to be compared.
    126 
    127     set_seq1(a)
    128         Set the first sequence to be compared.
    129 
    130     set_seq2(b)
    131         Set the second sequence to be compared.
    132 
    133     find_longest_match(alo, ahi, blo, bhi)
    134         Find longest matching block in a[alo:ahi] and b[blo:bhi].
    135 
    136     get_matching_blocks()
    137         Return list of triples describing matching subsequences.
    138 
    139     get_opcodes()
    140         Return list of 5-tuples describing how to turn a into b.
    141 
    142     ratio()
    143         Return a measure of the sequences' similarity (float in [0,1]).
    144 
    145     quick_ratio()
    146         Return an upper bound on .ratio() relatively quickly.
    147 
    148     real_quick_ratio()
    149         Return an upper bound on ratio() very quickly.
    150     """
    151 
    152     def __init__(self, isjunk=None, a='', b='', autojunk=True):
    153         """Construct a SequenceMatcher.
    154 
    155         Optional arg isjunk is None (the default), or a one-argument
    156         function that takes a sequence element and returns true iff the
    157         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
    158         no elements are considered to be junk.  For example, pass
    159             lambda x: x in " \\t"
    160         if you're comparing lines as sequences of characters, and don't
    161         want to synch up on blanks or hard tabs.
    162 
    163         Optional arg a is the first of two sequences to be compared.  By
    164         default, an empty string.  The elements of a must be hashable.  See
    165         also .set_seqs() and .set_seq1().
    166 
    167         Optional arg b is the second of two sequences to be compared.  By
    168         default, an empty string.  The elements of b must be hashable. See
    169         also .set_seqs() and .set_seq2().
    170 
    171         Optional arg autojunk should be set to False to disable the
    172         "automatic junk heuristic" that treats popular elements as junk
    173         (see module documentation for more information).
    174         """
    175 
    176         # Members:
    177         # a
    178         #      first sequence
    179         # b
    180         #      second sequence; differences are computed as "what do
    181         #      we need to do to 'a' to change it into 'b'?"
    182         # b2j
    183         #      for x in b, b2j[x] is a list of the indices (into b)
    184         #      at which x appears; junk elements do not appear
    185         # fullbcount
    186         #      for x in b, fullbcount[x] == the number of times x
    187         #      appears in b; only materialized if really needed (used
    188         #      only for computing quick_ratio())
    189         # matching_blocks
    190         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
    191         #      ascending & non-overlapping in i and in j; terminated by
    192         #      a dummy (len(a), len(b), 0) sentinel
    193         # opcodes
    194         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
    195         #      one of
    196         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
    197         #          'delete'    a[i1:i2] should be deleted
    198         #          'insert'    b[j1:j2] should be inserted
    199         #          'equal'     a[i1:i2] == b[j1:j2]
    200         # isjunk
    201         #      a user-supplied function taking a sequence element and
    202         #      returning true iff the element is "junk" -- this has
    203         #      subtle but helpful effects on the algorithm, which I'll
    204         #      get around to writing up someday <0.9 wink>.
    205         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
    206         # isbjunk
    207         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
    208         #      it's really the __contains__ method of a hidden dict.
    209         #      DOES NOT WORK for x in a!
    210         # isbpopular
    211         #      for x in b, isbpopular(x) is true iff b is reasonably long
    212         #      (at least 200 elements) and x accounts for more than 1 + 1% of
    213         #      its elements (when autojunk is enabled).
    214         #      DOES NOT WORK for x in a!
    215 
    216         self.isjunk = isjunk
    217         self.a = self.b = None
    218         self.autojunk = autojunk
    219         self.set_seqs(a, b)
    220 
    221     def set_seqs(self, a, b):
    222         """Set the two sequences to be compared.
    223 
    224         >>> s = SequenceMatcher()
    225         >>> s.set_seqs("abcd", "bcde")
    226         >>> s.ratio()
    227         0.75
    228         """
    229 
    230         self.set_seq1(a)
    231         self.set_seq2(b)
    232 
    233     def set_seq1(self, a):
    234         """Set the first sequence to be compared.
    235 
    236         The second sequence to be compared is not changed.
    237 
    238         >>> s = SequenceMatcher(None, "abcd", "bcde")
    239         >>> s.ratio()
    240         0.75
    241         >>> s.set_seq1("bcde")
    242         >>> s.ratio()
    243         1.0
    244         >>>
    245 
    246         SequenceMatcher computes and caches detailed information about the
    247         second sequence, so if you want to compare one sequence S against
    248         many sequences, use .set_seq2(S) once and call .set_seq1(x)
    249         repeatedly for each of the other sequences.
    250 
    251         See also set_seqs() and set_seq2().
    252         """
    253 
    254         if a is self.a:
    255             return
    256         self.a = a
    257         self.matching_blocks = self.opcodes = None
    258 
    259     def set_seq2(self, b):
    260         """Set the second sequence to be compared.
    261 
    262         The first sequence to be compared is not changed.
    263 
    264         >>> s = SequenceMatcher(None, "abcd", "bcde")
    265         >>> s.ratio()
    266         0.75
    267         >>> s.set_seq2("abcd")
    268         >>> s.ratio()
    269         1.0
    270         >>>
    271 
    272         SequenceMatcher computes and caches detailed information about the
    273         second sequence, so if you want to compare one sequence S against
    274         many sequences, use .set_seq2(S) once and call .set_seq1(x)
    275         repeatedly for each of the other sequences.
    276 
    277         See also set_seqs() and set_seq1().
    278         """
    279 
    280         if b is self.b:
    281             return
    282         self.b = b
    283         self.matching_blocks = self.opcodes = None
    284         self.fullbcount = None
    285         self.__chain_b()
    286 
    287     # For each element x in b, set b2j[x] to a list of the indices in
    288     # b where x appears; the indices are in increasing order; note that
    289     # the number of times x appears in b is len(b2j[x]) ...
    290     # when self.isjunk is defined, junk elements don't show up in this
    291     # map at all, which stops the central find_longest_match method
    292     # from starting any matching block at a junk element ...
    293     # also creates the fast isbjunk function ...
    294     # b2j also does not contain entries for "popular" elements, meaning
    295     # elements that account for more than 1 + 1% of the total elements, and
    296     # when the sequence is reasonably large (>= 200 elements); this can
    297     # be viewed as an adaptive notion of semi-junk, and yields an enormous
    298     # speedup when, e.g., comparing program files with hundreds of
    299     # instances of "return NULL;" ...
    300     # note that this is only called when b changes; so for cross-product
    301     # kinds of matches, it's best to call set_seq2 once, then set_seq1
    302     # repeatedly
    303 
    304     def __chain_b(self):
    305         # Because isjunk is a user-defined (not C) function, and we test
    306         # for junk a LOT, it's important to minimize the number of calls.
    307         # Before the tricks described here, __chain_b was by far the most
    308         # time-consuming routine in the whole module!  If anyone sees
    309         # Jim Roskind, thank him again for profile.py -- I never would
    310         # have guessed that.
    311         # The first trick is to build b2j ignoring the possibility
    312         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
    313         # out the junk later is much cheaper than building b2j "right"
    314         # from the start.
    315         b = self.b
    316         self.b2j = b2j = {}
    317 
    318         for i, elt in enumerate(b):
    319             indices = b2j.setdefault(elt, [])
    320             indices.append(i)
    321 
    322         # Purge junk elements
    323         junk = set()
    324         isjunk = self.isjunk
    325         if isjunk:
    326             for elt in list(b2j.keys()):  # using list() since b2j is modified
    327                 if isjunk(elt):
    328                     junk.add(elt)
    329                     del b2j[elt]
    330 
    331         # Purge popular elements that are not junk
    332         popular = set()
    333         n = len(b)
    334         if self.autojunk and n >= 200:
    335             ntest = n // 100 + 1
    336             for elt, idxs in list(b2j.items()):
    337                 if len(idxs) > ntest:
    338                     popular.add(elt)
    339                     del b2j[elt]
    340 
    341         # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
    342         # Sicne the number of *unique* junk elements is probably small, the
    343         # memory burden of keeping this set alive is likely trivial compared to
    344         # the size of b2j.
    345         self.isbjunk = junk.__contains__
    346         self.isbpopular = popular.__contains__
    347 
    348     def find_longest_match(self, alo, ahi, blo, bhi):
    349         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
    350 
    351         If isjunk is not defined:
    352 
    353         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
    354             alo <= i <= i+k <= ahi
    355             blo <= j <= j+k <= bhi
    356         and for all (i',j',k') meeting those conditions,
    357             k >= k'
    358             i <= i'
    359             and if i == i', j <= j'
    360 
    361         In other words, of all maximal matching blocks, return one that
    362         starts earliest in a, and of all those maximal matching blocks that
    363         start earliest in a, return the one that starts earliest in b.
    364 
    365         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
    366         >>> s.find_longest_match(0, 5, 0, 9)
    367         Match(a=0, b=4, size=5)
    368 
    369         If isjunk is defined, first the longest matching block is
    370         determined as above, but with the additional restriction that no
    371         junk element appears in the block.  Then that block is extended as
    372         far as possible by matching (only) junk elements on both sides.  So
    373         the resulting block never matches on junk except as identical junk
    374         happens to be adjacent to an "interesting" match.
    375 
    376         Here's the same example as before, but considering blanks to be
    377         junk.  That prevents " abcd" from matching the " abcd" at the tail
    378         end of the second sequence directly.  Instead only the "abcd" can
    379         match, and matches the leftmost "abcd" in the second sequence:
    380 
    381         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
    382         >>> s.find_longest_match(0, 5, 0, 9)
    383         Match(a=1, b=0, size=4)
    384 
    385         If no blocks match, return (alo, blo, 0).
    386 
    387         >>> s = SequenceMatcher(None, "ab", "c")
    388         >>> s.find_longest_match(0, 2, 0, 1)
    389         Match(a=0, b=0, size=0)
    390         """
    391 
    392         # CAUTION:  stripping common prefix or suffix would be incorrect.
    393         # E.g.,
    394         #    ab
    395         #    acab
    396         # Longest matching block is "ab", but if common prefix is
    397         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
    398         # strip, so ends up claiming that ab is changed to acab by
    399         # inserting "ca" in the middle.  That's minimal but unintuitive:
    400         # "it's obvious" that someone inserted "ac" at the front.
    401         # Windiff ends up at the same place as diff, but by pairing up
    402         # the unique 'b's and then matching the first two 'a's.
    403 
    404         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
    405         besti, bestj, bestsize = alo, blo, 0
    406         # find longest junk-free match
    407         # during an iteration of the loop, j2len[j] = length of longest
    408         # junk-free match ending with a[i-1] and b[j]
    409         j2len = {}
    410         nothing = []
    411         for i in xrange(alo, ahi):
    412             # look at all instances of a[i] in b; note that because
    413             # b2j has no junk keys, the loop is skipped if a[i] is junk
    414             j2lenget = j2len.get
    415             newj2len = {}
    416             for j in b2j.get(a[i], nothing):
    417                 # a[i] matches b[j]
    418                 if j < blo:
    419                     continue
    420                 if j >= bhi:
    421                     break
    422                 k = newj2len[j] = j2lenget(j-1, 0) + 1
    423                 if k > bestsize:
    424                     besti, bestj, bestsize = i-k+1, j-k+1, k
    425             j2len = newj2len
    426 
    427         # Extend the best by non-junk elements on each end.  In particular,
    428         # "popular" non-junk elements aren't in b2j, which greatly speeds
    429         # the inner loop above, but also means "the best" match so far
    430         # doesn't contain any junk *or* popular non-junk elements.
    431         while besti > alo and bestj > blo and \
    432               not isbjunk(b[bestj-1]) and \
    433               a[besti-1] == b[bestj-1]:
    434             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
    435         while besti+bestsize < ahi and bestj+bestsize < bhi and \
    436               not isbjunk(b[bestj+bestsize]) and \
    437               a[besti+bestsize] == b[bestj+bestsize]:
    438             bestsize += 1
    439 
    440         # Now that we have a wholly interesting match (albeit possibly
    441         # empty!), we may as well suck up the matching junk on each
    442         # side of it too.  Can't think of a good reason not to, and it
    443         # saves post-processing the (possibly considerable) expense of
    444         # figuring out what to do with it.  In the case of an empty
    445         # interesting match, this is clearly the right thing to do,
    446         # because no other kind of match is possible in the regions.
    447         while besti > alo and bestj > blo and \
    448               isbjunk(b[bestj-1]) and \
    449               a[besti-1] == b[bestj-1]:
    450             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
    451         while besti+bestsize < ahi and bestj+bestsize < bhi and \
    452               isbjunk(b[bestj+bestsize]) and \
    453               a[besti+bestsize] == b[bestj+bestsize]:
    454             bestsize = bestsize + 1
    455 
    456         return Match(besti, bestj, bestsize)
    457 
    458     def get_matching_blocks(self):
    459         """Return list of triples describing matching subsequences.
    460 
    461         Each triple is of the form (i, j, n), and means that
    462         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
    463         i and in j.  New in Python 2.5, it's also guaranteed that if
    464         (i, j, n) and (i', j', n') are adjacent triples in the list, and
    465         the second is not the last triple in the list, then i+n != i' or
    466         j+n != j'.  IOW, adjacent triples never describe adjacent equal
    467         blocks.
    468 
    469         The last triple is a dummy, (len(a), len(b), 0), and is the only
    470         triple with n==0.
    471 
    472         >>> s = SequenceMatcher(None, "abxcd", "abcd")
    473         >>> s.get_matching_blocks()
    474         [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
    475         """
    476 
    477         if self.matching_blocks is not None:
    478             return self.matching_blocks
    479         la, lb = len(self.a), len(self.b)
    480 
    481         # This is most naturally expressed as a recursive algorithm, but
    482         # at least one user bumped into extreme use cases that exceeded
    483         # the recursion limit on their box.  So, now we maintain a list
    484         # ('queue`) of blocks we still need to look at, and append partial
    485         # results to `matching_blocks` in a loop; the matches are sorted
    486         # at the end.
    487         queue = [(0, la, 0, lb)]
    488         matching_blocks = []
    489         while queue:
    490             alo, ahi, blo, bhi = queue.pop()
    491             i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
    492             # a[alo:i] vs b[blo:j] unknown
    493             # a[i:i+k] same as b[j:j+k]
    494             # a[i+k:ahi] vs b[j+k:bhi] unknown
    495             if k:   # if k is 0, there was no matching block
    496                 matching_blocks.append(x)
    497                 if alo < i and blo < j:
    498                     queue.append((alo, i, blo, j))
    499                 if i+k < ahi and j+k < bhi:
    500                     queue.append((i+k, ahi, j+k, bhi))
    501         matching_blocks.sort()
    502 
    503         # It's possible that we have adjacent equal blocks in the
    504         # matching_blocks list now.  Starting with 2.5, this code was added
    505         # to collapse them.
    506         i1 = j1 = k1 = 0
    507         non_adjacent = []
    508         for i2, j2, k2 in matching_blocks:
    509             # Is this block adjacent to i1, j1, k1?
    510             if i1 + k1 == i2 and j1 + k1 == j2:
    511                 # Yes, so collapse them -- this just increases the length of
    512                 # the first block by the length of the second, and the first
    513                 # block so lengthened remains the block to compare against.
    514                 k1 += k2
    515             else:
    516                 # Not adjacent.  Remember the first block (k1==0 means it's
    517                 # the dummy we started with), and make the second block the
    518                 # new block to compare against.
    519                 if k1:
    520                     non_adjacent.append((i1, j1, k1))
    521                 i1, j1, k1 = i2, j2, k2
    522         if k1:
    523             non_adjacent.append((i1, j1, k1))
    524 
    525         non_adjacent.append( (la, lb, 0) )
    526         self.matching_blocks = map(Match._make, non_adjacent)
    527         return self.matching_blocks
    528 
    529     def get_opcodes(self):
    530         """Return list of 5-tuples describing how to turn a into b.
    531 
    532         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
    533         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
    534         tuple preceding it, and likewise for j1 == the previous j2.
    535 
    536         The tags are strings, with these meanings:
    537 
    538         'replace':  a[i1:i2] should be replaced by b[j1:j2]
    539         'delete':   a[i1:i2] should be deleted.
    540                     Note that j1==j2 in this case.
    541         'insert':   b[j1:j2] should be inserted at a[i1:i1].
    542                     Note that i1==i2 in this case.
    543         'equal':    a[i1:i2] == b[j1:j2]
    544 
    545         >>> a = "qabxcd"
    546         >>> b = "abycdf"
    547         >>> s = SequenceMatcher(None, a, b)
    548         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
    549         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
    550         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
    551          delete a[0:1] (q) b[0:0] ()
    552           equal a[1:3] (ab) b[0:2] (ab)
    553         replace a[3:4] (x) b[2:3] (y)
    554           equal a[4:6] (cd) b[3:5] (cd)
    555          insert a[6:6] () b[5:6] (f)
    556         """
    557 
    558         if self.opcodes is not None:
    559             return self.opcodes
    560         i = j = 0
    561         self.opcodes = answer = []
    562         for ai, bj, size in self.get_matching_blocks():
    563             # invariant:  we've pumped out correct diffs to change
    564             # a[:i] into b[:j], and the next matching block is
    565             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
    566             # out a diff to change a[i:ai] into b[j:bj], pump out
    567             # the matching block, and move (i,j) beyond the match
    568             tag = ''
    569             if i < ai and j < bj:
    570                 tag = 'replace'
    571             elif i < ai:
    572                 tag = 'delete'
    573             elif j < bj:
    574                 tag = 'insert'
    575             if tag:
    576                 answer.append( (tag, i, ai, j, bj) )
    577             i, j = ai+size, bj+size
    578             # the list of matching blocks is terminated by a
    579             # sentinel with size 0
    580             if size:
    581                 answer.append( ('equal', ai, i, bj, j) )
    582         return answer
    583 
    584     def get_grouped_opcodes(self, n=3):
    585         """ Isolate change clusters by eliminating ranges with no changes.
    586 
    587         Return a generator of groups with up to n lines of context.
    588         Each group is in the same format as returned by get_opcodes().
    589 
    590         >>> from pprint import pprint
    591         >>> a = map(str, range(1,40))
    592         >>> b = a[:]
    593         >>> b[8:8] = ['i']     # Make an insertion
    594         >>> b[20] += 'x'       # Make a replacement
    595         >>> b[23:28] = []      # Make a deletion
    596         >>> b[30] += 'y'       # Make another replacement
    597         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
    598         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
    599          [('equal', 16, 19, 17, 20),
    600           ('replace', 19, 20, 20, 21),
    601           ('equal', 20, 22, 21, 23),
    602           ('delete', 22, 27, 23, 23),
    603           ('equal', 27, 30, 23, 26)],
    604          [('equal', 31, 34, 27, 30),
    605           ('replace', 34, 35, 30, 31),
    606           ('equal', 35, 38, 31, 34)]]
    607         """
    608 
    609         codes = self.get_opcodes()
    610         if not codes:
    611             codes = [("equal", 0, 1, 0, 1)]
    612         # Fixup leading and trailing groups if they show no changes.
    613         if codes[0][0] == 'equal':
    614             tag, i1, i2, j1, j2 = codes[0]
    615             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
    616         if codes[-1][0] == 'equal':
    617             tag, i1, i2, j1, j2 = codes[-1]
    618             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
    619 
    620         nn = n + n
    621         group = []
    622         for tag, i1, i2, j1, j2 in codes:
    623             # End the current group and start a new one whenever
    624             # there is a large range with no changes.
    625             if tag == 'equal' and i2-i1 > nn:
    626                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
    627                 yield group
    628                 group = []
    629                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
    630             group.append((tag, i1, i2, j1 ,j2))
    631         if group and not (len(group)==1 and group[0][0] == 'equal'):
    632             yield group
    633 
    634     def ratio(self):
    635         """Return a measure of the sequences' similarity (float in [0,1]).
    636 
    637         Where T is the total number of elements in both sequences, and
    638         M is the number of matches, this is 2.0*M / T.
    639         Note that this is 1 if the sequences are identical, and 0 if
    640         they have nothing in common.
    641 
    642         .ratio() is expensive to compute if you haven't already computed
    643         .get_matching_blocks() or .get_opcodes(), in which case you may
    644         want to try .quick_ratio() or .real_quick_ratio() first to get an
    645         upper bound.
    646 
    647         >>> s = SequenceMatcher(None, "abcd", "bcde")
    648         >>> s.ratio()
    649         0.75
    650         >>> s.quick_ratio()
    651         0.75
    652         >>> s.real_quick_ratio()
    653         1.0
    654         """
    655 
    656         matches = reduce(lambda sum, triple: sum + triple[-1],
    657                          self.get_matching_blocks(), 0)
    658         return _calculate_ratio(matches, len(self.a) + len(self.b))
    659 
    660     def quick_ratio(self):
    661         """Return an upper bound on ratio() relatively quickly.
    662 
    663         This isn't defined beyond that it is an upper bound on .ratio(), and
    664         is faster to compute.
    665         """
    666 
    667         # viewing a and b as multisets, set matches to the cardinality
    668         # of their intersection; this counts the number of matches
    669         # without regard to order, so is clearly an upper bound
    670         if self.fullbcount is None:
    671             self.fullbcount = fullbcount = {}
    672             for elt in self.b:
    673                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
    674         fullbcount = self.fullbcount
    675         # avail[x] is the number of times x appears in 'b' less the
    676         # number of times we've seen it in 'a' so far ... kinda
    677         avail = {}
    678         availhas, matches = avail.__contains__, 0
    679         for elt in self.a:
    680             if availhas(elt):
    681                 numb = avail[elt]
    682             else:
    683                 numb = fullbcount.get(elt, 0)
    684             avail[elt] = numb - 1
    685             if numb > 0:
    686                 matches = matches + 1
    687         return _calculate_ratio(matches, len(self.a) + len(self.b))
    688 
    689     def real_quick_ratio(self):
    690         """Return an upper bound on ratio() very quickly.
    691 
    692         This isn't defined beyond that it is an upper bound on .ratio(), and
    693         is faster to compute than either .ratio() or .quick_ratio().
    694         """
    695 
    696         la, lb = len(self.a), len(self.b)
    697         # can't have more matches than the number of elements in the
    698         # shorter sequence
    699         return _calculate_ratio(min(la, lb), la + lb)
    700 
    701 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
    702     """Use SequenceMatcher to return list of the best "good enough" matches.
    703 
    704     word is a sequence for which close matches are desired (typically a
    705     string).
    706 
    707     possibilities is a list of sequences against which to match word
    708     (typically a list of strings).
    709 
    710     Optional arg n (default 3) is the maximum number of close matches to
    711     return.  n must be > 0.
    712 
    713     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    714     that don't score at least that similar to word are ignored.
    715 
    716     The best (no more than n) matches among the possibilities are returned
    717     in a list, sorted by similarity score, most similar first.
    718 
    719     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
    720     ['apple', 'ape']
    721     >>> import keyword as _keyword
    722     >>> get_close_matches("wheel", _keyword.kwlist)
    723     ['while']
    724     >>> get_close_matches("apple", _keyword.kwlist)
    725     []
    726     >>> get_close_matches("accept", _keyword.kwlist)
    727     ['except']
    728     """
    729 
    730     if not n >  0:
    731         raise ValueError("n must be > 0: %r" % (n,))
    732     if not 0.0 <= cutoff <= 1.0:
    733         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    734     result = []
    735     s = SequenceMatcher()
    736     s.set_seq2(word)
    737     for x in possibilities:
    738         s.set_seq1(x)
    739         if s.real_quick_ratio() >= cutoff and \
    740            s.quick_ratio() >= cutoff and \
    741            s.ratio() >= cutoff:
    742             result.append((s.ratio(), x))
    743 
    744     # Move the best scorers to head of list
    745     result = heapq.nlargest(n, result)
    746     # Strip scores for the best n matches
    747     return [x for score, x in result]
    748 
    749 def _count_leading(line, ch):
    750     """
    751     Return number of `ch` characters at the start of `line`.
    752 
    753     Example:
    754 
    755     >>> _count_leading('   abc', ' ')
    756     3
    757     """
    758 
    759     i, n = 0, len(line)
    760     while i < n and line[i] == ch:
    761         i += 1
    762     return i
    763 
    764 class Differ:
    765     r"""
    766     Differ is a class for comparing sequences of lines of text, and
    767     producing human-readable differences or deltas.  Differ uses
    768     SequenceMatcher both to compare sequences of lines, and to compare
    769     sequences of characters within similar (near-matching) lines.
    770 
    771     Each line of a Differ delta begins with a two-letter code:
    772 
    773         '- '    line unique to sequence 1
    774         '+ '    line unique to sequence 2
    775         '  '    line common to both sequences
    776         '? '    line not present in either input sequence
    777 
    778     Lines beginning with '? ' attempt to guide the eye to intraline
    779     differences, and were not present in either input sequence.  These lines
    780     can be confusing if the sequences contain tab characters.
    781 
    782     Note that Differ makes no claim to produce a *minimal* diff.  To the
    783     contrary, minimal diffs are often counter-intuitive, because they synch
    784     up anywhere possible, sometimes accidental matches 100 pages apart.
    785     Restricting synch points to contiguous matches preserves some notion of
    786     locality, at the occasional cost of producing a longer diff.
    787 
    788     Example: Comparing two texts.
    789 
    790     First we set up the texts, sequences of individual single-line strings
    791     ending with newlines (such sequences can also be obtained from the
    792     `readlines()` method of file-like objects):
    793 
    794     >>> text1 = '''  1. Beautiful is better than ugly.
    795     ...   2. Explicit is better than implicit.
    796     ...   3. Simple is better than complex.
    797     ...   4. Complex is better than complicated.
    798     ... '''.splitlines(1)
    799     >>> len(text1)
    800     4
    801     >>> text1[0][-1]
    802     '\n'
    803     >>> text2 = '''  1. Beautiful is better than ugly.
    804     ...   3.   Simple is better than complex.
    805     ...   4. Complicated is better than complex.
    806     ...   5. Flat is better than nested.
    807     ... '''.splitlines(1)
    808 
    809     Next we instantiate a Differ object:
    810 
    811     >>> d = Differ()
    812 
    813     Note that when instantiating a Differ object we may pass functions to
    814     filter out line and character 'junk'.  See Differ.__init__ for details.
    815 
    816     Finally, we compare the two:
    817 
    818     >>> result = list(d.compare(text1, text2))
    819 
    820     'result' is a list of strings, so let's pretty-print it:
    821 
    822     >>> from pprint import pprint as _pprint
    823     >>> _pprint(result)
    824     ['    1. Beautiful is better than ugly.\n',
    825      '-   2. Explicit is better than implicit.\n',
    826      '-   3. Simple is better than complex.\n',
    827      '+   3.   Simple is better than complex.\n',
    828      '?     ++\n',
    829      '-   4. Complex is better than complicated.\n',
    830      '?            ^                     ---- ^\n',
    831      '+   4. Complicated is better than complex.\n',
    832      '?           ++++ ^                      ^\n',
    833      '+   5. Flat is better than nested.\n']
    834 
    835     As a single multi-line string it looks like this:
    836 
    837     >>> print ''.join(result),
    838         1. Beautiful is better than ugly.
    839     -   2. Explicit is better than implicit.
    840     -   3. Simple is better than complex.
    841     +   3.   Simple is better than complex.
    842     ?     ++
    843     -   4. Complex is better than complicated.
    844     ?            ^                     ---- ^
    845     +   4. Complicated is better than complex.
    846     ?           ++++ ^                      ^
    847     +   5. Flat is better than nested.
    848 
    849     Methods:
    850 
    851     __init__(linejunk=None, charjunk=None)
    852         Construct a text differencer, with optional filters.
    853 
    854     compare(a, b)
    855         Compare two sequences of lines; generate the resulting delta.
    856     """
    857 
    858     def __init__(self, linejunk=None, charjunk=None):
    859         """
    860         Construct a text differencer, with optional filters.
    861 
    862         The two optional keyword parameters are for filter functions:
    863 
    864         - `linejunk`: A function that should accept a single string argument,
    865           and return true iff the string is junk. The module-level function
    866           `IS_LINE_JUNK` may be used to filter out lines without visible
    867           characters, except for at most one splat ('#').  It is recommended
    868           to leave linejunk None; as of Python 2.3, the underlying
    869           SequenceMatcher class has grown an adaptive notion of "noise" lines
    870           that's better than any static definition the author has ever been
    871           able to craft.
    872 
    873         - `charjunk`: A function that should accept a string of length 1. The
    874           module-level function `IS_CHARACTER_JUNK` may be used to filter out
    875           whitespace characters (a blank or tab; **note**: bad idea to include
    876           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
    877         """
    878 
    879         self.linejunk = linejunk
    880         self.charjunk = charjunk
    881 
    882     def compare(self, a, b):
    883         r"""
    884         Compare two sequences of lines; generate the resulting delta.
    885 
    886         Each sequence must contain individual single-line strings ending with
    887         newlines. Such sequences can be obtained from the `readlines()` method
    888         of file-like objects.  The delta generated also consists of newline-
    889         terminated strings, ready to be printed as-is via the writeline()
    890         method of a file-like object.
    891 
    892         Example:
    893 
    894         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
    895         ...                                'ore\ntree\nemu\n'.splitlines(1))),
    896         - one
    897         ?  ^
    898         + ore
    899         ?  ^
    900         - two
    901         - three
    902         ?  -
    903         + tree
    904         + emu
    905         """
    906 
    907         cruncher = SequenceMatcher(self.linejunk, a, b)
    908         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
    909             if tag == 'replace':
    910                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
    911             elif tag == 'delete':
    912                 g = self._dump('-', a, alo, ahi)
    913             elif tag == 'insert':
    914                 g = self._dump('+', b, blo, bhi)
    915             elif tag == 'equal':
    916                 g = self._dump(' ', a, alo, ahi)
    917             else:
    918                 raise ValueError, 'unknown tag %r' % (tag,)
    919 
    920             for line in g:
    921                 yield line
    922 
    923     def _dump(self, tag, x, lo, hi):
    924         """Generate comparison results for a same-tagged range."""
    925         for i in xrange(lo, hi):
    926             yield '%s %s' % (tag, x[i])
    927 
    928     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
    929         assert alo < ahi and blo < bhi
    930         # dump the shorter block first -- reduces the burden on short-term
    931         # memory if the blocks are of very different sizes
    932         if bhi - blo < ahi - alo:
    933             first  = self._dump('+', b, blo, bhi)
    934             second = self._dump('-', a, alo, ahi)
    935         else:
    936             first  = self._dump('-', a, alo, ahi)
    937             second = self._dump('+', b, blo, bhi)
    938 
    939         for g in first, second:
    940             for line in g:
    941                 yield line
    942 
    943     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
    944         r"""
    945         When replacing one block of lines with another, search the blocks
    946         for *similar* lines; the best-matching pair (if any) is used as a
    947         synch point, and intraline difference marking is done on the
    948         similar pair. Lots of work, but often worth it.
    949 
    950         Example:
    951 
    952         >>> d = Differ()
    953         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
    954         ...                            ['abcdefGhijkl\n'], 0, 1)
    955         >>> print ''.join(results),
    956         - abcDefghiJkl
    957         ?    ^  ^  ^
    958         + abcdefGhijkl
    959         ?    ^  ^  ^
    960         """
    961 
    962         # don't synch up unless the lines have a similarity score of at
    963         # least cutoff; best_ratio tracks the best score seen so far
    964         best_ratio, cutoff = 0.74, 0.75
    965         cruncher = SequenceMatcher(self.charjunk)
    966         eqi, eqj = None, None   # 1st indices of equal lines (if any)
    967 
    968         # search for the pair that matches best without being identical
    969         # (identical lines must be junk lines, & we don't want to synch up
    970         # on junk -- unless we have to)
    971         for j in xrange(blo, bhi):
    972             bj = b[j]
    973             cruncher.set_seq2(bj)
    974             for i in xrange(alo, ahi):
    975                 ai = a[i]
    976                 if ai == bj:
    977                     if eqi is None:
    978                         eqi, eqj = i, j
    979                     continue
    980                 cruncher.set_seq1(ai)
    981                 # computing similarity is expensive, so use the quick
    982                 # upper bounds first -- have seen this speed up messy
    983                 # compares by a factor of 3.
    984                 # note that ratio() is only expensive to compute the first
    985                 # time it's called on a sequence pair; the expensive part
    986                 # of the computation is cached by cruncher
    987                 if cruncher.real_quick_ratio() > best_ratio and \
    988                       cruncher.quick_ratio() > best_ratio and \
    989                       cruncher.ratio() > best_ratio:
    990                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
    991         if best_ratio < cutoff:
    992             # no non-identical "pretty close" pair
    993             if eqi is None:
    994                 # no identical pair either -- treat it as a straight replace
    995                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
    996                     yield line
    997                 return
    998             # no close pair, but an identical pair -- synch up on that
    999             best_i, best_j, best_ratio = eqi, eqj, 1.0
   1000         else:
   1001             # there's a close pair, so forget the identical pair (if any)
   1002             eqi = None
   1003 
   1004         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
   1005         # identical
   1006 
   1007         # pump out diffs from before the synch point
   1008         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
   1009             yield line
   1010 
   1011         # do intraline marking on the synch pair
   1012         aelt, belt = a[best_i], b[best_j]
   1013         if eqi is None:
   1014             # pump out a '-', '?', '+', '?' quad for the synched lines
   1015             atags = btags = ""
   1016             cruncher.set_seqs(aelt, belt)
   1017             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
   1018                 la, lb = ai2 - ai1, bj2 - bj1
   1019                 if tag == 'replace':
   1020                     atags += '^' * la
   1021                     btags += '^' * lb
   1022                 elif tag == 'delete':
   1023                     atags += '-' * la
   1024                 elif tag == 'insert':
   1025                     btags += '+' * lb
   1026                 elif tag == 'equal':
   1027                     atags += ' ' * la
   1028                     btags += ' ' * lb
   1029                 else:
   1030                     raise ValueError, 'unknown tag %r' % (tag,)
   1031             for line in self._qformat(aelt, belt, atags, btags):
   1032                 yield line
   1033         else:
   1034             # the synch pair is identical
   1035             yield '  ' + aelt
   1036 
   1037         # pump out diffs from after the synch point
   1038         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
   1039             yield line
   1040 
   1041     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
   1042         g = []
   1043         if alo < ahi:
   1044             if blo < bhi:
   1045                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
   1046             else:
   1047                 g = self._dump('-', a, alo, ahi)
   1048         elif blo < bhi:
   1049             g = self._dump('+', b, blo, bhi)
   1050 
   1051         for line in g:
   1052             yield line
   1053 
   1054     def _qformat(self, aline, bline, atags, btags):
   1055         r"""
   1056         Format "?" output and deal with leading tabs.
   1057 
   1058         Example:
   1059 
   1060         >>> d = Differ()
   1061         >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
   1062         ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
   1063         >>> for line in results: print repr(line)
   1064         ...
   1065         '- \tabcDefghiJkl\n'
   1066         '? \t ^ ^  ^\n'
   1067         '+ \tabcdefGhijkl\n'
   1068         '? \t ^ ^  ^\n'
   1069         """
   1070 
   1071         # Can hurt, but will probably help most of the time.
   1072         common = min(_count_leading(aline, "\t"),
   1073                      _count_leading(bline, "\t"))
   1074         common = min(common, _count_leading(atags[:common], " "))
   1075         common = min(common, _count_leading(btags[:common], " "))
   1076         atags = atags[common:].rstrip()
   1077         btags = btags[common:].rstrip()
   1078 
   1079         yield "- " + aline
   1080         if atags:
   1081             yield "? %s%s\n" % ("\t" * common, atags)
   1082 
   1083         yield "+ " + bline
   1084         if btags:
   1085             yield "? %s%s\n" % ("\t" * common, btags)
   1086 
   1087 # With respect to junk, an earlier version of ndiff simply refused to
   1088 # *start* a match with a junk element.  The result was cases like this:
   1089 #     before: private Thread currentThread;
   1090 #     after:  private volatile Thread currentThread;
   1091 # If you consider whitespace to be junk, the longest contiguous match
   1092 # not starting with junk is "e Thread currentThread".  So ndiff reported
   1093 # that "e volatil" was inserted between the 't' and the 'e' in "private".
   1094 # While an accurate view, to people that's absurd.  The current version
   1095 # looks for matching blocks that are entirely junk-free, then extends the
   1096 # longest one of those as far as possible but only with matching junk.
   1097 # So now "currentThread" is matched, then extended to suck up the
   1098 # preceding blank; then "private" is matched, and extended to suck up the
   1099 # following blank; then "Thread" is matched; and finally ndiff reports
   1100 # that "volatile " was inserted before "Thread".  The only quibble
   1101 # remaining is that perhaps it was really the case that " volatile"
   1102 # was inserted after "private".  I can live with that <wink>.
   1103 
   1104 import re
   1105 
   1106 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
   1107     r"""
   1108     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
   1109 
   1110     Examples:
   1111 
   1112     >>> IS_LINE_JUNK('\n')
   1113     True
   1114     >>> IS_LINE_JUNK('  #   \n')
   1115     True
   1116     >>> IS_LINE_JUNK('hello\n')
   1117     False
   1118     """
   1119 
   1120     return pat(line) is not None
   1121 
   1122 def IS_CHARACTER_JUNK(ch, ws=" \t"):
   1123     r"""
   1124     Return 1 for ignorable character: iff `ch` is a space or tab.
   1125 
   1126     Examples:
   1127 
   1128     >>> IS_CHARACTER_JUNK(' ')
   1129     True
   1130     >>> IS_CHARACTER_JUNK('\t')
   1131     True
   1132     >>> IS_CHARACTER_JUNK('\n')
   1133     False
   1134     >>> IS_CHARACTER_JUNK('x')
   1135     False
   1136     """
   1137 
   1138     return ch in ws
   1139 
   1140 
   1141 ########################################################################
   1142 ###  Unified Diff
   1143 ########################################################################
   1144 
   1145 def _format_range_unified(start, stop):
   1146     'Convert range to the "ed" format'
   1147     # Per the diff spec at http://www.unix.org/single_unix_specification/
   1148     beginning = start + 1     # lines start numbering with one
   1149     length = stop - start
   1150     if length == 1:
   1151         return '{}'.format(beginning)
   1152     if not length:
   1153         beginning -= 1        # empty ranges begin at line just before the range
   1154     return '{},{}'.format(beginning, length)
   1155 
   1156 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
   1157                  tofiledate='', n=3, lineterm='\n'):
   1158     r"""
   1159     Compare two sequences of lines; generate the delta as a unified diff.
   1160 
   1161     Unified diffs are a compact way of showing line changes and a few
   1162     lines of context.  The number of context lines is set by 'n' which
   1163     defaults to three.
   1164 
   1165     By default, the diff control lines (those with ---, +++, or @@) are
   1166     created with a trailing newline.  This is helpful so that inputs
   1167     created from file.readlines() result in diffs that are suitable for
   1168     file.writelines() since both the inputs and outputs have trailing
   1169     newlines.
   1170 
   1171     For inputs that do not have trailing newlines, set the lineterm
   1172     argument to "" so that the output will be uniformly newline free.
   1173 
   1174     The unidiff format normally has a header for filenames and modification
   1175     times.  Any or all of these may be specified using strings for
   1176     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
   1177     The modification times are normally expressed in the ISO 8601 format.
   1178 
   1179     Example:
   1180 
   1181     >>> for line in unified_diff('one two three four'.split(),
   1182     ...             'zero one tree four'.split(), 'Original', 'Current',
   1183     ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
   1184     ...             lineterm=''):
   1185     ...     print line                  # doctest: +NORMALIZE_WHITESPACE
   1186     --- Original        2005-01-26 23:30:50
   1187     +++ Current         2010-04-02 10:20:52
   1188     @@ -1,4 +1,4 @@
   1189     +zero
   1190      one
   1191     -two
   1192     -three
   1193     +tree
   1194      four
   1195     """
   1196 
   1197     started = False
   1198     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
   1199         if not started:
   1200             started = True
   1201             fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
   1202             todate = '\t{}'.format(tofiledate) if tofiledate else ''
   1203             yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
   1204             yield '+++ {}{}{}'.format(tofile, todate, lineterm)
   1205 
   1206         first, last = group[0], group[-1]
   1207         file1_range = _format_range_unified(first[1], last[2])
   1208         file2_range = _format_range_unified(first[3], last[4])
   1209         yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
   1210 
   1211         for tag, i1, i2, j1, j2 in group:
   1212             if tag == 'equal':
   1213                 for line in a[i1:i2]:
   1214                     yield ' ' + line
   1215                 continue
   1216             if tag in ('replace', 'delete'):
   1217                 for line in a[i1:i2]:
   1218                     yield '-' + line
   1219             if tag in ('replace', 'insert'):
   1220                 for line in b[j1:j2]:
   1221                     yield '+' + line
   1222 
   1223 
   1224 ########################################################################
   1225 ###  Context Diff
   1226 ########################################################################
   1227 
   1228 def _format_range_context(start, stop):
   1229     'Convert range to the "ed" format'
   1230     # Per the diff spec at http://www.unix.org/single_unix_specification/
   1231     beginning = start + 1     # lines start numbering with one
   1232     length = stop - start
   1233     if not length:
   1234         beginning -= 1        # empty ranges begin at line just before the range
   1235     if length <= 1:
   1236         return '{}'.format(beginning)
   1237     return '{},{}'.format(beginning, beginning + length - 1)
   1238 
   1239 # See http://www.unix.org/single_unix_specification/
   1240 def context_diff(a, b, fromfile='', tofile='',
   1241                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
   1242     r"""
   1243     Compare two sequences of lines; generate the delta as a context diff.
   1244 
   1245     Context diffs are a compact way of showing line changes and a few
   1246     lines of context.  The number of context lines is set by 'n' which
   1247     defaults to three.
   1248 
   1249     By default, the diff control lines (those with *** or ---) are
   1250     created with a trailing newline.  This is helpful so that inputs
   1251     created from file.readlines() result in diffs that are suitable for
   1252     file.writelines() since both the inputs and outputs have trailing
   1253     newlines.
   1254 
   1255     For inputs that do not have trailing newlines, set the lineterm
   1256     argument to "" so that the output will be uniformly newline free.
   1257 
   1258     The context diff format normally has a header for filenames and
   1259     modification times.  Any or all of these may be specified using
   1260     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
   1261     The modification times are normally expressed in the ISO 8601 format.
   1262     If not specified, the strings default to blanks.
   1263 
   1264     Example:
   1265 
   1266     >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
   1267     ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),
   1268     *** Original
   1269     --- Current
   1270     ***************
   1271     *** 1,4 ****
   1272       one
   1273     ! two
   1274     ! three
   1275       four
   1276     --- 1,4 ----
   1277     + zero
   1278       one
   1279     ! tree
   1280       four
   1281     """
   1282 
   1283     prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
   1284     started = False
   1285     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
   1286         if not started:
   1287             started = True
   1288             fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
   1289             todate = '\t{}'.format(tofiledate) if tofiledate else ''
   1290             yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
   1291             yield '--- {}{}{}'.format(tofile, todate, lineterm)
   1292 
   1293         first, last = group[0], group[-1]
   1294         yield '***************' + lineterm
   1295 
   1296         file1_range = _format_range_context(first[1], last[2])
   1297         yield '*** {} ****{}'.format(file1_range, lineterm)
   1298 
   1299         if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):
   1300             for tag, i1, i2, _, _ in group:
   1301                 if tag != 'insert':
   1302                     for line in a[i1:i2]:
   1303                         yield prefix[tag] + line
   1304 
   1305         file2_range = _format_range_context(first[3], last[4])
   1306         yield '--- {} ----{}'.format(file2_range, lineterm)
   1307 
   1308         if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):
   1309             for tag, _, _, j1, j2 in group:
   1310                 if tag != 'delete':
   1311                     for line in b[j1:j2]:
   1312                         yield prefix[tag] + line
   1313 
   1314 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
   1315     r"""
   1316     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
   1317 
   1318     Optional keyword parameters `linejunk` and `charjunk` are for filter
   1319     functions (or None):
   1320 
   1321     - linejunk: A function that should accept a single string argument, and
   1322       return true iff the string is junk.  The default is None, and is
   1323       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
   1324       used that does a good job on its own.
   1325 
   1326     - charjunk: A function that should accept a string of length 1. The
   1327       default is module-level function IS_CHARACTER_JUNK, which filters out
   1328       whitespace characters (a blank or tab; note: bad idea to include newline
   1329       in this!).
   1330 
   1331     Tools/scripts/ndiff.py is a command-line front-end to this function.
   1332 
   1333     Example:
   1334 
   1335     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
   1336     ...              'ore\ntree\nemu\n'.splitlines(1))
   1337     >>> print ''.join(diff),
   1338     - one
   1339     ?  ^
   1340     + ore
   1341     ?  ^
   1342     - two
   1343     - three
   1344     ?  -
   1345     + tree
   1346     + emu
   1347     """
   1348     return Differ(linejunk, charjunk).compare(a, b)
   1349 
   1350 def _mdiff(fromlines, tolines, context=None, linejunk=None,
   1351            charjunk=IS_CHARACTER_JUNK):
   1352     r"""Returns generator yielding marked up from/to side by side differences.
   1353 
   1354     Arguments:
   1355     fromlines -- list of text lines to compared to tolines
   1356     tolines -- list of text lines to be compared to fromlines
   1357     context -- number of context lines to display on each side of difference,
   1358                if None, all from/to text lines will be generated.
   1359     linejunk -- passed on to ndiff (see ndiff documentation)
   1360     charjunk -- passed on to ndiff (see ndiff documentation)
   1361 
   1362     This function returns an iterator which returns a tuple:
   1363     (from line tuple, to line tuple, boolean flag)
   1364 
   1365     from/to line tuple -- (line num, line text)
   1366         line num -- integer or None (to indicate a context separation)
   1367         line text -- original line text with following markers inserted:
   1368             '\0+' -- marks start of added text
   1369             '\0-' -- marks start of deleted text
   1370             '\0^' -- marks start of changed text
   1371             '\1' -- marks end of added/deleted/changed text
   1372 
   1373     boolean flag -- None indicates context separation, True indicates
   1374         either "from" or "to" line contains a change, otherwise False.
   1375 
   1376     This function/iterator was originally developed to generate side by side
   1377     file difference for making HTML pages (see HtmlDiff class for example
   1378     usage).
   1379 
   1380     Note, this function utilizes the ndiff function to generate the side by
   1381     side difference markup.  Optional ndiff arguments may be passed to this
   1382     function and they in turn will be passed to ndiff.
   1383     """
   1384     import re
   1385 
   1386     # regular expression for finding intraline change indices
   1387     change_re = re.compile('(\++|\-+|\^+)')
   1388 
   1389     # create the difference iterator to generate the differences
   1390     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
   1391 
   1392     def _make_line(lines, format_key, side, num_lines=[0,0]):
   1393         """Returns line of text with user's change markup and line formatting.
   1394 
   1395         lines -- list of lines from the ndiff generator to produce a line of
   1396                  text from.  When producing the line of text to return, the
   1397                  lines used are removed from this list.
   1398         format_key -- '+' return first line in list with "add" markup around
   1399                           the entire line.
   1400                       '-' return first line in list with "delete" markup around
   1401                           the entire line.
   1402                       '?' return first line in list with add/delete/change
   1403                           intraline markup (indices obtained from second line)
   1404                       None return first line in list with no markup
   1405         side -- indice into the num_lines list (0=from,1=to)
   1406         num_lines -- from/to current line number.  This is NOT intended to be a
   1407                      passed parameter.  It is present as a keyword argument to
   1408                      maintain memory of the current line numbers between calls
   1409                      of this function.
   1410 
   1411         Note, this function is purposefully not defined at the module scope so
   1412         that data it needs from its parent function (within whose context it
   1413         is defined) does not need to be of module scope.
   1414         """
   1415         num_lines[side] += 1
   1416         # Handle case where no user markup is to be added, just return line of
   1417         # text with user's line format to allow for usage of the line number.
   1418         if format_key is None:
   1419             return (num_lines[side],lines.pop(0)[2:])
   1420         # Handle case of intraline changes
   1421         if format_key == '?':
   1422             text, markers = lines.pop(0), lines.pop(0)
   1423             # find intraline changes (store change type and indices in tuples)
   1424             sub_info = []
   1425             def record_sub_info(match_object,sub_info=sub_info):
   1426                 sub_info.append([match_object.group(1)[0],match_object.span()])
   1427                 return match_object.group(1)
   1428             change_re.sub(record_sub_info,markers)
   1429             # process each tuple inserting our special marks that won't be
   1430             # noticed by an xml/html escaper.
   1431             for key,(begin,end) in sub_info[::-1]:
   1432                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
   1433             text = text[2:]
   1434         # Handle case of add/delete entire line
   1435         else:
   1436             text = lines.pop(0)[2:]
   1437             # if line of text is just a newline, insert a space so there is
   1438             # something for the user to highlight and see.
   1439             if not text:
   1440                 text = ' '
   1441             # insert marks that won't be noticed by an xml/html escaper.
   1442             text = '\0' + format_key + text + '\1'
   1443         # Return line of text, first allow user's line formatter to do its
   1444         # thing (such as adding the line number) then replace the special
   1445         # marks with what the user's change markup.
   1446         return (num_lines[side],text)
   1447 
   1448     def _line_iterator():
   1449         """Yields from/to lines of text with a change indication.
   1450 
   1451         This function is an iterator.  It itself pulls lines from a
   1452         differencing iterator, processes them and yields them.  When it can
   1453         it yields both a "from" and a "to" line, otherwise it will yield one
   1454         or the other.  In addition to yielding the lines of from/to text, a
   1455         boolean flag is yielded to indicate if the text line(s) have
   1456         differences in them.
   1457 
   1458         Note, this function is purposefully not defined at the module scope so
   1459         that data it needs from its parent function (within whose context it
   1460         is defined) does not need to be of module scope.
   1461         """
   1462         lines = []
   1463         num_blanks_pending, num_blanks_to_yield = 0, 0
   1464         while True:
   1465             # Load up next 4 lines so we can look ahead, create strings which
   1466             # are a concatenation of the first character of each of the 4 lines
   1467             # so we can do some very readable comparisons.
   1468             while len(lines) < 4:
   1469                 try:
   1470                     lines.append(diff_lines_iterator.next())
   1471                 except StopIteration:
   1472                     lines.append('X')
   1473             s = ''.join([line[0] for line in lines])
   1474             if s.startswith('X'):
   1475                 # When no more lines, pump out any remaining blank lines so the
   1476                 # corresponding add/delete lines get a matching blank line so
   1477                 # all line pairs get yielded at the next level.
   1478                 num_blanks_to_yield = num_blanks_pending
   1479             elif s.startswith('-?+?'):
   1480                 # simple intraline change
   1481                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
   1482                 continue
   1483             elif s.startswith('--++'):
   1484                 # in delete block, add block coming: we do NOT want to get
   1485                 # caught up on blank lines yet, just process the delete line
   1486                 num_blanks_pending -= 1
   1487                 yield _make_line(lines,'-',0), None, True
   1488                 continue
   1489             elif s.startswith(('--?+', '--+', '- ')):
   1490                 # in delete block and see an intraline change or unchanged line
   1491                 # coming: yield the delete line and then blanks
   1492                 from_line,to_line = _make_line(lines,'-',0), None
   1493                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
   1494             elif s.startswith('-+?'):
   1495                 # intraline change
   1496                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
   1497                 continue
   1498             elif s.startswith('-?+'):
   1499                 # intraline change
   1500                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
   1501                 continue
   1502             elif s.startswith('-'):
   1503                 # delete FROM line
   1504                 num_blanks_pending -= 1
   1505                 yield _make_line(lines,'-',0), None, True
   1506                 continue
   1507             elif s.startswith('+--'):
   1508                 # in add block, delete block coming: we do NOT want to get
   1509                 # caught up on blank lines yet, just process the add line
   1510                 num_blanks_pending += 1
   1511                 yield None, _make_line(lines,'+',1), True
   1512                 continue
   1513             elif s.startswith(('+ ', '+-')):
   1514                 # will be leaving an add block: yield blanks then add line
   1515                 from_line, to_line = None, _make_line(lines,'+',1)
   1516                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
   1517             elif s.startswith('+'):
   1518                 # inside an add block, yield the add line
   1519                 num_blanks_pending += 1
   1520                 yield None, _make_line(lines,'+',1), True
   1521                 continue
   1522             elif s.startswith(' '):
   1523                 # unchanged text, yield it to both sides
   1524                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
   1525                 continue
   1526             # Catch up on the blank lines so when we yield the next from/to
   1527             # pair, they are lined up.
   1528             while(num_blanks_to_yield < 0):
   1529                 num_blanks_to_yield += 1
   1530                 yield None,('','\n'),True
   1531             while(num_blanks_to_yield > 0):
   1532                 num_blanks_to_yield -= 1
   1533                 yield ('','\n'),None,True
   1534             if s.startswith('X'):
   1535                 raise StopIteration
   1536             else:
   1537                 yield from_line,to_line,True
   1538 
   1539     def _line_pair_iterator():
   1540         """Yields from/to lines of text with a change indication.
   1541 
   1542         This function is an iterator.  It itself pulls lines from the line
   1543         iterator.  Its difference from that iterator is that this function
   1544         always yields a pair of from/to text lines (with the change
   1545         indication).  If necessary it will collect single from/to lines
   1546         until it has a matching pair from/to pair to yield.
   1547 
   1548         Note, this function is purposefully not defined at the module scope so
   1549         that data it needs from its parent function (within whose context it
   1550         is defined) does not need to be of module scope.
   1551         """
   1552         line_iterator = _line_iterator()
   1553         fromlines,tolines=[],[]
   1554         while True:
   1555             # Collecting lines of text until we have a from/to pair
   1556             while (len(fromlines)==0 or len(tolines)==0):
   1557                 from_line, to_line, found_diff =line_iterator.next()
   1558                 if from_line is not None:
   1559                     fromlines.append((from_line,found_diff))
   1560                 if to_line is not None:
   1561                     tolines.append((to_line,found_diff))
   1562             # Once we have a pair, remove them from the collection and yield it
   1563             from_line, fromDiff = fromlines.pop(0)
   1564             to_line, to_diff = tolines.pop(0)
   1565             yield (from_line,to_line,fromDiff or to_diff)
   1566 
   1567     # Handle case where user does not want context differencing, just yield
   1568     # them up without doing anything else with them.
   1569     line_pair_iterator = _line_pair_iterator()
   1570     if context is None:
   1571         while True:
   1572             yield line_pair_iterator.next()
   1573     # Handle case where user wants context differencing.  We must do some
   1574     # storage of lines until we know for sure that they are to be yielded.
   1575     else:
   1576         context += 1
   1577         lines_to_write = 0
   1578         while True:
   1579             # Store lines up until we find a difference, note use of a
   1580             # circular queue because we only need to keep around what
   1581             # we need for context.
   1582             index, contextLines = 0, [None]*(context)
   1583             found_diff = False
   1584             while(found_diff is False):
   1585                 from_line, to_line, found_diff = line_pair_iterator.next()
   1586                 i = index % context
   1587                 contextLines[i] = (from_line, to_line, found_diff)
   1588                 index += 1
   1589             # Yield lines that we have collected so far, but first yield
   1590             # the user's separator.
   1591             if index > context:
   1592                 yield None, None, None
   1593                 lines_to_write = context
   1594             else:
   1595                 lines_to_write = index
   1596                 index = 0
   1597             while(lines_to_write):
   1598                 i = index % context
   1599                 index += 1
   1600                 yield contextLines[i]
   1601                 lines_to_write -= 1
   1602             # Now yield the context lines after the change
   1603             lines_to_write = context-1
   1604             while(lines_to_write):
   1605                 from_line, to_line, found_diff = line_pair_iterator.next()
   1606                 # If another change within the context, extend the context
   1607                 if found_diff:
   1608                     lines_to_write = context-1
   1609                 else:
   1610                     lines_to_write -= 1
   1611                 yield from_line, to_line, found_diff
   1612 
   1613 
   1614 _file_template = """
   1615 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
   1616           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
   1617 
   1618 <html>
   1619 
   1620 <head>
   1621     <meta http-equiv="Content-Type"
   1622           content="text/html; charset=ISO-8859-1" />
   1623     <title></title>
   1624     <style type="text/css">%(styles)s
   1625     </style>
   1626 </head>
   1627 
   1628 <body>
   1629     %(table)s%(legend)s
   1630 </body>
   1631 
   1632 </html>"""
   1633 
   1634 _styles = """
   1635         table.diff {font-family:Courier; border:medium;}
   1636         .diff_header {background-color:#e0e0e0}
   1637         td.diff_header {text-align:right}
   1638         .diff_next {background-color:#c0c0c0}
   1639         .diff_add {background-color:#aaffaa}
   1640         .diff_chg {background-color:#ffff77}
   1641         .diff_sub {background-color:#ffaaaa}"""
   1642 
   1643 _table_template = """
   1644     <table class="diff" id="difflib_chg_%(prefix)s_top"
   1645            cellspacing="0" cellpadding="0" rules="groups" >
   1646         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
   1647         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
   1648         %(header_row)s
   1649         <tbody>
   1650 %(data_rows)s        </tbody>
   1651     </table>"""
   1652 
   1653 _legend = """
   1654     <table class="diff" summary="Legends">
   1655         <tr> <th colspan="2"> Legends </th> </tr>
   1656         <tr> <td> <table border="" summary="Colors">
   1657                       <tr><th> Colors </th> </tr>
   1658                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
   1659                       <tr><td class="diff_chg">Changed</td> </tr>
   1660                       <tr><td class="diff_sub">Deleted</td> </tr>
   1661                   </table></td>
   1662              <td> <table border="" summary="Links">
   1663                       <tr><th colspan="2"> Links </th> </tr>
   1664                       <tr><td>(f)irst change</td> </tr>
   1665                       <tr><td>(n)ext change</td> </tr>
   1666                       <tr><td>(t)op</td> </tr>
   1667                   </table></td> </tr>
   1668     </table>"""
   1669 
   1670 class HtmlDiff(object):
   1671     """For producing HTML side by side comparison with change highlights.
   1672 
   1673     This class can be used to create an HTML table (or a complete HTML file
   1674     containing the table) showing a side by side, line by line comparison
   1675     of text with inter-line and intra-line change highlights.  The table can
   1676     be generated in either full or contextual difference mode.
   1677 
   1678     The following methods are provided for HTML generation:
   1679 
   1680     make_table -- generates HTML for a single side by side table
   1681     make_file -- generates complete HTML file with a single side by side table
   1682 
   1683     See tools/scripts/diff.py for an example usage of this class.
   1684     """
   1685 
   1686     _file_template = _file_template
   1687     _styles = _styles
   1688     _table_template = _table_template
   1689     _legend = _legend
   1690     _default_prefix = 0
   1691 
   1692     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
   1693                  charjunk=IS_CHARACTER_JUNK):
   1694         """HtmlDiff instance initializer
   1695 
   1696         Arguments:
   1697         tabsize -- tab stop spacing, defaults to 8.
   1698         wrapcolumn -- column number where lines are broken and wrapped,
   1699             defaults to None where lines are not wrapped.
   1700         linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
   1701             HtmlDiff() to generate the side by side HTML differences).  See
   1702             ndiff() documentation for argument default values and descriptions.
   1703         """
   1704         self._tabsize = tabsize
   1705         self._wrapcolumn = wrapcolumn
   1706         self._linejunk = linejunk
   1707         self._charjunk = charjunk
   1708 
   1709     def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
   1710                   numlines=5):
   1711         """Returns HTML file of side by side comparison with change highlights
   1712 
   1713         Arguments:
   1714         fromlines -- list of "from" lines
   1715         tolines -- list of "to" lines
   1716         fromdesc -- "from" file column header string
   1717         todesc -- "to" file column header string
   1718         context -- set to True for contextual differences (defaults to False
   1719             which shows full differences).
   1720         numlines -- number of context lines.  When context is set True,
   1721             controls number of lines displayed before and after the change.
   1722             When context is False, controls the number of lines to place
   1723             the "next" link anchors before the next change (so click of
   1724             "next" link jumps to just before the change).
   1725         """
   1726 
   1727         return self._file_template % dict(
   1728             styles = self._styles,
   1729             legend = self._legend,
   1730             table = self.make_table(fromlines,tolines,fromdesc,todesc,
   1731                                     context=context,numlines=numlines))
   1732 
   1733     def _tab_newline_replace(self,fromlines,tolines):
   1734         """Returns from/to line lists with tabs expanded and newlines removed.
   1735 
   1736         Instead of tab characters being replaced by the number of spaces
   1737         needed to fill in to the next tab stop, this function will fill
   1738         the space with tab characters.  This is done so that the difference
   1739         algorithms can identify changes in a file when tabs are replaced by
   1740         spaces and vice versa.  At the end of the HTML generation, the tab
   1741         characters will be replaced with a nonbreakable space.
   1742         """
   1743         def expand_tabs(line):
   1744             # hide real spaces
   1745             line = line.replace(' ','\0')
   1746             # expand tabs into spaces
   1747             line = line.expandtabs(self._tabsize)
   1748             # replace spaces from expanded tabs back into tab characters
   1749             # (we'll replace them with markup after we do differencing)
   1750             line = line.replace(' ','\t')
   1751             return line.replace('\0',' ').rstrip('\n')
   1752         fromlines = [expand_tabs(line) for line in fromlines]
   1753         tolines = [expand_tabs(line) for line in tolines]
   1754         return fromlines,tolines
   1755 
   1756     def _split_line(self,data_list,line_num,text):
   1757         """Builds list of text lines by splitting text lines at wrap point
   1758 
   1759         This function will determine if the input text line needs to be
   1760         wrapped (split) into separate lines.  If so, the first wrap point
   1761         will be determined and the first line appended to the output
   1762         text line list.  This function is used recursively to handle
   1763         the second part of the split line to further split it.
   1764         """
   1765         # if blank line or context separator, just add it to the output list
   1766         if not line_num:
   1767             data_list.append((line_num,text))
   1768             return
   1769 
   1770         # if line text doesn't need wrapping, just add it to the output list
   1771         size = len(text)
   1772         max = self._wrapcolumn
   1773         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
   1774             data_list.append((line_num,text))
   1775             return
   1776 
   1777         # scan text looking for the wrap point, keeping track if the wrap
   1778         # point is inside markers
   1779         i = 0
   1780         n = 0
   1781         mark = ''
   1782         while n < max and i < size:
   1783             if text[i] == '\0':
   1784                 i += 1
   1785                 mark = text[i]
   1786                 i += 1
   1787             elif text[i] == '\1':
   1788                 i += 1
   1789                 mark = ''
   1790             else:
   1791                 i += 1
   1792                 n += 1
   1793 
   1794         # wrap point is inside text, break it up into separate lines
   1795         line1 = text[:i]
   1796         line2 = text[i:]
   1797 
   1798         # if wrap point is inside markers, place end marker at end of first
   1799         # line and start marker at beginning of second line because each
   1800         # line will have its own table tag markup around it.
   1801         if mark:
   1802             line1 = line1 + '\1'
   1803             line2 = '\0' + mark + line2
   1804 
   1805         # tack on first line onto the output list
   1806         data_list.append((line_num,line1))
   1807 
   1808         # use this routine again to wrap the remaining text
   1809         self._split_line(data_list,'>',line2)
   1810 
   1811     def _line_wrapper(self,diffs):
   1812         """Returns iterator that splits (wraps) mdiff text lines"""
   1813 
   1814         # pull from/to data and flags from mdiff iterator
   1815         for fromdata,todata,flag in diffs:
   1816             # check for context separators and pass them through
   1817             if flag is None:
   1818                 yield fromdata,todata,flag
   1819                 continue
   1820             (fromline,fromtext),(toline,totext) = fromdata,todata
   1821             # for each from/to line split it at the wrap column to form
   1822             # list of text lines.
   1823             fromlist,tolist = [],[]
   1824             self._split_line(fromlist,fromline,fromtext)
   1825             self._split_line(tolist,toline,totext)
   1826             # yield from/to line in pairs inserting blank lines as
   1827             # necessary when one side has more wrapped lines
   1828             while fromlist or tolist:
   1829                 if fromlist:
   1830                     fromdata = fromlist.pop(0)
   1831                 else:
   1832                     fromdata = ('',' ')
   1833                 if tolist:
   1834                     todata = tolist.pop(0)
   1835                 else:
   1836                     todata = ('',' ')
   1837                 yield fromdata,todata,flag
   1838 
   1839     def _collect_lines(self,diffs):
   1840         """Collects mdiff output into separate lists
   1841 
   1842         Before storing the mdiff from/to data into a list, it is converted
   1843         into a single line of text with HTML markup.
   1844         """
   1845 
   1846         fromlist,tolist,flaglist = [],[],[]
   1847         # pull from/to data and flags from mdiff style iterator
   1848         for fromdata,todata,flag in diffs:
   1849             try:
   1850                 # store HTML markup of the lines into the lists
   1851                 fromlist.append(self._format_line(0,flag,*fromdata))
   1852                 tolist.append(self._format_line(1,flag,*todata))
   1853             except TypeError:
   1854                 # exceptions occur for lines where context separators go
   1855                 fromlist.append(None)
   1856                 tolist.append(None)
   1857             flaglist.append(flag)
   1858         return fromlist,tolist,flaglist
   1859 
   1860     def _format_line(self,side,flag,linenum,text):
   1861         """Returns HTML markup of "from" / "to" text lines
   1862 
   1863         side -- 0 or 1 indicating "from" or "to" text
   1864         flag -- indicates if difference on line
   1865         linenum -- line number (used for line number column)
   1866         text -- line text to be marked up
   1867         """
   1868         try:
   1869             linenum = '%d' % linenum
   1870             id = ' id="%s%s"' % (self._prefix[side],linenum)
   1871         except TypeError:
   1872             # handle blank lines where linenum is '>' or ''
   1873             id = ''
   1874         # replace those things that would get confused with HTML symbols
   1875         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
   1876 
   1877         # make space non-breakable so they don't get compressed or line wrapped
   1878         text = text.replace(' ','&nbsp;').rstrip()
   1879 
   1880         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
   1881                % (id,linenum,text)
   1882 
   1883     def _make_prefix(self):
   1884         """Create unique anchor prefixes"""
   1885 
   1886         # Generate a unique anchor prefix so multiple tables
   1887         # can exist on the same HTML page without conflicts.
   1888         fromprefix = "from%d_" % HtmlDiff._default_prefix
   1889         toprefix = "to%d_" % HtmlDiff._default_prefix
   1890         HtmlDiff._default_prefix += 1
   1891         # store prefixes so line format method has access
   1892         self._prefix = [fromprefix,toprefix]
   1893 
   1894     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
   1895         """Makes list of "next" links"""
   1896 
   1897         # all anchor names will be generated using the unique "to" prefix
   1898         toprefix = self._prefix[1]
   1899 
   1900         # process change flags, generating middle column of next anchors/links
   1901         next_id = ['']*len(flaglist)
   1902         next_href = ['']*len(flaglist)
   1903         num_chg, in_change = 0, False
   1904         last = 0
   1905         for i,flag in enumerate(flaglist):
   1906             if flag:
   1907                 if not in_change:
   1908                     in_change = True
   1909                     last = i
   1910                     # at the beginning of a change, drop an anchor a few lines
   1911                     # (the context lines) before the change for the previous
   1912                     # link
   1913                     i = max([0,i-numlines])
   1914                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
   1915                     # at the beginning of a change, drop a link to the next
   1916                     # change
   1917                     num_chg += 1
   1918                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
   1919                          toprefix,num_chg)
   1920             else:
   1921                 in_change = False
   1922         # check for cases where there is no content to avoid exceptions
   1923         if not flaglist:
   1924             flaglist = [False]
   1925             next_id = ['']
   1926             next_href = ['']
   1927             last = 0
   1928             if context:
   1929                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
   1930                 tolist = fromlist
   1931             else:
   1932                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
   1933         # if not a change on first line, drop a link
   1934         if not flaglist[0]:
   1935             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
   1936         # redo the last link to link to the top
   1937         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
   1938 
   1939         return fromlist,tolist,flaglist,next_href,next_id
   1940 
   1941     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
   1942                    numlines=5):
   1943         """Returns HTML table of side by side comparison with change highlights
   1944 
   1945         Arguments:
   1946         fromlines -- list of "from" lines
   1947         tolines -- list of "to" lines
   1948         fromdesc -- "from" file column header string
   1949         todesc -- "to" file column header string
   1950         context -- set to True for contextual differences (defaults to False
   1951             which shows full differences).
   1952         numlines -- number of context lines.  When context is set True,
   1953             controls number of lines displayed before and after the change.
   1954             When context is False, controls the number of lines to place
   1955             the "next" link anchors before the next change (so click of
   1956             "next" link jumps to just before the change).
   1957         """
   1958 
   1959         # make unique anchor prefixes so that multiple tables may exist
   1960         # on the same page without conflict.
   1961         self._make_prefix()
   1962 
   1963         # change tabs to spaces before it gets more difficult after we insert
   1964         # markup
   1965         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
   1966 
   1967         # create diffs iterator which generates side by side from/to data
   1968         if context:
   1969             context_lines = numlines
   1970         else:
   1971             context_lines = None
   1972         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
   1973                       charjunk=self._charjunk)
   1974 
   1975         # set up iterator to wrap lines that exceed desired width
   1976         if self._wrapcolumn:
   1977             diffs = self._line_wrapper(diffs)
   1978 
   1979         # collect up from/to lines and flags into lists (also format the lines)
   1980         fromlist,tolist,flaglist = self._collect_lines(diffs)
   1981 
   1982         # process change flags, generating middle column of next anchors/links
   1983         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
   1984             fromlist,tolist,flaglist,context,numlines)
   1985 
   1986         s = []
   1987         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
   1988               '<td class="diff_next">%s</td>%s</tr>\n'
   1989         for i in range(len(flaglist)):
   1990             if flaglist[i] is None:
   1991                 # mdiff yields None on separator lines skip the bogus ones
   1992                 # generated for the first line
   1993                 if i > 0:
   1994                     s.append('        </tbody>        \n        <tbody>\n')
   1995             else:
   1996                 s.append( fmt % (next_id[i],next_href[i],fromlist[i],
   1997                                            next_href[i],tolist[i]))
   1998         if fromdesc or todesc:
   1999             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
   2000                 '<th class="diff_next"><br /></th>',
   2001                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
   2002                 '<th class="diff_next"><br /></th>',
   2003                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
   2004         else:
   2005             header_row = ''
   2006 
   2007         table = self._table_template % dict(
   2008             data_rows=''.join(s),
   2009             header_row=header_row,
   2010             prefix=self._prefix[1])
   2011 
   2012         return table.replace('\0+','<span class="diff_add">'). \
   2013                      replace('\0-','<span class="diff_sub">'). \
   2014                      replace('\0^','<span class="diff_chg">'). \
   2015                      replace('\1','</span>'). \
   2016                      replace('\t','&nbsp;')
   2017 
   2018 del re
   2019 
   2020 def restore(delta, which):
   2021     r"""
   2022     Generate one of the two sequences that generated a delta.
   2023 
   2024     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
   2025     lines originating from file 1 or 2 (parameter `which`), stripping off line
   2026     prefixes.
   2027 
   2028     Examples:
   2029 
   2030     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
   2031     ...              'ore\ntree\nemu\n'.splitlines(1))
   2032     >>> diff = list(diff)
   2033     >>> print ''.join(restore(diff, 1)),
   2034     one
   2035     two
   2036     three
   2037     >>> print ''.join(restore(diff, 2)),
   2038     ore
   2039     tree
   2040     emu
   2041     """
   2042     try:
   2043         tag = {1: "- ", 2: "+ "}[int(which)]
   2044     except KeyError:
   2045         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
   2046                            % which)
   2047     prefixes = ("  ", tag)
   2048     for line in delta:
   2049         if line[:2] in prefixes:
   2050             yield line[2:]
   2051 
   2052 def _test():
   2053     import doctest, difflib
   2054     return doctest.testmod(difflib)
   2055 
   2056 if __name__ == "__main__":
   2057     _test()
   2058