Home | History | Annotate | Download | only in Lib
      1 """
      2 Module difflib -- helpers for computing deltas between objects.
      3 
      4 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
      5     Use SequenceMatcher to return list of the best "good enough" matches.
      6 
      7 Function context_diff(a, b):
      8     For two lists of strings, return a delta in context diff format.
      9 
     10 Function ndiff(a, b):
     11     Return a delta: the difference between `a` and `b` (lists of strings).
     12 
     13 Function restore(delta, which):
     14     Return one of the two sequences that generated an ndiff delta.
     15 
     16 Function unified_diff(a, b):
     17     For two lists of strings, return a delta in unified diff format.
     18 
     19 Class SequenceMatcher:
     20     A flexible class for comparing pairs of sequences of any type.
     21 
     22 Class Differ:
     23     For producing human-readable deltas from sequences of lines of text.
     24 
     25 Class HtmlDiff:
     26     For producing HTML side by side comparison with change highlights.
     27 """
     28 
     29 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
     30            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
     31            'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
     32 
     33 from heapq import nlargest as _nlargest
     34 from collections import namedtuple as _namedtuple
     35 
     36 Match = _namedtuple('Match', 'a b size')
     37 
     38 def _calculate_ratio(matches, length):
     39     if length:
     40         return 2.0 * matches / length
     41     return 1.0
     42 
     43 class SequenceMatcher:
     44 
     45     """
     46     SequenceMatcher is a flexible class for comparing pairs of sequences of
     47     any type, so long as the sequence elements are hashable.  The basic
     48     algorithm predates, and is a little fancier than, an algorithm
     49     published in the late 1980's by Ratcliff and Obershelp under the
     50     hyperbolic name "gestalt pattern matching".  The basic idea is to find
     51     the longest contiguous matching subsequence that contains no "junk"
     52     elements (R-O doesn't address junk).  The same idea is then applied
     53     recursively to the pieces of the sequences to the left and to the right
     54     of the matching subsequence.  This does not yield minimal edit
     55     sequences, but does tend to yield matches that "look right" to people.
     56 
     57     SequenceMatcher tries to compute a "human-friendly diff" between two
     58     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
     59     longest *contiguous* & junk-free matching subsequence.  That's what
     60     catches peoples' eyes.  The Windows(tm) windiff has another interesting
     61     notion, pairing up elements that appear uniquely in each sequence.
     62     That, and the method here, appear to yield more intuitive difference
     63     reports than does diff.  This method appears to be the least vulnerable
     64     to synching up on blocks of "junk lines", though (like blank lines in
     65     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
     66     because this is the only method of the 3 that has a *concept* of
     67     "junk" <wink>.
     68 
     69     Example, comparing two strings, and considering blanks to be "junk":
     70 
     71     >>> s = SequenceMatcher(lambda x: x == " ",
     72     ...                     "private Thread currentThread;",
     73     ...                     "private volatile Thread currentThread;")
     74     >>>
     75 
     76     .ratio() returns a float in [0, 1], measuring the "similarity" of the
     77     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
     78     sequences are close matches:
     79 
     80     >>> print(round(s.ratio(), 3))
     81     0.866
     82     >>>
     83 
     84     If you're only interested in where the sequences match,
     85     .get_matching_blocks() is handy:
     86 
     87     >>> for block in s.get_matching_blocks():
     88     ...     print("a[%d] and b[%d] match for %d elements" % block)
     89     a[0] and b[0] match for 8 elements
     90     a[8] and b[17] match for 21 elements
     91     a[29] and b[38] match for 0 elements
     92 
     93     Note that the last tuple returned by .get_matching_blocks() is always a
     94     dummy, (len(a), len(b), 0), and this is the only case in which the last
     95     tuple element (number of elements matched) is 0.
     96 
     97     If you want to know how to change the first sequence into the second,
     98     use .get_opcodes():
     99 
    100     >>> for opcode in s.get_opcodes():
    101     ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
    102      equal a[0:8] b[0:8]
    103     insert a[8:8] b[8:17]
    104      equal a[8:29] b[17:38]
    105 
    106     See the Differ class for a fancy human-friendly file differencer, which
    107     uses SequenceMatcher both to compare sequences of lines, and to compare
    108     sequences of characters within similar (near-matching) lines.
    109 
    110     See also function get_close_matches() in this module, which shows how
    111     simple code building on SequenceMatcher can be used to do useful work.
    112 
    113     Timing:  Basic R-O is cubic time worst case and quadratic time expected
    114     case.  SequenceMatcher is quadratic time for the worst case and has
    115     expected-case behavior dependent in a complicated way on how many
    116     elements the sequences have in common; best case time is linear.
    117 
    118     Methods:
    119 
    120     __init__(isjunk=None, a='', b='')
    121         Construct a SequenceMatcher.
    122 
    123     set_seqs(a, b)
    124         Set the two sequences to be compared.
    125 
    126     set_seq1(a)
    127         Set the first sequence to be compared.
    128 
    129     set_seq2(b)
    130         Set the second sequence to be compared.
    131 
    132     find_longest_match(alo, ahi, blo, bhi)
    133         Find longest matching block in a[alo:ahi] and b[blo:bhi].
    134 
    135     get_matching_blocks()
    136         Return list of triples describing matching subsequences.
    137 
    138     get_opcodes()
    139         Return list of 5-tuples describing how to turn a into b.
    140 
    141     ratio()
    142         Return a measure of the sequences' similarity (float in [0,1]).
    143 
    144     quick_ratio()
    145         Return an upper bound on .ratio() relatively quickly.
    146 
    147     real_quick_ratio()
    148         Return an upper bound on ratio() very quickly.
    149     """
    150 
    151     def __init__(self, isjunk=None, a='', b='', autojunk=True):
    152         """Construct a SequenceMatcher.
    153 
    154         Optional arg isjunk is None (the default), or a one-argument
    155         function that takes a sequence element and returns true iff the
    156         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
    157         no elements are considered to be junk.  For example, pass
    158             lambda x: x in " \\t"
    159         if you're comparing lines as sequences of characters, and don't
    160         want to synch up on blanks or hard tabs.
    161 
    162         Optional arg a is the first of two sequences to be compared.  By
    163         default, an empty string.  The elements of a must be hashable.  See
    164         also .set_seqs() and .set_seq1().
    165 
    166         Optional arg b is the second of two sequences to be compared.  By
    167         default, an empty string.  The elements of b must be hashable. See
    168         also .set_seqs() and .set_seq2().
    169 
    170         Optional arg autojunk should be set to False to disable the
    171         "automatic junk heuristic" that treats popular elements as junk
    172         (see module documentation for more information).
    173         """
    174 
    175         # Members:
    176         # a
    177         #      first sequence
    178         # b
    179         #      second sequence; differences are computed as "what do
    180         #      we need to do to 'a' to change it into 'b'?"
    181         # b2j
    182         #      for x in b, b2j[x] is a list of the indices (into b)
    183         #      at which x appears; junk and popular elements do not appear
    184         # fullbcount
    185         #      for x in b, fullbcount[x] == the number of times x
    186         #      appears in b; only materialized if really needed (used
    187         #      only for computing quick_ratio())
    188         # matching_blocks
    189         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
    190         #      ascending & non-overlapping in i and in j; terminated by
    191         #      a dummy (len(a), len(b), 0) sentinel
    192         # opcodes
    193         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
    194         #      one of
    195         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
    196         #          'delete'    a[i1:i2] should be deleted
    197         #          'insert'    b[j1:j2] should be inserted
    198         #          'equal'     a[i1:i2] == b[j1:j2]
    199         # isjunk
    200         #      a user-supplied function taking a sequence element and
    201         #      returning true iff the element is "junk" -- this has
    202         #      subtle but helpful effects on the algorithm, which I'll
    203         #      get around to writing up someday <0.9 wink>.
    204         #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
    205         # bjunk
    206         #      the items in b for which isjunk is True.
    207         # bpopular
    208         #      nonjunk items in b treated as junk by the heuristic (if used).
    209 
    210         self.isjunk = isjunk
    211         self.a = self.b = None
    212         self.autojunk = autojunk
    213         self.set_seqs(a, b)
    214 
    215     def set_seqs(self, a, b):
    216         """Set the two sequences to be compared.
    217 
    218         >>> s = SequenceMatcher()
    219         >>> s.set_seqs("abcd", "bcde")
    220         >>> s.ratio()
    221         0.75
    222         """
    223 
    224         self.set_seq1(a)
    225         self.set_seq2(b)
    226 
    227     def set_seq1(self, a):
    228         """Set the first sequence to be compared.
    229 
    230         The second sequence to be compared is not changed.
    231 
    232         >>> s = SequenceMatcher(None, "abcd", "bcde")
    233         >>> s.ratio()
    234         0.75
    235         >>> s.set_seq1("bcde")
    236         >>> s.ratio()
    237         1.0
    238         >>>
    239 
    240         SequenceMatcher computes and caches detailed information about the
    241         second sequence, so if you want to compare one sequence S against
    242         many sequences, use .set_seq2(S) once and call .set_seq1(x)
    243         repeatedly for each of the other sequences.
    244 
    245         See also set_seqs() and set_seq2().
    246         """
    247 
    248         if a is self.a:
    249             return
    250         self.a = a
    251         self.matching_blocks = self.opcodes = None
    252 
    253     def set_seq2(self, b):
    254         """Set the second sequence to be compared.
    255 
    256         The first sequence to be compared is not changed.
    257 
    258         >>> s = SequenceMatcher(None, "abcd", "bcde")
    259         >>> s.ratio()
    260         0.75
    261         >>> s.set_seq2("abcd")
    262         >>> s.ratio()
    263         1.0
    264         >>>
    265 
    266         SequenceMatcher computes and caches detailed information about the
    267         second sequence, so if you want to compare one sequence S against
    268         many sequences, use .set_seq2(S) once and call .set_seq1(x)
    269         repeatedly for each of the other sequences.
    270 
    271         See also set_seqs() and set_seq1().
    272         """
    273 
    274         if b is self.b:
    275             return
    276         self.b = b
    277         self.matching_blocks = self.opcodes = None
    278         self.fullbcount = None
    279         self.__chain_b()
    280 
    281     # For each element x in b, set b2j[x] to a list of the indices in
    282     # b where x appears; the indices are in increasing order; note that
    283     # the number of times x appears in b is len(b2j[x]) ...
    284     # when self.isjunk is defined, junk elements don't show up in this
    285     # map at all, which stops the central find_longest_match method
    286     # from starting any matching block at a junk element ...
    287     # b2j also does not contain entries for "popular" elements, meaning
    288     # elements that account for more than 1 + 1% of the total elements, and
    289     # when the sequence is reasonably large (>= 200 elements); this can
    290     # be viewed as an adaptive notion of semi-junk, and yields an enormous
    291     # speedup when, e.g., comparing program files with hundreds of
    292     # instances of "return NULL;" ...
    293     # note that this is only called when b changes; so for cross-product
    294     # kinds of matches, it's best to call set_seq2 once, then set_seq1
    295     # repeatedly
    296 
    297     def __chain_b(self):
    298         # Because isjunk is a user-defined (not C) function, and we test
    299         # for junk a LOT, it's important to minimize the number of calls.
    300         # Before the tricks described here, __chain_b was by far the most
    301         # time-consuming routine in the whole module!  If anyone sees
    302         # Jim Roskind, thank him again for profile.py -- I never would
    303         # have guessed that.
    304         # The first trick is to build b2j ignoring the possibility
    305         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
    306         # out the junk later is much cheaper than building b2j "right"
    307         # from the start.
    308         b = self.b
    309         self.b2j = b2j = {}
    310 
    311         for i, elt in enumerate(b):
    312             indices = b2j.setdefault(elt, [])
    313             indices.append(i)
    314 
    315         # Purge junk elements
    316         self.bjunk = junk = set()
    317         isjunk = self.isjunk
    318         if isjunk:
    319             for elt in b2j.keys():
    320                 if isjunk(elt):
    321                     junk.add(elt)
    322             for elt in junk: # separate loop avoids separate list of keys
    323                 del b2j[elt]
    324 
    325         # Purge popular elements that are not junk
    326         self.bpopular = popular = set()
    327         n = len(b)
    328         if self.autojunk and n >= 200:
    329             ntest = n // 100 + 1
    330             for elt, idxs in b2j.items():
    331                 if len(idxs) > ntest:
    332                     popular.add(elt)
    333             for elt in popular: # ditto; as fast for 1% deletion
    334                 del b2j[elt]
    335 
    336     def find_longest_match(self, alo, ahi, blo, bhi):
    337         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
    338 
    339         If isjunk is not defined:
    340 
    341         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
    342             alo <= i <= i+k <= ahi
    343             blo <= j <= j+k <= bhi
    344         and for all (i',j',k') meeting those conditions,
    345             k >= k'
    346             i <= i'
    347             and if i == i', j <= j'
    348 
    349         In other words, of all maximal matching blocks, return one that
    350         starts earliest in a, and of all those maximal matching blocks that
    351         start earliest in a, return the one that starts earliest in b.
    352 
    353         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
    354         >>> s.find_longest_match(0, 5, 0, 9)
    355         Match(a=0, b=4, size=5)
    356 
    357         If isjunk is defined, first the longest matching block is
    358         determined as above, but with the additional restriction that no
    359         junk element appears in the block.  Then that block is extended as
    360         far as possible by matching (only) junk elements on both sides.  So
    361         the resulting block never matches on junk except as identical junk
    362         happens to be adjacent to an "interesting" match.
    363 
    364         Here's the same example as before, but considering blanks to be
    365         junk.  That prevents " abcd" from matching the " abcd" at the tail
    366         end of the second sequence directly.  Instead only the "abcd" can
    367         match, and matches the leftmost "abcd" in the second sequence:
    368 
    369         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
    370         >>> s.find_longest_match(0, 5, 0, 9)
    371         Match(a=1, b=0, size=4)
    372 
    373         If no blocks match, return (alo, blo, 0).
    374 
    375         >>> s = SequenceMatcher(None, "ab", "c")
    376         >>> s.find_longest_match(0, 2, 0, 1)
    377         Match(a=0, b=0, size=0)
    378         """
    379 
    380         # CAUTION:  stripping common prefix or suffix would be incorrect.
    381         # E.g.,
    382         #    ab
    383         #    acab
    384         # Longest matching block is "ab", but if common prefix is
    385         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
    386         # strip, so ends up claiming that ab is changed to acab by
    387         # inserting "ca" in the middle.  That's minimal but unintuitive:
    388         # "it's obvious" that someone inserted "ac" at the front.
    389         # Windiff ends up at the same place as diff, but by pairing up
    390         # the unique 'b's and then matching the first two 'a's.
    391 
    392         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
    393         besti, bestj, bestsize = alo, blo, 0
    394         # find longest junk-free match
    395         # during an iteration of the loop, j2len[j] = length of longest
    396         # junk-free match ending with a[i-1] and b[j]
    397         j2len = {}
    398         nothing = []
    399         for i in range(alo, ahi):
    400             # look at all instances of a[i] in b; note that because
    401             # b2j has no junk keys, the loop is skipped if a[i] is junk
    402             j2lenget = j2len.get
    403             newj2len = {}
    404             for j in b2j.get(a[i], nothing):
    405                 # a[i] matches b[j]
    406                 if j < blo:
    407                     continue
    408                 if j >= bhi:
    409                     break
    410                 k = newj2len[j] = j2lenget(j-1, 0) + 1
    411                 if k > bestsize:
    412                     besti, bestj, bestsize = i-k+1, j-k+1, k
    413             j2len = newj2len
    414 
    415         # Extend the best by non-junk elements on each end.  In particular,
    416         # "popular" non-junk elements aren't in b2j, which greatly speeds
    417         # the inner loop above, but also means "the best" match so far
    418         # doesn't contain any junk *or* popular non-junk elements.
    419         while besti > alo and bestj > blo and \
    420               not isbjunk(b[bestj-1]) and \
    421               a[besti-1] == b[bestj-1]:
    422             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
    423         while besti+bestsize < ahi and bestj+bestsize < bhi and \
    424               not isbjunk(b[bestj+bestsize]) and \
    425               a[besti+bestsize] == b[bestj+bestsize]:
    426             bestsize += 1
    427 
    428         # Now that we have a wholly interesting match (albeit possibly
    429         # empty!), we may as well suck up the matching junk on each
    430         # side of it too.  Can't think of a good reason not to, and it
    431         # saves post-processing the (possibly considerable) expense of
    432         # figuring out what to do with it.  In the case of an empty
    433         # interesting match, this is clearly the right thing to do,
    434         # because no other kind of match is possible in the regions.
    435         while besti > alo and bestj > blo and \
    436               isbjunk(b[bestj-1]) and \
    437               a[besti-1] == b[bestj-1]:
    438             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
    439         while besti+bestsize < ahi and bestj+bestsize < bhi and \
    440               isbjunk(b[bestj+bestsize]) and \
    441               a[besti+bestsize] == b[bestj+bestsize]:
    442             bestsize = bestsize + 1
    443 
    444         return Match(besti, bestj, bestsize)
    445 
    446     def get_matching_blocks(self):
    447         """Return list of triples describing matching subsequences.
    448 
    449         Each triple is of the form (i, j, n), and means that
    450         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
    451         i and in j.  New in Python 2.5, it's also guaranteed that if
    452         (i, j, n) and (i', j', n') are adjacent triples in the list, and
    453         the second is not the last triple in the list, then i+n != i' or
    454         j+n != j'.  IOW, adjacent triples never describe adjacent equal
    455         blocks.
    456 
    457         The last triple is a dummy, (len(a), len(b), 0), and is the only
    458         triple with n==0.
    459 
    460         >>> s = SequenceMatcher(None, "abxcd", "abcd")
    461         >>> list(s.get_matching_blocks())
    462         [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
    463         """
    464 
    465         if self.matching_blocks is not None:
    466             return self.matching_blocks
    467         la, lb = len(self.a), len(self.b)
    468 
    469         # This is most naturally expressed as a recursive algorithm, but
    470         # at least one user bumped into extreme use cases that exceeded
    471         # the recursion limit on their box.  So, now we maintain a list
    472         # ('queue`) of blocks we still need to look at, and append partial
    473         # results to `matching_blocks` in a loop; the matches are sorted
    474         # at the end.
    475         queue = [(0, la, 0, lb)]
    476         matching_blocks = []
    477         while queue:
    478             alo, ahi, blo, bhi = queue.pop()
    479             i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
    480             # a[alo:i] vs b[blo:j] unknown
    481             # a[i:i+k] same as b[j:j+k]
    482             # a[i+k:ahi] vs b[j+k:bhi] unknown
    483             if k:   # if k is 0, there was no matching block
    484                 matching_blocks.append(x)
    485                 if alo < i and blo < j:
    486                     queue.append((alo, i, blo, j))
    487                 if i+k < ahi and j+k < bhi:
    488                     queue.append((i+k, ahi, j+k, bhi))
    489         matching_blocks.sort()
    490 
    491         # It's possible that we have adjacent equal blocks in the
    492         # matching_blocks list now.  Starting with 2.5, this code was added
    493         # to collapse them.
    494         i1 = j1 = k1 = 0
    495         non_adjacent = []
    496         for i2, j2, k2 in matching_blocks:
    497             # Is this block adjacent to i1, j1, k1?
    498             if i1 + k1 == i2 and j1 + k1 == j2:
    499                 # Yes, so collapse them -- this just increases the length of
    500                 # the first block by the length of the second, and the first
    501                 # block so lengthened remains the block to compare against.
    502                 k1 += k2
    503             else:
    504                 # Not adjacent.  Remember the first block (k1==0 means it's
    505                 # the dummy we started with), and make the second block the
    506                 # new block to compare against.
    507                 if k1:
    508                     non_adjacent.append((i1, j1, k1))
    509                 i1, j1, k1 = i2, j2, k2
    510         if k1:
    511             non_adjacent.append((i1, j1, k1))
    512 
    513         non_adjacent.append( (la, lb, 0) )
    514         self.matching_blocks = list(map(Match._make, non_adjacent))
    515         return self.matching_blocks
    516 
    517     def get_opcodes(self):
    518         """Return list of 5-tuples describing how to turn a into b.
    519 
    520         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
    521         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
    522         tuple preceding it, and likewise for j1 == the previous j2.
    523 
    524         The tags are strings, with these meanings:
    525 
    526         'replace':  a[i1:i2] should be replaced by b[j1:j2]
    527         'delete':   a[i1:i2] should be deleted.
    528                     Note that j1==j2 in this case.
    529         'insert':   b[j1:j2] should be inserted at a[i1:i1].
    530                     Note that i1==i2 in this case.
    531         'equal':    a[i1:i2] == b[j1:j2]
    532 
    533         >>> a = "qabxcd"
    534         >>> b = "abycdf"
    535         >>> s = SequenceMatcher(None, a, b)
    536         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
    537         ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
    538         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
    539          delete a[0:1] (q) b[0:0] ()
    540           equal a[1:3] (ab) b[0:2] (ab)
    541         replace a[3:4] (x) b[2:3] (y)
    542           equal a[4:6] (cd) b[3:5] (cd)
    543          insert a[6:6] () b[5:6] (f)
    544         """
    545 
    546         if self.opcodes is not None:
    547             return self.opcodes
    548         i = j = 0
    549         self.opcodes = answer = []
    550         for ai, bj, size in self.get_matching_blocks():
    551             # invariant:  we've pumped out correct diffs to change
    552             # a[:i] into b[:j], and the next matching block is
    553             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
    554             # out a diff to change a[i:ai] into b[j:bj], pump out
    555             # the matching block, and move (i,j) beyond the match
    556             tag = ''
    557             if i < ai and j < bj:
    558                 tag = 'replace'
    559             elif i < ai:
    560                 tag = 'delete'
    561             elif j < bj:
    562                 tag = 'insert'
    563             if tag:
    564                 answer.append( (tag, i, ai, j, bj) )
    565             i, j = ai+size, bj+size
    566             # the list of matching blocks is terminated by a
    567             # sentinel with size 0
    568             if size:
    569                 answer.append( ('equal', ai, i, bj, j) )
    570         return answer
    571 
    572     def get_grouped_opcodes(self, n=3):
    573         """ Isolate change clusters by eliminating ranges with no changes.
    574 
    575         Return a generator of groups with up to n lines of context.
    576         Each group is in the same format as returned by get_opcodes().
    577 
    578         >>> from pprint import pprint
    579         >>> a = list(map(str, range(1,40)))
    580         >>> b = a[:]
    581         >>> b[8:8] = ['i']     # Make an insertion
    582         >>> b[20] += 'x'       # Make a replacement
    583         >>> b[23:28] = []      # Make a deletion
    584         >>> b[30] += 'y'       # Make another replacement
    585         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
    586         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
    587          [('equal', 16, 19, 17, 20),
    588           ('replace', 19, 20, 20, 21),
    589           ('equal', 20, 22, 21, 23),
    590           ('delete', 22, 27, 23, 23),
    591           ('equal', 27, 30, 23, 26)],
    592          [('equal', 31, 34, 27, 30),
    593           ('replace', 34, 35, 30, 31),
    594           ('equal', 35, 38, 31, 34)]]
    595         """
    596 
    597         codes = self.get_opcodes()
    598         if not codes:
    599             codes = [("equal", 0, 1, 0, 1)]
    600         # Fixup leading and trailing groups if they show no changes.
    601         if codes[0][0] == 'equal':
    602             tag, i1, i2, j1, j2 = codes[0]
    603             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
    604         if codes[-1][0] == 'equal':
    605             tag, i1, i2, j1, j2 = codes[-1]
    606             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
    607 
    608         nn = n + n
    609         group = []
    610         for tag, i1, i2, j1, j2 in codes:
    611             # End the current group and start a new one whenever
    612             # there is a large range with no changes.
    613             if tag == 'equal' and i2-i1 > nn:
    614                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
    615                 yield group
    616                 group = []
    617                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
    618             group.append((tag, i1, i2, j1 ,j2))
    619         if group and not (len(group)==1 and group[0][0] == 'equal'):
    620             yield group
    621 
    622     def ratio(self):
    623         """Return a measure of the sequences' similarity (float in [0,1]).
    624 
    625         Where T is the total number of elements in both sequences, and
    626         M is the number of matches, this is 2.0*M / T.
    627         Note that this is 1 if the sequences are identical, and 0 if
    628         they have nothing in common.
    629 
    630         .ratio() is expensive to compute if you haven't already computed
    631         .get_matching_blocks() or .get_opcodes(), in which case you may
    632         want to try .quick_ratio() or .real_quick_ratio() first to get an
    633         upper bound.
    634 
    635         >>> s = SequenceMatcher(None, "abcd", "bcde")
    636         >>> s.ratio()
    637         0.75
    638         >>> s.quick_ratio()
    639         0.75
    640         >>> s.real_quick_ratio()
    641         1.0
    642         """
    643 
    644         matches = sum(triple[-1] for triple in self.get_matching_blocks())
    645         return _calculate_ratio(matches, len(self.a) + len(self.b))
    646 
    647     def quick_ratio(self):
    648         """Return an upper bound on ratio() relatively quickly.
    649 
    650         This isn't defined beyond that it is an upper bound on .ratio(), and
    651         is faster to compute.
    652         """
    653 
    654         # viewing a and b as multisets, set matches to the cardinality
    655         # of their intersection; this counts the number of matches
    656         # without regard to order, so is clearly an upper bound
    657         if self.fullbcount is None:
    658             self.fullbcount = fullbcount = {}
    659             for elt in self.b:
    660                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
    661         fullbcount = self.fullbcount
    662         # avail[x] is the number of times x appears in 'b' less the
    663         # number of times we've seen it in 'a' so far ... kinda
    664         avail = {}
    665         availhas, matches = avail.__contains__, 0
    666         for elt in self.a:
    667             if availhas(elt):
    668                 numb = avail[elt]
    669             else:
    670                 numb = fullbcount.get(elt, 0)
    671             avail[elt] = numb - 1
    672             if numb > 0:
    673                 matches = matches + 1
    674         return _calculate_ratio(matches, len(self.a) + len(self.b))
    675 
    676     def real_quick_ratio(self):
    677         """Return an upper bound on ratio() very quickly.
    678 
    679         This isn't defined beyond that it is an upper bound on .ratio(), and
    680         is faster to compute than either .ratio() or .quick_ratio().
    681         """
    682 
    683         la, lb = len(self.a), len(self.b)
    684         # can't have more matches than the number of elements in the
    685         # shorter sequence
    686         return _calculate_ratio(min(la, lb), la + lb)
    687 
    688 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
    689     """Use SequenceMatcher to return list of the best "good enough" matches.
    690 
    691     word is a sequence for which close matches are desired (typically a
    692     string).
    693 
    694     possibilities is a list of sequences against which to match word
    695     (typically a list of strings).
    696 
    697     Optional arg n (default 3) is the maximum number of close matches to
    698     return.  n must be > 0.
    699 
    700     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    701     that don't score at least that similar to word are ignored.
    702 
    703     The best (no more than n) matches among the possibilities are returned
    704     in a list, sorted by similarity score, most similar first.
    705 
    706     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
    707     ['apple', 'ape']
    708     >>> import keyword as _keyword
    709     >>> get_close_matches("wheel", _keyword.kwlist)
    710     ['while']
    711     >>> get_close_matches("Apple", _keyword.kwlist)
    712     []
    713     >>> get_close_matches("accept", _keyword.kwlist)
    714     ['except']
    715     """
    716 
    717     if not n >  0:
    718         raise ValueError("n must be > 0: %r" % (n,))
    719     if not 0.0 <= cutoff <= 1.0:
    720         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    721     result = []
    722     s = SequenceMatcher()
    723     s.set_seq2(word)
    724     for x in possibilities:
    725         s.set_seq1(x)
    726         if s.real_quick_ratio() >= cutoff and \
    727            s.quick_ratio() >= cutoff and \
    728            s.ratio() >= cutoff:
    729             result.append((s.ratio(), x))
    730 
    731     # Move the best scorers to head of list
    732     result = _nlargest(n, result)
    733     # Strip scores for the best n matches
    734     return [x for score, x in result]
    735 
    736 def _count_leading(line, ch):
    737     """
    738     Return number of `ch` characters at the start of `line`.
    739 
    740     Example:
    741 
    742     >>> _count_leading('   abc', ' ')
    743     3
    744     """
    745 
    746     i, n = 0, len(line)
    747     while i < n and line[i] == ch:
    748         i += 1
    749     return i
    750 
    751 class Differ:
    752     r"""
    753     Differ is a class for comparing sequences of lines of text, and
    754     producing human-readable differences or deltas.  Differ uses
    755     SequenceMatcher both to compare sequences of lines, and to compare
    756     sequences of characters within similar (near-matching) lines.
    757 
    758     Each line of a Differ delta begins with a two-letter code:
    759 
    760         '- '    line unique to sequence 1
    761         '+ '    line unique to sequence 2
    762         '  '    line common to both sequences
    763         '? '    line not present in either input sequence
    764 
    765     Lines beginning with '? ' attempt to guide the eye to intraline
    766     differences, and were not present in either input sequence.  These lines
    767     can be confusing if the sequences contain tab characters.
    768 
    769     Note that Differ makes no claim to produce a *minimal* diff.  To the
    770     contrary, minimal diffs are often counter-intuitive, because they synch
    771     up anywhere possible, sometimes accidental matches 100 pages apart.
    772     Restricting synch points to contiguous matches preserves some notion of
    773     locality, at the occasional cost of producing a longer diff.
    774 
    775     Example: Comparing two texts.
    776 
    777     First we set up the texts, sequences of individual single-line strings
    778     ending with newlines (such sequences can also be obtained from the
    779     `readlines()` method of file-like objects):
    780 
    781     >>> text1 = '''  1. Beautiful is better than ugly.
    782     ...   2. Explicit is better than implicit.
    783     ...   3. Simple is better than complex.
    784     ...   4. Complex is better than complicated.
    785     ... '''.splitlines(keepends=True)
    786     >>> len(text1)
    787     4
    788     >>> text1[0][-1]
    789     '\n'
    790     >>> text2 = '''  1. Beautiful is better than ugly.
    791     ...   3.   Simple is better than complex.
    792     ...   4. Complicated is better than complex.
    793     ...   5. Flat is better than nested.
    794     ... '''.splitlines(keepends=True)
    795 
    796     Next we instantiate a Differ object:
    797 
    798     >>> d = Differ()
    799 
    800     Note that when instantiating a Differ object we may pass functions to
    801     filter out line and character 'junk'.  See Differ.__init__ for details.
    802 
    803     Finally, we compare the two:
    804 
    805     >>> result = list(d.compare(text1, text2))
    806 
    807     'result' is a list of strings, so let's pretty-print it:
    808 
    809     >>> from pprint import pprint as _pprint
    810     >>> _pprint(result)
    811     ['    1. Beautiful is better than ugly.\n',
    812      '-   2. Explicit is better than implicit.\n',
    813      '-   3. Simple is better than complex.\n',
    814      '+   3.   Simple is better than complex.\n',
    815      '?     ++\n',
    816      '-   4. Complex is better than complicated.\n',
    817      '?            ^                     ---- ^\n',
    818      '+   4. Complicated is better than complex.\n',
    819      '?           ++++ ^                      ^\n',
    820      '+   5. Flat is better than nested.\n']
    821 
    822     As a single multi-line string it looks like this:
    823 
    824     >>> print(''.join(result), end="")
    825         1. Beautiful is better than ugly.
    826     -   2. Explicit is better than implicit.
    827     -   3. Simple is better than complex.
    828     +   3.   Simple is better than complex.
    829     ?     ++
    830     -   4. Complex is better than complicated.
    831     ?            ^                     ---- ^
    832     +   4. Complicated is better than complex.
    833     ?           ++++ ^                      ^
    834     +   5. Flat is better than nested.
    835 
    836     Methods:
    837 
    838     __init__(linejunk=None, charjunk=None)
    839         Construct a text differencer, with optional filters.
    840 
    841     compare(a, b)
    842         Compare two sequences of lines; generate the resulting delta.
    843     """
    844 
    845     def __init__(self, linejunk=None, charjunk=None):
    846         """
    847         Construct a text differencer, with optional filters.
    848 
    849         The two optional keyword parameters are for filter functions:
    850 
    851         - `linejunk`: A function that should accept a single string argument,
    852           and return true iff the string is junk. The module-level function
    853           `IS_LINE_JUNK` may be used to filter out lines without visible
    854           characters, except for at most one splat ('#').  It is recommended
    855           to leave linejunk None; the underlying SequenceMatcher class has
    856           an adaptive notion of "noise" lines that's better than any static
    857           definition the author has ever been able to craft.
    858 
    859         - `charjunk`: A function that should accept a string of length 1. The
    860           module-level function `IS_CHARACTER_JUNK` may be used to filter out
    861           whitespace characters (a blank or tab; **note**: bad idea to include
    862           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
    863         """
    864 
    865         self.linejunk = linejunk
    866         self.charjunk = charjunk
    867 
    868     def compare(self, a, b):
    869         r"""
    870         Compare two sequences of lines; generate the resulting delta.
    871 
    872         Each sequence must contain individual single-line strings ending with
    873         newlines. Such sequences can be obtained from the `readlines()` method
    874         of file-like objects.  The delta generated also consists of newline-
    875         terminated strings, ready to be printed as-is via the writeline()
    876         method of a file-like object.
    877 
    878         Example:
    879 
    880         >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
    881         ...                                'ore\ntree\nemu\n'.splitlines(True))),
    882         ...       end="")
    883         - one
    884         ?  ^
    885         + ore
    886         ?  ^
    887         - two
    888         - three
    889         ?  -
    890         + tree
    891         + emu
    892         """
    893 
    894         cruncher = SequenceMatcher(self.linejunk, a, b)
    895         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
    896             if tag == 'replace':
    897                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
    898             elif tag == 'delete':
    899                 g = self._dump('-', a, alo, ahi)
    900             elif tag == 'insert':
    901                 g = self._dump('+', b, blo, bhi)
    902             elif tag == 'equal':
    903                 g = self._dump(' ', a, alo, ahi)
    904             else:
    905                 raise ValueError('unknown tag %r' % (tag,))
    906 
    907             yield from g
    908 
    909     def _dump(self, tag, x, lo, hi):
    910         """Generate comparison results for a same-tagged range."""
    911         for i in range(lo, hi):
    912             yield '%s %s' % (tag, x[i])
    913 
    914     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
    915         assert alo < ahi and blo < bhi
    916         # dump the shorter block first -- reduces the burden on short-term
    917         # memory if the blocks are of very different sizes
    918         if bhi - blo < ahi - alo:
    919             first  = self._dump('+', b, blo, bhi)
    920             second = self._dump('-', a, alo, ahi)
    921         else:
    922             first  = self._dump('-', a, alo, ahi)
    923             second = self._dump('+', b, blo, bhi)
    924 
    925         for g in first, second:
    926             yield from g
    927 
    928     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
    929         r"""
    930         When replacing one block of lines with another, search the blocks
    931         for *similar* lines; the best-matching pair (if any) is used as a
    932         synch point, and intraline difference marking is done on the
    933         similar pair. Lots of work, but often worth it.
    934 
    935         Example:
    936 
    937         >>> d = Differ()
    938         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
    939         ...                            ['abcdefGhijkl\n'], 0, 1)
    940         >>> print(''.join(results), end="")
    941         - abcDefghiJkl
    942         ?    ^  ^  ^
    943         + abcdefGhijkl
    944         ?    ^  ^  ^
    945         """
    946 
    947         # don't synch up unless the lines have a similarity score of at
    948         # least cutoff; best_ratio tracks the best score seen so far
    949         best_ratio, cutoff = 0.74, 0.75
    950         cruncher = SequenceMatcher(self.charjunk)
    951         eqi, eqj = None, None   # 1st indices of equal lines (if any)
    952 
    953         # search for the pair that matches best without being identical
    954         # (identical lines must be junk lines, & we don't want to synch up
    955         # on junk -- unless we have to)
    956         for j in range(blo, bhi):
    957             bj = b[j]
    958             cruncher.set_seq2(bj)
    959             for i in range(alo, ahi):
    960                 ai = a[i]
    961                 if ai == bj:
    962                     if eqi is None:
    963                         eqi, eqj = i, j
    964                     continue
    965                 cruncher.set_seq1(ai)
    966                 # computing similarity is expensive, so use the quick
    967                 # upper bounds first -- have seen this speed up messy
    968                 # compares by a factor of 3.
    969                 # note that ratio() is only expensive to compute the first
    970                 # time it's called on a sequence pair; the expensive part
    971                 # of the computation is cached by cruncher
    972                 if cruncher.real_quick_ratio() > best_ratio and \
    973                       cruncher.quick_ratio() > best_ratio and \
    974                       cruncher.ratio() > best_ratio:
    975                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
    976         if best_ratio < cutoff:
    977             # no non-identical "pretty close" pair
    978             if eqi is None:
    979                 # no identical pair either -- treat it as a straight replace
    980                 yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
    981                 return
    982             # no close pair, but an identical pair -- synch up on that
    983             best_i, best_j, best_ratio = eqi, eqj, 1.0
    984         else:
    985             # there's a close pair, so forget the identical pair (if any)
    986             eqi = None
    987 
    988         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
    989         # identical
    990 
    991         # pump out diffs from before the synch point
    992         yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
    993 
    994         # do intraline marking on the synch pair
    995         aelt, belt = a[best_i], b[best_j]
    996         if eqi is None:
    997             # pump out a '-', '?', '+', '?' quad for the synched lines
    998             atags = btags = ""
    999             cruncher.set_seqs(aelt, belt)
   1000             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
   1001                 la, lb = ai2 - ai1, bj2 - bj1
   1002                 if tag == 'replace':
   1003                     atags += '^' * la
   1004                     btags += '^' * lb
   1005                 elif tag == 'delete':
   1006                     atags += '-' * la
   1007                 elif tag == 'insert':
   1008                     btags += '+' * lb
   1009                 elif tag == 'equal':
   1010                     atags += ' ' * la
   1011                     btags += ' ' * lb
   1012                 else:
   1013                     raise ValueError('unknown tag %r' % (tag,))
   1014             yield from self._qformat(aelt, belt, atags, btags)
   1015         else:
   1016             # the synch pair is identical
   1017             yield '  ' + aelt
   1018 
   1019         # pump out diffs from after the synch point
   1020         yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
   1021 
   1022     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
   1023         g = []
   1024         if alo < ahi:
   1025             if blo < bhi:
   1026                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
   1027             else:
   1028                 g = self._dump('-', a, alo, ahi)
   1029         elif blo < bhi:
   1030             g = self._dump('+', b, blo, bhi)
   1031 
   1032         yield from g
   1033 
   1034     def _qformat(self, aline, bline, atags, btags):
   1035         r"""
   1036         Format "?" output and deal with leading tabs.
   1037 
   1038         Example:
   1039 
   1040         >>> d = Differ()
   1041         >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
   1042         ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
   1043         >>> for line in results: print(repr(line))
   1044         ...
   1045         '- \tabcDefghiJkl\n'
   1046         '? \t ^ ^  ^\n'
   1047         '+ \tabcdefGhijkl\n'
   1048         '? \t ^ ^  ^\n'
   1049         """
   1050 
   1051         # Can hurt, but will probably help most of the time.
   1052         common = min(_count_leading(aline, "\t"),
   1053                      _count_leading(bline, "\t"))
   1054         common = min(common, _count_leading(atags[:common], " "))
   1055         common = min(common, _count_leading(btags[:common], " "))
   1056         atags = atags[common:].rstrip()
   1057         btags = btags[common:].rstrip()
   1058 
   1059         yield "- " + aline
   1060         if atags:
   1061             yield "? %s%s\n" % ("\t" * common, atags)
   1062 
   1063         yield "+ " + bline
   1064         if btags:
   1065             yield "? %s%s\n" % ("\t" * common, btags)
   1066 
   1067 # With respect to junk, an earlier version of ndiff simply refused to
   1068 # *start* a match with a junk element.  The result was cases like this:
   1069 #     before: private Thread currentThread;
   1070 #     after:  private volatile Thread currentThread;
   1071 # If you consider whitespace to be junk, the longest contiguous match
   1072 # not starting with junk is "e Thread currentThread".  So ndiff reported
   1073 # that "e volatil" was inserted between the 't' and the 'e' in "private".
   1074 # While an accurate view, to people that's absurd.  The current version
   1075 # looks for matching blocks that are entirely junk-free, then extends the
   1076 # longest one of those as far as possible but only with matching junk.
   1077 # So now "currentThread" is matched, then extended to suck up the
   1078 # preceding blank; then "private" is matched, and extended to suck up the
   1079 # following blank; then "Thread" is matched; and finally ndiff reports
   1080 # that "volatile " was inserted before "Thread".  The only quibble
   1081 # remaining is that perhaps it was really the case that " volatile"
   1082 # was inserted after "private".  I can live with that <wink>.
   1083 
   1084 import re
   1085 
   1086 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
   1087     r"""
   1088     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
   1089 
   1090     Examples:
   1091 
   1092     >>> IS_LINE_JUNK('\n')
   1093     True
   1094     >>> IS_LINE_JUNK('  #   \n')
   1095     True
   1096     >>> IS_LINE_JUNK('hello\n')
   1097     False
   1098     """
   1099 
   1100     return pat(line) is not None
   1101 
   1102 def IS_CHARACTER_JUNK(ch, ws=" \t"):
   1103     r"""
   1104     Return 1 for ignorable character: iff `ch` is a space or tab.
   1105 
   1106     Examples:
   1107 
   1108     >>> IS_CHARACTER_JUNK(' ')
   1109     True
   1110     >>> IS_CHARACTER_JUNK('\t')
   1111     True
   1112     >>> IS_CHARACTER_JUNK('\n')
   1113     False
   1114     >>> IS_CHARACTER_JUNK('x')
   1115     False
   1116     """
   1117 
   1118     return ch in ws
   1119 
   1120 
   1121 ########################################################################
   1122 ###  Unified Diff
   1123 ########################################################################
   1124 
   1125 def _format_range_unified(start, stop):
   1126     'Convert range to the "ed" format'
   1127     # Per the diff spec at http://www.unix.org/single_unix_specification/
   1128     beginning = start + 1     # lines start numbering with one
   1129     length = stop - start
   1130     if length == 1:
   1131         return '{}'.format(beginning)
   1132     if not length:
   1133         beginning -= 1        # empty ranges begin at line just before the range
   1134     return '{},{}'.format(beginning, length)
   1135 
   1136 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
   1137                  tofiledate='', n=3, lineterm='\n'):
   1138     r"""
   1139     Compare two sequences of lines; generate the delta as a unified diff.
   1140 
   1141     Unified diffs are a compact way of showing line changes and a few
   1142     lines of context.  The number of context lines is set by 'n' which
   1143     defaults to three.
   1144 
   1145     By default, the diff control lines (those with ---, +++, or @@) are
   1146     created with a trailing newline.  This is helpful so that inputs
   1147     created from file.readlines() result in diffs that are suitable for
   1148     file.writelines() since both the inputs and outputs have trailing
   1149     newlines.
   1150 
   1151     For inputs that do not have trailing newlines, set the lineterm
   1152     argument to "" so that the output will be uniformly newline free.
   1153 
   1154     The unidiff format normally has a header for filenames and modification
   1155     times.  Any or all of these may be specified using strings for
   1156     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
   1157     The modification times are normally expressed in the ISO 8601 format.
   1158 
   1159     Example:
   1160 
   1161     >>> for line in unified_diff('one two three four'.split(),
   1162     ...             'zero one tree four'.split(), 'Original', 'Current',
   1163     ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
   1164     ...             lineterm=''):
   1165     ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
   1166     --- Original        2005-01-26 23:30:50
   1167     +++ Current         2010-04-02 10:20:52
   1168     @@ -1,4 +1,4 @@
   1169     +zero
   1170      one
   1171     -two
   1172     -three
   1173     +tree
   1174      four
   1175     """
   1176 
   1177     _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
   1178     started = False
   1179     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
   1180         if not started:
   1181             started = True
   1182             fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
   1183             todate = '\t{}'.format(tofiledate) if tofiledate else ''
   1184             yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
   1185             yield '+++ {}{}{}'.format(tofile, todate, lineterm)
   1186 
   1187         first, last = group[0], group[-1]
   1188         file1_range = _format_range_unified(first[1], last[2])
   1189         file2_range = _format_range_unified(first[3], last[4])
   1190         yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
   1191 
   1192         for tag, i1, i2, j1, j2 in group:
   1193             if tag == 'equal':
   1194                 for line in a[i1:i2]:
   1195                     yield ' ' + line
   1196                 continue
   1197             if tag in {'replace', 'delete'}:
   1198                 for line in a[i1:i2]:
   1199                     yield '-' + line
   1200             if tag in {'replace', 'insert'}:
   1201                 for line in b[j1:j2]:
   1202                     yield '+' + line
   1203 
   1204 
   1205 ########################################################################
   1206 ###  Context Diff
   1207 ########################################################################
   1208 
   1209 def _format_range_context(start, stop):
   1210     'Convert range to the "ed" format'
   1211     # Per the diff spec at http://www.unix.org/single_unix_specification/
   1212     beginning = start + 1     # lines start numbering with one
   1213     length = stop - start
   1214     if not length:
   1215         beginning -= 1        # empty ranges begin at line just before the range
   1216     if length <= 1:
   1217         return '{}'.format(beginning)
   1218     return '{},{}'.format(beginning, beginning + length - 1)
   1219 
   1220 # See http://www.unix.org/single_unix_specification/
   1221 def context_diff(a, b, fromfile='', tofile='',
   1222                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
   1223     r"""
   1224     Compare two sequences of lines; generate the delta as a context diff.
   1225 
   1226     Context diffs are a compact way of showing line changes and a few
   1227     lines of context.  The number of context lines is set by 'n' which
   1228     defaults to three.
   1229 
   1230     By default, the diff control lines (those with *** or ---) are
   1231     created with a trailing newline.  This is helpful so that inputs
   1232     created from file.readlines() result in diffs that are suitable for
   1233     file.writelines() since both the inputs and outputs have trailing
   1234     newlines.
   1235 
   1236     For inputs that do not have trailing newlines, set the lineterm
   1237     argument to "" so that the output will be uniformly newline free.
   1238 
   1239     The context diff format normally has a header for filenames and
   1240     modification times.  Any or all of these may be specified using
   1241     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
   1242     The modification times are normally expressed in the ISO 8601 format.
   1243     If not specified, the strings default to blanks.
   1244 
   1245     Example:
   1246 
   1247     >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
   1248     ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
   1249     ...       end="")
   1250     *** Original
   1251     --- Current
   1252     ***************
   1253     *** 1,4 ****
   1254       one
   1255     ! two
   1256     ! three
   1257       four
   1258     --- 1,4 ----
   1259     + zero
   1260       one
   1261     ! tree
   1262       four
   1263     """
   1264 
   1265     _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
   1266     prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
   1267     started = False
   1268     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
   1269         if not started:
   1270             started = True
   1271             fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
   1272             todate = '\t{}'.format(tofiledate) if tofiledate else ''
   1273             yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
   1274             yield '--- {}{}{}'.format(tofile, todate, lineterm)
   1275 
   1276         first, last = group[0], group[-1]
   1277         yield '***************' + lineterm
   1278 
   1279         file1_range = _format_range_context(first[1], last[2])
   1280         yield '*** {} ****{}'.format(file1_range, lineterm)
   1281 
   1282         if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
   1283             for tag, i1, i2, _, _ in group:
   1284                 if tag != 'insert':
   1285                     for line in a[i1:i2]:
   1286                         yield prefix[tag] + line
   1287 
   1288         file2_range = _format_range_context(first[3], last[4])
   1289         yield '--- {} ----{}'.format(file2_range, lineterm)
   1290 
   1291         if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
   1292             for tag, _, _, j1, j2 in group:
   1293                 if tag != 'delete':
   1294                     for line in b[j1:j2]:
   1295                         yield prefix[tag] + line
   1296 
   1297 def _check_types(a, b, *args):
   1298     # Checking types is weird, but the alternative is garbled output when
   1299     # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
   1300     # without this check, passing filenames as bytes results in output like
   1301     #   --- b'oldfile.txt'
   1302     #   +++ b'newfile.txt'
   1303     # because of how str.format() incorporates bytes objects.
   1304     if a and not isinstance(a[0], str):
   1305         raise TypeError('lines to compare must be str, not %s (%r)' %
   1306                         (type(a[0]).__name__, a[0]))
   1307     if b and not isinstance(b[0], str):
   1308         raise TypeError('lines to compare must be str, not %s (%r)' %
   1309                         (type(b[0]).__name__, b[0]))
   1310     for arg in args:
   1311         if not isinstance(arg, str):
   1312             raise TypeError('all arguments must be str, not: %r' % (arg,))
   1313 
   1314 def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
   1315                fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
   1316     r"""
   1317     Compare `a` and `b`, two sequences of lines represented as bytes rather
   1318     than str. This is a wrapper for `dfunc`, which is typically either
   1319     unified_diff() or context_diff(). Inputs are losslessly converted to
   1320     strings so that `dfunc` only has to worry about strings, and encoded
   1321     back to bytes on return. This is necessary to compare files with
   1322     unknown or inconsistent encoding. All other inputs (except `n`) must be
   1323     bytes rather than str.
   1324     """
   1325     def decode(s):
   1326         try:
   1327             return s.decode('ascii', 'surrogateescape')
   1328         except AttributeError as err:
   1329             msg = ('all arguments must be bytes, not %s (%r)' %
   1330                    (type(s).__name__, s))
   1331             raise TypeError(msg) from err
   1332     a = list(map(decode, a))
   1333     b = list(map(decode, b))
   1334     fromfile = decode(fromfile)
   1335     tofile = decode(tofile)
   1336     fromfiledate = decode(fromfiledate)
   1337     tofiledate = decode(tofiledate)
   1338     lineterm = decode(lineterm)
   1339 
   1340     lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
   1341     for line in lines:
   1342         yield line.encode('ascii', 'surrogateescape')
   1343 
   1344 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
   1345     r"""
   1346     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
   1347 
   1348     Optional keyword parameters `linejunk` and `charjunk` are for filter
   1349     functions, or can be None:
   1350 
   1351     - linejunk: A function that should accept a single string argument and
   1352       return true iff the string is junk.  The default is None, and is
   1353       recommended; the underlying SequenceMatcher class has an adaptive
   1354       notion of "noise" lines.
   1355 
   1356     - charjunk: A function that accepts a character (string of length
   1357       1), and returns true iff the character is junk. The default is
   1358       the module-level function IS_CHARACTER_JUNK, which filters out
   1359       whitespace characters (a blank or tab; note: it's a bad idea to
   1360       include newline in this!).
   1361 
   1362     Tools/scripts/ndiff.py is a command-line front-end to this function.
   1363 
   1364     Example:
   1365 
   1366     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
   1367     ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
   1368     >>> print(''.join(diff), end="")
   1369     - one
   1370     ?  ^
   1371     + ore
   1372     ?  ^
   1373     - two
   1374     - three
   1375     ?  -
   1376     + tree
   1377     + emu
   1378     """
   1379     return Differ(linejunk, charjunk).compare(a, b)
   1380 
   1381 def _mdiff(fromlines, tolines, context=None, linejunk=None,
   1382            charjunk=IS_CHARACTER_JUNK):
   1383     r"""Returns generator yielding marked up from/to side by side differences.
   1384 
   1385     Arguments:
   1386     fromlines -- list of text lines to compared to tolines
   1387     tolines -- list of text lines to be compared to fromlines
   1388     context -- number of context lines to display on each side of difference,
   1389                if None, all from/to text lines will be generated.
   1390     linejunk -- passed on to ndiff (see ndiff documentation)
   1391     charjunk -- passed on to ndiff (see ndiff documentation)
   1392 
   1393     This function returns an iterator which returns a tuple:
   1394     (from line tuple, to line tuple, boolean flag)
   1395 
   1396     from/to line tuple -- (line num, line text)
   1397         line num -- integer or None (to indicate a context separation)
   1398         line text -- original line text with following markers inserted:
   1399             '\0+' -- marks start of added text
   1400             '\0-' -- marks start of deleted text
   1401             '\0^' -- marks start of changed text
   1402             '\1' -- marks end of added/deleted/changed text
   1403 
   1404     boolean flag -- None indicates context separation, True indicates
   1405         either "from" or "to" line contains a change, otherwise False.
   1406 
   1407     This function/iterator was originally developed to generate side by side
   1408     file difference for making HTML pages (see HtmlDiff class for example
   1409     usage).
   1410 
   1411     Note, this function utilizes the ndiff function to generate the side by
   1412     side difference markup.  Optional ndiff arguments may be passed to this
   1413     function and they in turn will be passed to ndiff.
   1414     """
   1415     import re
   1416 
   1417     # regular expression for finding intraline change indices
   1418     change_re = re.compile(r'(\++|\-+|\^+)')
   1419 
   1420     # create the difference iterator to generate the differences
   1421     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
   1422 
   1423     def _make_line(lines, format_key, side, num_lines=[0,0]):
   1424         """Returns line of text with user's change markup and line formatting.
   1425 
   1426         lines -- list of lines from the ndiff generator to produce a line of
   1427                  text from.  When producing the line of text to return, the
   1428                  lines used are removed from this list.
   1429         format_key -- '+' return first line in list with "add" markup around
   1430                           the entire line.
   1431                       '-' return first line in list with "delete" markup around
   1432                           the entire line.
   1433                       '?' return first line in list with add/delete/change
   1434                           intraline markup (indices obtained from second line)
   1435                       None return first line in list with no markup
   1436         side -- indice into the num_lines list (0=from,1=to)
   1437         num_lines -- from/to current line number.  This is NOT intended to be a
   1438                      passed parameter.  It is present as a keyword argument to
   1439                      maintain memory of the current line numbers between calls
   1440                      of this function.
   1441 
   1442         Note, this function is purposefully not defined at the module scope so
   1443         that data it needs from its parent function (within whose context it
   1444         is defined) does not need to be of module scope.
   1445         """
   1446         num_lines[side] += 1
   1447         # Handle case where no user markup is to be added, just return line of
   1448         # text with user's line format to allow for usage of the line number.
   1449         if format_key is None:
   1450             return (num_lines[side],lines.pop(0)[2:])
   1451         # Handle case of intraline changes
   1452         if format_key == '?':
   1453             text, markers = lines.pop(0), lines.pop(0)
   1454             # find intraline changes (store change type and indices in tuples)
   1455             sub_info = []
   1456             def record_sub_info(match_object,sub_info=sub_info):
   1457                 sub_info.append([match_object.group(1)[0],match_object.span()])
   1458                 return match_object.group(1)
   1459             change_re.sub(record_sub_info,markers)
   1460             # process each tuple inserting our special marks that won't be
   1461             # noticed by an xml/html escaper.
   1462             for key,(begin,end) in reversed(sub_info):
   1463                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
   1464             text = text[2:]
   1465         # Handle case of add/delete entire line
   1466         else:
   1467             text = lines.pop(0)[2:]
   1468             # if line of text is just a newline, insert a space so there is
   1469             # something for the user to highlight and see.
   1470             if not text:
   1471                 text = ' '
   1472             # insert marks that won't be noticed by an xml/html escaper.
   1473             text = '\0' + format_key + text + '\1'
   1474         # Return line of text, first allow user's line formatter to do its
   1475         # thing (such as adding the line number) then replace the special
   1476         # marks with what the user's change markup.
   1477         return (num_lines[side],text)
   1478 
   1479     def _line_iterator():
   1480         """Yields from/to lines of text with a change indication.
   1481 
   1482         This function is an iterator.  It itself pulls lines from a
   1483         differencing iterator, processes them and yields them.  When it can
   1484         it yields both a "from" and a "to" line, otherwise it will yield one
   1485         or the other.  In addition to yielding the lines of from/to text, a
   1486         boolean flag is yielded to indicate if the text line(s) have
   1487         differences in them.
   1488 
   1489         Note, this function is purposefully not defined at the module scope so
   1490         that data it needs from its parent function (within whose context it
   1491         is defined) does not need to be of module scope.
   1492         """
   1493         lines = []
   1494         num_blanks_pending, num_blanks_to_yield = 0, 0
   1495         while True:
   1496             # Load up next 4 lines so we can look ahead, create strings which
   1497             # are a concatenation of the first character of each of the 4 lines
   1498             # so we can do some very readable comparisons.
   1499             while len(lines) < 4:
   1500                 lines.append(next(diff_lines_iterator, 'X'))
   1501             s = ''.join([line[0] for line in lines])
   1502             if s.startswith('X'):
   1503                 # When no more lines, pump out any remaining blank lines so the
   1504                 # corresponding add/delete lines get a matching blank line so
   1505                 # all line pairs get yielded at the next level.
   1506                 num_blanks_to_yield = num_blanks_pending
   1507             elif s.startswith('-?+?'):
   1508                 # simple intraline change
   1509                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
   1510                 continue
   1511             elif s.startswith('--++'):
   1512                 # in delete block, add block coming: we do NOT want to get
   1513                 # caught up on blank lines yet, just process the delete line
   1514                 num_blanks_pending -= 1
   1515                 yield _make_line(lines,'-',0), None, True
   1516                 continue
   1517             elif s.startswith(('--?+', '--+', '- ')):
   1518                 # in delete block and see an intraline change or unchanged line
   1519                 # coming: yield the delete line and then blanks
   1520                 from_line,to_line = _make_line(lines,'-',0), None
   1521                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
   1522             elif s.startswith('-+?'):
   1523                 # intraline change
   1524                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
   1525                 continue
   1526             elif s.startswith('-?+'):
   1527                 # intraline change
   1528                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
   1529                 continue
   1530             elif s.startswith('-'):
   1531                 # delete FROM line
   1532                 num_blanks_pending -= 1
   1533                 yield _make_line(lines,'-',0), None, True
   1534                 continue
   1535             elif s.startswith('+--'):
   1536                 # in add block, delete block coming: we do NOT want to get
   1537                 # caught up on blank lines yet, just process the add line
   1538                 num_blanks_pending += 1
   1539                 yield None, _make_line(lines,'+',1), True
   1540                 continue
   1541             elif s.startswith(('+ ', '+-')):
   1542                 # will be leaving an add block: yield blanks then add line
   1543                 from_line, to_line = None, _make_line(lines,'+',1)
   1544                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
   1545             elif s.startswith('+'):
   1546                 # inside an add block, yield the add line
   1547                 num_blanks_pending += 1
   1548                 yield None, _make_line(lines,'+',1), True
   1549                 continue
   1550             elif s.startswith(' '):
   1551                 # unchanged text, yield it to both sides
   1552                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
   1553                 continue
   1554             # Catch up on the blank lines so when we yield the next from/to
   1555             # pair, they are lined up.
   1556             while(num_blanks_to_yield < 0):
   1557                 num_blanks_to_yield += 1
   1558                 yield None,('','\n'),True
   1559             while(num_blanks_to_yield > 0):
   1560                 num_blanks_to_yield -= 1
   1561                 yield ('','\n'),None,True
   1562             if s.startswith('X'):
   1563                 return
   1564             else:
   1565                 yield from_line,to_line,True
   1566 
   1567     def _line_pair_iterator():
   1568         """Yields from/to lines of text with a change indication.
   1569 
   1570         This function is an iterator.  It itself pulls lines from the line
   1571         iterator.  Its difference from that iterator is that this function
   1572         always yields a pair of from/to text lines (with the change
   1573         indication).  If necessary it will collect single from/to lines
   1574         until it has a matching pair from/to pair to yield.
   1575 
   1576         Note, this function is purposefully not defined at the module scope so
   1577         that data it needs from its parent function (within whose context it
   1578         is defined) does not need to be of module scope.
   1579         """
   1580         line_iterator = _line_iterator()
   1581         fromlines,tolines=[],[]
   1582         while True:
   1583             # Collecting lines of text until we have a from/to pair
   1584             while (len(fromlines)==0 or len(tolines)==0):
   1585                 try:
   1586                     from_line, to_line, found_diff = next(line_iterator)
   1587                 except StopIteration:
   1588                     return
   1589                 if from_line is not None:
   1590                     fromlines.append((from_line,found_diff))
   1591                 if to_line is not None:
   1592                     tolines.append((to_line,found_diff))
   1593             # Once we have a pair, remove them from the collection and yield it
   1594             from_line, fromDiff = fromlines.pop(0)
   1595             to_line, to_diff = tolines.pop(0)
   1596             yield (from_line,to_line,fromDiff or to_diff)
   1597 
   1598     # Handle case where user does not want context differencing, just yield
   1599     # them up without doing anything else with them.
   1600     line_pair_iterator = _line_pair_iterator()
   1601     if context is None:
   1602         yield from line_pair_iterator
   1603     # Handle case where user wants context differencing.  We must do some
   1604     # storage of lines until we know for sure that they are to be yielded.
   1605     else:
   1606         context += 1
   1607         lines_to_write = 0
   1608         while True:
   1609             # Store lines up until we find a difference, note use of a
   1610             # circular queue because we only need to keep around what
   1611             # we need for context.
   1612             index, contextLines = 0, [None]*(context)
   1613             found_diff = False
   1614             while(found_diff is False):
   1615                 try:
   1616                     from_line, to_line, found_diff = next(line_pair_iterator)
   1617                 except StopIteration:
   1618                     return
   1619                 i = index % context
   1620                 contextLines[i] = (from_line, to_line, found_diff)
   1621                 index += 1
   1622             # Yield lines that we have collected so far, but first yield
   1623             # the user's separator.
   1624             if index > context:
   1625                 yield None, None, None
   1626                 lines_to_write = context
   1627             else:
   1628                 lines_to_write = index
   1629                 index = 0
   1630             while(lines_to_write):
   1631                 i = index % context
   1632                 index += 1
   1633                 yield contextLines[i]
   1634                 lines_to_write -= 1
   1635             # Now yield the context lines after the change
   1636             lines_to_write = context-1
   1637             while(lines_to_write):
   1638                 from_line, to_line, found_diff = next(line_pair_iterator)
   1639                 # If another change within the context, extend the context
   1640                 if found_diff:
   1641                     lines_to_write = context-1
   1642                 else:
   1643                     lines_to_write -= 1
   1644                 yield from_line, to_line, found_diff
   1645 
   1646 
   1647 _file_template = """
   1648 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
   1649           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
   1650 
   1651 <html>
   1652 
   1653 <head>
   1654     <meta http-equiv="Content-Type"
   1655           content="text/html; charset=%(charset)s" />
   1656     <title></title>
   1657     <style type="text/css">%(styles)s
   1658     </style>
   1659 </head>
   1660 
   1661 <body>
   1662     %(table)s%(legend)s
   1663 </body>
   1664 
   1665 </html>"""
   1666 
   1667 _styles = """
   1668         table.diff {font-family:Courier; border:medium;}
   1669         .diff_header {background-color:#e0e0e0}
   1670         td.diff_header {text-align:right}
   1671         .diff_next {background-color:#c0c0c0}
   1672         .diff_add {background-color:#aaffaa}
   1673         .diff_chg {background-color:#ffff77}
   1674         .diff_sub {background-color:#ffaaaa}"""
   1675 
   1676 _table_template = """
   1677     <table class="diff" id="difflib_chg_%(prefix)s_top"
   1678            cellspacing="0" cellpadding="0" rules="groups" >
   1679         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
   1680         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
   1681         %(header_row)s
   1682         <tbody>
   1683 %(data_rows)s        </tbody>
   1684     </table>"""
   1685 
   1686 _legend = """
   1687     <table class="diff" summary="Legends">
   1688         <tr> <th colspan="2"> Legends </th> </tr>
   1689         <tr> <td> <table border="" summary="Colors">
   1690                       <tr><th> Colors </th> </tr>
   1691                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
   1692                       <tr><td class="diff_chg">Changed</td> </tr>
   1693                       <tr><td class="diff_sub">Deleted</td> </tr>
   1694                   </table></td>
   1695              <td> <table border="" summary="Links">
   1696                       <tr><th colspan="2"> Links </th> </tr>
   1697                       <tr><td>(f)irst change</td> </tr>
   1698                       <tr><td>(n)ext change</td> </tr>
   1699                       <tr><td>(t)op</td> </tr>
   1700                   </table></td> </tr>
   1701     </table>"""
   1702 
   1703 class HtmlDiff(object):
   1704     """For producing HTML side by side comparison with change highlights.
   1705 
   1706     This class can be used to create an HTML table (or a complete HTML file
   1707     containing the table) showing a side by side, line by line comparison
   1708     of text with inter-line and intra-line change highlights.  The table can
   1709     be generated in either full or contextual difference mode.
   1710 
   1711     The following methods are provided for HTML generation:
   1712 
   1713     make_table -- generates HTML for a single side by side table
   1714     make_file -- generates complete HTML file with a single side by side table
   1715 
   1716     See tools/scripts/diff.py for an example usage of this class.
   1717     """
   1718 
   1719     _file_template = _file_template
   1720     _styles = _styles
   1721     _table_template = _table_template
   1722     _legend = _legend
   1723     _default_prefix = 0
   1724 
   1725     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
   1726                  charjunk=IS_CHARACTER_JUNK):
   1727         """HtmlDiff instance initializer
   1728 
   1729         Arguments:
   1730         tabsize -- tab stop spacing, defaults to 8.
   1731         wrapcolumn -- column number where lines are broken and wrapped,
   1732             defaults to None where lines are not wrapped.
   1733         linejunk,charjunk -- keyword arguments passed into ndiff() (used by
   1734             HtmlDiff() to generate the side by side HTML differences).  See
   1735             ndiff() documentation for argument default values and descriptions.
   1736         """
   1737         self._tabsize = tabsize
   1738         self._wrapcolumn = wrapcolumn
   1739         self._linejunk = linejunk
   1740         self._charjunk = charjunk
   1741 
   1742     def make_file(self, fromlines, tolines, fromdesc='', todesc='',
   1743                   context=False, numlines=5, *, charset='utf-8'):
   1744         """Returns HTML file of side by side comparison with change highlights
   1745 
   1746         Arguments:
   1747         fromlines -- list of "from" lines
   1748         tolines -- list of "to" lines
   1749         fromdesc -- "from" file column header string
   1750         todesc -- "to" file column header string
   1751         context -- set to True for contextual differences (defaults to False
   1752             which shows full differences).
   1753         numlines -- number of context lines.  When context is set True,
   1754             controls number of lines displayed before and after the change.
   1755             When context is False, controls the number of lines to place
   1756             the "next" link anchors before the next change (so click of
   1757             "next" link jumps to just before the change).
   1758         charset -- charset of the HTML document
   1759         """
   1760 
   1761         return (self._file_template % dict(
   1762             styles=self._styles,
   1763             legend=self._legend,
   1764             table=self.make_table(fromlines, tolines, fromdesc, todesc,
   1765                                   context=context, numlines=numlines),
   1766             charset=charset
   1767         )).encode(charset, 'xmlcharrefreplace').decode(charset)
   1768 
   1769     def _tab_newline_replace(self,fromlines,tolines):
   1770         """Returns from/to line lists with tabs expanded and newlines removed.
   1771 
   1772         Instead of tab characters being replaced by the number of spaces
   1773         needed to fill in to the next tab stop, this function will fill
   1774         the space with tab characters.  This is done so that the difference
   1775         algorithms can identify changes in a file when tabs are replaced by
   1776         spaces and vice versa.  At the end of the HTML generation, the tab
   1777         characters will be replaced with a nonbreakable space.
   1778         """
   1779         def expand_tabs(line):
   1780             # hide real spaces
   1781             line = line.replace(' ','\0')
   1782             # expand tabs into spaces
   1783             line = line.expandtabs(self._tabsize)
   1784             # replace spaces from expanded tabs back into tab characters
   1785             # (we'll replace them with markup after we do differencing)
   1786             line = line.replace(' ','\t')
   1787             return line.replace('\0',' ').rstrip('\n')
   1788         fromlines = [expand_tabs(line) for line in fromlines]
   1789         tolines = [expand_tabs(line) for line in tolines]
   1790         return fromlines,tolines
   1791 
   1792     def _split_line(self,data_list,line_num,text):
   1793         """Builds list of text lines by splitting text lines at wrap point
   1794 
   1795         This function will determine if the input text line needs to be
   1796         wrapped (split) into separate lines.  If so, the first wrap point
   1797         will be determined and the first line appended to the output
   1798         text line list.  This function is used recursively to handle
   1799         the second part of the split line to further split it.
   1800         """
   1801         # if blank line or context separator, just add it to the output list
   1802         if not line_num:
   1803             data_list.append((line_num,text))
   1804             return
   1805 
   1806         # if line text doesn't need wrapping, just add it to the output list
   1807         size = len(text)
   1808         max = self._wrapcolumn
   1809         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
   1810             data_list.append((line_num,text))
   1811             return
   1812 
   1813         # scan text looking for the wrap point, keeping track if the wrap
   1814         # point is inside markers
   1815         i = 0
   1816         n = 0
   1817         mark = ''
   1818         while n < max and i < size:
   1819             if text[i] == '\0':
   1820                 i += 1
   1821                 mark = text[i]
   1822                 i += 1
   1823             elif text[i] == '\1':
   1824                 i += 1
   1825                 mark = ''
   1826             else:
   1827                 i += 1
   1828                 n += 1
   1829 
   1830         # wrap point is inside text, break it up into separate lines
   1831         line1 = text[:i]
   1832         line2 = text[i:]
   1833 
   1834         # if wrap point is inside markers, place end marker at end of first
   1835         # line and start marker at beginning of second line because each
   1836         # line will have its own table tag markup around it.
   1837         if mark:
   1838             line1 = line1 + '\1'
   1839             line2 = '\0' + mark + line2
   1840 
   1841         # tack on first line onto the output list
   1842         data_list.append((line_num,line1))
   1843 
   1844         # use this routine again to wrap the remaining text
   1845         self._split_line(data_list,'>',line2)
   1846 
   1847     def _line_wrapper(self,diffs):
   1848         """Returns iterator that splits (wraps) mdiff text lines"""
   1849 
   1850         # pull from/to data and flags from mdiff iterator
   1851         for fromdata,todata,flag in diffs:
   1852             # check for context separators and pass them through
   1853             if flag is None:
   1854                 yield fromdata,todata,flag
   1855                 continue
   1856             (fromline,fromtext),(toline,totext) = fromdata,todata
   1857             # for each from/to line split it at the wrap column to form
   1858             # list of text lines.
   1859             fromlist,tolist = [],[]
   1860             self._split_line(fromlist,fromline,fromtext)
   1861             self._split_line(tolist,toline,totext)
   1862             # yield from/to line in pairs inserting blank lines as
   1863             # necessary when one side has more wrapped lines
   1864             while fromlist or tolist:
   1865                 if fromlist:
   1866                     fromdata = fromlist.pop(0)
   1867                 else:
   1868                     fromdata = ('',' ')
   1869                 if tolist:
   1870                     todata = tolist.pop(0)
   1871                 else:
   1872                     todata = ('',' ')
   1873                 yield fromdata,todata,flag
   1874 
   1875     def _collect_lines(self,diffs):
   1876         """Collects mdiff output into separate lists
   1877 
   1878         Before storing the mdiff from/to data into a list, it is converted
   1879         into a single line of text with HTML markup.
   1880         """
   1881 
   1882         fromlist,tolist,flaglist = [],[],[]
   1883         # pull from/to data and flags from mdiff style iterator
   1884         for fromdata,todata,flag in diffs:
   1885             try:
   1886                 # store HTML markup of the lines into the lists
   1887                 fromlist.append(self._format_line(0,flag,*fromdata))
   1888                 tolist.append(self._format_line(1,flag,*todata))
   1889             except TypeError:
   1890                 # exceptions occur for lines where context separators go
   1891                 fromlist.append(None)
   1892                 tolist.append(None)
   1893             flaglist.append(flag)
   1894         return fromlist,tolist,flaglist
   1895 
   1896     def _format_line(self,side,flag,linenum,text):
   1897         """Returns HTML markup of "from" / "to" text lines
   1898 
   1899         side -- 0 or 1 indicating "from" or "to" text
   1900         flag -- indicates if difference on line
   1901         linenum -- line number (used for line number column)
   1902         text -- line text to be marked up
   1903         """
   1904         try:
   1905             linenum = '%d' % linenum
   1906             id = ' id="%s%s"' % (self._prefix[side],linenum)
   1907         except TypeError:
   1908             # handle blank lines where linenum is '>' or ''
   1909             id = ''
   1910         # replace those things that would get confused with HTML symbols
   1911         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
   1912 
   1913         # make space non-breakable so they don't get compressed or line wrapped
   1914         text = text.replace(' ','&nbsp;').rstrip()
   1915 
   1916         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
   1917                % (id,linenum,text)
   1918 
   1919     def _make_prefix(self):
   1920         """Create unique anchor prefixes"""
   1921 
   1922         # Generate a unique anchor prefix so multiple tables
   1923         # can exist on the same HTML page without conflicts.
   1924         fromprefix = "from%d_" % HtmlDiff._default_prefix
   1925         toprefix = "to%d_" % HtmlDiff._default_prefix
   1926         HtmlDiff._default_prefix += 1
   1927         # store prefixes so line format method has access
   1928         self._prefix = [fromprefix,toprefix]
   1929 
   1930     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
   1931         """Makes list of "next" links"""
   1932 
   1933         # all anchor names will be generated using the unique "to" prefix
   1934         toprefix = self._prefix[1]
   1935 
   1936         # process change flags, generating middle column of next anchors/links
   1937         next_id = ['']*len(flaglist)
   1938         next_href = ['']*len(flaglist)
   1939         num_chg, in_change = 0, False
   1940         last = 0
   1941         for i,flag in enumerate(flaglist):
   1942             if flag:
   1943                 if not in_change:
   1944                     in_change = True
   1945                     last = i
   1946                     # at the beginning of a change, drop an anchor a few lines
   1947                     # (the context lines) before the change for the previous
   1948                     # link
   1949                     i = max([0,i-numlines])
   1950                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
   1951                     # at the beginning of a change, drop a link to the next
   1952                     # change
   1953                     num_chg += 1
   1954                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
   1955                          toprefix,num_chg)
   1956             else:
   1957                 in_change = False
   1958         # check for cases where there is no content to avoid exceptions
   1959         if not flaglist:
   1960             flaglist = [False]
   1961             next_id = ['']
   1962             next_href = ['']
   1963             last = 0
   1964             if context:
   1965                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
   1966                 tolist = fromlist
   1967             else:
   1968                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
   1969         # if not a change on first line, drop a link
   1970         if not flaglist[0]:
   1971             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
   1972         # redo the last link to link to the top
   1973         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
   1974 
   1975         return fromlist,tolist,flaglist,next_href,next_id
   1976 
   1977     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
   1978                    numlines=5):
   1979         """Returns HTML table of side by side comparison with change highlights
   1980 
   1981         Arguments:
   1982         fromlines -- list of "from" lines
   1983         tolines -- list of "to" lines
   1984         fromdesc -- "from" file column header string
   1985         todesc -- "to" file column header string
   1986         context -- set to True for contextual differences (defaults to False
   1987             which shows full differences).
   1988         numlines -- number of context lines.  When context is set True,
   1989             controls number of lines displayed before and after the change.
   1990             When context is False, controls the number of lines to place
   1991             the "next" link anchors before the next change (so click of
   1992             "next" link jumps to just before the change).
   1993         """
   1994 
   1995         # make unique anchor prefixes so that multiple tables may exist
   1996         # on the same page without conflict.
   1997         self._make_prefix()
   1998 
   1999         # change tabs to spaces before it gets more difficult after we insert
   2000         # markup
   2001         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
   2002 
   2003         # create diffs iterator which generates side by side from/to data
   2004         if context:
   2005             context_lines = numlines
   2006         else:
   2007             context_lines = None
   2008         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
   2009                       charjunk=self._charjunk)
   2010 
   2011         # set up iterator to wrap lines that exceed desired width
   2012         if self._wrapcolumn:
   2013             diffs = self._line_wrapper(diffs)
   2014 
   2015         # collect up from/to lines and flags into lists (also format the lines)
   2016         fromlist,tolist,flaglist = self._collect_lines(diffs)
   2017 
   2018         # process change flags, generating middle column of next anchors/links
   2019         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
   2020             fromlist,tolist,flaglist,context,numlines)
   2021 
   2022         s = []
   2023         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
   2024               '<td class="diff_next">%s</td>%s</tr>\n'
   2025         for i in range(len(flaglist)):
   2026             if flaglist[i] is None:
   2027                 # mdiff yields None on separator lines skip the bogus ones
   2028                 # generated for the first line
   2029                 if i > 0:
   2030                     s.append('        </tbody>        \n        <tbody>\n')
   2031             else:
   2032                 s.append( fmt % (next_id[i],next_href[i],fromlist[i],
   2033                                            next_href[i],tolist[i]))
   2034         if fromdesc or todesc:
   2035             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
   2036                 '<th class="diff_next"><br /></th>',
   2037                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
   2038                 '<th class="diff_next"><br /></th>',
   2039                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
   2040         else:
   2041             header_row = ''
   2042 
   2043         table = self._table_template % dict(
   2044             data_rows=''.join(s),
   2045             header_row=header_row,
   2046             prefix=self._prefix[1])
   2047 
   2048         return table.replace('\0+','<span class="diff_add">'). \
   2049                      replace('\0-','<span class="diff_sub">'). \
   2050                      replace('\0^','<span class="diff_chg">'). \
   2051                      replace('\1','</span>'). \
   2052                      replace('\t','&nbsp;')
   2053 
   2054 del re
   2055 
   2056 def restore(delta, which):
   2057     r"""
   2058     Generate one of the two sequences that generated a delta.
   2059 
   2060     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
   2061     lines originating from file 1 or 2 (parameter `which`), stripping off line
   2062     prefixes.
   2063 
   2064     Examples:
   2065 
   2066     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
   2067     ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
   2068     >>> diff = list(diff)
   2069     >>> print(''.join(restore(diff, 1)), end="")
   2070     one
   2071     two
   2072     three
   2073     >>> print(''.join(restore(diff, 2)), end="")
   2074     ore
   2075     tree
   2076     emu
   2077     """
   2078     try:
   2079         tag = {1: "- ", 2: "+ "}[int(which)]
   2080     except KeyError:
   2081         raise ValueError('unknown delta choice (must be 1 or 2): %r'
   2082                            % which)
   2083     prefixes = ("  ", tag)
   2084     for line in delta:
   2085         if line[:2] in prefixes:
   2086             yield line[2:]
   2087 
   2088 def _test():
   2089     import doctest, difflib
   2090     return doctest.testmod(difflib)
   2091 
   2092 if __name__ == "__main__":
   2093     _test()
   2094