Home | History | Annotate | Download | only in jdiff
      1 package jdiff;
      2 
      3 import java.io.*;
      4 import java.util.*;
      5 
      6 /** A class to compare vectors of objects.  The result of comparison
      7     is a list of <code>change</code> objects which form an
      8     edit script.  The objects compared are traditionally lines
      9     of text from two files.  Comparison options such as "ignore
     10     whitespace" are implemented by modifying the <code>equals</code>
     11     and <code>hashcode</code> methods for the objects compared.
     12 <p>
     13    The basic algorithm is described in: </br>
     14    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
     15    Algorithmica Vol. 1 No. 2, 1986, p 251.
     16 <p>
     17    This class outputs different results from GNU diff 1.15 on some
     18    inputs.  Our results are actually better (smaller change list, smaller
     19    total size of changes), but it would be nice to know why.  Perhaps
     20    there is a memory overwrite bug in GNU diff 1.15.
     21 
     22   @author Stuart D. Gathman, translated from GNU diff 1.15
     23     Copyright (C) 2000  Business Management Systems, Inc.
     24 <p>
     25     This program is free software; you can redistribute it and/or modify
     26     it under the terms of the GNU General Public License as published by
     27     the Free Software Foundation; either version 1, or (at your option)
     28     any later version.
     29 <p>
     30     This program is distributed in the hope that it will be useful,
     31     but WITHOUT ANY WARRANTY; without even the implied warranty of
     32     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     33     GNU General Public License for more details.
     34 <p>
     35     You should have received a copy of the <a href=COPYING.txt>
     36     GNU General Public License</a>
     37     along with this program; if not, write to the Free Software
     38     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     39 
     40  */
     41 
     42 public class DiffMyers
     43 {
     44 
     45   /** Prepare to find differences between two arrays.  Each element of
     46       the arrays is translated to an "equivalence number" based on
     47       the result of <code>equals</code>.  The original Object arrays
     48       are no longer needed for computing the differences.  They will
     49       be needed again later to print the results of the comparison as
     50       an edit script, if desired.
     51    */
     52   public DiffMyers(Object[] a,Object[] b)
     53   {
     54     Hashtable h = new Hashtable(a.length + b.length);
     55     filevec[0] = new file_data(a,h);
     56     filevec[1] = new file_data(b,h);
     57   }
     58 
     59   /** 1 more than the maximum equivalence value used for this or its
     60      sibling file. */
     61   private int equiv_max = 1;
     62 
     63   /** When set to true, the comparison uses a heuristic to speed it up.
     64     With this heuristic, for files with a constant small density
     65     of changes, the algorithm is linear in the file size.  */
     66   public boolean heuristic = false;
     67 
     68   /** When set to true, the algorithm returns a guarranteed minimal
     69       set of changes.  This makes things slower, sometimes much slower. */
     70   public boolean no_discards = false;
     71 
     72   private int[] xvec, yvec;        /* Vectors being compared. */
     73   private int[] fdiag;                /* Vector, indexed by diagonal, containing
     74                                    the X coordinate of the point furthest
     75                                    along the given diagonal in the forward
     76                                    search of the edit matrix. */
     77   private int[] bdiag;                /* Vector, indexed by diagonal, containing
     78                                    the X coordinate of the point furthest
     79                                    along the given diagonal in the backward
     80                                    search of the edit matrix. */
     81   private int fdiagoff, bdiagoff;
     82   private final file_data[] filevec = new file_data[2];
     83   private int cost;
     84 
     85   /** Find the midpoint of the shortest edit script for a specified
     86      portion of the two files.
     87 
     88      We scan from the beginnings of the files, and simultaneously from the ends,
     89      doing a breadth-first search through the space of edit-sequence.
     90      When the two searches meet, we have found the midpoint of the shortest
     91      edit sequence.
     92 
     93      The value returned is the number of the diagonal on which the midpoint lies.
     94      The diagonal number equals the number of inserted lines minus the number
     95      of deleted lines (counting only lines before the midpoint).
     96      The edit cost is stored into COST; this is the total number of
     97      lines inserted or deleted (counting only lines before the midpoint).
     98 
     99      This function assumes that the first lines of the specified portions
    100      of the two files do not match, and likewise that the last lines do not
    101      match.  The caller must trim matching lines from the beginning and end
    102      of the portions it is going to specify.
    103 
    104      Note that if we return the "wrong" diagonal value, or if
    105      the value of bdiag at that diagonal is "wrong",
    106      the worst this can do is cause suboptimal diff output.
    107      It cannot cause incorrect diff output.  */
    108 
    109   private int diag (int xoff, int xlim, int yoff, int ylim)
    110   {
    111     final int[] fd = fdiag;        // Give the compiler a chance.
    112     final int[] bd = bdiag;        // Additional help for the compiler.
    113     final int[] xv = xvec;                // Still more help for the compiler.
    114     final int[] yv = yvec;                // And more and more . . .
    115     final int dmin = xoff - ylim;        // Minimum valid diagonal.
    116     final int dmax = xlim - yoff;        // Maximum valid diagonal.
    117     final int fmid = xoff - yoff;        // Center diagonal of top-down search.
    118     final int bmid = xlim - ylim;        // Center diagonal of bottom-up search.
    119     int fmin = fmid, fmax = fmid;        // Limits of top-down search.
    120     int bmin = bmid, bmax = bmid;        // Limits of bottom-up search.
    121     /* True if southeast corner is on an odd
    122                                      diagonal with respect to the northwest. */
    123     final boolean odd = (fmid - bmid & 1) != 0;
    124 
    125     fd[fdiagoff + fmid] = xoff;
    126     bd[bdiagoff + bmid] = xlim;
    127 
    128     for (int c = 1;; ++c)
    129       {
    130         int d;                        /* Active diagonal. */
    131         boolean big_snake = false;
    132 
    133         /* Extend the top-down search by an edit step in each diagonal. */
    134         if (fmin > dmin)
    135           fd[fdiagoff + --fmin - 1] = -1;
    136         else
    137           ++fmin;
    138         if (fmax < dmax)
    139           fd[fdiagoff + ++fmax + 1] = -1;
    140         else
    141           --fmax;
    142         for (d = fmax; d >= fmin; d -= 2)
    143           {
    144             int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
    145 
    146             if (tlo >= thi)
    147               x = tlo + 1;
    148             else
    149               x = thi;
    150             oldx = x;
    151             y = x - d;
    152             while (x < xlim && y < ylim && xv[x] == yv[y]) {
    153               ++x; ++y;
    154             }
    155             if (x - oldx > 20)
    156               big_snake = true;
    157             fd[fdiagoff + d] = x;
    158             if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
    159               {
    160                 cost = 2 * c - 1;
    161                 return d;
    162               }
    163           }
    164 
    165         /* Similar extend the bottom-up search. */
    166         if (bmin > dmin)
    167           bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
    168         else
    169           ++bmin;
    170         if (bmax < dmax)
    171           bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
    172         else
    173           --bmax;
    174         for (d = bmax; d >= bmin; d -= 2)
    175           {
    176             int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
    177 
    178             if (tlo < thi)
    179               x = tlo;
    180             else
    181               x = thi - 1;
    182             oldx = x;
    183             y = x - d;
    184             while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
    185               --x; --y;
    186             }
    187             if (oldx - x > 20)
    188               big_snake = true;
    189             bd[bdiagoff + d] = x;
    190             if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
    191               {
    192                 cost = 2 * c;
    193                 return d;
    194               }
    195           }
    196 
    197         /* Heuristic: check occasionally for a diagonal that has made
    198            lots of progress compared with the edit distance.
    199            If we have any such, find the one that has made the most
    200            progress and return it as if it had succeeded.
    201 
    202            With this heuristic, for files with a constant small density
    203            of changes, the algorithm is linear in the file size.  */
    204 
    205         if (c > 200 && big_snake && heuristic)
    206           {
    207             int best = 0;
    208             int bestpos = -1;
    209 
    210             for (d = fmax; d >= fmin; d -= 2)
    211               {
    212                 int dd = d - fmid;
    213                 if ((fd[fdiagoff + d] - xoff)*2 - dd > 12 * (c + (dd > 0 ? dd : -dd)))
    214                   {
    215                     if (fd[fdiagoff + d] * 2 - dd > best
    216                         && fd[fdiagoff + d] - xoff > 20
    217                         && fd[fdiagoff + d] - d - yoff > 20)
    218                       {
    219                         int k;
    220                         int x = fd[fdiagoff + d];
    221 
    222                         /* We have a good enough best diagonal;
    223                            now insist that it end with a significant snake.  */
    224                         for (k = 1; k <= 20; k++)
    225                           if (xvec[x - k] != yvec[x - d - k])
    226                             break;
    227 
    228                         if (k == 21)
    229                           {
    230                             best = fd[fdiagoff + d] * 2 - dd;
    231                             bestpos = d;
    232                           }
    233                       }
    234                   }
    235               }
    236             if (best > 0)
    237               {
    238                 cost = 2 * c - 1;
    239                 return bestpos;
    240               }
    241 
    242             best = 0;
    243             for (d = bmax; d >= bmin; d -= 2)
    244               {
    245                 int dd = d - bmid;
    246                 if ((xlim - bd[bdiagoff + d])*2 + dd > 12 * (c + (dd > 0 ? dd : -dd)))
    247                   {
    248                     if ((xlim - bd[bdiagoff + d]) * 2 + dd > best
    249                         && xlim - bd[bdiagoff + d] > 20
    250                         && ylim - (bd[bdiagoff + d] - d) > 20)
    251                       {
    252                         /* We have a good enough best diagonal;
    253                            now insist that it end with a significant snake.  */
    254                         int k;
    255                         int x = bd[bdiagoff + d];
    256 
    257                         for (k = 0; k < 20; k++)
    258                           if (xvec[x + k] != yvec[x - d + k])
    259                             break;
    260                         if (k == 20)
    261                           {
    262                             best = (xlim - bd[bdiagoff + d]) * 2 + dd;
    263                             bestpos = d;
    264                           }
    265                       }
    266                   }
    267               }
    268             if (best > 0)
    269               {
    270                 cost = 2 * c - 1;
    271                 return bestpos;
    272               }
    273           }
    274       }
    275   }
    276 
    277   /** Compare in detail contiguous subsequences of the two files
    278      which are known, as a whole, to match each other.
    279 
    280      The results are recorded in the vectors filevec[N].changed_flag, by
    281      storing a 1 in the element for each line that is an insertion or deletion.
    282 
    283      The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
    284 
    285      Note that XLIM, YLIM are exclusive bounds.
    286      All line numbers are origin-0 and discarded lines are not counted.  */
    287 
    288   private void compareseq (int xoff, int xlim, int yoff, int ylim) {
    289     /* Slide down the bottom initial diagonal. */
    290     while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
    291       ++xoff; ++yoff;
    292     }
    293     /* Slide up the top initial diagonal. */
    294     while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
    295       --xlim; --ylim;
    296     }
    297 
    298     /* Handle simple cases. */
    299     if (xoff == xlim)
    300       while (yoff < ylim)
    301         filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
    302     else if (yoff == ylim)
    303       while (xoff < xlim)
    304         filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
    305     else
    306       {
    307         /* Find a point of correspondence in the middle of the files.  */
    308 
    309         int d = diag (xoff, xlim, yoff, ylim);
    310         int c = cost;
    311         int f = fdiag[fdiagoff + d];
    312         int b = bdiag[bdiagoff + d];
    313 
    314         if (c == 1)
    315           {
    316             /* This should be impossible, because it implies that
    317                one of the two subsequences is empty,
    318                and that case was handled above without calling `diag'.
    319                Let's verify that this is true.  */
    320             throw new IllegalArgumentException("Empty subsequence");
    321           }
    322         else
    323           {
    324             /* Use that point to split this problem into two subproblems.  */
    325             compareseq (xoff, b, yoff, b - d);
    326             /* This used to use f instead of b,
    327                but that is incorrect!
    328                It is not necessarily the case that diagonal d
    329                has a snake from b to f.  */
    330             compareseq (b, xlim, b - d, ylim);
    331           }
    332       }
    333   }
    334 
    335   /** Discard lines from one file that have no matches in the other file.
    336    */
    337 
    338   private void discard_confusing_lines() {
    339     filevec[0].discard_confusing_lines(filevec[1]);
    340     filevec[1].discard_confusing_lines(filevec[0]);
    341   }
    342 
    343   private boolean inhibit = false;
    344 
    345   /** Adjust inserts/deletes of blank lines to join changes
    346      as much as possible.
    347    */
    348 
    349   private void shift_boundaries() {
    350     if (inhibit)
    351       return;
    352     filevec[0].shift_boundaries(filevec[1]);
    353     filevec[1].shift_boundaries(filevec[0]);
    354   }
    355 
    356   /** Scan the tables of which lines are inserted and deleted,
    357      producing an edit script in reverse order.  */
    358 
    359   private change build_reverse_script() {
    360     change script = null;
    361     final boolean[] changed0 = filevec[0].changed_flag;
    362     final boolean[] changed1 = filevec[1].changed_flag;
    363     final int len0 = filevec[0].buffered_lines;
    364     final int len1 = filevec[1].buffered_lines;
    365 
    366     /* Note that changedN[len0] does exist, and contains 0.  */
    367 
    368     int i0 = 0, i1 = 0;
    369 
    370     while (i0 < len0 || i1 < len1)
    371       {
    372         if (changed0[1+i0] || changed1[1+i1])
    373           {
    374             int line0 = i0, line1 = i1;
    375 
    376             /* Find # lines changed here in each file.  */
    377             while (changed0[1+i0]) ++i0;
    378             while (changed1[1+i1]) ++i1;
    379 
    380             /* Record this change.  */
    381             script = new change(line0, line1, i0 - line0, i1 - line1, script);
    382           }
    383 
    384         /* We have reached lines in the two files that match each other.  */
    385         i0++; i1++;
    386       }
    387 
    388     return script;
    389   }
    390 
    391   /** Scan the tables of which lines are inserted and deleted,
    392      producing an edit script in forward order.  */
    393 
    394   private change build_script() {
    395     change script = null;
    396     final boolean[] changed0 = filevec[0].changed_flag;
    397     final boolean[] changed1 = filevec[1].changed_flag;
    398     final int len0 = filevec[0].buffered_lines;
    399     final int len1 = filevec[1].buffered_lines;
    400     int i0 = len0, i1 = len1;
    401 
    402     /* Note that changedN[-1] does exist, and contains 0.  */
    403 
    404     while (i0 >= 0 || i1 >= 0)
    405       {
    406         if (changed0[i0] || changed1[i1])
    407           {
    408             int line0 = i0, line1 = i1;
    409 
    410             /* Find # lines changed here in each file.  */
    411             while (changed0[i0]) --i0;
    412             while (changed1[i1]) --i1;
    413 
    414             /* Record this change.  */
    415             script = new change(i0, i1, line0 - i0, line1 - i1, script);
    416           }
    417 
    418         /* We have reached lines in the two files that match each other.  */
    419         i0--; i1--;
    420       }
    421 
    422     return script;
    423   }
    424 
    425   /* Report the differences of two files.  DEPTH is the current directory
    426      depth. */
    427   public change diff_2(final boolean reverse) {
    428 
    429     /* Some lines are obviously insertions or deletions
    430        because they don't match anything.  Detect them now,
    431        and avoid even thinking about them in the main comparison algorithm.  */
    432 
    433     discard_confusing_lines ();
    434 
    435     /* Now do the main comparison algorithm, considering just the
    436        undiscarded lines.  */
    437 
    438     xvec = filevec[0].undiscarded;
    439     yvec = filevec[1].undiscarded;
    440 
    441     int diags =
    442       filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
    443     fdiag = new int[diags];
    444     fdiagoff = filevec[1].nondiscarded_lines + 1;
    445     bdiag = new int[diags];
    446     bdiagoff = filevec[1].nondiscarded_lines + 1;
    447 
    448     compareseq (0, filevec[0].nondiscarded_lines,
    449                 0, filevec[1].nondiscarded_lines);
    450     fdiag = null;
    451     bdiag = null;
    452 
    453     /* Modify the results slightly to make them prettier
    454        in cases where that can validly be done.  */
    455 
    456     shift_boundaries ();
    457 
    458     /* Get the results of comparison in the form of a chain
    459        of `struct change's -- an edit script.  */
    460 
    461     if (reverse)
    462       return build_reverse_script();
    463     else
    464       return build_script();
    465   }
    466 
    467   /** The result of comparison is an "edit script": a chain of change objects.
    468      Each change represents one place where some lines are deleted
    469      and some are inserted.
    470 
    471      LINE0 and LINE1 are the first affected lines in the two files (origin 0).
    472      DELETED is the number of lines deleted here from file 0.
    473      INSERTED is the number of lines inserted here in file 1.
    474 
    475      If DELETED is 0 then LINE0 is the number of the line before
    476      which the insertion was done; vice versa for INSERTED and LINE1.  */
    477 
    478   public static class change {
    479     /** Previous or next edit command. */
    480     public change link;
    481     /** # lines of file 1 changed here.  */
    482     public int inserted;
    483     /** # lines of file 0 changed here.  */
    484     public int deleted;
    485     /** Line number of 1st deleted line.  */
    486     public final int line0;
    487     /** Line number of 1st inserted line.  */
    488     public final int line1;
    489 
    490     /** Cons an additional entry onto the front of an edit script OLD.
    491        LINE0 and LINE1 are the first affected lines in the two files (origin 0).
    492        DELETED is the number of lines deleted here from file 0.
    493        INSERTED is the number of lines inserted here in file 1.
    494 
    495        If DELETED is 0 then LINE0 is the number of the line before
    496        which the insertion was done; vice versa for INSERTED and LINE1.  */
    497     change(int line0, int line1, int deleted, int inserted, change old) {
    498       this.line0 = line0;
    499       this.line1 = line1;
    500       this.inserted = inserted;
    501       this.deleted = deleted;
    502       this.link = old;
    503       //System.err.println(line0+","+line1+","+inserted+","+deleted);
    504     }
    505   }
    506 
    507   /** Data on one input file being compared.
    508    */
    509 
    510   class file_data {
    511 
    512     /** Allocate changed array for the results of comparison.  */
    513     void clear() {
    514       /* Allocate a flag for each line of each file, saying whether that line
    515          is an insertion or deletion.
    516          Allocate an extra element, always zero, at each end of each vector.
    517        */
    518       changed_flag = new boolean[buffered_lines + 2];
    519     }
    520 
    521     /** Return equiv_count[I] as the number of lines in this file
    522        that fall in equivalence class I.
    523          @return the array of equivalence class counts.
    524      */
    525     int[] equivCount() {
    526       int[] equiv_count = new int[equiv_max];
    527       for (int i = 0; i < buffered_lines; ++i)
    528         ++equiv_count[equivs[i]];
    529       return equiv_count;
    530     }
    531 
    532     /** Discard lines that have no matches in another file.
    533 
    534        A line which is discarded will not be considered by the actual
    535        comparison algorithm; it will be as if that line were not in the file.
    536        The file's `realindexes' table maps virtual line numbers
    537        (which don't count the discarded lines) into real line numbers;
    538        this is how the actual comparison algorithm produces results
    539        that are comprehensible when the discarded lines are counted.
    540 <p>
    541        When we discard a line, we also mark it as a deletion or insertion
    542        so that it will be printed in the output.
    543       @param f the other file
    544      */
    545     void discard_confusing_lines(file_data f) {
    546       clear();
    547     /* Set up table of which lines are going to be discarded. */
    548       final byte[] discarded = discardable(f.equivCount());
    549 
    550     /* Don't really discard the provisional lines except when they occur
    551        in a run of discardables, with nonprovisionals at the beginning
    552        and end.  */
    553       filterDiscards(discarded);
    554 
    555     /* Actually discard the lines. */
    556       discard(discarded);
    557     }
    558 
    559     /** Mark to be discarded each line that matches no line of another file.
    560        If a line matches many lines, mark it as provisionally discardable.
    561        @see equivCount()
    562        @param counts The count of each equivalence number for the other file.
    563        @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
    564                for each line
    565      */
    566 
    567     private byte[] discardable(final int[] counts) {
    568       final int end = buffered_lines;
    569       final byte[] discards = new byte[end];
    570       final int[] equivs = this.equivs;
    571       int many = 5;
    572       int tem = end / 64;
    573 
    574       /* Multiply MANY by approximate square root of number of lines.
    575          That is the threshold for provisionally discardable lines.  */
    576       while ((tem = tem >> 2) > 0)
    577         many *= 2;
    578 
    579       for (int i = 0; i < end; i++)
    580         {
    581           int nmatch;
    582           if (equivs[i] == 0)
    583             continue;
    584           nmatch = counts[equivs[i]];
    585           if (nmatch == 0)
    586             discards[i] = 1;
    587           else if (nmatch > many)
    588             discards[i] = 2;
    589         }
    590       return discards;
    591     }
    592 
    593     /** Don't really discard the provisional lines except when they occur
    594        in a run of discardables, with nonprovisionals at the beginning
    595        and end.  */
    596 
    597     private void filterDiscards(final byte[] discards) {
    598         final int end = buffered_lines;
    599 
    600         for (int i = 0; i < end; i++)
    601           {
    602             /* Cancel provisional discards not in middle of run of discards.  */
    603             if (discards[i] == 2)
    604               discards[i] = 0;
    605             else if (discards[i] != 0)
    606               {
    607                 /* We have found a nonprovisional discard.  */
    608                 int j;
    609                 int length;
    610                 int provisional = 0;
    611 
    612                 /* Find end of this run of discardable lines.
    613                    Count how many are provisionally discardable.  */
    614                 for (j = i; j < end; j++)
    615                   {
    616                     if (discards[j] == 0)
    617                       break;
    618                     if (discards[j] == 2)
    619                       ++provisional;
    620                   }
    621 
    622                 /* Cancel provisional discards at end, and shrink the run.  */
    623                 while (j > i && discards[j - 1] == 2) {
    624                   discards[--j] = 0; --provisional;
    625                 }
    626 
    627                 /* Now we have the length of a run of discardable lines
    628                    whose first and last are not provisional.  */
    629                 length = j - i;
    630 
    631                 /* If 1/4 of the lines in the run are provisional,
    632                    cancel discarding of all provisional lines in the run.  */
    633                 if (provisional * 4 > length)
    634                   {
    635                     while (j > i)
    636                       if (discards[--j] == 2)
    637                         discards[j] = 0;
    638                   }
    639                 else
    640                   {
    641                     int consec;
    642                     int minimum = 1;
    643                     int tem = length / 4;
    644 
    645                     /* MINIMUM is approximate square root of LENGTH/4.
    646                        A subrun of two or more provisionals can stand
    647                        when LENGTH is at least 16.
    648                        A subrun of 4 or more can stand when LENGTH >= 64.  */
    649                     while ((tem = tem >> 2) > 0)
    650                       minimum *= 2;
    651                     minimum++;
    652 
    653                     /* Cancel any subrun of MINIMUM or more provisionals
    654                        within the larger run.  */
    655                     for (j = 0, consec = 0; j < length; j++)
    656                       if (discards[i + j] != 2)
    657                         consec = 0;
    658                       else if (minimum == ++consec)
    659                         /* Back up to start of subrun, to cancel it all.  */
    660                         j -= consec;
    661                       else if (minimum < consec)
    662                         discards[i + j] = 0;
    663 
    664                     /* Scan from beginning of run
    665                        until we find 3 or more nonprovisionals in a row
    666                        or until the first nonprovisional at least 8 lines in.
    667                        Until that point, cancel any provisionals.  */
    668                     for (j = 0, consec = 0; j < length; j++)
    669                       {
    670                         if (j >= 8 && discards[i + j] == 1)
    671                           break;
    672                         if (discards[i + j] == 2) {
    673                           consec = 0; discards[i + j] = 0;
    674                         }
    675                         else if (discards[i + j] == 0)
    676                           consec = 0;
    677                         else
    678                           consec++;
    679                         if (consec == 3)
    680                           break;
    681                       }
    682 
    683                     /* I advances to the last line of the run.  */
    684                     i += length - 1;
    685 
    686                     /* Same thing, from end.  */
    687                     for (j = 0, consec = 0; j < length; j++)
    688                       {
    689                         if (j >= 8 && discards[i - j] == 1)
    690                           break;
    691                         if (discards[i - j] == 2) {
    692                           consec = 0; discards[i - j] = 0;
    693                         }
    694                         else if (discards[i - j] == 0)
    695                           consec = 0;
    696                         else
    697                           consec++;
    698                         if (consec == 3)
    699                           break;
    700                       }
    701                   }
    702               }
    703           }
    704       }
    705 
    706     /** Actually discard the lines.
    707       @param discards flags lines to be discarded
    708      */
    709     private void discard(final byte[] discards) {
    710       final int end = buffered_lines;
    711       int j = 0;
    712       for (int i = 0; i < end; ++i)
    713         if (no_discards || discards[i] == 0)
    714           {
    715             undiscarded[j] = equivs[i];
    716             realindexes[j++] = i;
    717           }
    718         else
    719           changed_flag[1+i] = true;
    720       nondiscarded_lines = j;
    721     }
    722 
    723     file_data(Object[] data,Hashtable h) {
    724       buffered_lines = data.length;
    725 
    726       equivs = new int[buffered_lines];
    727       undiscarded = new int[buffered_lines];
    728       realindexes = new int[buffered_lines];
    729 
    730       for (int i = 0; i < data.length; ++i) {
    731         Integer ir = (Integer)h.get(data[i]);
    732         if (ir == null)
    733           h.put(data[i],new Integer(equivs[i] = equiv_max++));
    734         else
    735           equivs[i] = ir.intValue();
    736       }
    737     }
    738 
    739     /** Adjust inserts/deletes of blank lines to join changes
    740        as much as possible.
    741 
    742        We do something when a run of changed lines include a blank
    743        line at one end and have an excluded blank line at the other.
    744        We are free to choose which blank line is included.
    745        `compareseq' always chooses the one at the beginning,
    746        but usually it is cleaner to consider the following blank line
    747        to be the "change".  The only exception is if the preceding blank line
    748        would join this change to other changes.
    749       @param f the file being compared against
    750     */
    751 
    752     void shift_boundaries(file_data f) {
    753       final boolean[] changed = changed_flag;
    754       final boolean[] other_changed = f.changed_flag;
    755       int i = 0;
    756       int j = 0;
    757       int i_end = buffered_lines;
    758       int preceding = -1;
    759       int other_preceding = -1;
    760 
    761       for (;;)
    762         {
    763           int start, end, other_start;
    764 
    765           /* Scan forwards to find beginning of another run of changes.
    766              Also keep track of the corresponding point in the other file.  */
    767 
    768           while (i < i_end && !changed[1+i])
    769             {
    770               while (other_changed[1+j++])
    771                 /* Non-corresponding lines in the other file
    772                    will count as the preceding batch of changes.  */
    773                 other_preceding = j;
    774               i++;
    775             }
    776 
    777           if (i == i_end)
    778             break;
    779 
    780           start = i;
    781           other_start = j;
    782 
    783           for (;;)
    784             {
    785               /* Now find the end of this run of changes.  */
    786 
    787               while (i < i_end && changed[1+i]) i++;
    788               end = i;
    789 
    790               /* If the first changed line matches the following unchanged one,
    791                  and this run does not follow right after a previous run,
    792                  and there are no lines deleted from the other file here,
    793                  then classify the first changed line as unchanged
    794                  and the following line as changed in its place.  */
    795 
    796               /* You might ask, how could this run follow right after another?
    797                  Only because the previous run was shifted here.  */
    798 
    799               if (end != i_end
    800                   && equivs[start] == equivs[end]
    801                   && !other_changed[1+j]
    802                   && end != i_end
    803                   && !((preceding >= 0 && start == preceding)
    804                        || (other_preceding >= 0
    805                            && other_start == other_preceding)))
    806                 {
    807                   changed[1+end++] = true;
    808                   changed[1+start++] = false;
    809                   ++i;
    810                   /* Since one line-that-matches is now before this run
    811                      instead of after, we must advance in the other file
    812                      to keep in synch.  */
    813                   ++j;
    814                 }
    815               else
    816                 break;
    817             }
    818 
    819           preceding = i;
    820           other_preceding = j;
    821         }
    822     }
    823 
    824     /** Number of elements (lines) in this file. */
    825     final int buffered_lines;
    826 
    827     /** Vector, indexed by line number, containing an equivalence code for
    828        each line.  It is this vector that is actually compared with that
    829        of another file to generate differences. */
    830     private final int[]            equivs;
    831 
    832     /** Vector, like the previous one except that
    833        the elements for discarded lines have been squeezed out.  */
    834     final int[]           undiscarded;
    835 
    836     /** Vector mapping virtual line numbers (not counting discarded lines)
    837        to real ones (counting those lines).  Both are origin-0.  */
    838     final int[]           realindexes;
    839 
    840     /** Total number of nondiscarded lines. */
    841     int                    nondiscarded_lines;
    842 
    843     /** Array, indexed by real origin-1 line number,
    844        containing true for a line that is an insertion or a deletion.
    845        The results of comparison are stored here.  */
    846     boolean[]            changed_flag;
    847 
    848   }
    849 
    850 }
    851