001// License: GPL.
002package org.openstreetmap.josm.tools;
003
004//// Taken from http://www.bmsi.com/java/#diff
005
006// http://www.bmsi.com/java/DiffPrint.java could also be useful
007
008/*
009 * $Log: Diff.java,v $
010 * Revision 1.15  2013/04/01 16:27:31  stuart
011 * Fix DiffPrint unified format with test cases.
012 * Begin porting some diff-2.7 features.
013 *
014 * Revision 1.14  2010/03/03 21:21:25  stuart
015 * Test new direct equivalence API
016 *
017 * Revision 1.13  2009/12/07 17:43:17  stuart
018 * Compute equiv_max for int[] ctor
019 *
020 * Revision 1.12  2009/12/07 17:34:46  stuart
021 * Ctor with int[].
022 *
023 * Revision 1.11  2009/11/15 01:11:54  stuart
024 * Diff doesn't need to be generic
025 *
026 * Revision 1.10  2009/11/15 00:54:03  stuart
027 * Update to Java 5 containers
028 *
029 * Revision 1.7  2009/01/19 03:05:26  stuart
030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
031 *
032 * Revision 1.6  2003/03/06 22:51:32  stuart
033 * Convert to CVS
034 *
035 * Revision 1.5  2002/07/19  19:14:40  stuart
036 * fix reverseScript, make change ctor public, update docs
037 *
038 * Revision 1.4  2002/04/09  17:53:39  stuart
039 * More flexible interface for diff() function.
040 *
041 * Revision 1.3  2000/03/03 21:58:03  stuart
042 * move discard_confusing_lines and shift_boundaries to class file_data
043 *
044 * Revision 1.2  2000/03/02  16:37:38  stuart
045 * Add GPL and copyright
046 *
047 */
048
049import java.util.HashMap;
050import java.util.Map;
051
052/** A class to compare vectors of objects.  The result of comparison
053    is a list of <code>change</code> objects which form an
054    edit script.  The objects compared are traditionally lines
055    of text from two files.  Comparison options such as "ignore
056    whitespace" are implemented by modifying the <code>equals</code>
057    and <code>hashcode</code> methods for the objects compared.
058<p>
059   The basic algorithm is described in: <br>
060   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
061   Algorithmica Vol. 1 No. 2, 1986, p 251.
062<p>
063   This class outputs different results from GNU diff 1.15 on some
064   inputs.  Our results are actually better (smaller change list, smaller
065   total size of changes), but it would be nice to know why.  Perhaps
066   there is a memory overwrite bug in GNU diff 1.15.
067
068  @author Stuart D. Gathman, translated from GNU diff 1.15
069    Copyright (C) 2000  Business Management Systems, Inc.
070<p>
071    This program is free software; you can redistribute it and/or modify
072    it under the terms of the GNU General Public License as published by
073    the Free Software Foundation; either version 1, or (at your option)
074    any later version.
075<p>
076    This program is distributed in the hope that it will be useful,
077    but WITHOUT ANY WARRANTY; without even the implied warranty of
078    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
079    GNU General Public License for more details.
080<p>
081    You should have received a copy of the <a href=COPYING.txt>
082    GNU General Public License</a>
083    along with this program; if not, write to the Free Software
084    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
085
086 */
087
088public class Diff {
089
090    /** Prepare to find differences between two arrays.  Each element of
091      the arrays is translated to an "equivalence number" based on
092      the result of <code>equals</code>.  The original Object arrays
093      are no longer needed for computing the differences.  They will
094      be needed again later to print the results of the comparison as
095      an edit script, if desired.
096     */
097    public Diff(Object[] a, Object[] b) {
098        Map<Object,Integer> h = new HashMap<>(a.length + b.length);
099        filevec = new FileData[] { new FileData(a,h),new FileData(b,h) };
100    }
101
102    /** 1 more than the maximum equivalence value used for this or its
103     sibling file. */
104    private int equiv_max = 1;
105
106    /** When set to true, the comparison uses a heuristic to speed it up.
107    With this heuristic, for files with a constant small density
108    of changes, the algorithm is linear in the file size.  */
109    public boolean heuristic = false;
110
111    /** When set to true, the algorithm returns a guarranteed minimal
112      set of changes.  This makes things slower, sometimes much slower. */
113    public boolean no_discards = false;
114
115    private int[] xvec, yvec; /* Vectors being compared. */
116    private int[] fdiag;      /* Vector, indexed by diagonal, containing
117                   the X coordinate of the point furthest
118                   along the given diagonal in the forward
119                   search of the edit matrix. */
120    private int[] bdiag;      /* Vector, indexed by diagonal, containing
121                   the X coordinate of the point furthest
122                   along the given diagonal in the backward
123                   search of the edit matrix. */
124    private int fdiagoff, bdiagoff;
125    private final FileData[] filevec;
126    private int cost;
127    /** Snakes bigger than this are considered "big". */
128    private static final int SNAKE_LIMIT = 20;
129
130    /** Find the midpoint of the shortest edit script for a specified
131     portion of the two files.
132
133     We scan from the beginnings of the files, and simultaneously from the ends,
134     doing a breadth-first search through the space of edit-sequence.
135     When the two searches meet, we have found the midpoint of the shortest
136     edit sequence.
137
138     The value returned is the number of the diagonal on which the midpoint lies.
139     The diagonal number equals the number of inserted lines minus the number
140     of deleted lines (counting only lines before the midpoint).
141     The edit cost is stored into COST; this is the total number of
142     lines inserted or deleted (counting only lines before the midpoint).
143
144     This function assumes that the first lines of the specified portions
145     of the two files do not match, and likewise that the last lines do not
146     match.  The caller must trim matching lines from the beginning and end
147     of the portions it is going to specify.
148
149     Note that if we return the "wrong" diagonal value, or if
150     the value of bdiag at that diagonal is "wrong",
151     the worst this can do is cause suboptimal diff output.
152     It cannot cause incorrect diff output.  */
153
154    private int diag (int xoff, int xlim, int yoff, int ylim) {
155        final int[] fd = fdiag; // Give the compiler a chance.
156        final int[] bd = bdiag; // Additional help for the compiler.
157        final int[] xv = xvec;      // Still more help for the compiler.
158        final int[] yv = yvec;      // And more and more . . .
159        final int dmin = xoff - ylim;   // Minimum valid diagonal.
160        final int dmax = xlim - yoff;   // Maximum valid diagonal.
161        final int fmid = xoff - yoff;   // Center diagonal of top-down search.
162        final int bmid = xlim - ylim;   // Center diagonal of bottom-up search.
163        int fmin = fmid, fmax = fmid;   // Limits of top-down search.
164        int bmin = bmid, bmax = bmid;   // Limits of bottom-up search.
165        /* True if southeast corner is on an odd
166                     diagonal with respect to the northwest. */
167        final boolean odd = (fmid - bmid & 1) != 0;
168
169        fd[fdiagoff + fmid] = xoff;
170        bd[bdiagoff + bmid] = xlim;
171
172        for (int c = 1;; ++c)
173        {
174            int d;          /* Active diagonal. */
175            boolean big_snake = false;
176
177            /* Extend the top-down search by an edit step in each diagonal. */
178            if (fmin > dmin) {
179                fd[fdiagoff + --fmin - 1] = -1;
180            } else {
181                ++fmin;
182            }
183            if (fmax < dmax) {
184                fd[fdiagoff + ++fmax + 1] = -1;
185            } else {
186                --fmax;
187            }
188            for (d = fmax; d >= fmin; d -= 2)
189            {
190                int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
191
192                if (tlo >= thi) {
193                    x = tlo + 1;
194                } else {
195                    x = thi;
196                }
197                oldx = x;
198                y = x - d;
199                while (x < xlim && y < ylim && xv[x] == yv[y]) {
200                    ++x; ++y;
201                }
202                if (x - oldx > SNAKE_LIMIT) {
203                    big_snake = true;
204                }
205                fd[fdiagoff + d] = x;
206                if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
207                {
208                    cost = 2 * c - 1;
209                    return d;
210                }
211            }
212
213            /* Similar extend the bottom-up search. */
214            if (bmin > dmin) {
215                bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
216            } else {
217                ++bmin;
218            }
219            if (bmax < dmax) {
220                bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
221            } else {
222                --bmax;
223            }
224            for (d = bmax; d >= bmin; d -= 2)
225            {
226                int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
227
228                if (tlo < thi) {
229                    x = tlo;
230                } else {
231                    x = thi - 1;
232                }
233                oldx = x;
234                y = x - d;
235                while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
236                    --x; --y;
237                }
238                if (oldx - x > SNAKE_LIMIT) {
239                    big_snake = true;
240                }
241                bd[bdiagoff + d] = x;
242                if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
243                {
244                    cost = 2 * c;
245                    return d;
246                }
247            }
248
249            /* Heuristic: check occasionally for a diagonal that has made
250       lots of progress compared with the edit distance.
251       If we have any such, find the one that has made the most
252       progress and return it as if it had succeeded.
253
254       With this heuristic, for files with a constant small density
255       of changes, the algorithm is linear in the file size.  */
256
257            if (c > 200 && big_snake && heuristic)
258            {
259                int best = 0;
260                int bestpos = -1;
261
262                for (d = fmax; d >= fmin; d -= 2)
263                {
264                    int dd = d - fmid;
265                    int x = fd[fdiagoff + d];
266                    int y = x - d;
267                    int v = (x - xoff) * 2 - dd;
268                    if (v > 12 * (c + (dd < 0 ? -dd : dd)))
269                    {
270                        if (v > best
271                                && xoff + SNAKE_LIMIT <= x && x < xlim
272                                && yoff + SNAKE_LIMIT <= y && y < ylim)
273                        {
274                            /* We have a good enough best diagonal.
275                               now insist that it end with a significant snake.  */
276                            int k;
277
278                            for (k = 1; xvec[x - k] == yvec[y - k]; k++)
279                                if (k == SNAKE_LIMIT)
280                                {
281                                    best = v;
282                                    bestpos = d;
283                                    break;
284                                }
285                        }
286                    }
287                }
288                if (best > 0)
289                {
290                    cost = 2 * c - 1;
291                    return bestpos;
292                }
293
294                best = 0;
295                for (d = bmax; d >= bmin; d -= 2)
296                {
297                    int dd = d - bmid;
298                    int x = bd[bdiagoff + d];
299                    int y = x - d;
300                    int v = (xlim - x) * 2 + dd;
301                    if (v > 12 * (c + (dd < 0 ? -dd : dd)))
302                    {
303                        if (v > best
304                                && xoff < x && x <= xlim - SNAKE_LIMIT
305                                && yoff < y && y <= ylim - SNAKE_LIMIT)
306                        {
307                            /* We have a good enough best diagonal.
308                               now insist that it end with a significant snake.  */
309                            int k;
310
311                            for (k = 0; xvec[x + k] == yvec[y + k]; k++)
312                                if (k == SNAKE_LIMIT)
313                                {
314                                    best = v;
315                                    bestpos = d;
316                                    break;
317                                }
318                        }
319                    }
320                }
321                if (best > 0)
322                {
323                    cost = 2 * c - 1;
324                    return bestpos;
325                }
326            }
327        }
328    }
329
330    /** Compare in detail contiguous subsequences of the two files
331     which are known, as a whole, to match each other.
332
333     The results are recorded in the vectors filevec[N].changed_flag, by
334     storing a 1 in the element for each line that is an insertion or deletion.
335
336     The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
337
338     Note that XLIM, YLIM are exclusive bounds.
339     All line numbers are origin-0 and discarded lines are not counted.  */
340
341    private void compareseq (int xoff, int xlim, int yoff, int ylim) {
342        /* Slide down the bottom initial diagonal. */
343        while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
344            ++xoff; ++yoff;
345        }
346        /* Slide up the top initial diagonal. */
347        while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
348            --xlim; --ylim;
349        }
350
351        /* Handle simple cases. */
352        if (xoff == xlim) {
353            while (yoff < ylim) {
354                filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
355            }
356        } else if (yoff == ylim) {
357            while (xoff < xlim) {
358                filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
359            }
360        } else {
361            /* Find a point of correspondence in the middle of the files.  */
362
363            int d = diag (xoff, xlim, yoff, ylim);
364            int c = cost;
365            int b = bdiag[bdiagoff + d];
366
367            if (c == 1)
368                /* This should be impossible, because it implies that
369                   one of the two subsequences is empty,
370                   and that case was handled above without calling `diag'.
371                   Let's verify that this is true.  */
372                throw new IllegalArgumentException("Empty subsequence");
373            else
374            {
375                /* Use that point to split this problem into two subproblems.  */
376                compareseq (xoff, b, yoff, b - d);
377                /* This used to use f instead of b,
378                   but that is incorrect!
379                   It is not necessarily the case that diagonal d
380                   has a snake from b to f.  */
381                compareseq (b, xlim, b - d, ylim);
382            }
383        }
384    }
385
386    /** Discard lines from one file that have no matches in the other file.
387     */
388    private void discard_confusing_lines() {
389        filevec[0].discard_confusing_lines(filevec[1]);
390        filevec[1].discard_confusing_lines(filevec[0]);
391    }
392
393    private boolean inhibit = false;
394
395    /** Adjust inserts/deletes of blank lines to join changes
396        as much as possible.
397     */
398    private void shift_boundaries() {
399        if (inhibit)
400            return;
401        filevec[0].shift_boundaries(filevec[1]);
402        filevec[1].shift_boundaries(filevec[0]);
403    }
404
405    public interface ScriptBuilder {
406        /** Scan the tables of which lines are inserted and deleted,
407            producing an edit script.
408            @param changed0 true for lines in first file which do not match 2nd
409            @param len0 number of lines in first file
410            @param changed1 true for lines in 2nd file which do not match 1st
411            @param len1 number of lines in 2nd file
412            @return a linked list of changes - or null
413         */
414        public Change build_script(
415                boolean[] changed0,int len0,
416                boolean[] changed1,int len1
417        );
418    }
419
420    /** Scan the tables of which lines are inserted and deleted,
421     producing an edit script in reverse order.  */
422
423    static class ReverseScript implements ScriptBuilder {
424        @Override
425        public  Change build_script(
426                final boolean[] changed0,int len0,
427                final boolean[] changed1,int len1)
428        {
429            Change script = null;
430            int i0 = 0, i1 = 0;
431            while (i0 < len0 || i1 < len1) {
432                if (changed0[1+i0] || changed1[1+i1]) {
433                    int line0 = i0, line1 = i1;
434
435                    /* Find # lines changed here in each file.  */
436                    while (changed0[1+i0]) {
437                        ++i0;
438                    }
439                    while (changed1[1+i1]) {
440                        ++i1;
441                    }
442
443                    /* Record this change.  */
444                    script = new Change(line0, line1, i0 - line0, i1 - line1, script);
445                }
446
447                /* We have reached lines in the two files that match each other.  */
448                i0++; i1++;
449            }
450
451            return script;
452        }
453    }
454
455    static class ForwardScript implements ScriptBuilder {
456        /** Scan the tables of which lines are inserted and deleted,
457            producing an edit script in forward order.  */
458        @Override
459        public Change build_script(
460                final boolean[] changed0,int len0,
461                final boolean[] changed1,int len1)
462        {
463            Change script = null;
464            int i0 = len0, i1 = len1;
465
466            while (i0 >= 0 || i1 >= 0)
467            {
468                if (changed0[i0] || changed1[i1])
469                {
470                    int line0 = i0, line1 = i1;
471
472                    /* Find # lines changed here in each file.  */
473                    while (changed0[i0]) {
474                        --i0;
475                    }
476                    while (changed1[i1]) {
477                        --i1;
478                    }
479
480                    /* Record this change.  */
481                    script = new Change(i0, i1, line0 - i0, line1 - i1, script);
482                }
483
484                /* We have reached lines in the two files that match each other.  */
485                i0--; i1--;
486            }
487
488            return script;
489        }
490    }
491
492    /** Standard ScriptBuilders. */
493    public static final ScriptBuilder
494    forwardScript = new ForwardScript(),
495    reverseScript = new ReverseScript();
496
497    /** Report the differences of two files.  DEPTH is the current directory
498        depth. */
499    public final Change diff_2(final boolean reverse) {
500        return diff(reverse ? reverseScript : forwardScript);
501    }
502
503    /** Get the results of comparison as an edit script.  The script
504     is described by a list of changes.  The standard ScriptBuilder
505     implementations provide for forward and reverse edit scripts.
506     Alternate implementations could, for instance, list common elements
507     instead of differences.
508     @param bld an object to build the script from change flags
509     @return the head of a list of changes
510     */
511    public Change diff(final ScriptBuilder bld) {
512
513        /* Some lines are obviously insertions or deletions
514       because they don't match anything.  Detect them now,
515       and avoid even thinking about them in the main comparison algorithm.  */
516
517        discard_confusing_lines ();
518
519        /* Now do the main comparison algorithm, considering just the
520       undiscarded lines.  */
521
522        xvec = filevec[0].undiscarded;
523        yvec = filevec[1].undiscarded;
524
525        int diags =
526            filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
527        fdiag = new int[diags];
528        fdiagoff = filevec[1].nondiscarded_lines + 1;
529        bdiag = new int[diags];
530        bdiagoff = filevec[1].nondiscarded_lines + 1;
531
532        compareseq (0, filevec[0].nondiscarded_lines,
533                0, filevec[1].nondiscarded_lines);
534        fdiag = null;
535        bdiag = null;
536
537        /* Modify the results slightly to make them prettier
538       in cases where that can validly be done.  */
539
540        shift_boundaries ();
541
542        /* Get the results of comparison in the form of a chain
543       of `struct change's -- an edit script.  */
544        return bld.build_script(
545                filevec[0].changed_flag,
546                filevec[0].buffered_lines,
547                filevec[1].changed_flag,
548                filevec[1].buffered_lines
549        );
550
551    }
552
553    /** The result of comparison is an "edit script": a chain of change objects.
554     Each change represents one place where some lines are deleted
555     and some are inserted.
556
557     LINE0 and LINE1 are the first affected lines in the two files (origin 0).
558     DELETED is the number of lines deleted here from file 0.
559     INSERTED is the number of lines inserted here in file 1.
560
561     If DELETED is 0 then LINE0 is the number of the line before
562     which the insertion was done; vice versa for INSERTED and LINE1.  */
563
564    public static class Change {
565        /** Previous or next edit command. */
566        public Change link;
567        /** # lines of file 1 changed here.  */
568        public final int inserted;
569        /** # lines of file 0 changed here.  */
570        public final int deleted;
571        /** Line number of 1st deleted line.  */
572        public final int line0;
573        /** Line number of 1st inserted line.  */
574        public final int line1;
575
576        /** Cons an additional entry onto the front of an edit script OLD.
577       LINE0 and LINE1 are the first affected lines in the two files (origin 0).
578       DELETED is the number of lines deleted here from file 0.
579       INSERTED is the number of lines inserted here in file 1.
580
581       If DELETED is 0 then LINE0 is the number of the line before
582       which the insertion was done; vice versa for INSERTED and LINE1.  */
583        public Change(int line0, int line1, int deleted, int inserted, Change old) {
584            this.line0 = line0;
585            this.line1 = line1;
586            this.inserted = inserted;
587            this.deleted = deleted;
588            this.link = old;
589        }
590
591        @Override
592        public String toString() {
593            String s = String.format("%d -%d +%d %d",line0,deleted,inserted,line1);
594            return (link != null) ? s = s + '\n' + link : s;
595        }
596    }
597
598    /** Data on one input file being compared.
599     */
600
601    class FileData {
602
603        /** Allocate changed array for the results of comparison.  */
604        void clear() {
605            /* Allocate a flag for each line of each file, saying whether that line
606               is an insertion or deletion.
607               Allocate an extra element, always zero, at each end of each vector.
608             */
609            changed_flag = new boolean[buffered_lines + 2];
610        }
611
612        /** Return equiv_count[I] as the number of lines in this file
613         that fall in equivalence class I.
614         @return the array of equivalence class counts.
615         */
616        int[] equivCount() {
617            int[] equiv_count = new int[equiv_max];
618            for (int i = 0; i < buffered_lines; ++i) {
619                ++equiv_count[equivs[i]];
620            }
621            return equiv_count;
622        }
623
624        /** Discard lines that have no matches in another file.
625
626       A line which is discarded will not be considered by the actual
627       comparison algorithm; it will be as if that line were not in the file.
628       The file's `realindexes' table maps virtual line numbers
629       (which don't count the discarded lines) into real line numbers;
630       this is how the actual comparison algorithm produces results
631       that are comprehensible when the discarded lines are counted.
632<p>
633       When we discard a line, we also mark it as a deletion or insertion
634       so that it will be printed in the output.
635      @param f the other file
636         */
637        void discard_confusing_lines(FileData f) {
638            clear();
639            /* Set up table of which lines are going to be discarded. */
640            final byte[] discarded = discardable(f.equivCount());
641
642            /* Don't really discard the provisional lines except when they occur
643       in a run of discardables, with nonprovisionals at the beginning
644       and end.  */
645            filterDiscards(discarded);
646
647            /* Actually discard the lines. */
648            discard(discarded);
649        }
650
651        /**
652         * Mark to be discarded each line that matches no line of another file.
653         * If a line matches many lines, mark it as provisionally discardable.
654         * @see #equivCount()
655         * @param counts The count of each equivalence number for the other file.
656         * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
657         *  for each line
658         */
659        private byte[] discardable(final int[] counts) {
660            final int end = buffered_lines;
661            final byte[] discards = new byte[end];
662            final int[] equivs = this.equivs;
663            int many = 5;
664            int tem = end / 64;
665
666            /* Multiply MANY by approximate square root of number of lines.
667               That is the threshold for provisionally discardable lines.  */
668            while ((tem = tem >> 2) > 0) {
669                many *= 2;
670            }
671
672            for (int i = 0; i < end; i++)
673            {
674                int nmatch;
675                if (equivs[i] == 0) {
676                    continue;
677                }
678                nmatch = counts[equivs[i]];
679                if (nmatch == 0) {
680                    discards[i] = 1;
681                } else if (nmatch > many) {
682                    discards[i] = 2;
683                }
684            }
685            return discards;
686        }
687
688        /**
689         * Don't really discard the provisional lines except when they occur
690         * in a run of discardables, with nonprovisionals at the beginning
691         * and end.
692         */
693        private void filterDiscards(final byte[] discards) {
694            final int end = buffered_lines;
695
696            for (int i = 0; i < end; i++)
697            {
698                /* Cancel provisional discards not in middle of run of discards.  */
699                if (discards[i] == 2) {
700                    discards[i] = 0;
701                } else if (discards[i] != 0)
702                {
703                    /* We have found a nonprovisional discard.  */
704                    int j;
705                    int length;
706                    int provisional = 0;
707
708                    /* Find end of this run of discardable lines.
709                       Count how many are provisionally discardable.  */
710                    for (j = i; j < end; j++)
711                    {
712                        if (discards[j] == 0) {
713                            break;
714                        }
715                        if (discards[j] == 2) {
716                            ++provisional;
717                        }
718                    }
719
720                    /* Cancel provisional discards at end, and shrink the run.  */
721                    while (j > i && discards[j - 1] == 2) {
722                        discards[--j] = 0; --provisional;
723                    }
724
725                    /* Now we have the length of a run of discardable lines
726                       whose first and last are not provisional.  */
727                    length = j - i;
728
729                    /* If 1/4 of the lines in the run are provisional,
730                       cancel discarding of all provisional lines in the run.  */
731                    if (provisional * 4 > length) {
732                        while (j > i)
733                            if (discards[--j] == 2) {
734                                discards[j] = 0;
735                            }
736                    } else {
737                        int consec;
738                        int minimum = 1;
739                        int tem = length / 4;
740
741                        /* MINIMUM is approximate square root of LENGTH/4.
742                           A subrun of two or more provisionals can stand
743                           when LENGTH is at least 16.
744                           A subrun of 4 or more can stand when LENGTH >= 64.  */
745                        while ((tem = tem >> 2) > 0) {
746                            minimum *= 2;
747                        }
748                        minimum++;
749
750                        /* Cancel any subrun of MINIMUM or more provisionals
751                           within the larger run.  */
752                        for (j = 0, consec = 0; j < length; j++)
753                            if (discards[i + j] != 2) {
754                                consec = 0;
755                            } else if (minimum == ++consec) {
756                                /* Back up to start of subrun, to cancel it all.  */
757                                j -= consec;
758                            } else if (minimum < consec) {
759                                discards[i + j] = 0;
760                            }
761
762                        /* Scan from beginning of run
763                           until we find 3 or more nonprovisionals in a row
764                           or until the first nonprovisional at least 8 lines in.
765                           Until that point, cancel any provisionals.  */
766                        for (j = 0, consec = 0; j < length; j++)
767                        {
768                            if (j >= 8 && discards[i + j] == 1) {
769                                break;
770                            }
771                            if (discards[i + j] == 2) {
772                                consec = 0; discards[i + j] = 0;
773                            }
774                            else if (discards[i + j] == 0) {
775                                consec = 0;
776                            } else {
777                                consec++;
778                            }
779                            if (consec == 3) {
780                                break;
781                            }
782                        }
783
784                        /* I advances to the last line of the run.  */
785                        i += length - 1;
786
787                        /* Same thing, from end.  */
788                        for (j = 0, consec = 0; j < length; j++)
789                        {
790                            if (j >= 8 && discards[i - j] == 1) {
791                                break;
792                            }
793                            if (discards[i - j] == 2) {
794                                consec = 0; discards[i - j] = 0;
795                            }
796                            else if (discards[i - j] == 0) {
797                                consec = 0;
798                            } else {
799                                consec++;
800                            }
801                            if (consec == 3) {
802                                break;
803                            }
804                        }
805                    }
806                }
807            }
808        }
809
810        /** Actually discard the lines.
811            @param discards flags lines to be discarded
812         */
813        private void discard(final byte[] discards) {
814            final int end = buffered_lines;
815            int j = 0;
816            for (int i = 0; i < end; ++i)
817                if (no_discards || discards[i] == 0)
818                {
819                    undiscarded[j] = equivs[i];
820                    realindexes[j++] = i;
821                } else {
822                    changed_flag[1+i] = true;
823                }
824            nondiscarded_lines = j;
825        }
826
827        FileData(int length) {
828            buffered_lines = length;
829            equivs = new int[length];
830            undiscarded = new int[buffered_lines];
831            realindexes = new int[buffered_lines];
832        }
833
834        FileData(Object[] data, Map<Object,Integer> h) {
835            this(data.length);
836            // FIXME: diff 2.7 removes common prefix and common suffix
837            for (int i = 0; i < data.length; ++i) {
838                Integer ir = h.get(data[i]);
839                if (ir == null) {
840                    h.put(data[i],equivs[i] = equiv_max++);
841                } else {
842                    equivs[i] = ir.intValue();
843                }
844            }
845        }
846
847        /** Adjust inserts/deletes of blank lines to join changes
848       as much as possible.
849
850       We do something when a run of changed lines include a blank
851       line at one end and have an excluded blank line at the other.
852       We are free to choose which blank line is included.
853       `compareseq' always chooses the one at the beginning,
854       but usually it is cleaner to consider the following blank line
855       to be the "change".  The only exception is if the preceding blank line
856       would join this change to other changes.
857      @param f the file being compared against
858         */
859
860        void shift_boundaries(FileData f) {
861            final boolean[] changed = changed_flag;
862            final boolean[] other_changed = f.changed_flag;
863            int i = 0;
864            int j = 0;
865            int i_end = buffered_lines;
866            int preceding = -1;
867            int other_preceding = -1;
868
869            for (;;)
870            {
871                int start, end, other_start;
872
873                /* Scan forwards to find beginning of another run of changes.
874                   Also keep track of the corresponding point in the other file.  */
875
876                while (i < i_end && !changed[1+i])
877                {
878                    while (other_changed[1+j++]) {
879                        /* Non-corresponding lines in the other file
880                           will count as the preceding batch of changes.  */
881                        other_preceding = j;
882                    }
883                    i++;
884                }
885
886                if (i == i_end) {
887                    break;
888                }
889
890                start = i;
891                other_start = j;
892
893                for (;;)
894                {
895                    /* Now find the end of this run of changes.  */
896
897                    while (i < i_end && changed[1+i]) {
898                        i++;
899                    }
900                    end = i;
901
902                    /* If the first changed line matches the following unchanged one,
903                       and this run does not follow right after a previous run,
904                       and there are no lines deleted from the other file here,
905                       then classify the first changed line as unchanged
906                       and the following line as changed in its place.  */
907
908                    /* You might ask, how could this run follow right after another?
909                       Only because the previous run was shifted here.  */
910
911                    if (end != i_end
912                            && equivs[start] == equivs[end]
913                                                       && !other_changed[1+j]
914                                                                         && !((preceding >= 0 && start == preceding)
915                                                                                 || (other_preceding >= 0
916                                                                                         && other_start == other_preceding)))
917                    {
918                        changed[1+end++] = true;
919                        changed[1+start++] = false;
920                        ++i;
921                        /* Since one line-that-matches is now before this run
922                           instead of after, we must advance in the other file
923                           to keep in synch.  */
924                        ++j;
925                    } else {
926                        break;
927                    }
928                }
929
930                preceding = i;
931                other_preceding = j;
932            }
933        }
934
935        /** Number of elements (lines) in this file. */
936        final int buffered_lines;
937
938        /** Vector, indexed by line number, containing an equivalence code for
939           each line.  It is this vector that is actually compared with that
940           of another file to generate differences. */
941        private final int[]     equivs;
942
943        /** Vector, like the previous one except that
944           the elements for discarded lines have been squeezed out.  */
945        final int[]    undiscarded;
946
947        /** Vector mapping virtual line numbers (not counting discarded lines)
948           to real ones (counting those lines).  Both are origin-0.  */
949        final int[]    realindexes;
950
951        /** Total number of nondiscarded lines. */
952        int         nondiscarded_lines;
953
954        /** Array, indexed by real origin-1 line number,
955           containing true for a line that is an insertion or a deletion.
956           The results of comparison are stored here.  */
957        boolean[]       changed_flag;
958    }
959}