MyersDiff.java

  1. /*
  2.  * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
  3.  * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de> and others
  4.  *
  5.  * This program and the accompanying materials are made available under the
  6.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  7.  * https://www.eclipse.org/org/documents/edl-v10.php.
  8.  *
  9.  * SPDX-License-Identifier: BSD-3-Clause
  10.  */

  11. package org.eclipse.jgit.diff;

  12. import java.text.MessageFormat;

  13. import org.eclipse.jgit.errors.DiffInterruptedException;
  14. import org.eclipse.jgit.internal.JGitText;
  15. import org.eclipse.jgit.util.IntList;
  16. import org.eclipse.jgit.util.LongList;

  17. /**
  18.  * Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
  19.  * by Eugene Myers.
  20.  * <p>
  21.  * The basic idea is to put the line numbers of text A as columns ("x") and the
  22.  * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
  23.  * from the upper left corner to the lower right corner, where you can always go
  24.  * horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
  25.  * line x in text A is identical to line y in text B.
  26.  * <p>
  27.  * Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
  28.  * D-path is an edit path starting at the upper left corner and containing
  29.  * exactly D non-diagonal elements ("differences"). The furthest reaching D-path
  30.  * on diagonal k is the one that contains the most (diagonal) elements which
  31.  * ends on diagonal k (where k = y - x).
  32.  * <p>
  33.  * Example:
  34.  *
  35.  * <pre>
  36.  *    H E L L O   W O R L D
  37.  *    ____
  38.  *  L     \___
  39.  *  O         \___
  40.  *  W             \________
  41.  * </pre>
  42.  * <p>
  43.  * Since every D-path has exactly D horizontal or vertical elements, it can only
  44.  * end on the diagonals -D, -D+2, ..., D-2, D.
  45.  * <p>
  46.  * Since every furthest reaching D-path contains at least one furthest reaching
  47.  * (D-1)-path (except for D=0), we can construct them recursively.
  48.  * <p>
  49.  * Since we are really interested in the shortest edit path, we can start
  50.  * looking for a 0-path, then a 1-path, and so on, until we find a path that
  51.  * ends in the lower right corner.
  52.  * <p>
  53.  * To save space, we do not need to store all paths (which has quadratic space
  54.  * requirements), but generate the D-paths simultaneously from both sides. When
  55.  * the ends meet, we will have found "the middle" of the path. From the end
  56.  * points of that diagonal part, we can generate the rest recursively.
  57.  * <p>
  58.  * This only requires linear space.
  59.  * <p>
  60.  * The overall (runtime) complexity is:
  61.  *
  62.  * <pre>
  63.  *     O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
  64.  *     = O(N * D^2 * 5 / 4) = O(N * D^2),
  65.  * </pre>
  66.  * <p>
  67.  * (With each step, we have to find the middle parts of twice as many regions as
  68.  * before, but the regions (as well as the D) are halved.)
  69.  * <p>
  70.  * So the overall runtime complexity stays the same with linear space, albeit
  71.  * with a larger constant factor.
  72.  *
  73.  * @param <S>
  74.  *            type of sequence.
  75.  */
  76. @SuppressWarnings("hiding")
  77. public class MyersDiff<S extends Sequence> {
  78.     /** Singleton instance of MyersDiff. */
  79.     public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
  80.         @SuppressWarnings("unused")
  81.         @Override
  82.         public <S extends Sequence> void diffNonCommon(EditList edits,
  83.                 HashedSequenceComparator<S> cmp, HashedSequence<S> a,
  84.                 HashedSequence<S> b, Edit region) {
  85.             new MyersDiff<>(edits, cmp, a, b, region);
  86.         }
  87.     };

  88.     /**
  89.      * The list of edits found during the last call to
  90.      * {@link #calculateEdits(Edit)}
  91.      */
  92.     protected EditList edits;

  93.     /** Comparison function for sequences. */
  94.     protected HashedSequenceComparator<S> cmp;

  95.     /**
  96.      * The first text to be compared. Referred to as "Text A" in the comments
  97.      */
  98.     protected HashedSequence<S> a;

  99.     /**
  100.      * The second text to be compared. Referred to as "Text B" in the comments
  101.      */
  102.     protected HashedSequence<S> b;

  103.     private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
  104.             HashedSequence<S> a, HashedSequence<S> b, Edit region) {
  105.         this.edits = edits;
  106.         this.cmp = cmp;
  107.         this.a = a;
  108.         this.b = b;
  109.         calculateEdits(region);
  110.     }

  111.     // TODO: use ThreadLocal for future multi-threaded operations
  112.     MiddleEdit middle = new MiddleEdit();

  113.     /**
  114.      * Entrypoint into the algorithm this class is all about. This method triggers that the
  115.      * differences between A and B are calculated in form of a list of edits.
  116.      * @param r portion of the sequences to examine.
  117.      */
  118.     private void calculateEdits(Edit r) {
  119.         middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
  120.         if (middle.beginA >= middle.endA &&
  121.                 middle.beginB >= middle.endB)
  122.             return;

  123.         calculateEdits(middle.beginA, middle.endA,
  124.                 middle.beginB, middle.endB);
  125.     }

  126.     /**
  127.      * Calculates the differences between a given part of A against another
  128.      * given part of B
  129.      *
  130.      * @param beginA
  131.      *            start of the part of A which should be compared
  132.      *            (0&lt;=beginA&lt;sizeof(A))
  133.      * @param endA
  134.      *            end of the part of A which should be compared
  135.      *            (beginA&lt;=endA&lt;sizeof(A))
  136.      * @param beginB
  137.      *            start of the part of B which should be compared
  138.      *            (0&lt;=beginB&lt;sizeof(B))
  139.      * @param endB
  140.      *            end of the part of B which should be compared
  141.      *            (beginB&lt;=endB&lt;sizeof(B))
  142.      */
  143.     protected void calculateEdits(int beginA, int endA,
  144.             int beginB, int endB) {
  145.         Edit edit = middle.calculate(beginA, endA, beginB, endB);

  146.         if (beginA < edit.beginA || beginB < edit.beginB) {
  147.             int k = edit.beginB - edit.beginA;
  148.             int x = middle.backward.snake(k, edit.beginA);
  149.             calculateEdits(beginA, x, beginB, k + x);
  150.         }

  151.         if (edit.getType() != Edit.Type.EMPTY)
  152.             edits.add(edits.size(), edit);

  153.         // after middle
  154.         if (endA > edit.endA || endB > edit.endB) {
  155.             int k = edit.endB - edit.endA;
  156.             int x = middle.forward.snake(k, edit.endA);
  157.             calculateEdits(x, endA, k + x, endB);
  158.         }
  159.     }

  160.     /**
  161.      * A class to help bisecting the sequences a and b to find minimal
  162.      * edit paths.
  163.      *
  164.      * As the arrays are reused for space efficiency, you will need one
  165.      * instance per thread.
  166.      *
  167.      * The entry function is the calculate() method.
  168.      */
  169.     class MiddleEdit {
  170.         void initialize(int beginA, int endA, int beginB, int endB) {
  171.             this.beginA = beginA; this.endA = endA;
  172.             this.beginB = beginB; this.endB = endB;

  173.             // strip common parts on either end
  174.             int k = beginB - beginA;
  175.             this.beginA = forward.snake(k, beginA);
  176.             this.beginB = k + this.beginA;

  177.             k = endB - endA;
  178.             this.endA = backward.snake(k, endA);
  179.             this.endB = k + this.endA;
  180.         }

  181.         /*
  182.          * This function calculates the "middle" Edit of the shortest
  183.          * edit path between the given subsequences of a and b.
  184.          *
  185.          * Once a forward path and a backward path meet, we found the
  186.          * middle part.  From the last snake end point on both of them,
  187.          * we construct the Edit.
  188.          *
  189.          * It is assumed that there is at least one edit in the range.
  190.          */
  191.         // TODO: measure speed impact when this is synchronized
  192.         Edit calculate(int beginA, int endA, int beginB, int endB) {
  193.             if (beginA == endA || beginB == endB)
  194.                 return new Edit(beginA, endA, beginB, endB);
  195.             this.beginA = beginA; this.endA = endA;
  196.             this.beginB = beginB; this.endB = endB;

  197.             /*
  198.              * Following the conventions in Myers' paper, "k" is
  199.              * the difference between the index into "b" and the
  200.              * index into "a".
  201.              */
  202.             int minK = beginB - endA;
  203.             int maxK = endB - beginA;

  204.             forward.initialize(beginB - beginA, beginA, minK, maxK);
  205.             backward.initialize(endB - endA, endA, minK, maxK);

  206.             for (int d = 1; ; d++)
  207.                 if (forward.calculate(d) ||
  208.                         backward.calculate(d))
  209.                     return edit;
  210.         }

  211.         /*
  212.          * For each d, we need to hold the d-paths for the diagonals
  213.          * k = -d, -d + 2, ..., d - 2, d.  These are stored in the
  214.          * forward (and backward) array.
  215.          *
  216.          * As we allow subsequences, too, this needs some refinement:
  217.          * the forward paths start on the diagonal forwardK =
  218.          * beginB - beginA, and backward paths start on the diagonal
  219.          * backwardK = endB - endA.
  220.          *
  221.          * So, we need to hold the forward d-paths for the diagonals
  222.          * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
  223.          * the analogue for the backward d-paths.  This means that
  224.          * we can turn (k, d) into the forward array index using this
  225.          * formula:
  226.          *
  227.          *  i = (d + k - forwardK) / 2
  228.          *
  229.          * There is a further complication: the edit paths should not
  230.          * leave the specified subsequences, so k is bounded by
  231.          * minK = beginB - endA and maxK = endB - beginA.  However,
  232.          * (k - forwardK) _must_ be odd whenever d is odd, and it
  233.          * _must_ be even when d is even.
  234.          *
  235.          * The values in the "forward" and "backward" arrays are
  236.          * positions ("x") in the sequence a, to get the corresponding
  237.          * positions ("y") in the sequence b, you have to calculate
  238.          * the appropriate k and then y:
  239.          *
  240.          *  k = forwardK - d + i * 2
  241.          *  y = k + x
  242.          *
  243.          * (substitute backwardK for forwardK if you want to get the
  244.          * y position for an entry in the "backward" array.
  245.          */
  246.         EditPaths forward = new ForwardEditPaths();
  247.         EditPaths backward = new BackwardEditPaths();

  248.         /* Some variables which are shared between methods */
  249.         protected int beginA, endA, beginB, endB;
  250.         protected Edit edit;

  251.         abstract class EditPaths {
  252.             private IntList x = new IntList();
  253.             private LongList snake = new LongList();
  254.             int beginK, endK, middleK;
  255.             int prevBeginK, prevEndK;
  256.             /* if we hit one end early, no need to look further */
  257.             int minK, maxK; // TODO: better explanation

  258.             final int getIndex(int d, int k) {
  259. // TODO: remove
  260. if (((d + k - middleK) % 2) != 0)
  261.     throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
  262.                 return (d + k - middleK) / 2;
  263.             }

  264.             final int getX(int d, int k) {
  265. // TODO: remove
  266. if (k < beginK || k > endK)
  267.     throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
  268.                 return x.get(getIndex(d, k));
  269.             }

  270.             final long getSnake(int d, int k) {
  271. // TODO: remove
  272. if (k < beginK || k > endK)
  273.     throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
  274.                 return snake.get(getIndex(d, k));
  275.             }

  276.             private int forceKIntoRange(int k) {
  277.                 /* if k is odd, so must be the result */
  278.                 if (k < minK)
  279.                     return minK + ((k ^ minK) & 1);
  280.                 else if (k > maxK)
  281.                     return maxK - ((k ^ maxK) & 1);
  282.                 return k;
  283.             }

  284.             void initialize(int k, int x, int minK, int maxK) {
  285.                 this.minK = minK;
  286.                 this.maxK = maxK;
  287.                 beginK = endK = middleK = k;
  288.                 this.x.clear();
  289.                 this.x.add(x);
  290.                 snake.clear();
  291.                 snake.add(newSnake(k, x));
  292.             }

  293.             abstract int snake(int k, int x);
  294.             abstract int getLeft(int x);
  295.             abstract int getRight(int x);
  296.             abstract boolean isBetter(int left, int right);
  297.             abstract void adjustMinMaxK(int k, int x);
  298.             abstract boolean meets(int d, int k, int x, long snake);

  299.             final long newSnake(int k, int x) {
  300.                 long y = k + x;
  301.                 long ret = ((long) x) << 32;
  302.                 return ret | y;
  303.             }

  304.             final int snake2x(long snake) {
  305.                 return (int) (snake >>> 32);
  306.             }

  307.             final int snake2y(long snake) {
  308.                 return (int) snake;
  309.             }

  310.             final boolean makeEdit(long snake1, long snake2) {
  311.                 int x1 = snake2x(snake1), x2 = snake2x(snake2);
  312.                 int y1 = snake2y(snake1), y2 = snake2y(snake2);
  313.                 /*
  314.                  * Check for incompatible partial edit paths:
  315.                  * when there are ambiguities, we might have
  316.                  * hit incompatible (i.e. non-overlapping)
  317.                  * forward/backward paths.
  318.                  *
  319.                  * In that case, just pretend that we have
  320.                  * an empty edit at the end of one snake; this
  321.                  * will force a decision which path to take
  322.                  * in the next recursion step.
  323.                  */
  324.                 if (x1 > x2 || y1 > y2) {
  325.                     x1 = x2;
  326.                     y1 = y2;
  327.                 }
  328.                 edit = new Edit(x1, x2, y1, y2);
  329.                 return true;
  330.             }

  331.             boolean calculate(int d) {
  332.                 prevBeginK = beginK;
  333.                 prevEndK = endK;
  334.                 beginK = forceKIntoRange(middleK - d);
  335.                 endK = forceKIntoRange(middleK + d);
  336.                 // TODO: handle i more efficiently
  337.                 // TODO: walk snake(k, getX(d, k)) only once per (d, k)
  338.                 // TODO: move end points out of the loop to avoid conditionals inside the loop
  339.                 // go backwards so that we can avoid temp vars
  340.                 for (int k = endK; k >= beginK; k -= 2) {
  341.                     if (Thread.interrupted()) {
  342.                         throw new DiffInterruptedException();
  343.                     }
  344.                     int left = -1, right = -1;
  345.                     long leftSnake = -1L, rightSnake = -1L;
  346.                     // TODO: refactor into its own function
  347.                     if (k > prevBeginK) {
  348.                         int i = getIndex(d - 1, k - 1);
  349.                         left = x.get(i);
  350.                         int end = snake(k - 1, left);
  351.                         leftSnake = left != end ?
  352.                             newSnake(k - 1, end) :
  353.                             snake.get(i);
  354.                         if (meets(d, k - 1, end, leftSnake))
  355.                             return true;
  356.                         left = getLeft(end);
  357.                     }
  358.                     if (k < prevEndK) {
  359.                         int i = getIndex(d - 1, k + 1);
  360.                         right = x.get(i);
  361.                         int end = snake(k + 1, right);
  362.                         rightSnake = right != end ?
  363.                             newSnake(k + 1, end) :
  364.                             snake.get(i);
  365.                         if (meets(d, k + 1, end, rightSnake))
  366.                             return true;
  367.                         right = getRight(end);
  368.                     }
  369.                     int newX;
  370.                     long newSnake;
  371.                     if (k >= prevEndK ||
  372.                             (k > prevBeginK &&
  373.                              isBetter(left, right))) {
  374.                         newX = left;
  375.                         newSnake = leftSnake;
  376.                     }
  377.                     else {
  378.                         newX = right;
  379.                         newSnake = rightSnake;
  380.                     }
  381.                     if (meets(d, k, newX, newSnake))
  382.                         return true;
  383.                     adjustMinMaxK(k, newX);
  384.                     int i = getIndex(d, k);
  385.                     x.set(i, newX);
  386.                     snake.set(i, newSnake);
  387.                 }
  388.                 return false;
  389.             }
  390.         }

  391.         class ForwardEditPaths extends EditPaths {
  392.             @Override
  393.             final int snake(int k, int x) {
  394.                 for (; x < endA && k + x < endB; x++)
  395.                     if (!cmp.equals(a, x, b, k + x))
  396.                         break;
  397.                 return x;
  398.             }

  399.             @Override
  400.             final int getLeft(int x) {
  401.                 return x;
  402.             }

  403.             @Override
  404.             final int getRight(int x) {
  405.                 return x + 1;
  406.             }

  407.             @Override
  408.             final boolean isBetter(int left, int right) {
  409.                 return left > right;
  410.             }

  411.             @Override
  412.             final void adjustMinMaxK(int k, int x) {
  413.                 if (x >= endA || k + x >= endB) {
  414.                     if (k > backward.middleK)
  415.                         maxK = k;
  416.                     else
  417.                         minK = k;
  418.                 }
  419.             }

  420.             @Override
  421.             final boolean meets(int d, int k, int x, long snake) {
  422.                 if (k < backward.beginK || k > backward.endK)
  423.                     return false;
  424.                 // TODO: move out of loop
  425.                 if (((d - 1 + k - backward.middleK) % 2) != 0)
  426.                     return false;
  427.                 if (x < backward.getX(d - 1, k))
  428.                     return false;
  429.                 makeEdit(snake, backward.getSnake(d - 1, k));
  430.                 return true;
  431.             }
  432.         }

  433.         class BackwardEditPaths extends EditPaths {
  434.             @Override
  435.             final int snake(int k, int x) {
  436.                 for (; x > beginA && k + x > beginB; x--)
  437.                     if (!cmp.equals(a, x - 1, b, k + x - 1))
  438.                         break;
  439.                 return x;
  440.             }

  441.             @Override
  442.             final int getLeft(int x) {
  443.                 return x - 1;
  444.             }

  445.             @Override
  446.             final int getRight(int x) {
  447.                 return x;
  448.             }

  449.             @Override
  450.             final boolean isBetter(int left, int right) {
  451.                 return left < right;
  452.             }

  453.             @Override
  454.             final void adjustMinMaxK(int k, int x) {
  455.                 if (x <= beginA || k + x <= beginB) {
  456.                     if (k > forward.middleK)
  457.                         maxK = k;
  458.                     else
  459.                         minK = k;
  460.                 }
  461.             }

  462.             @Override
  463.             final boolean meets(int d, int k, int x, long snake) {
  464.                 if (k < forward.beginK || k > forward.endK)
  465.                     return false;
  466.                 // TODO: move out of loop
  467.                 if (((d + k - forward.middleK) % 2) != 0)
  468.                     return false;
  469.                 if (x > forward.getX(d, k))
  470.                     return false;
  471.                 makeEdit(forward.getSnake(d, k), snake);
  472.                 return true;
  473.             }
  474.         }
  475.     }

  476.     /**
  477.      * Main method
  478.      *
  479.      * @param args
  480.      *            two filenames specifying the contents to be diffed
  481.      */
  482.     public static void main(String[] args) {
  483.         if (args.length != 2) {
  484.             System.err.println(JGitText.get().need2Arguments);
  485.             System.exit(1);
  486.         }
  487.         try {
  488.             RawText a = new RawText(new java.io.File(args[0]));
  489.             RawText b = new RawText(new java.io.File(args[1]));
  490.             EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
  491.             System.out.println(r.toString());
  492.         } catch (Exception e) {
  493.             e.printStackTrace();
  494.         }
  495.     }
  496. }