RenameDetector.java

/*
 * Copyright (C) 2010, Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.diff;

import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;

import org.eclipse.jgit.api.errors.CanceledException;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.lib.AbbreviatedObjectId;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.ProgressMonitor;
import org.eclipse.jgit.lib.Repository;

/**
 * Detect and resolve object renames.
 */
public class RenameDetector {
	private static final int EXACT_RENAME_SCORE = 100;

	private static final Comparator<DiffEntry> DIFF_COMPARATOR = new Comparator<>() {

		@Override
		public int compare(DiffEntry a, DiffEntry b) {
			int cmp = nameOf(a).compareTo(nameOf(b));
			if (cmp == 0)
				cmp = sortOf(a.getChangeType()) - sortOf(b.getChangeType());
			return cmp;
		}

		private String nameOf(DiffEntry ent) {
			// Sort by the new name, unless the change is a delete. On
			// deletes the new name is /dev/null, so we sort instead by
			// the old name.
			//
			if (ent.changeType == ChangeType.DELETE)
				return ent.oldPath;
			return ent.newPath;
		}

		private int sortOf(ChangeType changeType) {
			// Sort deletes before adds so that a major type change for
			// a file path (such as symlink to regular file) will first
			// remove the path, then add it back with the new type.
			//
			switch (changeType) {
			case DELETE:
				return 1;
			case ADD:
				return 2;
			default:
				return 10;
			}
		}
	};

	private List<DiffEntry> entries;

	private List<DiffEntry> deleted;

	private List<DiffEntry> added;

	private boolean done;

	private final ObjectReader objectReader;

	/** Similarity score required to pair an add/delete as a rename. */
	private int renameScore = 60;

	/**
	 * Similarity score required to keep modified file pairs together. Any
	 * modified file pairs with a similarity score below this will be broken
	 * apart.
	 */
	private int breakScore = -1;

	/** Limit in the number of files to consider for renames. */
	private int renameLimit;

	/**
	 * File size threshold (in bytes) for detecting renames. Files larger
	 * than this size will not be processed for renames.
	 */
	private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;

	/**
	 * Skip detecting content renames for binary files. Content renames are
	 * those that are not exact, that is with a slight content modification
	 * between the two files.
	 */
	private boolean skipContentRenamesForBinaryFiles = false;

	/** Set if the number of adds or deletes was over the limit. */
	private boolean overRenameLimit;

	/**
	 * Create a new rename detector for the given repository
	 *
	 * @param repo
	 *            the repository to use for rename detection
	 */
	public RenameDetector(Repository repo) {
		this(repo.newObjectReader(), repo.getConfig().get(DiffConfig.KEY));
	}

	/**
	 * Create a new rename detector with a specified reader and diff config.
	 *
	 * @param reader
	 *            reader to obtain objects from the repository with.
	 * @param cfg
	 *            diff config specifying rename detection options.
	 * @since 3.0
	 */
	public RenameDetector(ObjectReader reader, DiffConfig cfg) {
		objectReader = reader.newReader();
		renameLimit = cfg.getRenameLimit();
		reset();
	}

	/**
	 * Get rename score
	 *
	 * @return minimum score required to pair an add/delete as a rename. The
	 *         score ranges are within the bounds of (0, 100).
	 */
	public int getRenameScore() {
		return renameScore;
	}

	/**
	 * Set the minimum score required to pair an add/delete as a rename.
	 * <p>
	 * When comparing two files together their score must be greater than or
	 * equal to the rename score for them to be considered a rename match. The
	 * score is computed based on content similarity, so a score of 60 implies
	 * that approximately 60% of the bytes in the files are identical.
	 *
	 * @param score
	 *            new rename score, must be within [0, 100].
	 * @throws java.lang.IllegalArgumentException
	 *             the score was not within [0, 100].
	 */
	public void setRenameScore(int score) {
		if (score < 0 || score > 100)
			throw new IllegalArgumentException(
					JGitText.get().similarityScoreMustBeWithinBounds);
		renameScore = score;
	}

	/**
	 * Get break score
	 *
	 * @return the similarity score required to keep modified file pairs
	 *         together. Any modify pairs that score below this will be broken
	 *         apart into separate add/deletes. Values less than or equal to
	 *         zero indicate that no modifies will be broken apart. Values over
	 *         100 cause all modify pairs to be broken.
	 */
	public int getBreakScore() {
		return breakScore;
	}

	/**
	 * Set break score
	 *
	 * @param breakScore
	 *            the similarity score required to keep modified file pairs
	 *            together. Any modify pairs that score below this will be
	 *            broken apart into separate add/deletes. Values less than or
	 *            equal to zero indicate that no modifies will be broken apart.
	 *            Values over 100 cause all modify pairs to be broken.
	 */
	public void setBreakScore(int breakScore) {
		this.breakScore = breakScore;
	}

	/**
	 * Get rename limit
	 *
	 * @return limit on number of paths to perform inexact rename detection
	 */
	public int getRenameLimit() {
		return renameLimit;
	}

	/**
	 * Set the limit on the number of files to perform inexact rename detection.
	 * <p>
	 * The rename detector has to build a square matrix of the rename limit on
	 * each side, then perform that many file compares to determine similarity.
	 * If 1000 files are added, and 1000 files are deleted, a 1000*1000 matrix
	 * must be allocated, and 1,000,000 file compares may need to be performed.
	 *
	 * @param limit
	 *            new file limit. 0 means no limit; a negative number means no
	 *            inexact rename detection will be performed, only exact rename
	 *            detection.
	 */
	public void setRenameLimit(int limit) {
		renameLimit = limit;
	}

	/**
	 * Get file size threshold for detecting renames. Files larger
	 * than this size will not be processed for rename detection.
	 *
	 * @return threshold in bytes of the file size.
	 * @since 5.12
	 */
	public int getBigFileThreshold() { return bigFileThreshold; }

	/**
	 * Set the file size threshold for detecting renames. Files larger than this
	 * threshold will be skipped during rename detection computation.
	 *
	 * @param threshold file size threshold in bytes.
	 * @since 5.12
	 */
	public void setBigFileThreshold(int threshold) {
		this.bigFileThreshold = threshold;
	}

	/**
	 * Get skipping detecting content renames for binary files.
	 *
	 * @return true if content renames should be skipped for binary files, false otherwise.
	 * @since 5.12
	 */
	public boolean getSkipContentRenamesForBinaryFiles() {
		return skipContentRenamesForBinaryFiles;
	}

	/**
	 * Sets skipping detecting content renames for binary files.
	 *
	 * @param value true if content renames should be skipped for binary files, false otherwise.
	 * @since 5.12
	 */
	public void setSkipContentRenamesForBinaryFiles(boolean value) {
		this.skipContentRenamesForBinaryFiles = value;
	}

	/**
	 * Check if the detector is over the rename limit.
	 * <p>
	 * This method can be invoked either before or after {@code getEntries} has
	 * been used to perform rename detection.
	 *
	 * @return true if the detector has more file additions or removals than the
	 *         rename limit is currently set to. In such configurations the
	 *         detector will skip expensive computation.
	 */
	public boolean isOverRenameLimit() {
		if (done)
			return overRenameLimit;
		int cnt = Math.max(added.size(), deleted.size());
		return getRenameLimit() != 0 && getRenameLimit() < cnt;
	}

	/**
	 * Add entries to be considered for rename detection.
	 *
	 * @param entriesToAdd
	 *            one or more entries to add.
	 * @throws java.lang.IllegalStateException
	 *             if {@code getEntries} was already invoked.
	 */
	public void addAll(Collection<DiffEntry> entriesToAdd) {
		if (done)
			throw new IllegalStateException(JGitText.get().renamesAlreadyFound);

		for (DiffEntry entry : entriesToAdd) {
			switch (entry.getChangeType()) {
			case ADD:
				added.add(entry);
				break;

			case DELETE:
				deleted.add(entry);
				break;

			case MODIFY:
				if (sameType(entry.getOldMode(), entry.getNewMode())) {
					entries.add(entry);
				} else {
					List<DiffEntry> tmp = DiffEntry.breakModify(entry);
					deleted.add(tmp.get(0));
					added.add(tmp.get(1));
				}
				break;

			case COPY:
			case RENAME:
			default:
				entries.add(entry);
			}
		}
	}

	/**
	 * Add an entry to be considered for rename detection.
	 *
	 * @param entry
	 *            to add.
	 * @throws java.lang.IllegalStateException
	 *             if {@code getEntries} was already invoked.
	 */
	public void add(DiffEntry entry) {
		addAll(Collections.singletonList(entry));
	}

	/**
	 * Detect renames in the current file set.
	 * <p>
	 * This convenience function runs without a progress monitor.
	 * </p>
	 *
	 * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
	 *         representing all files that have been changed.
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 */
	public List<DiffEntry> compute() throws IOException {
		try {
			return compute(NullProgressMonitor.INSTANCE);
		} catch (CanceledException e) {
			// Won't happen with a NullProgressMonitor
			return Collections.emptyList();
		}
	}

	/**
	 * Detect renames in the current file set.
	 *
	 * @param pm
	 *            report progress during the detection phases.
	 * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
	 *         representing all files that have been changed.
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 * @throws CanceledException
	 *             if rename detection was cancelled
	 */
	public List<DiffEntry> compute(ProgressMonitor pm)
			throws IOException, CanceledException {
		if (!done) {
			try {
				return compute(objectReader, pm);
			} finally {
				objectReader.close();
			}
		}
		return Collections.unmodifiableList(entries);
	}

	/**
	 * Detect renames in the current file set.
	 *
	 * @param reader
	 *            reader to obtain objects from the repository with.
	 * @param pm
	 *            report progress during the detection phases.
	 * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
	 *         representing all files that have been changed.
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 * @throws CanceledException
	 *             if rename detection was cancelled
	 */
	public List<DiffEntry> compute(ObjectReader reader, ProgressMonitor pm)
			throws IOException, CanceledException {
		final ContentSource cs = ContentSource.create(reader);
		return compute(new ContentSource.Pair(cs, cs), pm);
	}

	/**
	 * Detect renames in the current file set.
	 *
	 * @param reader
	 *            reader to obtain objects from the repository with.
	 * @param pm
	 *            report progress during the detection phases.
	 * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
	 *         representing all files that have been changed.
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 * @throws CanceledException
	 *             if rename detection was cancelled
	 */
	public List<DiffEntry> compute(ContentSource.Pair reader, ProgressMonitor pm)
			throws IOException, CanceledException {
		if (!done) {
			done = true;

			if (pm == null)
				pm = NullProgressMonitor.INSTANCE;

			if (0 < breakScore)
				breakModifies(reader, pm);

			if (!added.isEmpty() && !deleted.isEmpty())
				findExactRenames(pm);

			if (!added.isEmpty() && !deleted.isEmpty())
				findContentRenames(reader, pm);

			if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
				rejoinModifies(pm);

			entries.addAll(added);
			added = null;

			entries.addAll(deleted);
			deleted = null;

			Collections.sort(entries, DIFF_COMPARATOR);
		}
		return Collections.unmodifiableList(entries);
	}

	/**
	 * Reset this rename detector for another rename detection pass.
	 */
	public void reset() {
		entries = new ArrayList<>();
		deleted = new ArrayList<>();
		added = new ArrayList<>();
		done = false;
	}

	private void advanceOrCancel(ProgressMonitor pm) throws CanceledException {
		if (pm.isCancelled()) {
			throw new CanceledException(JGitText.get().renameCancelled);
		}
		pm.update(1);
	}

	private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
			throws IOException, CanceledException {
		ArrayList<DiffEntry> newEntries = new ArrayList<>(entries.size());

		pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());

		for (int i = 0; i < entries.size(); i++) {
			DiffEntry e = entries.get(i);
			if (e.getChangeType() == ChangeType.MODIFY) {
				int score = calculateModifyScore(reader, e);
				if (score < breakScore) {
					List<DiffEntry> tmp = DiffEntry.breakModify(e);
					DiffEntry del = tmp.get(0);
					del.score = score;
					deleted.add(del);
					added.add(tmp.get(1));
				} else {
					newEntries.add(e);
				}
			} else {
				newEntries.add(e);
			}
			advanceOrCancel(pm);
		}

		entries = newEntries;
	}

	private void rejoinModifies(ProgressMonitor pm) throws CanceledException {
		HashMap<String, DiffEntry> nameMap = new HashMap<>();
		ArrayList<DiffEntry> newAdded = new ArrayList<>(added.size());

		pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size()
				+ deleted.size());

		for (DiffEntry src : deleted) {
			nameMap.put(src.oldPath, src);
			advanceOrCancel(pm);
		}

		for (DiffEntry dst : added) {
			DiffEntry src = nameMap.remove(dst.newPath);
			if (src != null) {
				if (sameType(src.oldMode, dst.newMode)) {
					entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst,
							src.score));
				} else {
					nameMap.put(src.oldPath, src);
					newAdded.add(dst);
				}
			} else {
				newAdded.add(dst);
			}
			advanceOrCancel(pm);
		}

		added = newAdded;
		deleted = new ArrayList<>(nameMap.values());
	}

	private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
			throws IOException {
		try {
			SimilarityIndex src = new SimilarityIndex();
			src.hash(reader.open(OLD, d));
			src.sort();

			SimilarityIndex dst = new SimilarityIndex();
			dst.hash(reader.open(NEW, d));
			dst.sort();
			return src.score(dst, 100);
		} catch (TableFullException tableFull) {
			// If either table overflowed while being constructed, don't allow
			// the pair to be broken. Returning 1 higher than breakScore will
			// ensure its not similar, but not quite dissimilar enough to break.
			//
			overRenameLimit = true;
			return breakScore + 1;
		}
	}

	private void findContentRenames(ContentSource.Pair reader,
			ProgressMonitor pm)
			throws IOException, CanceledException {
		int cnt = Math.max(added.size(), deleted.size());
		if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
			SimilarityRenameDetector d;

			d = new SimilarityRenameDetector(reader, deleted, added);
			d.setRenameScore(getRenameScore());
			d.setBigFileThreshold(getBigFileThreshold());
			d.setSkipBinaryFiles(getSkipContentRenamesForBinaryFiles());
			d.compute(pm);
			overRenameLimit |= d.isTableOverflow();
			deleted = d.getLeftOverSources();
			added = d.getLeftOverDestinations();
			entries.addAll(d.getMatches());
		} else {
			overRenameLimit = true;
		}
	}

	@SuppressWarnings("unchecked")
	private void findExactRenames(ProgressMonitor pm)
			throws CanceledException {
		pm.beginTask(JGitText.get().renamesFindingExact, //
				added.size() + added.size() + deleted.size()
						+ added.size() * deleted.size());

		HashMap<AbbreviatedObjectId, Object> deletedMap = populateMap(deleted, pm);
		HashMap<AbbreviatedObjectId, Object> addedMap = populateMap(added, pm);

		ArrayList<DiffEntry> uniqueAdds = new ArrayList<>(added.size());
		ArrayList<List<DiffEntry>> nonUniqueAdds = new ArrayList<>();

		for (Object o : addedMap.values()) {
			if (o instanceof DiffEntry)
				uniqueAdds.add((DiffEntry) o);
			else
				nonUniqueAdds.add((List<DiffEntry>) o);
		}

		ArrayList<DiffEntry> left = new ArrayList<>(added.size());

		for (DiffEntry a : uniqueAdds) {
			Object del = deletedMap.get(a.newId);
			if (del instanceof DiffEntry) {
				// We have one add to one delete: pair them if they are the same
				// type
				DiffEntry e = (DiffEntry) del;
				if (sameType(e.oldMode, a.newMode)) {
					e.changeType = ChangeType.RENAME;
					entries.add(exactRename(e, a));
				} else {
					left.add(a);
				}
			} else if (del != null) {
				// We have one add to many deletes: find the delete with the
				// same type and closest name to the add, then pair them
				List<DiffEntry> list = (List<DiffEntry>) del;
				DiffEntry best = bestPathMatch(a, list);
				if (best != null) {
					best.changeType = ChangeType.RENAME;
					entries.add(exactRename(best, a));
				} else {
					left.add(a);
				}
			} else {
				left.add(a);
			}
			advanceOrCancel(pm);
		}

		for (List<DiffEntry> adds : nonUniqueAdds) {
			Object o = deletedMap.get(adds.get(0).newId);
			if (o instanceof DiffEntry) {
				// We have many adds to one delete: find the add with the same
				// type and closest name to the delete, then pair them. Mark the
				// rest as copies of the delete.
				DiffEntry d = (DiffEntry) o;
				DiffEntry best = bestPathMatch(d, adds);
				if (best != null) {
					d.changeType = ChangeType.RENAME;
					entries.add(exactRename(d, best));
					for (DiffEntry a : adds) {
						if (a != best) {
							if (sameType(d.oldMode, a.newMode)) {
								entries.add(exactCopy(d, a));
							} else {
								left.add(a);
							}
						}
					}
				} else {
					left.addAll(adds);
				}
			} else if (o != null) {
				// We have many adds to many deletes: score all the adds against
				// all the deletes by path name, take the best matches, pair
				// them as renames, then call the rest copies
				List<DiffEntry> dels = (List<DiffEntry>) o;
				long[] matrix = new long[dels.size() * adds.size()];
				int mNext = 0;
				for (int delIdx = 0; delIdx < dels.size(); delIdx++) {
					String deletedName = dels.get(delIdx).oldPath;

					for (int addIdx = 0; addIdx < adds.size(); addIdx++) {
						String addedName = adds.get(addIdx).newPath;

						int score = SimilarityRenameDetector.nameScore(addedName, deletedName);
						matrix[mNext] = SimilarityRenameDetector.encode(score, delIdx, addIdx);
						mNext++;
						if (pm.isCancelled()) {
							throw new CanceledException(
									JGitText.get().renameCancelled);
						}
					}
				}

				Arrays.sort(matrix);

				for (--mNext; mNext >= 0; mNext--) {
					long ent = matrix[mNext];
					int delIdx = SimilarityRenameDetector.srcFile(ent);
					int addIdx = SimilarityRenameDetector.dstFile(ent);
					DiffEntry d = dels.get(delIdx);
					DiffEntry a = adds.get(addIdx);

					if (a == null) {
						advanceOrCancel(pm);
						continue; // was already matched earlier
					}

					ChangeType type;
					if (d.changeType == ChangeType.DELETE) {
						// First use of this source file. Tag it as a rename so we
						// later know it is already been used as a rename, other
						// matches (if any) will claim themselves as copies instead.
						//
						d.changeType = ChangeType.RENAME;
						type = ChangeType.RENAME;
					} else {
						type = ChangeType.COPY;
					}

					entries.add(DiffEntry.pair(type, d, a, 100));
					adds.set(addIdx, null); // Claim the destination was matched.
					advanceOrCancel(pm);
				}
			} else {
				left.addAll(adds);
			}
			advanceOrCancel(pm);
		}
		added = left;

		deleted = new ArrayList<>(deletedMap.size());
		for (Object o : deletedMap.values()) {
			if (o instanceof DiffEntry) {
				DiffEntry e = (DiffEntry) o;
				if (e.changeType == ChangeType.DELETE)
					deleted.add(e);
			} else {
				List<DiffEntry> list = (List<DiffEntry>) o;
				for (DiffEntry e : list) {
					if (e.changeType == ChangeType.DELETE)
						deleted.add(e);
				}
			}
		}
		pm.endTask();
	}

	/**
	 * Find the best match by file path for a given DiffEntry from a list of
	 * DiffEntrys. The returned DiffEntry will be of the same type as <src>. If
	 * no DiffEntry can be found that has the same type, this method will return
	 * null.
	 *
	 * @param src
	 *            the DiffEntry to try to find a match for
	 * @param list
	 *            a list of DiffEntrys to search through
	 * @return the DiffEntry from <list> who's file path best matches <src>
	 */
	private static DiffEntry bestPathMatch(DiffEntry src, List<DiffEntry> list) {
		DiffEntry best = null;
		int score = -1;

		for (DiffEntry d : list) {
			if (sameType(mode(d), mode(src))) {
				int tmp = SimilarityRenameDetector
						.nameScore(path(d), path(src));
				if (tmp > score) {
					best = d;
					score = tmp;
				}
			}
		}

		return best;
	}

	@SuppressWarnings("unchecked")
	private HashMap<AbbreviatedObjectId, Object> populateMap(
			List<DiffEntry> diffEntries, ProgressMonitor pm)
			throws CanceledException {
		HashMap<AbbreviatedObjectId, Object> map = new HashMap<>();
		for (DiffEntry de : diffEntries) {
			Object old = map.put(id(de), de);
			if (old instanceof DiffEntry) {
				ArrayList<DiffEntry> list = new ArrayList<>(2);
				list.add((DiffEntry) old);
				list.add(de);
				map.put(id(de), list);
			} else if (old != null) {
				// Must be a list of DiffEntries
				((List<DiffEntry>) old).add(de);
				map.put(id(de), old);
			}
			advanceOrCancel(pm);
		}
		return map;
	}

	private static String path(DiffEntry de) {
		return de.changeType == ChangeType.DELETE ? de.oldPath : de.newPath;
	}

	private static FileMode mode(DiffEntry de) {
		return de.changeType == ChangeType.DELETE ? de.oldMode : de.newMode;
	}

	private static AbbreviatedObjectId id(DiffEntry de) {
		return de.changeType == ChangeType.DELETE ? de.oldId : de.newId;
	}

	static boolean sameType(FileMode a, FileMode b) {
		// Files have to be of the same type in order to rename them.
		// We would never want to rename a file to a gitlink, or a
		// symlink to a file.
		//
		int aType = a.getBits() & FileMode.TYPE_MASK;
		int bType = b.getBits() & FileMode.TYPE_MASK;
		return aType == bType;
	}

	private static DiffEntry exactRename(DiffEntry src, DiffEntry dst) {
		return DiffEntry.pair(ChangeType.RENAME, src, dst, EXACT_RENAME_SCORE);
	}

	private static DiffEntry exactCopy(DiffEntry src, DiffEntry dst) {
		return DiffEntry.pair(ChangeType.COPY, src, dst, EXACT_RENAME_SCORE);
	}
}