NameConflictTreeWalk.java

/*
 * Copyright (C) 2008, Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.treewalk;

import java.io.IOException;

import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.CorruptObjectException;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;

/**
 * Specialized TreeWalk to detect directory-file (D/F) name conflicts.
 * <p>
 * Due to the way a Git tree is organized the standard
 * {@link org.eclipse.jgit.treewalk.TreeWalk} won't easily find a D/F conflict
 * when merging two or more trees together. In the standard TreeWalk the file
 * will be returned first, and then much later the directory will be returned.
 * This makes it impossible for the application to efficiently detect and handle
 * the conflict.
 * <p>
 * Using this walk implementation causes the directory to report earlier than
 * usual, at the same time as the non-directory entry. This permits the
 * application to handle the D/F conflict in a single step. The directory is
 * returned only once, so it does not get returned later in the iteration.
 * <p>
 * When a D/F conflict is detected
 * {@link org.eclipse.jgit.treewalk.TreeWalk#isSubtree()} will return true and
 * {@link org.eclipse.jgit.treewalk.TreeWalk#enterSubtree()} will recurse into
 * the subtree, no matter which iterator originally supplied the subtree.
 * <p>
 * Because conflicted directories report early, using this walk implementation
 * to populate a {@link org.eclipse.jgit.dircache.DirCacheBuilder} may cause the
 * automatic resorting to run and fix the entry ordering.
 * <p>
 * This walk implementation requires more CPU to implement a look-ahead and a
 * look-behind to merge a D/F pair together, or to skip a previously reported
 * directory. In typical Git repositories the look-ahead cost is 0 and the
 * look-behind doesn't trigger, as users tend not to create trees which contain
 * both "foo" as a directory and "foo.c" as a file.
 * <p>
 * In the worst-case however several thousand look-ahead steps per walk step may
 * be necessary, making the overhead quite significant. Since this worst-case
 * should never happen this walk implementation has made the time/space tradeoff
 * in favor of more-time/less-space, as that better suits the typical case.
 */
public class NameConflictTreeWalk extends TreeWalk {
	private static final int TREE_MODE = FileMode.TREE.getBits();

	private boolean fastMinHasMatch;

	private AbstractTreeIterator dfConflict;

	/**
	 * Create a new tree walker for a given repository.
	 *
	 * @param repo
	 *            the repository the walker will obtain data from.
	 */
	public NameConflictTreeWalk(Repository repo) {
		super(repo);
	}

	/**
	 * Create a new tree walker for a given repository.
	 *
	 * @param repo
	 *            the repository the walker will obtain data from.
	 * @param or
	 *            the reader the walker will obtain tree data from.
	 * @since 4.3
	 */
	public NameConflictTreeWalk(@Nullable Repository repo, ObjectReader or) {
		super(repo, or);
	}

	/**
	 * Create a new tree walker for a given repository.
	 *
	 * @param or
	 *            the reader the walker will obtain tree data from.
	 */
	public NameConflictTreeWalk(ObjectReader or) {
		super(or);
	}

	@Override
	AbstractTreeIterator min() throws CorruptObjectException {
		for (;;) {
			final AbstractTreeIterator minRef = fastMin();
			if (fastMinHasMatch)
				return minRef;

			if (isTree(minRef)) {
				if (skipEntry(minRef)) {
					for (AbstractTreeIterator t : trees) {
						if (t.matches == minRef) {
							t.next(1);
							t.matches = null;
						}
					}
					continue;
				}
				return minRef;
			}

			return combineDF(minRef);
		}
	}

	private AbstractTreeIterator fastMin() {
		fastMinHasMatch = true;

		int i = 0;
		AbstractTreeIterator minRef = trees[i];
		while (minRef.eof() && ++i < trees.length)
			minRef = trees[i];
		if (minRef.eof())
			return minRef;

		boolean hasConflict = false;
		minRef.matches = minRef;
		while (++i < trees.length) {
			final AbstractTreeIterator t = trees[i];
			if (t.eof())
				continue;

			final int cmp = t.pathCompare(minRef);
			if (cmp < 0) {
				if (fastMinHasMatch && isTree(minRef) && !isTree(t)
						&& nameEqual(minRef, t)) {
					// We used to be at a tree, but now we are at a file
					// with the same name. Allow the file to match the
					// tree anyway.
					//
					t.matches = minRef;
					hasConflict = true;
				} else {
					fastMinHasMatch = false;
					t.matches = t;
					minRef = t;
				}
			} else if (cmp == 0) {
				// Exact name/mode match is best.
				//
				t.matches = minRef;
			} else if (fastMinHasMatch && isTree(t) && !isTree(minRef)
					&& !isGitlink(minRef) && nameEqual(t, minRef)) {
				// The minimum is a file (non-tree) but the next entry
				// of this iterator is a tree whose name matches our file.
				// This is a classic D/F conflict and commonly occurs like
				// this, with no gaps in between the file and directory.
				//
				// Use the tree as the minimum instead (see combineDF).
				//

				for (int k = 0; k < i; k++) {
					final AbstractTreeIterator p = trees[k];
					if (p.matches == minRef)
						p.matches = t;
				}
				t.matches = t;
				minRef = t;
				hasConflict = true;
			} else
				fastMinHasMatch = false;
		}

		if (hasConflict && fastMinHasMatch && dfConflict == null)
			dfConflict = minRef;
		return minRef;
	}

	private static boolean nameEqual(final AbstractTreeIterator a,
			final AbstractTreeIterator b) {
		return a.pathCompare(b, TREE_MODE) == 0;
	}

	private boolean isGitlink(AbstractTreeIterator p) {
		return FileMode.GITLINK.equals(p.mode);
	}

	private static boolean isTree(AbstractTreeIterator p) {
		return FileMode.TREE.equals(p.mode);
	}

	private boolean skipEntry(AbstractTreeIterator minRef)
			throws CorruptObjectException {
		// A tree D/F may have been handled earlier. We need to
		// not report this path if it has already been reported.
		//
		for (AbstractTreeIterator t : trees) {
			if (t.matches == minRef || t.first())
				continue;

			int stepsBack = 0;
			for (;;) {
				stepsBack++;
				t.back(1);

				final int cmp = t.pathCompare(minRef, 0);
				if (cmp == 0) {
					// We have already seen this "$path" before. Skip it.
					//
					t.next(stepsBack);
					return true;
				} else if (cmp < 0 || t.first()) {
					// We cannot find "$path" in t; it will never appear.
					//
					t.next(stepsBack);
					break;
				}
			}
		}

		// We have never seen the current path before.
		//
		return false;
	}

	private AbstractTreeIterator combineDF(AbstractTreeIterator minRef)
			throws CorruptObjectException {
		// Look for a possible D/F conflict forward in the tree(s)
		// as there may be a "$path/" which matches "$path". Make
		// such entries match this entry.
		//
		AbstractTreeIterator treeMatch = null;
		for (AbstractTreeIterator t : trees) {
			if (t.matches == minRef || t.eof())
				continue;

			for (;;) {
				final int cmp = t.pathCompare(minRef, TREE_MODE);
				if (cmp < 0) {
					// The "$path/" may still appear later.
					//
					t.matchShift++;
					t.next(1);
					if (t.eof()) {
						t.back(t.matchShift);
						t.matchShift = 0;
						break;
					}
				} else if (cmp == 0) {
					// We have a conflict match here.
					//
					t.matches = minRef;
					treeMatch = t;
					break;
				} else {
					// A conflict match is not possible.
					//
					if (t.matchShift != 0) {
						t.back(t.matchShift);
						t.matchShift = 0;
					}
					break;
				}
			}
		}

		if (treeMatch != null) {
			// If we do have a conflict use one of the directory
			// matching iterators instead of the file iterator.
			// This way isSubtree is true and isRecursive works.
			//
			for (AbstractTreeIterator t : trees)
				if (t.matches == minRef)
					t.matches = treeMatch;

			if (dfConflict == null && !isGitlink(minRef)) {
				dfConflict = treeMatch;
			}

			return treeMatch;
		}

		return minRef;
	}

	@Override
	void popEntriesEqual() throws CorruptObjectException {
		final AbstractTreeIterator ch = currentHead;
		for (AbstractTreeIterator t : trees) {
			if (t.matches == ch) {
				if (t.matchShift == 0)
					t.next(1);
				else {
					t.back(t.matchShift);
					t.matchShift = 0;
				}
				t.matches = null;
			}
		}

		if (ch == dfConflict)
			dfConflict = null;
	}

	@Override
	void skipEntriesEqual() throws CorruptObjectException {
		final AbstractTreeIterator ch = currentHead;
		for (AbstractTreeIterator t : trees) {
			if (t.matches == ch) {
				if (t.matchShift == 0)
					t.skip();
				else {
					t.back(t.matchShift);
					t.matchShift = 0;
				}
				t.matches = null;
			}
		}

		if (ch == dfConflict)
			dfConflict = null;
	}

	@Override
	void stopWalk() throws IOException {
		if (!needsStopWalk()) {
			return;
		}

		// Name conflicts make aborting early difficult. Multiple paths may
		// exist between the file and directory versions of a name. To ensure
		// the directory version is skipped over (as it was previously visited
		// during the file version step) requires popping up the stack and
		// finishing out each subtree that the walker dove into. Siblings in
		// parents do not need to be recursed into, bounding the cost.
		for (;;) {
			AbstractTreeIterator t = min();
			if (t.eof()) {
				if (depth > 0) {
					exitSubtree();
					popEntriesEqual();
					continue;
				}
				return;
			}
			currentHead = t;
			skipEntriesEqual();
		}
	}

	private boolean needsStopWalk() {
		for (AbstractTreeIterator t : trees) {
			if (t.needsStopWalk()) {
				return true;
			}
		}
		return false;
	}

	/**
	 * True if the current entry is covered by a directory/file conflict.
	 *
	 * This means that for some prefix of the current entry's path, this walk
	 * has detected a directory/file conflict. Also true if the current entry
	 * itself is a directory/file conflict.
	 *
	 * Example: If this TreeWalk points to foo/bar/a.txt and this method returns
	 * true then you know that either for path foo or for path foo/bar files and
	 * folders were detected.
	 *
	 * @return <code>true</code> if the current entry is covered by a
	 *         directory/file conflict, <code>false</code> otherwise
	 */
	public boolean isDirectoryFileConflict() {
		return dfConflict != null;
	}
}