PackIndexWriter.java

/*
 * Copyright (C) 2008, Robin Rosenberg <robin.rosenberg@dewire.com>
 * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org> and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.internal.storage.file;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.security.DigestOutputStream;
import java.text.MessageFormat;
import java.util.List;

import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.transport.PackedObjectInfo;
import org.eclipse.jgit.util.NB;

/**
 * Creates a table of contents to support random access by
 * {@link org.eclipse.jgit.internal.storage.file.Pack}.
 * <p>
 * Pack index files (the <code>.idx</code> suffix in a pack file pair) provides
 * random access to any object in the pack by associating an ObjectId to the
 * byte offset within the pack where the object's data can be read.
 */
public abstract class PackIndexWriter {
	/** Magic constant indicating post-version 1 format. */
	protected static final byte[] TOC = { -1, 't', 'O', 'c' };

	/**
	 * Create a new writer for the oldest (most widely understood) format.
	 * <p>
	 * This method selects an index format that can accurate describe the
	 * supplied objects and that will be the most compatible format with older
	 * Git implementations.
	 * <p>
	 * Index version 1 is widely recognized by all Git implementations, but
	 * index version 2 (and later) is not as well recognized as it was
	 * introduced more than a year later. Index version 1 can only be used if
	 * the resulting pack file is under 4 gigabytes in size; packs larger than
	 * that limit must use index version 2.
	 *
	 * @param dst
	 *            the stream the index data will be written to. If not already
	 *            buffered it will be automatically wrapped in a buffered
	 *            stream. Callers are always responsible for closing the stream.
	 * @param objs
	 *            the objects the caller needs to store in the index. Entries
	 *            will be examined until a format can be conclusively selected.
	 * @return a new writer to output an index file of the requested format to
	 *         the supplied stream.
	 * @throws java.lang.IllegalArgumentException
	 *             no recognized pack index version can support the supplied
	 *             objects. This is likely a bug in the implementation.
	 * @see #oldestPossibleFormat(List)
	 */
	public static PackIndexWriter createOldestPossible(final OutputStream dst,
			final List<? extends PackedObjectInfo> objs) {
		return createVersion(dst, oldestPossibleFormat(objs));
	}

	/**
	 * Return the oldest (most widely understood) index format.
	 * <p>
	 * This method selects an index format that can accurate describe the
	 * supplied objects and that will be the most compatible format with older
	 * Git implementations.
	 * <p>
	 * Index version 1 is widely recognized by all Git implementations, but
	 * index version 2 (and later) is not as well recognized as it was
	 * introduced more than a year later. Index version 1 can only be used if
	 * the resulting pack file is under 4 gigabytes in size; packs larger than
	 * that limit must use index version 2.
	 *
	 * @param objs
	 *            the objects the caller needs to store in the index. Entries
	 *            will be examined until a format can be conclusively selected.
	 * @return the index format.
	 * @throws java.lang.IllegalArgumentException
	 *             no recognized pack index version can support the supplied
	 *             objects. This is likely a bug in the implementation.
	 */
	public static int oldestPossibleFormat(
			final List<? extends PackedObjectInfo> objs) {
		for (PackedObjectInfo oe : objs) {
			if (!PackIndexWriterV1.canStore(oe))
				return 2;
		}
		return 1;
	}


	/**
	 * Create a new writer instance for a specific index format version.
	 *
	 * @param dst
	 *            the stream the index data will be written to. If not already
	 *            buffered it will be automatically wrapped in a buffered
	 *            stream. Callers are always responsible for closing the stream.
	 * @param version
	 *            index format version number required by the caller. Exactly
	 *            this formatted version will be written.
	 * @return a new writer to output an index file of the requested format to
	 *         the supplied stream.
	 * @throws java.lang.IllegalArgumentException
	 *             the version requested is not supported by this
	 *             implementation.
	 */
	public static PackIndexWriter createVersion(final OutputStream dst,
			final int version) {
		switch (version) {
		case 1:
			return new PackIndexWriterV1(dst);
		case 2:
			return new PackIndexWriterV2(dst);
		default:
			throw new IllegalArgumentException(MessageFormat.format(
					JGitText.get().unsupportedPackIndexVersion,
					Integer.valueOf(version)));
		}
	}

	/** The index data stream we are responsible for creating. */
	protected final DigestOutputStream out;

	/** A temporary buffer for use during IO to {link #out}. */
	protected final byte[] tmp;

	/** The entries this writer must pack. */
	protected List<? extends PackedObjectInfo> entries;

	/** SHA-1 checksum for the entire pack data. */
	protected byte[] packChecksum;

	/**
	 * Create a new writer instance.
	 *
	 * @param dst
	 *            the stream this instance outputs to. If not already buffered
	 *            it will be automatically wrapped in a buffered stream.
	 */
	protected PackIndexWriter(OutputStream dst) {
		out = new DigestOutputStream(dst instanceof BufferedOutputStream ? dst
				: new BufferedOutputStream(dst),
				Constants.newMessageDigest());
		tmp = new byte[4 + Constants.OBJECT_ID_LENGTH];
	}

	/**
	 * Write all object entries to the index stream.
	 * <p>
	 * After writing the stream passed to the factory is flushed but remains
	 * open. Callers are always responsible for closing the output stream.
	 *
	 * @param toStore
	 *            sorted list of objects to store in the index. The caller must
	 *            have previously sorted the list using
	 *            {@link org.eclipse.jgit.transport.PackedObjectInfo}'s native
	 *            {@link java.lang.Comparable} implementation.
	 * @param packDataChecksum
	 *            checksum signature of the entire pack data content. This is
	 *            traditionally the last 20 bytes of the pack file's own stream.
	 * @throws java.io.IOException
	 *             an error occurred while writing to the output stream, or this
	 *             index format cannot store the object data supplied.
	 */
	public void write(final List<? extends PackedObjectInfo> toStore,
			final byte[] packDataChecksum) throws IOException {
		entries = toStore;
		packChecksum = packDataChecksum;
		writeImpl();
		out.flush();
	}

	/**
	 * Writes the index file to {@link #out}.
	 * <p>
	 * Implementations should go something like:
	 *
	 * <pre>
	 * writeFanOutTable();
	 * for (final PackedObjectInfo po : entries)
	 * 	writeOneEntry(po);
	 * writeChecksumFooter();
	 * </pre>
	 *
	 * <p>
	 * Where the logic for <code>writeOneEntry</code> is specific to the index
	 * format in use. Additional headers/footers may be used if necessary and
	 * the {@link #entries} collection may be iterated over more than once if
	 * necessary. Implementors therefore have complete control over the data.
	 *
	 * @throws java.io.IOException
	 *             an error occurred while writing to the output stream, or this
	 *             index format cannot store the object data supplied.
	 */
	protected abstract void writeImpl() throws IOException;

	/**
	 * Output the version 2 (and later) TOC header, with version number.
	 * <p>
	 * Post version 1 all index files start with a TOC header that makes the
	 * file an invalid version 1 file, and then includes the version number.
	 * This header is necessary to recognize a version 1 from a version 2
	 * formatted index.
	 *
	 * @param version
	 *            version number of this index format being written.
	 * @throws java.io.IOException
	 *             an error occurred while writing to the output stream.
	 */
	protected void writeTOC(int version) throws IOException {
		out.write(TOC);
		NB.encodeInt32(tmp, 0, version);
		out.write(tmp, 0, 4);
	}

	/**
	 * Output the standard 256 entry first-level fan-out table.
	 * <p>
	 * The fan-out table is 4 KB in size, holding 256 32-bit unsigned integer
	 * counts. Each count represents the number of objects within this index
	 * whose {@link org.eclipse.jgit.lib.ObjectId#getFirstByte()} matches the
	 * count's position in the fan-out table.
	 *
	 * @throws java.io.IOException
	 *             an error occurred while writing to the output stream.
	 */
	protected void writeFanOutTable() throws IOException {
		final int[] fanout = new int[256];
		for (PackedObjectInfo po : entries)
			fanout[po.getFirstByte() & 0xff]++;
		for (int i = 1; i < 256; i++)
			fanout[i] += fanout[i - 1];
		for (int n : fanout) {
			NB.encodeInt32(tmp, 0, n);
			out.write(tmp, 0, 4);
		}
	}

	/**
	 * Output the standard two-checksum index footer.
	 * <p>
	 * The standard footer contains two checksums (20 byte SHA-1 values):
	 * <ol>
	 * <li>Pack data checksum - taken from the last 20 bytes of the pack file.</li>
	 * <li>Index data checksum - checksum of all index bytes written, including
	 * the pack data checksum above.</li>
	 * </ol>
	 *
	 * @throws java.io.IOException
	 *             an error occurred while writing to the output stream.
	 */
	protected void writeChecksumFooter() throws IOException {
		out.write(packChecksum);
		out.on(false);
		out.write(out.getMessageDigest().digest());
	}
}