QuotedString.java

/*
 * Copyright (C) 2008, 2019 Google Inc. and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.eclipse.jgit.util;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.util.Arrays;

import org.eclipse.jgit.lib.Constants;

/**
 * Utility functions related to quoted string handling.
 */
public abstract class QuotedString {
	/** Quoting style that obeys the rules Git applies to file names */
	public static final GitPathStyle GIT_PATH = new GitPathStyle(true);

	/**
	 * Quoting style that obeys the rules Git applies to file names when
	 * {@code core.quotePath = false}.
	 *
	 * @since 5.6
	 */
	public static final QuotedString GIT_PATH_MINIMAL = new GitPathStyle(false);

	/**
	 * Quoting style used by the Bourne shell.
	 * <p>
	 * Quotes are unconditionally inserted during {@link #quote(String)}. This
	 * protects shell meta-characters like <code>$</code> or <code>~</code> from
	 * being recognized as special.
	 */
	public static final BourneStyle BOURNE = new BourneStyle();

	/** Bourne style, but permits <code>~user</code> at the start of the string. */
	public static final BourneUserPathStyle BOURNE_USER_PATH = new BourneUserPathStyle();

	/**
	 * Quote an input string by the quoting rules.
	 * <p>
	 * If the input string does not require any quoting, the same String
	 * reference is returned to the caller.
	 * <p>
	 * Otherwise a quoted string is returned, including the opening and closing
	 * quotation marks at the start and end of the string. If the style does not
	 * permit raw Unicode characters then the string will first be encoded in
	 * UTF-8, with unprintable sequences possibly escaped by the rules.
	 *
	 * @param in
	 *            any non-null Unicode string.
	 * @return a quoted string. See above for details.
	 */
	public abstract String quote(String in);

	/**
	 * Clean a previously quoted input, decoding the result via UTF-8.
	 * <p>
	 * This method must match quote such that:
	 *
	 * <pre>
	 * a.equals(dequote(quote(a)));
	 * </pre>
	 *
	 * is true for any <code>a</code>.
	 *
	 * @param in
	 *            a Unicode string to remove quoting from.
	 * @return the cleaned string.
	 * @see #dequote(byte[], int, int)
	 */
	public String dequote(String in) {
		final byte[] b = Constants.encode(in);
		return dequote(b, 0, b.length);
	}

	/**
	 * Decode a previously quoted input, scanning a UTF-8 encoded buffer.
	 * <p>
	 * This method must match quote such that:
	 *
	 * <pre>
	 * a.equals(dequote(Constants.encode(quote(a))));
	 * </pre>
	 *
	 * is true for any <code>a</code>.
	 * <p>
	 * This method removes any opening/closing quotation marks added by
	 * {@link #quote(String)}.
	 *
	 * @param in
	 *            the input buffer to parse.
	 * @param offset
	 *            first position within <code>in</code> to scan.
	 * @param end
	 *            one position past in <code>in</code> to scan.
	 * @return the cleaned string.
	 */
	public abstract String dequote(byte[] in, int offset, int end);

	/**
	 * Quoting style used by the Bourne shell.
	 * <p>
	 * Quotes are unconditionally inserted during {@link #quote(String)}. This
	 * protects shell meta-characters like <code>$</code> or <code>~</code> from
	 * being recognized as special.
	 */
	public static class BourneStyle extends QuotedString {
		@Override
		public String quote(String in) {
			final StringBuilder r = new StringBuilder();
			r.append('\'');
			int start = 0, i = 0;
			for (; i < in.length(); i++) {
				switch (in.charAt(i)) {
				case '\'':
				case '!':
					r.append(in, start, i);
					r.append('\'');
					r.append('\\');
					r.append(in.charAt(i));
					r.append('\'');
					start = i + 1;
					break;
				}
			}
			r.append(in, start, i);
			r.append('\'');
			return r.toString();
		}

		@Override
		public String dequote(byte[] in, int ip, int ie) {
			boolean inquote = false;
			final byte[] r = new byte[ie - ip];
			int rPtr = 0;
			while (ip < ie) {
				final byte b = in[ip++];
				switch (b) {
				case '\'':
					inquote = !inquote;
					continue;
				case '\\':
					if (inquote || ip == ie)
						r[rPtr++] = b; // literal within a quote
					else
						r[rPtr++] = in[ip++];
					continue;
				default:
					r[rPtr++] = b;
					continue;
				}
			}
			return RawParseUtils.decode(UTF_8, r, 0, rPtr);
		}
	}

	/** Bourne style, but permits <code>~user</code> at the start of the string. */
	public static class BourneUserPathStyle extends BourneStyle {
		@Override
		public String quote(String in) {
			if (in.matches("^~[A-Za-z0-9_-]+$")) { //$NON-NLS-1$
				// If the string is just "~user" we can assume they
				// mean "~user/".
				//
				return in + "/"; //$NON-NLS-1$
			}

			if (in.matches("^~[A-Za-z0-9_-]*/.*$")) { //$NON-NLS-1$
				// If the string is of "~/path" or "~user/path"
				// we must not escape ~/ or ~user/ from the shell.
				//
				final int i = in.indexOf('/') + 1;
				if (i == in.length())
					return in;
				return in.substring(0, i) + super.quote(in.substring(i));
			}

			return super.quote(in);
		}
	}

	/** Quoting style that obeys the rules Git applies to file names */
	public static final class GitPathStyle extends QuotedString {
		private static final byte[] quote;
		static {
			quote = new byte[128];
			Arrays.fill(quote, (byte) -1);

			for (int i = '0'; i <= '9'; i++)
				quote[i] = 0;
			for (int i = 'a'; i <= 'z'; i++)
				quote[i] = 0;
			for (int i = 'A'; i <= 'Z'; i++)
				quote[i] = 0;
			quote[' '] = 0;
			quote['$'] = 0;
			quote['%'] = 0;
			quote['&'] = 0;
			quote['*'] = 0;
			quote['+'] = 0;
			quote[','] = 0;
			quote['-'] = 0;
			quote['.'] = 0;
			quote['/'] = 0;
			quote[':'] = 0;
			quote[';'] = 0;
			quote['='] = 0;
			quote['?'] = 0;
			quote['@'] = 0;
			quote['_'] = 0;
			quote['^'] = 0;
			quote['|'] = 0;
			quote['~'] = 0;

			quote['\u0007'] = 'a';
			quote['\b'] = 'b';
			quote['\f'] = 'f';
			quote['\n'] = 'n';
			quote['\r'] = 'r';
			quote['\t'] = 't';
			quote['\u000B'] = 'v';
			quote['\\'] = '\\';
			quote['"'] = '"';
		}

		private final boolean quoteHigh;

		@Override
		public String quote(String instr) {
			if (instr.isEmpty()) {
				return "\"\""; //$NON-NLS-1$
			}
			boolean reuse = true;
			final byte[] in = Constants.encode(instr);
			final byte[] out = new byte[4 * in.length + 2];
			int o = 0;
			out[o++] = '"';
			for (byte element : in) {
				final int c = element & 0xff;
				if (c < quote.length) {
					final byte style = quote[c];
					if (style == 0) {
						out[o++] = (byte) c;
						continue;
					}
					if (style > 0) {
						reuse = false;
						out[o++] = '\\';
						out[o++] = style;
						continue;
					}
				} else if (!quoteHigh) {
					out[o++] = (byte) c;
					continue;
				}

				reuse = false;
				out[o++] = '\\';
				out[o++] = (byte) (((c >> 6) & 03) + '0');
				out[o++] = (byte) (((c >> 3) & 07) + '0');
				out[o++] = (byte) (((c >> 0) & 07) + '0');
			}
			if (reuse) {
				return instr;
			}
			out[o++] = '"';
			return new String(out, 0, o, UTF_8);
		}

		@Override
		public String dequote(byte[] in, int inPtr, int inEnd) {
			if (2 <= inEnd - inPtr && in[inPtr] == '"' && in[inEnd - 1] == '"')
				return dq(in, inPtr + 1, inEnd - 1);
			return RawParseUtils.decode(UTF_8, in, inPtr, inEnd);
		}

		private static String dq(byte[] in, int inPtr, int inEnd) {
			final byte[] r = new byte[inEnd - inPtr];
			int rPtr = 0;
			while (inPtr < inEnd) {
				final byte b = in[inPtr++];
				if (b != '\\') {
					r[rPtr++] = b;
					continue;
				}

				if (inPtr == inEnd) {
					// Lone trailing backslash. Treat it as a literal.
					//
					r[rPtr++] = '\\';
					break;
				}

				switch (in[inPtr++]) {
				case 'a':
					r[rPtr++] = 0x07 /* \a = BEL */;
					continue;
				case 'b':
					r[rPtr++] = '\b';
					continue;
				case 'f':
					r[rPtr++] = '\f';
					continue;
				case 'n':
					r[rPtr++] = '\n';
					continue;
				case 'r':
					r[rPtr++] = '\r';
					continue;
				case 't':
					r[rPtr++] = '\t';
					continue;
				case 'v':
					r[rPtr++] = 0x0B/* \v = VT */;
					continue;

				case '\\':
				case '"':
					r[rPtr++] = in[inPtr - 1];
					continue;

				case '0':
				case '1':
				case '2':
				case '3': {
					int cp = in[inPtr - 1] - '0';
					for (int n = 1; n < 3 && inPtr < inEnd; n++) {
						final byte c = in[inPtr];
						if ('0' <= c && c <= '7') {
							cp <<= 3;
							cp |= c - '0';
							inPtr++;
						} else {
							break;
						}
					}
					r[rPtr++] = (byte) cp;
					continue;
				}

				default:
					// Any other code is taken literally.
					//
					r[rPtr++] = '\\';
					r[rPtr++] = in[inPtr - 1];
					continue;
				}
			}

			return RawParseUtils.decode(UTF_8, r, 0, rPtr);
		}

		private GitPathStyle(boolean doQuote) {
			quoteHigh = doQuote;
		}
	}
}