Strings.java

/*
 * Copyright (C) 2014, 2017 Andrey Loskutov <loskutov@gmx.de> and others
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */
package org.eclipse.jgit.ignore.internal;

import static java.lang.Character.isLetter;

import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.eclipse.jgit.errors.InvalidPatternException;
import org.eclipse.jgit.ignore.FastIgnoreRule;
import org.eclipse.jgit.internal.JGitText;

/**
 * Various {@link java.lang.String} related utility methods, written mostly to
 * avoid generation of new String objects (e.g. via splitting Strings etc).
 */
public class Strings {

	static char getPathSeparator(Character pathSeparator) {
		return pathSeparator == null ? FastIgnoreRule.PATH_SEPARATOR
				: pathSeparator.charValue();
	}

	/**
	 * Strip trailing characters
	 *
	 * @param pattern
	 *            non null
	 * @param c
	 *            character to remove
	 * @return new string with all trailing characters removed
	 */
	public static String stripTrailing(String pattern, char c) {
		for (int i = pattern.length() - 1; i >= 0; i--) {
			char charAt = pattern.charAt(i);
			if (charAt != c) {
				if (i == pattern.length() - 1) {
					return pattern;
				}
				return pattern.substring(0, i + 1);
			}
		}
		return ""; //$NON-NLS-1$
	}

	/**
	 * Strip trailing whitespace characters
	 *
	 * @param pattern
	 *            non null
	 * @return new string with all trailing whitespace removed
	 */
	public static String stripTrailingWhitespace(String pattern) {
		for (int i = pattern.length() - 1; i >= 0; i--) {
			char charAt = pattern.charAt(i);
			if (!Character.isWhitespace(charAt)) {
				if (i == pattern.length() - 1) {
					return pattern;
				}
				return pattern.substring(0, i + 1);
			}
		}
		return ""; //$NON-NLS-1$
	}

	/**
	 * Check if pattern is a directory pattern ending with a path separator
	 *
	 * @param pattern
	 *            non null
	 * @return {@code true} if the last character, which is not whitespace, is a
	 *         path separator
	 */
	public static boolean isDirectoryPattern(String pattern) {
		for (int i = pattern.length() - 1; i >= 0; i--) {
			char charAt = pattern.charAt(i);
			if (!Character.isWhitespace(charAt)) {
				return charAt == FastIgnoreRule.PATH_SEPARATOR;
			}
		}
		return false;
	}

	static int count(String s, char c, boolean ignoreFirstLast) {
		int start = 0;
		int count = 0;
		int length = s.length();
		while (start < length) {
			start = s.indexOf(c, start);
			if (start == -1) {
				break;
			}
			if (!ignoreFirstLast || (start != 0 && start != length - 1)) {
				count++;
			}
			start++;
		}
		return count;
	}

	/**
	 * Splits given string to substrings by given separator
	 *
	 * @param pattern
	 *            non null
	 * @param slash
	 *            separator char
	 * @return list of substrings
	 */
	public static List<String> split(String pattern, char slash) {
		int count = count(pattern, slash, true);
		if (count < 1)
			throw new IllegalStateException(
					"Pattern must have at least two segments: " + pattern); //$NON-NLS-1$
		List<String> segments = new ArrayList<>(count);
		int right = 0;
		while (true) {
			int left = right;
			right = pattern.indexOf(slash, right);
			if (right == -1) {
				if (left < pattern.length())
					segments.add(pattern.substring(left));
				break;
			}
			if (right - left > 0)
				if (left == 1)
					// leading slash should remain by the first pattern
					segments.add(pattern.substring(left - 1, right));
				else if (right == pattern.length() - 1)
					// trailing slash should remain too
					segments.add(pattern.substring(left, right + 1));
				else
					segments.add(pattern.substring(left, right));
			right++;
		}
		return segments;
	}

	static boolean isWildCard(String pattern) {
		return pattern.indexOf('*') != -1 || isComplexWildcard(pattern);
	}

	private static boolean isComplexWildcard(String pattern) {
		int idx1 = pattern.indexOf('[');
		if (idx1 != -1) {
			return true;
		}
		if (pattern.indexOf('?') != -1) {
			return true;
		}
		// check if the backslash escapes one of the glob special characters
		// if not, backslash is not part of a regex and treated literally
		int backSlash = pattern.indexOf('\\');
		if (backSlash >= 0) {
			int nextIdx = backSlash + 1;
			if (pattern.length() == nextIdx) {
				return false;
			}
			char nextChar = pattern.charAt(nextIdx);
			if (escapedByBackslash(nextChar)) {
				return true;
			}
			return false;
		}
		return false;
	}

	private static boolean escapedByBackslash(char nextChar) {
		return nextChar == '?' || nextChar == '*' || nextChar == '[';
	}

	static PatternState checkWildCards(String pattern) {
		if (isComplexWildcard(pattern))
			return PatternState.COMPLEX;
		int startIdx = pattern.indexOf('*');
		if (startIdx < 0)
			return PatternState.NONE;

		if (startIdx == pattern.length() - 1)
			return PatternState.TRAILING_ASTERISK_ONLY;
		if (pattern.lastIndexOf('*') == 0)
			return PatternState.LEADING_ASTERISK_ONLY;

		return PatternState.COMPLEX;
	}

	enum PatternState {
		LEADING_ASTERISK_ONLY, TRAILING_ASTERISK_ONLY, COMPLEX, NONE
	}

	static final List<String> POSIX_CHAR_CLASSES = Arrays.asList(
			"alnum", "alpha", "blank", "cntrl", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
			// [:alnum:] [:alpha:] [:blank:] [:cntrl:]
			"digit", "graph", "lower", "print", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
			// [:digit:] [:graph:] [:lower:] [:print:]
			"punct", "space", "upper", "xdigit", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
			// [:punct:] [:space:] [:upper:] [:xdigit:]
			"word" //$NON-NLS-1$
	// [:word:] XXX I don't see it in
	// http://man7.org/linux/man-pages/man7/glob.7.html
	// but this was in org.eclipse.jgit.fnmatch.GroupHead.java ???
			);

	private static final String DL = "\\p{javaDigit}\\p{javaLetter}"; //$NON-NLS-1$

	static final List<String> JAVA_CHAR_CLASSES = Arrays
			.asList("\\p{Alnum}", "\\p{javaLetter}", "\\p{Blank}", "\\p{Cntrl}", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
					// [:alnum:] [:alpha:] [:blank:] [:cntrl:]
					"\\p{javaDigit}", "[\\p{Graph}" + DL + "]", "\\p{Ll}", "[\\p{Print}" + DL + "]", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
					// [:digit:] [:graph:] [:lower:] [:print:]
					"\\p{Punct}", "\\p{Space}", "\\p{Lu}", "\\p{XDigit}", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
					// [:punct:] [:space:] [:upper:] [:xdigit:]
					"[" + DL + "_]" //$NON-NLS-1$ //$NON-NLS-2$
							// [:word:]
			);

	// Collating symbols [[.a.]] or equivalence class expressions [[=a=]] are
	// not supported by CLI git (at least not by 1.9.1)
	static final Pattern UNSUPPORTED = Pattern
			.compile("\\[\\[[.=]\\w+[.=]\\]\\]"); //$NON-NLS-1$

	/**
	 * Conversion from glob to Java regex following two sources: <li>
	 * http://man7.org/linux/man-pages/man7/glob.7.html <li>
	 * org.eclipse.jgit.fnmatch.FileNameMatcher.java Seems that there are
	 * various ways to define what "glob" can be.
	 *
	 * @param pattern
	 *            non null pattern
	 *
	 * @return Java regex pattern corresponding to given glob pattern
	 * @throws InvalidPatternException
	 */
	static Pattern convertGlob(String pattern) throws InvalidPatternException {
		if (UNSUPPORTED.matcher(pattern).find())
			throw new InvalidPatternException(
					"Collating symbols [[.a.]] or equivalence class expressions [[=a=]] are not supported", //$NON-NLS-1$
					pattern);

		StringBuilder sb = new StringBuilder(pattern.length());

		int in_brackets = 0;
		boolean seenEscape = false;
		boolean ignoreLastBracket = false;
		boolean in_char_class = false;
		// 6 is the length of the longest posix char class "xdigit"
		char[] charClass = new char[6];

		for (int i = 0; i < pattern.length(); i++) {
			final char c = pattern.charAt(i);
			switch (c) {

			case '*':
				if (seenEscape || in_brackets > 0)
					sb.append(c);
				else
					sb.append('.').append(c);
				break;

			case '(': // fall-through
			case ')': // fall-through
			case '{': // fall-through
			case '}': // fall-through
			case '+': // fall-through
			case '$': // fall-through
			case '^': // fall-through
			case '|':
				if (seenEscape || in_brackets > 0)
					sb.append(c);
				else
					sb.append('\\').append(c);
				break;

			case '.':
				if (seenEscape)
					sb.append(c);
				else
					sb.append('\\').append('.');
				break;

			case '?':
				if (seenEscape || in_brackets > 0)
					sb.append(c);
				else
					sb.append('.');
				break;

			case ':':
				if (in_brackets > 0)
					if (lookBehind(sb) == '['
							&& isLetter(lookAhead(pattern, i)))
						in_char_class = true;
				sb.append(':');
				break;

			case '-':
				if (in_brackets > 0) {
					if (lookAhead(pattern, i) == ']')
						sb.append('\\').append(c);
					else
						sb.append(c);
				} else
					sb.append('-');
				break;

			case '\\':
				if (in_brackets > 0) {
					char lookAhead = lookAhead(pattern, i);
					if (lookAhead == ']' || lookAhead == '[')
						ignoreLastBracket = true;
				} else {
					//
					char lookAhead = lookAhead(pattern, i);
					if (lookAhead != '\\' && lookAhead != '['
							&& lookAhead != '?' && lookAhead != '*'
							&& lookAhead != ' ' && lookBehind(sb) != '\\') {
						break;
					}
				}
				sb.append(c);
				break;

			case '[':
				if (in_brackets > 0) {
					if (!seenEscape) {
						sb.append('\\');
					}
					sb.append('[');
					ignoreLastBracket = true;
				} else {
					if (!seenEscape) {
						in_brackets++;
						ignoreLastBracket = false;
					}
					sb.append('[');
				}
				break;

			case ']':
				if (seenEscape) {
					sb.append(']');
					ignoreLastBracket = true;
					break;
				}
				if (in_brackets <= 0) {
					sb.append('\\').append(']');
					ignoreLastBracket = true;
					break;
				}
				char lookBehind = lookBehind(sb);
				if ((lookBehind == '[' && !ignoreLastBracket)
						|| lookBehind == '^') {
					sb.append('\\');
					sb.append(']');
					ignoreLastBracket = true;
				} else {
					ignoreLastBracket = false;
					if (!in_char_class) {
						in_brackets--;
						sb.append(']');
					} else {
						in_char_class = false;
						String charCl = checkPosixCharClass(charClass);
						// delete last \[:: chars and set the pattern
						if (charCl != null) {
							sb.setLength(sb.length() - 4);
							sb.append(charCl);
						}
						reset(charClass);
					}
				}
				break;

			case '!':
				if (in_brackets > 0) {
					if (lookBehind(sb) == '[')
						sb.append('^');
					else
						sb.append(c);
				} else
					sb.append(c);
				break;

			default:
				if (in_char_class)
					setNext(charClass, c);
				else
					sb.append(c);
				break;
			} // end switch

			seenEscape = c == '\\';

		} // end for

		if (in_brackets > 0)
			throw new InvalidPatternException("Not closed bracket?", pattern); //$NON-NLS-1$
		try {
			return Pattern.compile(sb.toString(), Pattern.DOTALL);
		} catch (PatternSyntaxException e) {
			throw new InvalidPatternException(
					MessageFormat.format(JGitText.get().invalidIgnoreRule,
							pattern),
					pattern, e);
		}
	}

	/**
	 * @param buffer
	 * @return zero of the buffer is empty, otherwise the last character from
	 *         buffer
	 */
	private static char lookBehind(StringBuilder buffer) {
		return buffer.length() > 0 ? buffer.charAt(buffer.length() - 1) : 0;
	}

	/**
	 * @param pattern
	 * @param i
	 *            current pointer in the pattern
	 * @return zero of the index is out of range, otherwise the next character
	 *         from given position
	 */
	private static char lookAhead(String pattern, int i) {
		int idx = i + 1;
		return idx >= pattern.length() ? 0 : pattern.charAt(idx);
	}

	private static void setNext(char[] buffer, char c) {
		for (int i = 0; i < buffer.length; i++)
			if (buffer[i] == 0) {
				buffer[i] = c;
				break;
			}
	}

	private static void reset(char[] buffer) {
		for (int i = 0; i < buffer.length; i++)
			buffer[i] = 0;
	}

	private static String checkPosixCharClass(char[] buffer) {
		for (int i = 0; i < POSIX_CHAR_CLASSES.size(); i++) {
			String clazz = POSIX_CHAR_CLASSES.get(i);
			boolean match = true;
			for (int j = 0; j < clazz.length(); j++)
				if (buffer[j] != clazz.charAt(j)) {
					match = false;
					break;
				}
			if (match)
				return JAVA_CHAR_CLASSES.get(i);
		}
		return null;
	}

	static String deleteBackslash(String s) {
		if (s.indexOf('\\') < 0) {
			return s;
		}
		StringBuilder sb = new StringBuilder(s.length());
		for (int i = 0; i < s.length(); i++) {
			char ch = s.charAt(i);
			if (ch == '\\') {
				if (i + 1 == s.length()) {
					continue;
				}
				char next = s.charAt(i + 1);
				if (next == '\\') {
					sb.append(ch);
					i++;
					continue;
				}
				if (!escapedByBackslash(next)) {
					continue;
				}
			}
			sb.append(ch);
		}
		return sb.toString();
	}

}