.
After searching for all the related answers on SO, I realized that in this case a lexer is needed, and I wrote a general implementation for myself. It supports a delimiter, multiple quotes, and multiple brackets as regular expressions.
public static List<String> split(String string, String regex, String[] quotesRegex, String[] leftBracketsRegex,
String[] rightBracketsRegex) {
if (leftBracketsRegex.length != rightBracketsRegex.length) {
throw new IllegalArgumentException("Bracket count mismatch, left: " + leftBracketsRegex.length + ", right: "
+ rightBracketsRegex.length);
}
String[] delimiters = new String[1 + quotesRegex.length + leftBracketsRegex.length + rightBracketsRegex.length];
delimiters[0] = regex;
System.arraycopy(quotesRegex, 0, delimiters, 1, quotesRegex.length);
System.arraycopy(leftBracketsRegex, 0, delimiters, 1 + quotesRegex.length, leftBracketsRegex.length);
System.arraycopy(rightBracketsRegex, 0, delimiters, 1 + quotesRegex.length + leftBracketsRegex.length,
rightBracketsRegex.length);
StringBuilder delimitersRegexBuilder = new StringBuilder("(?:");
boolean first = true;
for (String delimiter : delimiters) {
if (delimiter.endsWith("\\") && !delimiter.endsWith("\\\\")) {
throw new IllegalArgumentException("Delimiter contains trailing single \\: " + delimiter);
}
if (first) {
first = false;
} else {
delimitersRegexBuilder.append("|");
}
delimitersRegexBuilder
.append("(")
.append(delimiter)
.append(")");
}
delimitersRegexBuilder.append(")");
String delimitersRegex = delimitersRegexBuilder.toString();
int pendingQuoteIndex = -1;
Deque<Integer> bracketStack = new LinkedList<>();
StringBuilder pendingSegmentBuilder = new StringBuilder();
List<String> segmentList = new ArrayList<>();
Matcher matcher = Pattern.compile(delimitersRegex).matcher(string);
int matcherIndex = 0;
while (matcher.find()) {
pendingSegmentBuilder.append(string.substring(matcherIndex, matcher.start()));
int delimiterIndex = -1;
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
delimiterIndex = i - 1;
break;
}
}
if (delimiterIndex < 1) {
if (pendingQuoteIndex == -1 && bracketStack.isEmpty()) {
segmentList.add(pendingSegmentBuilder.toString());
pendingSegmentBuilder.setLength(0);
} else {
pendingSegmentBuilder.append(matcher.group());
}
} else {
delimiterIndex -= 1;
pendingSegmentBuilder.append(matcher.group());
if (delimiterIndex < quotesRegex.length) {
if (pendingQuoteIndex == -1) {
pendingQuoteIndex = delimiterIndex;
} else if (pendingQuoteIndex == delimiterIndex) {
pendingQuoteIndex = -1;
}
} else if (pendingQuoteIndex == -1) {
delimiterIndex -= quotesRegex.length;
if (delimiterIndex < leftBracketsRegex.length) {
bracketStack.push(delimiterIndex);
} else {
delimiterIndex -= leftBracketsRegex.length;
int topBracket = bracketStack.peek();
if (delimiterIndex == topBracket) {
bracketStack.pop();
}
}
}
}
matcherIndex = matcher.end();
}
pendingSegmentBuilder.append(string.substring(matcherIndex, string.length()));
segmentList.add(pendingSegmentBuilder.toString());
while (segmentList.size() > 0 && segmentList.get(segmentList.size() - 1).isEmpty()) {
segmentList.remove(segmentList.size() - 1);
}
return segmentList;
}
source
share