java- truncateHTML. , HTML.
public static String truncateHTML(String text, int length, String suffix) {
if (text.replaceAll("<.*?>", "").length() <= length) {
return text;
}
String result = "";
boolean trimmed = false;
if (suffix == null) {
suffix = "...";
}
Pattern tagPattern = Pattern.compile("(<.+?>)?([^<>]*)");
Pattern emptyTagPattern = Pattern.compile("^<\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param).*>$");
Pattern closingTagPattern = Pattern.compile("^<\\s*/\\s*([a-zA-Z]+[1-6]?)\\s*>$");
Pattern openingTagPattern = Pattern.compile("^<\\s*([a-zA-Z]+[1-6]?).*?>$");
Pattern entityPattern = Pattern.compile("(&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};)");
Matcher tagMatcher = tagPattern.matcher(text);
int numTags = tagMatcher.groupCount();
int totalLength = suffix.length();
List<String> openTags = new ArrayList<String>();
boolean proposingChop = false;
while (tagMatcher.find()) {
String tagText = tagMatcher.group(1);
String plainText = tagMatcher.group(2);
if (proposingChop &&
tagText != null && tagText.length() != 0 &&
plainText != null && plainText.length() != 0) {
trimmed = true;
break;
}
if (tagText != null && tagText.length() > 0) {
boolean foundMatch = false;
Matcher matcher = emptyTagPattern.matcher(tagText);
if (matcher.find()) {
foundMatch = true;
}
if (!foundMatch) {
matcher = closingTagPattern.matcher(tagText);
if (matcher.find()) {
foundMatch = true;
String tagName = matcher.group(1);
openTags.remove(tagName.toLowerCase());
}
}
if (!foundMatch) {
matcher = openingTagPattern.matcher(tagText);
if (matcher.find()) {
String tagName = matcher.group(1);
openTags.add(0, tagName.toLowerCase());
}
}
result += tagText;
}
int contentLength = plainText.replaceAll("&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};", " ").length();
if (totalLength + contentLength > length) {
int numCharsRemaining = length - totalLength;
int entitiesLength = 0;
Matcher entityMatcher = entityPattern.matcher(plainText);
while (entityMatcher.find()) {
String entity = entityMatcher.group(1);
if (numCharsRemaining > 0) {
numCharsRemaining--;
entitiesLength += entity.length();
} else {
break;
}
}
int proposedChopPosition = numCharsRemaining + entitiesLength;
int endOfWordPosition = plainText.indexOf(" ", proposedChopPosition-1);
if (endOfWordPosition == -1) {
endOfWordPosition = plainText.length();
}
int endOfWordOffset = endOfWordPosition - proposedChopPosition;
if (endOfWordOffset > 6) {
endOfWordOffset = 0;
}
proposedChopPosition = numCharsRemaining + entitiesLength + endOfWordOffset;
if (plainText.length() >= proposedChopPosition) {
result += plainText.substring(0, proposedChopPosition);
proposingChop = true;
if (proposedChopPosition < plainText.length()) {
trimmed = true;
break;
}
} else {
result += plainText;
}
} else {
result += plainText;
totalLength += contentLength;
}
if(totalLength >= length) {
trimmed = true;
break;
}
}
for (String openTag : openTags) {
result += "</" + openTag + ">";
}
if (trimmed) {
result += suffix;
}
return result;
}