How to replace words with span tag using jsoup?
Suppose I have the following html:
<html> <head> </head> <body> <div id="wrapper" > <div class="s2">I am going <a title="some title" href="">by flying</a> <p>mr tt</p> </div> </div> </body> </html> Any words in text nodes equal to or greater than 4 characters, for example, the word "going" is replaced by the contents of html (not text) <span>going<span> in the original html, without changing anything.
If I try to do something like element.html (replacement), the problem is that if the current element is <div class="s2"> , it will also flow out <a title="some title"
In this case, you must go through your document as suggested by this answer . Here you can do it using the Jsoup APIs:
NodeTraversorandNodeVisitorallow you to navigate the DOMNode.replaceWith(...)allows you to replace node in the DOM
Here is the code:
public class JsoupReplacer { public static void main(String[] args) { so6527876(); } public static void so6527876() { String html = "<html>" + "<head>" + "</head>" + "<body>" + " <div id=\"wrapper\" >" + " <div class=\"s2\">I am going <a title=\"some title\" href=\"\">by flying</a>" + " <p>mr tt</p>" + " </div> " + " </div>" + "</body> " + "</html>"; Document doc = Jsoup.parse(html); final List<TextNode> nodesToChange = new ArrayList<TextNode>(); NodeTraversor nd = new NodeTraversor(new NodeVisitor() { @Override public void tail(Node node, int depth) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.getWholeText(); String[] words = text.trim().split(" "); for (String word : words) { if (word.length() > 4) { nodesToChange.add(textNode); break; } } } } @Override public void head(Node node, int depth) { } }); nd.traverse(doc.body()); for (TextNode textNode : nodesToChange) { Node newNode = buildElementForText(textNode); textNode.replaceWith(newNode); } System.out.println("result: "); System.out.println(); System.out.println(doc); } private static Node buildElementForText(TextNode textNode) { String text = textNode.getWholeText(); String[] words = text.trim().split(" "); Set<String> longWords = new HashSet<String>(); for (String word : words) { if (word.length() > 4) { longWords.add(word); } } String newText = text; for (String longWord : longWords) { newText = newText.replaceAll(longWord, "<span>" + longWord + "</span>"); } return new DataNode(newText, textNode.baseUri()); } } I think you need to cross the tree. The result of text () on the element will be the entire text of the element, including text in child elements. We hope you enjoy the following code:
import java.io.File; import java.io.IOException; import java.util.StringTokenizer; import org.apache.commons.io.FileUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; public class ScreenScrape { public static void main(String[] args) throws IOException { String content = FileUtils.readFileToString(new File("test.html")); Document doc = Jsoup.parse(content); Element body = doc.body(); //System.out.println(body.toString()); StringBuilder sb = new StringBuilder(); traverse(body, sb); System.out.println(sb.toString()); } private static void traverse(Node n, StringBuilder sb) { if (n instanceof Element) { sb.append('<'); sb.append(n.nodeName()); if (n.attributes().size() > 0) { sb.append(n.attributes().toString()); } sb.append('>'); } if (n instanceof TextNode) { TextNode tn = (TextNode) n; if (!tn.isBlank()) { sb.append(spanifyText(tn.text())); } } for (Node c : n.childNodes()) { traverse(c, sb); } if (n instanceof Element) { sb.append("</"); sb.append(n.nodeName()); sb.append('>'); } } private static String spanifyText(String text){ StringBuilder sb = new StringBuilder(); StringTokenizer st = new StringTokenizer(text); String token; while (st.hasMoreTokens()) { token = st.nextToken(); if(token.length() > 3){ sb.append("<span>"); sb.append(token); sb.append("</span>"); } else { sb.append(token); } sb.append(' '); } return sb.substring(0, sb.length() - 1).toString(); } } UPDATE
Using the Jonathan new Jsoup List element.textNode () method and combining it with MarcoS, the NodeTraversor / NodeVisitor method that I came up with is proposed (although I modify the tree while it goes through - maybe a bad idea):
Document doc = Jsoup.parse(content); Element body = doc.body(); NodeTraversor nd = new NodeTraversor(new NodeVisitor() { @Override public void tail(Node node, int depth) { if (node instanceof Element) { boolean foundLongWord; Element elem = (Element) node; Element span; String token; StringTokenizer st; ArrayList<Node> changedNodes; Node currentNode; for (TextNode tn : elem.textNodes()) { foundLongWord = Boolean.FALSE; changedNodes = new ArrayList<Node>(); st = new StringTokenizer(tn.text()); while (st.hasMoreTokens()) { token = st.nextToken(); if (token.length() > 3) { foundLongWord = Boolean.TRUE; span = new Element(Tag.valueOf("span"), elem.baseUri()); span.appendText(token); changedNodes.add(span); } else { changedNodes.add(new TextNode(token + " ", elem.baseUri())); } } if (foundLongWord) { currentNode = changedNodes.remove(0); tn.replaceWith(currentNode); for (Node n : changedNodes) { currentNode.after(n); currentNode = n; } } } } } @Override public void head(Node node, int depth) { } }); nd.traverse(body); System.out.println(body.toString()); I replace hello hello (span tag)
Document doc = Jsoup.parse(content); Element test = doc.body(); Elements elemenets = test.getAllElements(); for(int i =0 ;i <elemenets .size();i++){ String elementText = elemenets .get(i).text(); if(elementText.contains("hello")) elemenets .get(i).html(l.get(i).text().replaceAll("hello","<span style=\"color:blue\">hello</span>")); }