利用Jsoup高亮html页面中的关键词

代码

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Test {
    public static void main(String[] args) throws IOException {
        String keyword = "hello";
        Document document = Jsoup.parse(new File("test.html"));
        List<Node> childs = document.body().childNodes();
        recursion(childs, keyword);
    }

    private static void recursion(List<Node> nodes, String keyword) {
        for (Node node : nodes) {
            if (node instanceof TextNode) {
                TextNode textNode = ((TextNode) node);
                String text = textNode.text();
                StringBuffer sb = new StringBuffer();
                String regex = "<mark>" + keyword + "</mark>";
                Pattern r = Pattern.compile(keyword, Pattern.CASE_INSENSITIVE);
                Matcher m = r.matcher(text);
                int count = 0;
                boolean result = m.find();
                if (result) count++;
                while (result) {
                    m.appendReplacement(sb, regex);
                    result = m.find();
                    if (result) count++;
                }
                m.appendTail(sb);
                if (count > 0) {
                    Element span = new Element(Tag.valueOf("span"), "", null);
                    span.html(sb.toString());
                    textNode.replaceWith(span);
                }
            } else {
                List<Node> childs = node.childNodes();
                recursion(childs, keyword);
            }
        }
    }
}
posted @ 2022-07-10 16:59  小小爬虫  阅读(65)  评论(0编辑  收藏  举报