公司有一个虫库需要完善虫子的信息,于是..................................

 

        <dependency>
            <!-- jsoup HTML parser library @ https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.9.1</version>
        </dependency>
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ListIterator;

/**
 * @description 百度百科数据抓取
 * @version V1.0
 * @author zhang
 * @date
 * @update
 */
public class BaiKeiCap {
    public static void main(String[] args) throws IOException {
        start("肾毒蛾");
    }

    private static StringBuilder start(String insect) {
        StringBuilder stringBuilder = new StringBuilder();
        try {
            Document document = Jsoup.connect("https://baike.baidu.com/item/" + insect).get();
            Elements description = document.getElementsByClass("description");
            Elements supNormal = document.getElementsByClass("sup--normal");
            Elements titlePrefix = document.getElementsByClass("title-prefix");
            Elements audio = document.getElementsByClass("J-part-audio-text");
            Elements lemma = document.getElementsByClass("wiki-lemma-icons_edit-lemma");
            Elements anchorList = document.getElementsByClass("anchor-list");
            Elements editIcon = document.getElementsByClass("edit-icon");
            Elements lemmaAnchor = document.getElementsByClass("lemma-anchor");
            // 去除图片
            supNormal.remove();
            // 去除图片描述
            description.remove();
            // 去除标题前缀
            titlePrefix.remove();
            // 移除编辑和播报
            audio.remove();
            lemma.remove();
            // 移除段落小标题
            anchorList.remove();
            editIcon.remove();
            lemmaAnchor.remove();
            Elements intro = document.getElementsByClass("lemma-summary");
            Elements introChildren = intro.get(0).children();
            ListIterator<Element> elementListIterator = introChildren.listIterator();
            System.err.println(
                    "==========================================================简介==========================================================");
            stringBuilder.append(
                    "<p class=\"ql-align-center\"><span class=\"ql-size-huge\">简介</span></p>");
            while (elementListIterator.hasNext()) {
                Element next = elementListIterator.next();
                if (next.text().contains("概述图参考来源")) {
                    continue;
                }
                System.err.println(next.text() + "<p>");
            }
            // 获取到所有章节
            Elements chapter = document.getElementsByClass("J-chapter");
            ListIterator<Element> chapterIterator = chapter.listIterator();
            // 遍历每个章节
            while (chapterIterator.hasNext()) {
                // 获取他的兄弟元素
                Element next = chapterIterator.next();
                // 获取章节标题
                String chapterTitle = next.select(".title-text").text();
                System.err.println(
                        "=========================================================="
                                + chapterTitle
                                + "==========================================================");
                stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">");
                stringBuilder.append(chapterTitle);
                stringBuilder.append("</span></p>");
                getSub(next.nextElementSibling(), stringBuilder);
            }
            return stringBuilder;
        } catch (Exception e) {
            System.err.println("未搜索到内容");
            return stringBuilder;
        }
    }

    private static void getSub(Element element, StringBuilder stringBuilder) {
        if (element.hasClass("J-chapter") || "J-main-content-end-dom".equals(element.attr("id"))) {
            return;
        }
        if (element.hasClass("para-title")
                && element.hasAttr("label-module")
                && element.hasAttr("data-index")) {
            System.err.println(
                    "=========================================================="
                            + element.text()
                            + "==========================================================");
            stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">");
            stringBuilder.append(element.text());
            stringBuilder.append("</span></p>");
        }
        if (element.hasClass("para") && element.hasAttr("label-module")) {
            System.err.println(element.text());
            stringBuilder.append("</p>  " + element.text() + "</p>");
        }
        element = element.nextElementSibling();
        getSub(element, stringBuilder);
    }
}