公司有一个虫库需要完善虫子的信息,于是..................................
<dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.9.1</version> </dependency>
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ListIterator; /** * @description 百度百科数据抓取 * @version V1.0 * @author zhang * @date * @update */ public class BaiKeiCap { public static void main(String[] args) throws IOException { start("肾毒蛾"); } private static StringBuilder start(String insect) { StringBuilder stringBuilder = new StringBuilder(); try { Document document = Jsoup.connect("https://baike.baidu.com/item/" + insect).get(); Elements description = document.getElementsByClass("description"); Elements supNormal = document.getElementsByClass("sup--normal"); Elements titlePrefix = document.getElementsByClass("title-prefix"); Elements audio = document.getElementsByClass("J-part-audio-text"); Elements lemma = document.getElementsByClass("wiki-lemma-icons_edit-lemma"); Elements anchorList = document.getElementsByClass("anchor-list"); Elements editIcon = document.getElementsByClass("edit-icon"); Elements lemmaAnchor = document.getElementsByClass("lemma-anchor"); // 去除图片 supNormal.remove(); // 去除图片描述 description.remove(); // 去除标题前缀 titlePrefix.remove(); // 移除编辑和播报 audio.remove(); lemma.remove(); // 移除段落小标题 anchorList.remove(); editIcon.remove(); lemmaAnchor.remove(); Elements intro = document.getElementsByClass("lemma-summary"); Elements introChildren = intro.get(0).children(); ListIterator<Element> elementListIterator = introChildren.listIterator(); System.err.println( "==========================================================简介=========================================================="); stringBuilder.append( "<p class=\"ql-align-center\"><span class=\"ql-size-huge\">简介</span></p>"); while (elementListIterator.hasNext()) { Element next = elementListIterator.next(); if (next.text().contains("概述图参考来源")) { continue; } System.err.println(next.text() + "<p>"); } // 获取到所有章节 Elements chapter = document.getElementsByClass("J-chapter"); ListIterator<Element> chapterIterator = chapter.listIterator(); // 遍历每个章节 while (chapterIterator.hasNext()) { // 获取他的兄弟元素 Element next = chapterIterator.next(); // 获取章节标题 String chapterTitle = next.select(".title-text").text(); System.err.println( "==========================================================" + chapterTitle + "=========================================================="); stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">"); stringBuilder.append(chapterTitle); stringBuilder.append("</span></p>"); getSub(next.nextElementSibling(), stringBuilder); } return stringBuilder; } catch (Exception e) { System.err.println("未搜索到内容"); return stringBuilder; } } private static void getSub(Element element, StringBuilder stringBuilder) { if (element.hasClass("J-chapter") || "J-main-content-end-dom".equals(element.attr("id"))) { return; } if (element.hasClass("para-title") && element.hasAttr("label-module") && element.hasAttr("data-index")) { System.err.println( "==========================================================" + element.text() + "=========================================================="); stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">"); stringBuilder.append(element.text()); stringBuilder.append("</span></p>"); } if (element.hasClass("para") && element.hasAttr("label-module")) { System.err.println(element.text()); stringBuilder.append("</p> " + element.text() + "</p>"); } element = element.nextElementSibling(); getSub(element, stringBuilder); } }