爬虫 - crawler
1. JSoup htmlunit 只能静态、同步的数据。
2. 爬动态的,异步的数据:cdp4j
https://blog.csdn.net/f1370335844/article/details/102815452
https://webfolder.io/cdp4j.html
https://github.com/chrisp-d/cdp4j
<!-- https://mvnrepository.com/artifact/io.webfolder/cdp4j --> <dependency> <groupId>io.webfolder</groupId> <artifactId>cdp4j</artifactId> <version>3.0.13</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jvnet.winp/winp--> <dependency> <groupId>org.jvnet.winp</groupId> <artifactId>winp</artifactId> <version>1.28</version> </dependency>
package com.sy.crawl; import io.webfolder.cdp.Launcher; import io.webfolder.cdp.command.Network; import io.webfolder.cdp.session.Session; import io.webfolder.cdp.session.SessionFactory; import io.webfolder.cdp.type.network.CookieParam; import io.webfolder.cdp.type.network.CookieSameSite; import org.springframework.boot.autoconfigure.SpringBootApplication; import java.util.ArrayList; public class App { public static void main(String[] args) { ArrayList<String> command = new ArrayList<>(); //不显示google 浏览器 command.add("--headless"); Launcher launcher = new Launcher(); try (SessionFactory factory = launcher.launch(command); Session session = factory.create()) { session.clearCookies(); String url = "https://www.nmpa.gov.cn/xxgk/chpzhh/index.html"; String domain = "www.nmpa.gov.cn"; final Network network = session.getThis().getCommand().getNetwork(); network.setCookie("acw_tc", "3ccdc16316153796650355332e4a890fb765c5ae1d4cb290d77f09c29ea9a6", url, domain, "/", false, false, CookieSameSite.Strict, 0D); network.setCookie("neCYtZEjo8GmO", "5L2lDTmev0mDPOjikIb58I5VgBDfQB5S.y.C7OIhk9c_O06_fC1Z7FrQdKPDDnOTHhoPyRzKR7CtbqSrPZvNrEG", url, domain, "/", false, false, CookieSameSite.Strict, 0D); network.setCookie("neCYtZEjo8GmP", "53lPZVKrEBJlqqqmgor52QGf8NeM0Ymi0zCSDP5OMzY_pvxEgQlDh2urTvJDK.9rrZu7aC8nRgoG6aTe_LEzu6f0AYUYe1Yv3Ot7.5To72bKOUYpXgvi6ESQUOIhoY6oMYmRiPJHP8HflG9tO.YQyXnVju9SR0yVlc_9HOMAmzERSrbgJdVS0fp0nIqagq_BxN.6ivvJbUtnKXhv_lC6lBpghJ3Yz.ruNhg9m6Ptho7wsU46yjTPxBXL5Dx5JyUf8o82v7NvZru2EXnYdCQpN84JPzCYhBC8qhZQmPsnHVKTGnrVYN3l2KAupK8c7.VodW", url, domain, "/", false, false, CookieSameSite.Strict, 0D); session.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"); session.navigate(url); session.waitDocumentReady(); String content = session.getContent(); System.out.println("================con================="); System.out.println(content); System.out.println("================con================="); } finally { launcher.kill(); } } }
// 可去掉cookie; 只留userAgent
或者:
package com.sy.crawl.utils; import com.sy.crawl.constant.Constants; import com.sy.crawl.entity.NoticeInfo; import io.webfolder.cdp.Launcher; import io.webfolder.cdp.session.Session; import io.webfolder.cdp.session.SessionFactory; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.CollectionUtils; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; /** * @author Alice on 2021-03-05 */ public class CrawlTool { private static final Logger LOGGER = LoggerFactory.getLogger(CrawlTool.class); public static List<NoticeInfo> parse(String content) { List<NoticeInfo> noticeInfoList = new ArrayList<>(); try { Document document = Jsoup.parse(content); if (null != document) { Elements li = document.select(".yp-child-list .list ul li"); if (!li.isEmpty()) { for (Element e : li) { String href = e.select("a").attr("href"); href = Constants.DOMAIN_NAME + href.substring(5); System.out.println(href); String title = e.select("a").attr("title"); System.out.println(title); String time = e.select("span").text(); time = time.substring(1, 11); System.out.println(time); LOGGER.info("爬的内容:" + href + " " + title + "" + time); NoticeInfo noticeInfo = new NoticeInfo(); noticeInfo.setTitle(title); noticeInfo.setUrl(href); noticeInfo.setPublishTime(String2Date(time)); noticeInfoList.add(noticeInfo); } } else { LOGGER.info("爬内容时,Elements元组为空.content:{}.", content); } } else { LOGGER.info("爬内容时,document为空."); } } catch (Exception e) { LOGGER.error("爬虫时,解析content:{},发生了异常.", content, e); } /* //temp:临时为了测试数据添加的 if (CollectionUtils.isEmpty(noticeInfoList)) { for (int i = 0; i < 5; i++) { NoticeInfo noticeInfo = new NoticeInfo(); noticeInfo.setTitle("title " + i); noticeInfo.setUrl("href " + i); noticeInfo.setPublishTime(new Date()); noticeInfoList.add(noticeInfo); } }*/ return noticeInfoList; } private static Date String2Date(String time) throws ParseException { return new SimpleDateFormat("yyyy-MM-dd").parse(time); } public static List<NoticeInfo> crawler(String url) { Launcher launcher = new Launcher(); try { String content = connectCrawl(launcher, url); System.out.println(content); return parse(content); } catch (Exception e) { LOGGER.error("爬虫,连接url:{}.发生了异常.", url, e); //如果进了异常,在试一次,连接. crawler(url); } finally { launcher.kill(); } return null; } private static String connectCrawl(Launcher launcher, String url) throws Exception { ArrayList<String> command = new ArrayList<>(); //不显示google 浏览器 command.add("--headless"); LOGGER.info("准备开始爬数据.url:{}", url); SessionFactory factory = launcher.launch(command); Session session = factory.create(); session.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"); session.navigate(url); session.waitDocumentReady(); return session.getContent(); } }