爬虫 - crawler

1. JSoup htmlunit 只能静态、同步的数据。

2. 爬动态的,异步的数据:cdp4j 

https://blog.csdn.net/f1370335844/article/details/102815452

https://webfolder.io/cdp4j.html

https://github.com/chrisp-d/cdp4j

 

     <!-- https://mvnrepository.com/artifact/io.webfolder/cdp4j -->
        <dependency>
            <groupId>io.webfolder</groupId>
            <artifactId>cdp4j</artifactId>
            <version>3.0.13</version>
        </dependency>
        <!--  https://mvnrepository.com/artifact/org.jvnet.winp/winp-->
        <dependency>
            <groupId>org.jvnet.winp</groupId>
            <artifactId>winp</artifactId>
            <version>1.28</version>
        </dependency>
package com.sy.crawl;

import io.webfolder.cdp.Launcher;
import io.webfolder.cdp.command.Network;
import io.webfolder.cdp.session.Session;
import io.webfolder.cdp.session.SessionFactory;
import io.webfolder.cdp.type.network.CookieParam;
import io.webfolder.cdp.type.network.CookieSameSite;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import java.util.ArrayList;

public class App {

    public static void main(String[] args) {
        ArrayList<String> command = new ArrayList<>();
        //不显示google 浏览器
        command.add("--headless");

        Launcher launcher = new Launcher();
        try (SessionFactory factory = launcher.launch(command); Session session = factory.create()) {
            session.clearCookies();
            String url = "https://www.nmpa.gov.cn/xxgk/chpzhh/index.html";
            String domain = "www.nmpa.gov.cn";

            final Network network = session.getThis().getCommand().getNetwork();
            network.setCookie("acw_tc", "3ccdc16316153796650355332e4a890fb765c5ae1d4cb290d77f09c29ea9a6", url, domain, "/", false, false, CookieSameSite.Strict, 0D);
            network.setCookie("neCYtZEjo8GmO", "5L2lDTmev0mDPOjikIb58I5VgBDfQB5S.y.C7OIhk9c_O06_fC1Z7FrQdKPDDnOTHhoPyRzKR7CtbqSrPZvNrEG", url, domain, "/", false, false, CookieSameSite.Strict, 0D);
            network.setCookie("neCYtZEjo8GmP", "53lPZVKrEBJlqqqmgor52QGf8NeM0Ymi0zCSDP5OMzY_pvxEgQlDh2urTvJDK.9rrZu7aC8nRgoG6aTe_LEzu6f0AYUYe1Yv3Ot7.5To72bKOUYpXgvi6ESQUOIhoY6oMYmRiPJHP8HflG9tO.YQyXnVju9SR0yVlc_9HOMAmzERSrbgJdVS0fp0nIqagq_BxN.6ivvJbUtnKXhv_lC6lBpghJ3Yz.ruNhg9m6Ptho7wsU46yjTPxBXL5Dx5JyUf8o82v7NvZru2EXnYdCQpN84JPzCYhBC8qhZQmPsnHVKTGnrVYN3l2KAupK8c7.VodW", url, domain, "/", false, false, CookieSameSite.Strict, 0D);
            session.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36");

            session.navigate(url);
            session.waitDocumentReady();
            String content = session.getContent();
            System.out.println("================con=================");
            System.out.println(content);
            System.out.println("================con=================");
        } finally {
            launcher.kill();
        }
    }

}

// 可去掉cookie; 只留userAgent

 

或者:

package com.sy.crawl.utils;

import com.sy.crawl.constant.Constants;
import com.sy.crawl.entity.NoticeInfo;
import io.webfolder.cdp.Launcher;
import io.webfolder.cdp.session.Session;
import io.webfolder.cdp.session.SessionFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @author Alice on 2021-03-05
 */
public class CrawlTool {
    private static final Logger LOGGER = LoggerFactory.getLogger(CrawlTool.class);

    public static List<NoticeInfo> parse(String content) {
        List<NoticeInfo> noticeInfoList = new ArrayList<>();
        try {
            Document document = Jsoup.parse(content);
            if (null != document) {
                Elements li = document.select(".yp-child-list .list ul li");
                if (!li.isEmpty()) {
                    for (Element e : li) {
                        String href = e.select("a").attr("href");
                        href = Constants.DOMAIN_NAME + href.substring(5);
                        System.out.println(href);
                        String title = e.select("a").attr("title");
                        System.out.println(title);
                        String time = e.select("span").text();
                        time = time.substring(1, 11);
                        System.out.println(time);
                        LOGGER.info("爬的内容:" + href + " " + title + "" + time);

                        NoticeInfo noticeInfo = new NoticeInfo();
                        noticeInfo.setTitle(title);
                        noticeInfo.setUrl(href);
                        noticeInfo.setPublishTime(String2Date(time));

                        noticeInfoList.add(noticeInfo);
                    }
                } else {
                    LOGGER.info("爬内容时,Elements元组为空.content:{}.", content);
                }

            } else {
                LOGGER.info("爬内容时,document为空.");
            }

        } catch (Exception e) {
            LOGGER.error("爬虫时,解析content:{},发生了异常.", content, e);
        }

 /*       //temp:临时为了测试数据添加的
        if (CollectionUtils.isEmpty(noticeInfoList)) {
            for (int i = 0; i < 5; i++) {
                NoticeInfo noticeInfo = new NoticeInfo();
                noticeInfo.setTitle("title " + i);
                noticeInfo.setUrl("href " + i);
                noticeInfo.setPublishTime(new Date());
                noticeInfoList.add(noticeInfo);
            }
        }*/

        return noticeInfoList;
    }

    private static Date String2Date(String time) throws ParseException {
        return new SimpleDateFormat("yyyy-MM-dd").parse(time);
    }

    public static List<NoticeInfo> crawler(String url) {
        Launcher launcher = new Launcher();
        try {
            String content = connectCrawl(launcher, url);
            System.out.println(content);
            return parse(content);
        } catch (Exception e) {
            LOGGER.error("爬虫,连接url:{}.发生了异常.", url, e);
            //如果进了异常,在试一次,连接.
            crawler(url);
        } finally {
            launcher.kill();
        }
        return null;
    }

    private static String connectCrawl(Launcher launcher, String url) throws Exception {
        ArrayList<String> command = new ArrayList<>();
        //不显示google 浏览器
        command.add("--headless");
        LOGGER.info("准备开始爬数据.url:{}", url);
        SessionFactory factory = launcher.launch(command);
        Session session = factory.create();
        session.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36");
        session.navigate(url);
        session.waitDocumentReady();
        return session.getContent();
    }

}

 

posted @ 2021-03-17 10:04  Alice的小屋  阅读(372)  评论(0编辑  收藏  举报