java爬虫

 

 首先,创建一个Maven项目

一,导包

二,创建一个测试类

注意:url为网页地址

   

 模拟浏览器的头信息

 

 

需要爬的网页:

 

结果:

 

 

 

 

 

接下来就是爬一个大的数据

首先,爬下来放到redis中,然后存到mysql数据库

只需要写5个类,两个实体类,一个dao,一个放到redis中的类,一个存进mysql的类

配置pom.xml

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>

package com.nf147.ojp;

public class Policy {

    private String title;

    private String url;

    private String content;

    private String basis;

    private String info;

    private PolicySource policySource;

    public PolicySource getPolicySource() {
        return policySource;
    }

    public void setPolicySource(PolicySource policySource) {
        this.policySource = policySource;
    }

    public String getBasis() {
        return basis;
    }

    public void setBasis(String basis) {
        this.basis = basis;
    }

    public String getInfo() {
        return info;
    }

    public void setInfo(String info) {
        this.info = info;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    @Override
    public String toString() {
        return "Policy{" +
                "title='" + title + '\'' +
                ", url='" + url + '\'' +
                ", content='" + content + '\'' +
                ", basis='" + basis + '\'' +
                ", info='" + info + '\'' +
                ", policySource=" + policySource +
                '}';
    }
}
View Code

package com.nf147.ojp;

public class PolicySource {
    private String url;
    private String title;
    private String content;

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    @Override
    public String toString() {
        return "PolicySource{" +
                "url='" + url + '\'' +
                ", title='" + title + '\'' +
                ", content='" + content.substring(0,20) + '\'' +
                '}';
    }
}
View Code

package com.nf147.ojp;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class PolicyDAO {
    //186个页面

    //定义官网的地址
    public String hostURL = "http://www.zhsme.gov.cn";
    //定义列表页面



    //抓取列表
    public List<Policy> getListInfo(int pageNum) throws IOException {
        String policyListUrl = "http://www.zhsme.gov.cn/policy/getPolicyList?pageNum=" + pageNum + "&NameOrWords=&areaSreachValue=&areaSreachId=&scaleSreachValue=&scaleSreachId=&levelSreachValue=&levelSreachId=&isShuangChuang=";
        List<Policy> list = new ArrayList<>();
        Policy item = null;
        //获取到列表
        Element doc = Jsoup.connect(policyListUrl)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36")
                .get();

        Elements listTitle = doc.select(".list-content.list-content-1 h4 a");
        for (Element next : listTitle) {
            item = new Policy();
            //政策信息
            item.setPolicySource(new PolicySource());
            //设置url
            item.setUrl(hostURL + next.attr("href"));
            //设置标题
            item.setTitle(next.attr("title"));

            list.add(item);
        }

        return list;

    }

    //抓取政策信息
    public Policy getPolicyInfo(Policy policy) throws IOException {
        //定义政策源页面
        String policyUrl = "http://www.zhsme.gov.cn/policy/getTextPolicyByTextPolicyId?textPolicyId=";
        //抓取政策信息
        Element content = Jsoup.connect(policy.getUrl())
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36")
                .get();
        //政策依据
        policy.setBasis(content.select(".policy-txt.clear p").text());
        //政策相关信息
        policy.setInfo(content.select(".policy-con p").text());

        //筛选政策连接地址
        Elements select = content.select(".policy-txt.clear > a");
        //政策标题
        String policySourceTitle = select.text();
        policy.getPolicySource().setTitle(policySourceTitle);
        //政策明细
            String text = content.select(".part-warp.part-one.clear").text();
        policy.setContent(text);
        String onclick = select.attr("onclick");
        //连接地址
        String policyUrlId = onclick.substring(onclick.indexOf("'") + 1, onclick.lastIndexOf("'"));
        policy.getPolicySource().setUrl(policyUrl + policyUrlId);

        return policy;
    }

    //抓取政策源文件
    public Policy getPolicyBasisInfo(Policy policy) throws IOException {
        // 抓取政策源页面
        Element policyBasis = Jsoup.connect(policy.getPolicySource().getUrl())
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36")
                .get();

        PolicySource policySource = policy.getPolicySource();
        //政策源内容
            policySource.setContent(String.valueOf(policyBasis.select(".view-content")));
        return policy;
    }

    //抓取总页数
    public int getSumPageNum() throws IOException {
        String policyListUrl = "http://www.zhsme.gov.cn/policy/getPolicyList?pageNum=1&NameOrWords=&areaSreachValue=&areaSreachId=&scaleSreachValue=&scaleSreachId=&levelSreachValue=&levelSreachId=&isShuangChuang=";

        Document document = Jsoup.connect(policyListUrl)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36")
                .get();
        String script = String.valueOf(document.select("script[type=text/javascript]"));
        String numText = script.substring(script.indexOf("totalPages"), script.indexOf("visiblePages"));
        String sumPageNum = numText.substring(numText.indexOf(":") + 1, numText.indexOf(","));
        return Integer.valueOf(sumPageNum);
    }
}
View Code

package com.nf147.ojp;

import com.fasterxml.jackson.databind.ObjectMapper;
import redis.clients.jedis.Jedis;

import java.io.IOException;
import java.util.List;

public class JsoupReptile {

    public static void main(String[] args) throws IOException, InterruptedException {

        //redis
        Jedis jedis = new Jedis();
        ObjectMapper mapper = new ObjectMapper();


        PolicyDAO policyDAO = new PolicyDAO();

        //定义开始抓取的页面
        int nowPageNum = 1;
        //定义结束页面 (最大186)
        int maxPageNum = policyDAO.getSumPageNum();


        while (nowPageNum <= maxPageNum) {

            try {
                List<Policy> listInfo = policyDAO.getListInfo(nowPageNum);

                for (int j = 1; j < 4; j++) {
                    try {
                        listInfo = policyDAO.getListInfo(nowPageNum);
                        //抓取成功跳出循环
                        break;
                    } catch (Exception e) {
                        System.out.println("抓取第" + nowPageNum + "页列表出现问题..正在进行第" + j + "重试");
                    }
                }

                for (int i = 0; i < listInfo.size(); i++) {
                    Policy policy = policy = listInfo.get(i);

                    Policy policyInfo = null;
                    for (int j = 1; j < 4; j++) {
                        try {
                            policyInfo = policyDAO.getPolicyInfo(policy);

                            break;
                        } catch (Exception e) {
                            System.out.println("抓取详情" + policy.getUrl() + "时出现错误,正在进行第" + j + "次尝试");
                        }
                    }

                    for (int j = 1; j < 4; j++) {
                        try {
                            policyInfo = policyDAO.getPolicyBasisInfo(policyInfo);

                            break;
                        } catch (Exception e) {
                            assert policyInfo != null;
                            System.out.println("抓取源" + policyInfo.getPolicySource().getUrl() + "的时候出现错误,正在进行第" + j + "次尝试");
                        }

                    }

                    listInfo.set(i, policyInfo);
                }

                //写入redis
                jedis.set("list-" + nowPageNum, mapper.writeValueAsString(listInfo));

                System.out.println("已爬取第" + nowPageNum + "页");
                nowPageNum++;
                //休眠300毫秒
                Thread.sleep(200);
            } catch (Exception e) {
                System.out.println("已达到抓取失败次数上限,跳过第" + nowPageNum + "页");
                nowPageNum++;
            }

        }
    }


}
View Code

package com.nf147.ojp;

import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.mchange.v2.c3p0.ComboPooledDataSource;
import redis.clients.jedis.Jedis;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

public class GetDataByRedis {

    public static void main(String[] args) throws IOException, InterruptedException, SQLException {


//        int[] list = {168,180,183,185};
//
//        Jedis jedis = new Jedis();
//        ObjectMapper mapper = new ObjectMapper();
//
//
//        PolicyDAO policyDAO = new PolicyDAO();
//
//
//
//        for (int i : list) {
//            try{
//                List<Policy> listInfo = policyDAO.getListInfo(i);
//
//                for (int j = 0; j < listInfo.size(); j++) {
//                    Policy policy = listInfo.get(j);
//                    Policy policyInfo = policyDAO.getPolicyInfo(policy);
//                    Policy policyBasisInfo = policyDAO.getPolicyBasisInfo(policyInfo);
//                    listInfo.set(j, policyBasisInfo);
//                }
//
//                jedis.set("list-" + i, mapper.writeValueAsString(listInfo));
//
//                System.out.println("已爬取第" + i + "页");
//                //休眠300毫秒
//                Thread.sleep(300);
//            } catch (Exception e) {
//                System.out.println(e.getMessage());
//                System.out.println("出错了第" + i + "页");
//            }
//
//        }

        Jedis jedis = new Jedis();

        Set<String> keys = jedis.keys("list-*");

        ObjectMapper mapper = new ObjectMapper();


        List<Policy> list = null;

        ComboPooledDataSource source = new ComboPooledDataSource("mysql");
        source.setJdbcUrl("jdbc:mariadb://localhost:3307/zqy");
        source.setUser("root");
        source.setPassword("123456");

        Connection connection = source.getConnection();

        int i = 0;
        long startTime = System.currentTimeMillis();

        //关闭自动提交
        connection.setAutoCommit(false);
        PreparedStatement prep = null;

        for (String key : keys) {
            try {
                String s = jedis.get(key);

                JavaType javaType = mapper.getTypeFactory().constructCollectionType(ArrayList.class, Policy.class);

                list = (List<Policy>) mapper.readValue(s, javaType);

                for (Policy policy : list) {
                    prep = connection.prepareStatement("INSERT INTO `zqy`.`policy` (`title`, `url`, `content`, `basis`, `info`, `policy_source_url`, `policy_source_title`, `policy_source_content`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)");

                    prep.setString(1, policy.getTitle());
                    prep.setString(2, policy.getUrl());
                    prep.setString(3, policy.getContent());
                    prep.setString(4, policy.getBasis());
                    prep.setString(5, policy.getInfo());
                    prep.setString(6, policy.getPolicySource().getUrl());
                    prep.setString(7, policy.getPolicySource().getTitle());
                    prep.setString(8, policy.getPolicySource().getContent());

                    prep.executeUpdate();
                    i++;
                }

                list = null;

            } catch (Exception e) {
                continue;
            }
//            System.out.println(key);
        }

        //提交事务
        connection.commit();

        prep.close();
        connection.close();


        long endTime = System.currentTimeMillis();

        System.out.println("花费时间" + (endTime - startTime) + "毫秒");
        System.out.println("共写入数据" + i + "条");

    }
}
View Code

 

数据库

create database zqy;
use zqy;
create table policy
(
     id int primary key Auto_increment,
     title varchar(200),
    url varchar(500),
    content longText,
    basis varchar(500),
    info varchar(500),
    policy_source_url varchar(500),
    policy_source_title text,
    policy_source_content longText
);

select count(*) from policy;
View Code

 

然后、、、、、、、、、、、自己搞结果、、、、、、、、 

 

 


 

posted @ 2019-01-08 14:36  欧之衍  阅读(210)  评论(0编辑  收藏  举报