【爬虫】Java爬取省市县行政区域统计数据

 

前言

网上看了好几个Python爬虫来爬取省市县行政区域统计

官网除了省市县以外,还有区,街道,居委村委层级

https://zhuanlan.zhihu.com/p/512852193

所以自己用Java写一个完整爬取的,之前写过的一版不是很理想

这次换了更轻量的库来重构,逻辑也直观些

 

依赖库:

Hutool工具库,有Http工具包和DB操作的API

Jsoup解析HTML代码,爬虫标配

Lombok简化PO

放FastJson是考虑可能不用DB存放,直接写JSON文件,在这里没用到

    <dependencies>
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.8.4</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.15</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.10</version>
        </dependency>

        <!-- 连接池https://mvnrepository.com/artifact/com.alibaba/druid -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.1.14</version>
        </dependency>

    </dependencies>

 

HutoolDb需要的配置文件:

## db.setting文件

url = jdbc:mysql://localhost:3308/my?serverTimezone=Asia/Shanghai
user = root
pass = 123456

## 可选配置
# 是否在日志中显示执行的SQL
showSql = true
# 是否格式化显示的SQL
formatSql = false
# 是否显示SQL参数
showParams = true
# 打印SQL的日志等级,默认debug,可以是info、warn、error
sqlLevel = debug

#----------------------------------------------------------------------------------------------------------------
## 连接池配置项
#————————————————
#版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
#原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633

## ---------------------------------------------------- Druid
# 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时
initialSize = 1
# 最大连接池数量
maxActive = 8
# 最小连接池数量
minIdle = 0
# 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。
maxWait = 0
# 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。
poolPreparedStatements = false
# 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100
maxOpenPreparedStatements = -1
# 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。
validationQuery = SELECT 1
# 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。
testOnBorrow = true
# 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能
testOnReturn = false
# 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。
testWhileIdle = false
# 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明
timeBetweenEvictionRunsMillis = 60000
# 物理连接初始化的时候执行的sql
connectionInitSqls = SELECT 1
# 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat  日志用的filter:log4j 防御sql注入的filter:wall
# filters = stat
# 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系
# proxyFilters =

 

表结构:

CREATE TABLE `region2021` (
  `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码',
  `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码',
  `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称',
  `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `LEVEL` int DEFAULT NULL COMMENT '层级',
  `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型',
  `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间',
  PRIMARY KEY (`CODE`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;

  

代码部分:

封装的PO类:

package cn.cloud9.rdp.po;

import lombok.*;
import java.time.LocalDateTime;

/**
 * 行政区域最小存储单元
 * @projectName: 行政区域爬取工具
 * @author: Cloud9
 * @date: 2022年06月29日 10:30
 * @version: 1.0
 */
@Data
@EqualsAndHashCode
@AllArgsConstructor
@NoArgsConstructor
@Builder
public class RegionCell {
    // 名称
    private String name;
    // 下一层的访问地址
    private String url;
    // 统计用区分代码
    private String regionCode;
    // 统计用区分代码(上级代码)
    private String parentCode;
    // 城乡分类代码
    private String typeCode;
    // 行政区的层级
    private Integer level;
    // 创建时间
    private LocalDateTime genTime;
}

  

Main启动类:

本来是想用递归写逻辑的,但是每一层的逻辑不是完全一样,所以不采用递归

层级是可以确认的,最底层到village就没有了

package cn.cloud9.rdp;

import cn.cloud9.rdp.po.RegionCell;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.HttpCookie;
import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;

/**
 * @projectName: 行政区域爬取工具
 * @author: Cloud9
 * @date: 2022年06月29日 09:55
 * @version: 1.0
 */
public class MainApplication {

    public static final String HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
    public static final String COOKIE_KEY = "SF_cookie_1";
    public static final String COOKIE_HEADER_KEY = "Cookie";
    public static final String USER_AGENT = "User-Agent";
    public static final String[] BROWSER_AGENTS = {
        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"
    };
    public final static Random R = new Random();
    public static final String TABLE_NAME = "region2021";

    public static Map<String, String> getNewRequestHeader() {
        return new ConcurrentHashMap<String, String>(){{
            this.put("Host", "www.stats.gov.cn");
//            this.put("Server", "CWAP-waf");
//            this.put("Content-Length", "2351");
//            this.put("Content-Type", "text/html");
//            this.put("Accept-Ranges", "bytes");
//            this.put("Content-Encoding", "gzip");
//            this.put("ETag", "1736-5d4bccabedf00-gzip");
//            this.put("Vary", "Accept-Encoding");
//            this.put("X-Powered-By", "anyu.qianxin.com");
//            this.put("WZWS-RAY", "1129-1656502841.38-w-waf03cdm");
        }};
    }


    public static void main(String[] args) throws SQLException {

        // 创建MySQL连接
        Db db = Db.use();
        // 清空表记录
        db.execute("TRUNCATE TABLE region2021", null);

        // 第一层请求
        HttpRequest getRequest = HttpUtil.createGet(HEADER_URL);
        HttpResponse httpResponse = getRequest.execute();
        HttpCookie cookie = httpResponse.getCookie(COOKIE_KEY);
        System.out.println(cookie);

        // 取响应状态
        int status = httpResponse.getStatus();
        // 取响应data
        String body = httpResponse.body();
        System.out.println("省 响应状态 " + status);
        System.out.println("省 响应状态 " + body);
        if (status != 200) {
            System.out.println("爬取异常,程序终止");
            return;
        }

        // 解析HTML文档,封装成文档对象
        final Document DOC = Jsoup.parse(body);
        // 省份是 class="provincetr" 的tr标签,
        Elements provinceTrList = DOC.getElementsByClass("provincetr");
        provinceTrList.forEach(tr -> {
            // 每个tr标签内部嵌套了若干省份 a标签
            Elements provinceAList = tr.getElementsByTag("a");
            provinceAList.forEach(a -> {
                int provinceTrIndex = provinceTrList.indexOf(tr) + 1;
                int provinceALinkIndex = provinceAList.indexOf(a) + 1;

                RegionCell cell = RegionCell.builder()
                    // 文本就是省份
                    .name(a.text())
                    // 下一级的地址是 首页地址 + 标签存的地址
                    .url(HEADER_URL + a.attr("href"))
                    // 第一级没有行政区编号,我自己设定规则是(tr元素下标 + a元素下标)组合
                    .regionCode(String.valueOf(provinceTrIndex) + provinceALinkIndex)
                    // 省级没有
                    .parentCode(String.valueOf(0))
                    .genTime(LocalDateTime.now())
                    .build();
                System.out.println(cell);

                // 插入省份
                try {
                    db.insertOrUpdate(Entity
                        .create(TABLE_NAME)
                        .set("CODE", cell.getRegionCode())
                        .set("PARENT_CODE", cell.getParentCode())
                        .set("NAME", cell.getName())
                        .set("LEVEL", 1)
                        .set("LINK", cell.getUrl())
                        .set("GEN_TIME", cell.getGenTime())
                    );
                } catch (SQLException e) {
                    e.printStackTrace();
                }

                // 设置header头,
                Map<String, String> header = getNewRequestHeader();
                // 把首次请求提供的Cookie放进来
                header.put(COOKIE_HEADER_KEY, cookie.toString());
                // 设置每次请求时,伪装不同的浏览器访问
                header.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute();
                // 保留本次cookie,给下一级请求时提供
                List<HttpCookie> provinceCookies = response.getCookies();
                // 不判断状态,默认200
                int provinceStatus = response.getStatus();
                System.out.println(cell.getName() + " | " + cell.getUrl() + " | 请求状态" + provinceStatus);
                String provinceBody = response.body();
                final Document provinceDOC = Jsoup.parse(provinceBody);

                // 市级取的 class="citytr"
                Elements cityTrList = provinceDOC.getElementsByClass("citytr");
                cityTrList.forEach(cityTr -> {
                    // 每个tr 存放一个城市, 第一个td 放编码 第二个td放城市名
                    // 有可能这个城市没有下一级,td就不嵌套a元素, 所以这里写了判断逻辑
                    Elements cityTds = cityTr.getElementsByTag("td");
                    Element codeTd = cityTds.get(0);
                    Element nameTd = cityTds.get(1);
                    Elements aTagInCodeTd = codeTd.getElementsByTag("a");
                    Elements aTagInNameTd = nameTd.getElementsByTag("a");

                    // 取城乡代码
                    String regionCityCode;
                    String cityNextHref = null;

                    boolean isLinkTag = !CollectionUtil.isEmpty(aTagInCodeTd);
                    if (isLinkTag) {
                        // 编码a元素和城市a元素都会放href, 这里按编码的来取
                        regionCityCode = aTagInCodeTd.get(0).text();
                        cityNextHref = aTagInCodeTd.get(0).attr("href");
                    } else regionCityCode = codeTd.text();
                    String regionName;
                    if (!CollectionUtil.isEmpty(aTagInNameTd)) regionName = aTagInNameTd.get(0).text();
                    else regionName = nameTd.text();

                    RegionCell.RegionCellBuilder cellBuilder = RegionCell.builder();
                    // 城市的下一级Link 也是一样,用首地址 + href放的地址拼接组成
                    if (isLinkTag)  cellBuilder.url(HEADER_URL + cityNextHref);

                    cellBuilder.name(regionName);
                    cellBuilder.regionCode(regionCityCode);
                    cellBuilder.parentCode(cell.getRegionCode());
                    cellBuilder.genTime(LocalDateTime.now()).build();
                    RegionCell cityCell = cellBuilder.build();
                    System.out.println(cityCell);

                    try {
                        db.insertOrUpdate(Entity
                            .create(TABLE_NAME)
                            .set("CODE", cityCell.getRegionCode())
                            .set("PARENT_CODE", cityCell.getParentCode())
                            .set("NAME", cityCell.getName())
                            .set("LEVEL", 2)
                            .set("LINK", cityCell.getUrl())
                            .set("GEN_TIME", cityCell.getGenTime())
                        );
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }

                    if (null == cityNextHref)
                        return;

                    Map<String, String> countyHeader = getNewRequestHeader();
                    provinceCookies.forEach( provinceCookie -> countyHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";"));
                    countyHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                    HttpResponse countyResponse = HttpUtil.createGet(cityCell.getUrl()).addHeaders(countyHeader).execute();
                    List<HttpCookie> countyCookies = countyResponse.getCookies();
                    int countyStatus = countyResponse.getStatus();
                    String countyBody = countyResponse.body();
                    System.out.println(cityCell.getName() + " | " + cityCell.getUrl() + " | 请求状态" + countyStatus);
                    Document countyDOC = Jsoup.parse(countyBody);

                    Elements countyTrList = countyDOC.getElementsByClass("countytr");
                    countyTrList.forEach(countyTr -> {

                        Elements countyTds = countyTr.getElementsByTag("td");
                        Element countyCodeTd = countyTds.get(0);
                        Element countyNameTd = countyTds.get(1);
                        Elements aTagInCountyCodeTd = countyCodeTd.getElementsByTag("a");
                        Elements aTagInCountyNameTd = countyNameTd.getElementsByTag("a");
                        // 取城乡代码
                        String regionCountyCode;
                        String countyNextHref = null;

                        boolean isCountyLinkTag = !CollectionUtil.isEmpty(aTagInCountyCodeTd);
                        if (isCountyLinkTag) {
                            regionCountyCode = aTagInCountyCodeTd.get(0).text();
                            countyNextHref = aTagInCountyCodeTd.get(0).attr("href");
                        } else regionCountyCode = countyCodeTd.text();

                        String regionCountyName;
                        if (!CollectionUtil.isEmpty(aTagInCountyNameTd)) regionCountyName = aTagInCountyNameTd.get(0).text();
                        else regionCountyName = countyNameTd.text();

                        RegionCell.RegionCellBuilder countyCellBuilder = RegionCell.builder();
                        if (isCountyLinkTag) {
                            // 县级的href需要截取处理。 /xxxxxx.html -> 首地址/xx/xxxxxx.html这样
                            int index = countyNextHref.indexOf('/');
                            String provincePath = countyNextHref.substring(index + 1, index + 3) + "/";
                            String countyUrl = HEADER_URL + provincePath + countyNextHref;
                            countyCellBuilder.url(countyUrl);
                        }
                        countyCellBuilder.name(regionCountyName);
                        countyCellBuilder.regionCode(regionCountyCode);
                        countyCellBuilder.genTime(LocalDateTime.now());
                        countyCellBuilder.parentCode(cityCell.getRegionCode());
                        RegionCell countyCell = countyCellBuilder.build();
                        System.out.println(countyCell);


                        try {
                            db.insertOrUpdate(Entity
                                .create(TABLE_NAME)
                                .set("CODE", countyCell.getRegionCode())
                                .set("PARENT_CODE", countyCell.getParentCode())
                                .set("NAME", countyCell.getName())
                                .set("LEVEL", 3)
                                .set("LINK", countyCell.getUrl())
                                .set("GEN_TIME", countyCell.getGenTime())
                            );
                        } catch (SQLException e) {
                            e.printStackTrace();
                        }

                        if (null == countyNextHref)
                            return;

                        Map<String, String> townHeader = getNewRequestHeader();
                        countyCookies.forEach(
                                provinceCookie -> townHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";"));
                        townHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                        HttpResponse townResponse = HttpUtil.createGet(countyCell.getUrl()).addHeaders(townHeader).execute();
                        List<HttpCookie> townCookies = townResponse.getCookies();
                        int townStatus = townResponse.getStatus();
                        String townBody = townResponse.body();
                        System.out.println(countyCell.getName() + " | " + countyCell.getUrl() + " | 请求状态" + townStatus);
                        Document townDOC = Jsoup.parse(townBody);
                        Elements townTrList = townDOC.getElementsByClass("towntr");
                        townTrList.forEach(townTr -> {

                            Elements townTds = townTr.getElementsByTag("td");
                            Element townCodeTd = townTds.get(0);
                            Element townNameTd = townTds.get(1);
                            Elements aTagInTownCodeTd = townCodeTd.getElementsByTag("a");
                            Elements aTagInTownNameTd = townNameTd.getElementsByTag("a");
                            // 取城乡代码
                            String regionTownCode;
                            String townNextHref = null;

                            boolean isTownLinkTag = !CollectionUtil.isEmpty(aTagInTownCodeTd);
                            if (isTownLinkTag) {
                                regionTownCode = aTagInTownCodeTd.get(0).text();
                                townNextHref = aTagInTownCodeTd.get(0).attr("href");
                            } else regionTownCode = townCodeTd.text();
                            String regionTownName;
                            if (!CollectionUtil.isEmpty(aTagInTownNameTd)) regionTownName = aTagInTownNameTd.get(0).text();
                            else regionTownName = townNameTd.text();

                            RegionCell.RegionCellBuilder townCellBuilder = RegionCell.builder();
                            if (isTownLinkTag) {
                                int index = townNextHref.indexOf("/");
                                String provincePath = townNextHref.substring(index + 1, index + 3);
                                String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/";
                                String url = HEADER_URL + provincePath + cityPath + townNextHref;
                                townCellBuilder.url(url);
                            }

                            townCellBuilder.name(regionTownName);
                            townCellBuilder.regionCode(regionTownCode);
                            townCellBuilder.genTime(LocalDateTime.now());
                            townCellBuilder.parentCode(countyCell.getRegionCode());
                            RegionCell townCell = townCellBuilder.build();
                            System.out.println(townCell);

                            try {
                                db.insertOrUpdate(Entity
                                    .create(TABLE_NAME)
                                    .set("CODE", townCell.getRegionCode())
                                    .set("PARENT_CODE", townCell.getParentCode())
                                    .set("NAME", townCell.getName())
                                    .set("LEVEL", 4)
                                    .set("LINK", townCell.getUrl())
                                    .set("GEN_TIME", townCell.getGenTime())
                                );
                            } catch (SQLException e) {
                                e.printStackTrace();
                            }


                            if (null == townNextHref)
                                return;

                            Map<String, String> villageHeader = getNewRequestHeader();
                            townCookies.forEach(townCookie -> villageHeader.put(COOKIE_HEADER_KEY, townCookie.toString() + ";"));
                            villageHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
                            HttpResponse villageResponse = HttpUtil.createGet(townCell.getUrl()).addHeaders(villageHeader).execute();
                            int villageStatus = villageResponse.getStatus();
                            String villageBody = villageResponse.body();
                            System.out.println(townCell.getName() + " | " + townCell.getUrl() + " | 请求状态" + villageStatus);
                            Document villageDOC = Jsoup.parse(villageBody);
                            Elements villageTrList = villageDOC.getElementsByClass("villagetr");

                            villageTrList.forEach(villageTr -> {
                                Elements villageTdList = villageTr.getElementsByTag("td");

                                Element villageCodeTd = villageTdList.get(0);
                                Element villageTypeCodeTd = villageTdList.get(1);
                                Element villageNameTd = villageTdList.get(2);

                                RegionCell.RegionCellBuilder villageCellBuilder = RegionCell.builder();
                                villageCellBuilder.regionCode(villageCodeTd.text());
                                villageCellBuilder.name(villageNameTd.text());
                                villageCellBuilder.typeCode(villageTypeCodeTd.text());
                                villageCellBuilder.parentCode(townCell.getRegionCode());
                                villageCellBuilder.genTime(LocalDateTime.now());
                                RegionCell villageCell = villageCellBuilder.build();
                                System.out.println(villageCell);

                                try {
                                    db.insert(Entity
                                        .create(TABLE_NAME)
                                        .set("CODE", villageCell.getRegionCode())
                                        .set("PARENT_CODE", villageCell.getParentCode())
                                        .set("NAME", villageCell.getName())
                                        .set("LEVEL", 5)
                                        .set("TYPE_CODE", villageCell.getTypeCode())
                                        .set("LINK", villageCell.getUrl())
                                        .set("GEN_TIME", villageCell.getGenTime())
                                    );
                                } catch (SQLException e) {
                                    e.printStackTrace();
                                }
                            });
                        });
                    });
                });
            });
        });

        System.out.println("爬取完毕");
    }
}

  

注意项:

在爬取居委村委层级时存在反爬限制,请求会被阻塞10分钟,程序不会报链接超时

这个问题暂时没找到解决办法,就是这样爬取的效率慢很多

我的思路是想,可不可以判断是否阻塞,如果阻塞就直接重新请求尝试

 

2022年7月6日22点49分更新:

对请求进行封装,使用递归不停止请求

    private static HttpResponse retryConn(HttpRequest httpRequest) {
        HttpResponse httpResponse = null;
        try {
            httpResponse = httpRequest
                    .timeout(TIMEOUT)
                    .setConnectionTimeout(TIMEOUT)
                    .setReadTimeout(TIMEOUT)
                    .execute();
        } catch (Exception exception) {
            exception.printStackTrace();
            return retryConn(httpRequest);
        }
        return httpResponse;
    }

 

所有的请求都这样改成异常递归执行

// HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute();
HttpResponse response = retryConn(HttpUtil.createGet(cell.getLink()).addHeaders(header)); 

但是发现还是有爆栈的情况:

 

所以有对应的写了一份补数据的逻辑:

1、补数据直接采用递归实现

2、首先要查询已经爬取的数据,查询那些本该有子节点,但实际为空的记录

3、查询得到之后遍历记录记载的LINK,继续爬取

4、发现44和46两个省份的link规则不一样,要单独做调整(Ctrl + F 输44,搜下面的代码)

5、LEVEL的层级也不能市确定的1 - 2 - 3 - 4 - 5, 所以改用自连接LEFT JOIN,根据上级代码查询

6、逻辑可重复执行

package cn.cloud9.fix;

import cn.cloud9.po.RegionCell;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.druid.util.StringUtils;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

/**
 * @author OnCloud9
 * @description
 * @project RegionReptile
 * @date 2022年07月04日 下午 09:21
 */
public class DataFixApplication {
    private static final int TIMEOUT = 3000;

    private static String YEAR;
    private static String HEADER_URL;
    private static String TABLE_NAME;
    public static final String CODE = "CODE";
    public static final String PARENT_CODE = "PARENT_CODE";
    public static final String NAME = "NAME";
    public static final String LEVEL = "LEVEL";
    public static final String TYPE_CODE = "TYPE_CODE";
    public static final String LINK = "LINK";
    public static final String GEN_TIME = "GEN_TIME";
    public static final String REFERER = "Referer";
    public static final String USER_AGENT = "User-Agent";
    public static final String[] BROWSER_AGENTS = {
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
    };
    public final static Random R = new Random();

    private static Db db = Db.use();

    public static void main(String[] args) {


    }

    public static void fixData(String year) {
        YEAR = year;
        TABLE_NAME = "region" + YEAR;
        HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + YEAR + "/";
        List<RegionCell> regionCells = queryLostData(db, TABLE_NAME, 1);
        int provinceLost = regionCells.size();
        if (!CollUtil.isEmpty(regionCells)) {
            // 递归查询
            regionCells.forEach(DataFixApplication::readDataRecursive);
        }

        regionCells = queryLostData(db, TABLE_NAME, 2);
        int cityLost = regionCells.size();
        if (!CollUtil.isEmpty(regionCells)) {
            // 递归查询
            regionCells.forEach(DataFixApplication::readDataRecursive);
        }
        regionCells = queryLostData(db, TABLE_NAME, 3);
        int countyLost = regionCells.size();
        if (!CollUtil.isEmpty(regionCells)) {
            // 递归查询
            regionCells.forEach(DataFixApplication::readDataRecursive);
        }

        regionCells = queryLostData(db, TABLE_NAME, 4);
        int townLost = regionCells.size();
        if (!CollUtil.isEmpty(regionCells)) {
            // 递归查询
            regionCells.forEach(DataFixApplication::readDataRecursive);
        }
        if (provinceLost + cityLost + countyLost + townLost == 0) {
            System.out.println(YEAR + "年数据补完!");
        }
    }


    private static RegionCell readDataRecursive(RegionCell regionCell) {
        final String regionCellLink = regionCell.getLink();
        if (StringUtils.isEmpty(regionCellLink)) return null;

        final HttpRequest httpRequest = HttpUtil
                .createGet(regionCellLink)
                .timeout(TIMEOUT)
                .setConnectionTimeout(TIMEOUT)
                .setReadTimeout(TIMEOUT);
        if (!CollUtil.isEmpty(regionCell.getCookies())) {
            Map<String, String> headers = new HashMap<>();
            regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";"));
            headers.put(REFERER,  regionCell.getLink());
            headers.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]);
            headers.put("Host", "www.stats.gov.cn");
            headers.put("Upgrade-Insecure-Requests", "1");
            httpRequest.addHeaders(headers);
        }
        final HttpResponse httpResponse = httpRequest.execute();
        // 封装cookie给下一次请求使用
        final RegionCell.RegionCellBuilder builder = RegionCell.builder();
        builder.cookies(httpResponse.getCookies());
        final RegionCell newCell = builder.build();

        if (!httpResponse.isOk()) return null;
        final Document document = Jsoup.parse(httpResponse.body());
        readCityData(document, newCell, regionCell);
        readCountyData(document, newCell, regionCell);
        readTownData(document, newCell, regionCell);
        readVillageData(document, newCell, regionCell);
        return null;
    }

    private static void readVillageData(Document document, RegionCell newCell, RegionCell superCell) {
        Elements villageTrList = document.getElementsByClass("villagetr");
        if (CollUtil.isEmpty(villageTrList)) return;
        villageTrList.forEach(villageTr -> {
            Elements villageTdList = villageTr.getElementsByTag("td");

            Element villageCodeTd = villageTdList.get(0);
            Element villageTypeCodeTd = villageTdList.get(1);
            Element villageNameTd = villageTdList.get(2);

            newCell.setCode(villageCodeTd.text());
            newCell.setParentCode(superCell.getCode());
            newCell.setName(villageNameTd.text());
            newCell.setTypeCode(villageTypeCodeTd.text());
            newCell.setGenTime(LocalDateTime.now());
            writeDataToDb(5, newCell);
        });
    }

    private static void readTownData(Document document, RegionCell newCell, RegionCell superCell) {
        Elements townTrList = document.getElementsByClass("towntr");
        if (CollUtil.isEmpty(townTrList)) return;
        townTrList.forEach(townTr -> {

            Elements townTds = townTr.getElementsByTag("td");
            Element townCodeTd = townTds.get(0);
            Element townNameTd = townTds.get(1);
            Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a");
            Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a");
            // 取城乡代码
            String regionTownCode;
            String townNextHref = null;

            boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd);
            if (isTownLinkTag) {
                regionTownCode = aTagsInTownCodeTd.get(0).text();
                townNextHref = aTagsInTownCodeTd.get(0).attr("href");
            } else regionTownCode = townCodeTd.text();
            String regionTownName;
            if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text();
            else regionTownName = townNameTd.text();

            final String codePrefix = regionTownCode.substring(0, 2);
            boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix);

            if (condition1 && isTownLinkTag) {
                final String link = superCell.getLink();
                final String basePath = link.substring(0, link.lastIndexOf("/") + 1);
                String url = basePath + townNextHref;
                newCell.setLink(url);
            } else if (isTownLinkTag) {
                int index = townNextHref.indexOf("/");
                String provincePath = townNextHref.substring(index + 1, index + 3);
                String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/";
                String url = HEADER_URL + provincePath + cityPath + townNextHref;
                newCell.setLink(url);
            }

            newCell.setCode(regionTownCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionTownName);
            newCell.setGenTime(LocalDateTime.now());
            writeDataToDb(4, newCell);
            readDataRecursive(newCell);
        });
    }

    private static void readCountyData(Document document, RegionCell newCell, RegionCell superCell) {
        Elements countyTrList = document.getElementsByClass("countytr");
        if (CollUtil.isEmpty(countyTrList)) return;
        countyTrList.forEach(countyTr -> {
            Elements countyTds = countyTr.getElementsByTag("td");
            Element countyCodeTd = countyTds.get(0);
            Element countyNameTd = countyTds.get(1);
            Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a");
            Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a");
            // 取城乡代码
            String regionCountyCode;
            String countyNextHref = null;

            boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd);
            if (isCountyLinkTag) {
                regionCountyCode = aTagsInCountyCodeTd.get(0).text();
                countyNextHref = aTagsInCountyCodeTd.get(0).attr("href");
            } else regionCountyCode = countyCodeTd.text();

            String regionCountyName;
            if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text();
            else regionCountyName = countyNameTd.text();

            if (isCountyLinkTag) {
                int index = countyNextHref.indexOf('/');
                String provincePath = countyNextHref.substring(index + 1, index + 3) + "/";
                String countyUrl = HEADER_URL + provincePath + countyNextHref;
                newCell.setLink(countyUrl);
            }

            newCell.setCode(regionCountyCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionCountyName);
            newCell.setGenTime(LocalDateTime.now());
            writeDataToDb(3, newCell);
            readDataRecursive(newCell);
        });
    }

    private static void readCityData(Document document, RegionCell newCell, RegionCell superCell) {
        final Elements citytrList = document.getElementsByClass("citytr");
        if (CollUtil.isEmpty(citytrList)) return;
        citytrList.forEach(cityTr -> {
            Elements cityTds = cityTr.getElementsByTag("td");
            Element codeTd = cityTds.get(0);
            Element nameTd = cityTds.get(1);
            Elements asTagInCodeTd = codeTd.getElementsByTag("a");
            Elements asTagInNameTd = nameTd.getElementsByTag("a");

            String regionCityCode;
            String cityNextHref = null;

            boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd);
            if (isLinkTag) {
                regionCityCode = asTagInCodeTd.get(0).text();
                cityNextHref = asTagInCodeTd.get(0).attr("href");
            } else
                regionCityCode = codeTd.text();
            String regionName;
            if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text();
            else regionName = nameTd.text();

            if (isLinkTag) newCell.setLink(HEADER_URL + cityNextHref);

            newCell.setCode(regionCityCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionName);
            newCell.setGenTime(LocalDateTime.now());
            writeDataToDb(2, newCell);
            readDataRecursive(newCell);
        });
    }

    private static void writeDataToDb(int level, RegionCell cell) {
        try {
            db.insertOrUpdate(Entity
                    .create(TABLE_NAME)
                    .set(CODE, cell.getCode())
                    .set(PARENT_CODE, cell.getParentCode())
                    .set(NAME, cell.getName())
                    .set(LEVEL, level)
                    .set(TYPE_CODE, cell.getTypeCode())
                    .set(LINK, cell.getLink())
                    .set(GEN_TIME, cell.getGenTime()),
                    CODE
            );
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    /**
     * SELECT 
     *  SUPER.*
     * FROM 
     *  (SELECT * FROM region2021 WHERE `LEVEL` = 4) AS SUPER
     *  LEFT JOIN (SELECT * FROM region2021) AS SUB ON SUPER.CODE = SUB.PARENT_CODE
     *  WHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL
     * @param db
     * @param tableName
     * @param level
     * @return
     */
    @SneakyThrows
    private static List<RegionCell> queryLostData(
        Db db,
        String tableName,
        int level
    ) {
        String sql =
        "SELECT \n" +
        "\tSUPER.*\n" +
        "FROM \n" +
        "\t(SELECT * FROM " + tableName + " WHERE `LEVEL` = ? ) AS SUPER\n" +
        "\tLEFT JOIN (SELECT * FROM " + tableName + " ) AS SUB ON SUPER.CODE = SUB.PARENT_CODE\n" +
        "\tWHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL";
        return db.query(sql, RegionCell.class, level);
    }

}

  

 2022年7月9日06点08分更新

通过写补偿逻辑发现可以进一步优化代码结果:

1、常量统一存放

package cn.cloud9.constant;

import java.util.Random;

public interface Constant {
    int TIMEOUT = 3000;
    String PATH_CHAR = "/";
    String CODE = "CODE";
    String PARENT_CODE = "PARENT_CODE";
    String NAME = "NAME";
    String LEVEL = "LEVEL";
    String TYPE_CODE = "TYPE_CODE";
    String LINK = "LINK";
    String GEN_TIME = "GEN_TIME";
    String REFERER = "Referer";
    String USER_AGENT = "User-Agent";
    String[] BROWSER_AGENTS = {
        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
    };
    String ROOT_PATH = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/${YEAR}/";
    Random R = new Random();
}

  

2、调用的方法封装在Util中

package cn.cloud9.util;

import cn.cloud9.constant.Constant;
import cn.cloud9.po.RegionCell;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.db.Db;
import cn.hutool.db.Entity;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.druid.util.StringUtils;
import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.Map;

import static cn.cloud9.constant.Constant.*;

/**
 * @author OnCloud9
 * @description
 * @project RegionReptile-Remaster
 * @date 2022年07月07日 下午 10:01
 */
public class MyUtil {
    private static Db db = Db.use();
    /**
     * 分配新的请求头Header
     * @return
     */
    public static Map<String, String> getNewRequestHeader() {
        return new HashMap<String, String>(){{
            this.put("Host", "www.stats.gov.cn");
            this.put("Upgrade-Insecure-Requests", "1");
        }};
    }

    /**
     * 初始化表空间
     * @param tableName
     */
    @SneakyThrows
    public static void initialTableSpace(String tableName) {
        String SQL =
            "CREATE TABLE IF NOT EXISTS "+ tableName +" (\n" +
            "  `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码',\n" +
            "  `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码',\n" +
            "  `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称',\n" +
            "  `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT '网页地址',\n" +
            "  `LEVEL` int DEFAULT NULL COMMENT '层级',\n" +
            "  `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型',\n" +
            "  `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间',\n" +
            "  PRIMARY KEY (`CODE`) USING BTREE, \n" +
            "  KEY `IDX_LEVEL` (`LEVEL`) USING BTREE,\n" +
            "  KEY `IDX_PC` (`PARENT_CODE`) USING BTREE \n" +
            ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;";
        db.execute(SQL, null);
    }

    /**
     * 连接重试
     * @param httpRequest
     * @return
     */
    public static HttpResponse retryConn(HttpRequest httpRequest) {
        HttpResponse httpResponse = null;
        try {
            httpResponse = httpRequest
                .timeout(TIMEOUT)
                .setConnectionTimeout(TIMEOUT)
                .setReadTimeout(TIMEOUT)
                .execute();
        } catch (Exception exception) {
            exception.printStackTrace();
            return retryConn(httpRequest);
        }
        return httpResponse;
    }

    /**
     * 写入操作封装, 不更新,
     * 有CODE主键发生冲突,异常后执行下一个
     * @param tableName
     * @param cell
     */
    @SneakyThrows
    public static void writeDataToDb(String tableName, RegionCell cell) {
        db.insert(Entity
            .create(tableName)
            .set(CODE, cell.getCode())
            .set(PARENT_CODE, cell.getParentCode())
            .set(NAME, cell.getName())
            .set(LEVEL, cell.getLevel())
            .set(TYPE_CODE, cell.getTypeCode())
            .set(LINK, cell.getLink())
            .set(GEN_TIME, cell.getGenTime()));
    }

    /**
     * 村,居委会读取
     * @param document
     * @param newCell
     * @param superCell
     * @param tableName
     */
    public static void readVillage(
        Document document,
        RegionCell newCell,
        RegionCell superCell,
        String tableName
    ) {
        Elements villageTrList = document.getElementsByClass("villagetr");
        if (CollUtil.isEmpty(villageTrList)) return;
        villageTrList.forEach(villageTr -> {
            Elements villageTdList = villageTr.getElementsByTag("td");
            newCell.setCode(villageTdList.get(0).text());
            newCell.setParentCode(superCell.getCode());
            newCell.setName(villageTdList.get(2).text());
            newCell.setTypeCode(villageTdList.get(1).text());
            newCell.setLevel(5);
            newCell.setGenTime(LocalDateTime.now());
            writeDataToDb(tableName, newCell);
        });
    }

    /**
     * 街道,镇 读取
     * @param BASE_URL
     * @param document
     * @param newCell
     * @param superCell
     * @param tableName
     */
    public static void readTown (
        String BASE_URL,
        Document document,
        RegionCell newCell,
        RegionCell superCell,
        String tableName
    ) {
        Elements townTrList = document.getElementsByClass("towntr");
        if (CollUtil.isEmpty(townTrList)) return;
        townTrList.forEach(townTr -> {

            Elements townTds = townTr.getElementsByTag("td");
            Element townCodeTd = townTds.get(0);
            Element townNameTd = townTds.get(1);
            Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a");
            Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a");
            // 取城乡代码
            String regionTownCode;
            String townNextHref = null;

            boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd);
            if (isTownLinkTag) {
                regionTownCode = aTagsInTownCodeTd.get(0).text();
                townNextHref = aTagsInTownCodeTd.get(0).attr("href");
            } else regionTownCode = townCodeTd.text();
            String regionTownName;
            if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text();
            else regionTownName = townNameTd.text();

            final String codePrefix = regionTownCode.substring(0, 2);
            boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix);

            if (condition1 && isTownLinkTag) {
                final String link = superCell.getLink();
                final String basePath = link.substring(0, link.lastIndexOf(Constant.PATH_CHAR) + 1);
                String url = basePath + townNextHref;
                newCell.setLink(url);
            } else if (isTownLinkTag) {
                int index = townNextHref.indexOf(Constant.PATH_CHAR);
                String provincePath = townNextHref.substring(index + 1, index + 3);
                String cityPath = Constant.PATH_CHAR + townNextHref.substring(index + 3, index + 5) + Constant.PATH_CHAR;
                String url = BASE_URL + provincePath + cityPath + townNextHref;
                newCell.setLink(url);
            }

            newCell.setCode(regionTownCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionTownName);
            newCell.setGenTime(LocalDateTime.now());
            newCell.setLevel(4);
            writeDataToDb(tableName, newCell);
            readDataRecursive(newCell, BASE_URL, tableName);
        });
    }

    /**
     * 区县读取
     * @param BASE_URL
     * @param document
     * @param newCell
     * @param superCell
     * @param tableName
     */
    public static void readCounty (
        String BASE_URL,
        Document document,
        RegionCell newCell,
        RegionCell superCell,
        String tableName
    ) {
        Elements countyTrList = document.getElementsByClass("countytr");
        if (CollUtil.isEmpty(countyTrList)) return;
        countyTrList.forEach(countyTr -> {
            Elements countyTds = countyTr.getElementsByTag("td");
            Element countyCodeTd = countyTds.get(0);
            Element countyNameTd = countyTds.get(1);
            Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a");
            Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a");
            // 取城乡代码
            String regionCountyCode;
            String countyNextHref = null;

            boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd);
            if (isCountyLinkTag) {
                regionCountyCode = aTagsInCountyCodeTd.get(0).text();
                countyNextHref = aTagsInCountyCodeTd.get(0).attr("href");
            } else regionCountyCode = countyCodeTd.text();

            String regionCountyName;
            if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text();
            else regionCountyName = countyNameTd.text();

            if (isCountyLinkTag) {
                int index = countyNextHref.indexOf(Constant.PATH_CHAR);
                String provincePath = countyNextHref.substring(index + 1, index + 3) + Constant.PATH_CHAR;
                String countyUrl = BASE_URL + provincePath + countyNextHref;
                newCell.setLink(countyUrl);
            }

            newCell.setCode(regionCountyCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionCountyName);
            newCell.setGenTime(LocalDateTime.now());
            newCell.setLevel(3);
            writeDataToDb(tableName, newCell);
            readDataRecursive(newCell, BASE_URL, tableName);
        });
    }

    /**
     * 城市读取
     * @param BASE_URL
     * @param document
     * @param newCell
     * @param superCell
     * @param tableName
     */
    public static void readCity (
            String BASE_URL,
            Document document,
            RegionCell newCell,
            RegionCell superCell,
            String tableName
    ) {
        final Elements citytrList = document.getElementsByClass("citytr");
        if (CollUtil.isEmpty(citytrList)) return;
        citytrList.forEach(cityTr -> {
            Elements cityTds = cityTr.getElementsByTag("td");
            Element codeTd = cityTds.get(0);
            Element nameTd = cityTds.get(1);
            Elements asTagInCodeTd = codeTd.getElementsByTag("a");
            Elements asTagInNameTd = nameTd.getElementsByTag("a");

            String regionCityCode;
            String cityNextHref = null;

            boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd);
            if (isLinkTag) {
                regionCityCode = asTagInCodeTd.get(0).text();
                cityNextHref = asTagInCodeTd.get(0).attr("href");
            } else
                regionCityCode = codeTd.text();
            String regionName;
            if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text();
            else regionName = nameTd.text();

            if (isLinkTag) newCell.setLink(BASE_URL + cityNextHref);

            newCell.setCode(regionCityCode);
            newCell.setParentCode(superCell.getCode());
            newCell.setName(regionName);
            newCell.setGenTime(LocalDateTime.now());
            newCell.setLevel(2);
            writeDataToDb(tableName, newCell);
            readDataRecursive(newCell, BASE_URL, tableName);
        });
    }

    /**
     * 递归请求调用
     * @param regionCell
     * @param BASE_URL
     * @param tableName
     */
    @SneakyThrows
    public static void readDataRecursive(RegionCell regionCell, String BASE_URL, String tableName) {
        final String regionCellLink = regionCell.getLink();
        if (StringUtils.isEmpty(regionCellLink)) return;

        final HttpRequest httpRequest = HttpUtil
                .createGet(regionCellLink)
                .timeout(TIMEOUT)
                .setConnectionTimeout(TIMEOUT)
                .setReadTimeout(TIMEOUT);
        if (!CollUtil.isEmpty(regionCell.getCookies())) {
            final Map<String, String> headers = MyUtil.getNewRequestHeader();
            regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";"));
            headers.put(REFERER,  regionCell.getLink());
            headers.put(USER_AGENT, BROWSER_AGENTS[Constant.R.nextInt(BROWSER_AGENTS.length)]);
            headers.put("Host", "www.stats.gov.cn");
            headers.put("Upgrade-Insecure-Requests", "1");
            httpRequest.addHeaders(headers);
        }

        final HttpResponse httpResponse = retryConn(httpRequest);
        // 封装cookie给下一次请求使用
        final RegionCell.RegionCellBuilder builder = RegionCell.builder();
        builder.cookies(httpResponse.getCookies());
        final RegionCell newCell = builder.build();

        if (!httpResponse.isOk()) {
            System.out.println(httpResponse.body());
            return;
        }
        final Document document = Jsoup.parse(httpResponse.body());
        readCity(BASE_URL, document, newCell, regionCell, tableName);
        readCounty(BASE_URL, document, newCell, regionCell, tableName);
        readTown(BASE_URL, document, newCell, regionCell, tableName);
        readVillage(document, newCell, regionCell, tableName);
        return;
    }

    /**
     * 读取省份数据
     * @param BASE_URL
     * @param tableName
     */
    public static void readProvinceData(String BASE_URL, String tableName) {
        final String s = HttpUtil.get(BASE_URL);
        final Elements provincetrs = Jsoup.parse(s).getElementsByClass("provincetr");
        provincetrs.forEach(tr -> {
            Elements provinceas = tr.getElementsByTag("a");
            provinceas.forEach(a -> {
                RegionCell cell = RegionCell.builder()
                    .name(a.text())
                    .code( a.attr("href").replace(".html", ""))
                    .link(BASE_URL + a.attr("href"))
                    .parentCode(String.valueOf(0))
                    .genTime(LocalDateTime.now())
                    .level(1)
                    .build();
                MyUtil.writeDataToDb(tableName, cell);
                readDataRecursive(cell, BASE_URL, tableName);
            });
        });
    }

}

  

启动类就不用写那么多东西了

package cn.cloud9;

import cn.cloud9.constant.Constant;
import cn.cloud9.util.MyUtil;

/**
 * @author OnCloud9
 * @description
 * @project RegionReptile-Remaster
 * @date 2022年07月07日 下午 09:35
 */
public class MainApplication {

    public static void main(String[] args) {
        final String year = 0 == args.length ? "2009" : args[0];
        String tableName = "region" + year;
        String BASE_URL = Constant.ROOT_PATH.replace("${YEAR}", year);
        MyUtil.initialTableSpace(tableName);
        MyUtil.readProvinceData(BASE_URL, tableName);
    }
}

  

 

posted @ 2022-06-30 15:42  emdzz  阅读(1029)  评论(0编辑  收藏  举报