jsoup爬取国家统计局全国省市区数据

项目中经常用到全国省市区的数据表,但是这个是数据又会经常变动,每次都需要找最新的数据,很麻烦,特此记录一下,用jsoup爬取国家统计局的数据。

1.引入jar包

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>

2.创建数据表,生成Haha实体类

DROP TABLE IF EXISTS `haha`;
CREATE TABLE `haha` (
`id` int NOT NULL AUTO_INCREMENT,
`code` bigint DEFAULT NULL,
`name` varchar(255) NOT NULL,
`parent_id` int NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3647 DEFAULT CHARSET=utf8;

 

3.实现接口

        //省份代码json
        String provinceJson = "[{\"code\":110100000000,\"id\":1,\"name\":\"北京市\",\"parentId\":0}," +
                "{\"code\":120100000000,\"id\":2,\"name\":\"天津市\",\"parentId\":0}," +
                "{\"code\":130000000000,\"id\":3,\"name\":\"河北省\",\"parentId\":0}," +
                "{\"code\":140000000000,\"id\":4,\"name\":\"山西省\",\"parentId\":0}," +
                "{\"code\":150000000000,\"id\":5,\"name\":\"内蒙古自治区\",\"parentId\":0}," +
                "{\"code\":210000000000,\"id\":6,\"name\":\"辽宁省\",\"parentId\":0}," +
                "{\"code\":220000000000,\"id\":7,\"name\":\"吉林省\",\"parentId\":0}," +
                "{\"code\":230000000000,\"id\":8,\"name\":\"黑龙江省\",\"parentId\":0}," +
                "{\"code\":310000000000,\"id\":9,\"name\":\"上海市\",\"parentId\":0}," +
                "{\"code\":320000000000,\"id\":10,\"name\":\"江苏省\",\"parentId\":0}," +
                "{\"code\":330000000000,\"id\":11,\"name\":\"浙江省\",\"parentId\":0}," +
                "{\"code\":340000000000,\"id\":12,\"name\":\"安徽省\",\"parentId\":0}," +
                "{\"code\":350000000000,\"id\":13,\"name\":\"福建省\",\"parentId\":0}," +
                "{\"code\":360000000000,\"id\":14,\"name\":\"江西省\",\"parentId\":0}," +
                "{\"code\":370000000000,\"id\":15,\"name\":\"山东省\",\"parentId\":0}," +
                "{\"code\":410000000000,\"id\":16,\"name\":\"河南省\",\"parentId\":0}," +
                "{\"code\":420000000000,\"id\":17,\"name\":\"湖北省\",\"parentId\":0}," +
                "{\"code\":430000000000,\"id\":18,\"name\":\"湖南省\",\"parentId\":0}," +
                "{\"code\":440000000000,\"id\":19,\"name\":\"广东省\",\"parentId\":0}," +
                "{\"code\":450000000000,\"id\":20,\"name\":\"广西壮族自治区\",\"parentId\":0}," +
                "{\"code\":460000000000,\"id\":21,\"name\":\"海南省\",\"parentId\":0}," +
                "{\"code\":500000000000,\"id\":22,\"name\":\"重庆市\",\"parentId\":0}," +
                "{\"code\":510000000000,\"id\":23,\"name\":\"四川省\",\"parentId\":0}," +
                "{\"code\":520000000000,\"id\":24,\"name\":\"贵州省\",\"parentId\":0}," +
                "{\"code\":530000000000,\"id\":25,\"name\":\"云南省\",\"parentId\":0}," +
                "{\"code\":540000000000,\"id\":26,\"name\":\"西藏自治区\",\"parentId\":0}," +
                "{\"code\":610000000000,\"id\":27,\"name\":\"陕西省\",\"parentId\":0}," +
                "{\"code\":620000000000,\"id\":28,\"name\":\"甘肃省\",\"parentId\":0}," +
                "{\"code\":630000000000,\"id\":29,\"name\":\"青海省\",\"parentId\":0}," +
                "{\"code\":640000000000,\"id\":30,\"name\":\"宁夏回族自治区\",\"parentId\":0}," +
                "{\"code\":650000000000,\"id\":31,\"name\":\"新疆维吾尔自治区\",\"parentId\":0}," +
                "{\"code\":710000000000,\"id\":32,\"name\":\"台湾\",\"parentId\":0}," +
                "{\"code\":810000000000,\"id\":33,\"name\":\"香港特别行政区\",\"parentId\":0}," +
                "{\"code\":820000000000,\"id\":34,\"name\":\"澳门特别行政区\",\"parentId\":0}]\n";

        JSONArray array = JSONArray.parseArray(provinceJson);
        //国家统计局地址
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";

        //链接到目标地址
        Connection connect = Jsoup.connect(url);
        //设置useragent,设置超时时间,并以get请求方式请求服务器
        Document document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                timeout(6000).ignoreContentType(true).get();

        //获取省份列表
        ListIterator<Element> elements = document.getElementsByClass("provincetr").listIterator();

        while (elements.hasNext()) {
            ListIterator<Element> tds = elements.next().children().listIterator();

            while (tds.hasNext()) {
                Element element = tds.next().child(0);
                String provinceName = element.text();

                Area province = new Area();
                province.setName(provinceName);
                for (int i = 0; i < array.size(); i++) {
                    JSONObject json = array.getJSONObject(i);
                    if (provinceName.equals(json.getString("name"))) {
                        province.setCode(json.getLong("code"));
                    }
                }

                province.save();

                url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + element.attr("href");
                connect = Jsoup.connect(url);
                document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                        timeout(6000).ignoreContentType(true).get();
                ListIterator<Element> citys = document.getElementsByClass("citytr").listIterator();
                while (citys.hasNext()) {
                    ListIterator<Element> as = citys.next().getElementsByTag("a").listIterator();

                    int index = 1;
                    Area city = new Area();

                    while (as.hasNext()) {
                        Element c = as.next();
                        if (index == 1) {
                            index++;
                            city.setCode(Long.parseLong(c.text().trim()));
                        } else {
                            url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + c.attr("href");
                            city.setName(c.text().trim());
                        }
                    }
                    city.setParentId(province.getId());
                    city.save();

                    connect = Jsoup.connect(url);
                    document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)").
                            timeout(6000).ignoreContentType(true).get();

                    ListIterator<Element> countys = document.getElementsByClass("countytr").listIterator();
                    ListIterator<Element> towns = document.getElementsByClass("towntr").listIterator();


                    while (countys.hasNext()) {
                        ListIterator<Element> couna = countys.next().getElementsByTag("td").listIterator();

                        Area county = new Area();
                        int countIndex = 1;
                        while (couna.hasNext()) {
                            Element a = couna.next();
                            if (countIndex == 1) {
                                countIndex++;
                                county.setCode(Long.parseLong(a.text().trim()));
                            } else {
                                county.setName(a.text().trim());
                            }
                        }
                        county.setParentId(city.getId());
                        county.save();
                    }
                    while (towns.hasNext()) {
                        ListIterator<Element> couna = towns.next().getElementsByTag("td").listIterator();

                        Area county = new Area();
                        int countIndex = 1;
                        while (couna.hasNext()) {
                            Element a = couna.next();
                            if (countIndex == 1) {
                                countIndex++;
                                county.setCode(Long.parseLong(a.text().trim()));
                            } else {
                                county.setName(a.text().trim());
                            }
                        }
                        county.setParentId(city.getId());
                        county.save();
                    }
                }
            }
        }

  

 

4.请求接口

http://localhost/demo

 

我就惯着你们。。。(全国省市区sql文件,更新时间2019年10月31日)

链接: https://pan.baidu.com/s/1X2PDK4WL4dMB2UVcdU-_Mw

提取码: atzp

 

posted @ 2021-02-02 11:57  路边一草鞋  阅读(724)  评论(1编辑  收藏  举报