jsoup爬取国家统计局全国省市区数据
项目中经常用到全国省市区的数据表,但是这个是数据又会经常变动,每次都需要找最新的数据,很麻烦,特此记录一下,用jsoup爬取国家统计局的数据。
1.引入jar包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
2.创建数据表,生成Haha实体类
DROP TABLE IF EXISTS `haha`;
CREATE TABLE `haha` (
`id` int NOT NULL AUTO_INCREMENT,
`code` bigint DEFAULT NULL,
`name` varchar(255) NOT NULL,
`parent_id` int NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3647 DEFAULT CHARSET=utf8;
3.实现接口
//省份代码json String provinceJson = "[{\"code\":110100000000,\"id\":1,\"name\":\"北京市\",\"parentId\":0}," + "{\"code\":120100000000,\"id\":2,\"name\":\"天津市\",\"parentId\":0}," + "{\"code\":130000000000,\"id\":3,\"name\":\"河北省\",\"parentId\":0}," + "{\"code\":140000000000,\"id\":4,\"name\":\"山西省\",\"parentId\":0}," + "{\"code\":150000000000,\"id\":5,\"name\":\"内蒙古自治区\",\"parentId\":0}," + "{\"code\":210000000000,\"id\":6,\"name\":\"辽宁省\",\"parentId\":0}," + "{\"code\":220000000000,\"id\":7,\"name\":\"吉林省\",\"parentId\":0}," + "{\"code\":230000000000,\"id\":8,\"name\":\"黑龙江省\",\"parentId\":0}," + "{\"code\":310000000000,\"id\":9,\"name\":\"上海市\",\"parentId\":0}," + "{\"code\":320000000000,\"id\":10,\"name\":\"江苏省\",\"parentId\":0}," + "{\"code\":330000000000,\"id\":11,\"name\":\"浙江省\",\"parentId\":0}," + "{\"code\":340000000000,\"id\":12,\"name\":\"安徽省\",\"parentId\":0}," + "{\"code\":350000000000,\"id\":13,\"name\":\"福建省\",\"parentId\":0}," + "{\"code\":360000000000,\"id\":14,\"name\":\"江西省\",\"parentId\":0}," + "{\"code\":370000000000,\"id\":15,\"name\":\"山东省\",\"parentId\":0}," + "{\"code\":410000000000,\"id\":16,\"name\":\"河南省\",\"parentId\":0}," + "{\"code\":420000000000,\"id\":17,\"name\":\"湖北省\",\"parentId\":0}," + "{\"code\":430000000000,\"id\":18,\"name\":\"湖南省\",\"parentId\":0}," + "{\"code\":440000000000,\"id\":19,\"name\":\"广东省\",\"parentId\":0}," + "{\"code\":450000000000,\"id\":20,\"name\":\"广西壮族自治区\",\"parentId\":0}," + "{\"code\":460000000000,\"id\":21,\"name\":\"海南省\",\"parentId\":0}," + "{\"code\":500000000000,\"id\":22,\"name\":\"重庆市\",\"parentId\":0}," + "{\"code\":510000000000,\"id\":23,\"name\":\"四川省\",\"parentId\":0}," + "{\"code\":520000000000,\"id\":24,\"name\":\"贵州省\",\"parentId\":0}," + "{\"code\":530000000000,\"id\":25,\"name\":\"云南省\",\"parentId\":0}," + "{\"code\":540000000000,\"id\":26,\"name\":\"西藏自治区\",\"parentId\":0}," + "{\"code\":610000000000,\"id\":27,\"name\":\"陕西省\",\"parentId\":0}," + "{\"code\":620000000000,\"id\":28,\"name\":\"甘肃省\",\"parentId\":0}," + "{\"code\":630000000000,\"id\":29,\"name\":\"青海省\",\"parentId\":0}," + "{\"code\":640000000000,\"id\":30,\"name\":\"宁夏回族自治区\",\"parentId\":0}," + "{\"code\":650000000000,\"id\":31,\"name\":\"新疆维吾尔自治区\",\"parentId\":0}," + "{\"code\":710000000000,\"id\":32,\"name\":\"台湾\",\"parentId\":0}," + "{\"code\":810000000000,\"id\":33,\"name\":\"香港特别行政区\",\"parentId\":0}," + "{\"code\":820000000000,\"id\":34,\"name\":\"澳门特别行政区\",\"parentId\":0}]\n"; JSONArray array = JSONArray.parseArray(provinceJson); //国家统计局地址 String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html"; //链接到目标地址 Connection connect = Jsoup.connect(url); //设置useragent,设置超时时间,并以get请求方式请求服务器 Document document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"). timeout(6000).ignoreContentType(true).get(); //获取省份列表 ListIterator<Element> elements = document.getElementsByClass("provincetr").listIterator(); while (elements.hasNext()) { ListIterator<Element> tds = elements.next().children().listIterator(); while (tds.hasNext()) { Element element = tds.next().child(0); String provinceName = element.text(); Area province = new Area(); province.setName(provinceName); for (int i = 0; i < array.size(); i++) { JSONObject json = array.getJSONObject(i); if (provinceName.equals(json.getString("name"))) { province.setCode(json.getLong("code")); } } province.save(); url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + element.attr("href"); connect = Jsoup.connect(url); document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"). timeout(6000).ignoreContentType(true).get(); ListIterator<Element> citys = document.getElementsByClass("citytr").listIterator(); while (citys.hasNext()) { ListIterator<Element> as = citys.next().getElementsByTag("a").listIterator(); int index = 1; Area city = new Area(); while (as.hasNext()) { Element c = as.next(); if (index == 1) { index++; city.setCode(Long.parseLong(c.text().trim())); } else { url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/" + c.attr("href"); city.setName(c.text().trim()); } } city.setParentId(province.getId()); city.save(); connect = Jsoup.connect(url); document = connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"). timeout(6000).ignoreContentType(true).get(); ListIterator<Element> countys = document.getElementsByClass("countytr").listIterator(); ListIterator<Element> towns = document.getElementsByClass("towntr").listIterator(); while (countys.hasNext()) { ListIterator<Element> couna = countys.next().getElementsByTag("td").listIterator(); Area county = new Area(); int countIndex = 1; while (couna.hasNext()) { Element a = couna.next(); if (countIndex == 1) { countIndex++; county.setCode(Long.parseLong(a.text().trim())); } else { county.setName(a.text().trim()); } } county.setParentId(city.getId()); county.save(); } while (towns.hasNext()) { ListIterator<Element> couna = towns.next().getElementsByTag("td").listIterator(); Area county = new Area(); int countIndex = 1; while (couna.hasNext()) { Element a = couna.next(); if (countIndex == 1) { countIndex++; county.setCode(Long.parseLong(a.text().trim())); } else { county.setName(a.text().trim()); } } county.setParentId(city.getId()); county.save(); } } } }
4.请求接口
http://localhost/demo
我就惯着你们。。。(全国省市区sql文件,更新时间2019年10月31日)
链接: https://pan.baidu.com/s/1X2PDK4WL4dMB2UVcdU-_Mw
提取码: atzp