【爬虫】Java爬取省市县行政区域统计数据
前言
网上看了好几个Python爬虫来爬取省市县行政区域统计
官网除了省市县以外,还有区,街道,居委村委层级
https://zhuanlan.zhihu.com/p/512852193
所以自己用Java写一个完整爬取的,之前写过的一版不是很理想
这次换了更轻量的库来重构,逻辑也直观些
依赖库:
Hutool工具库,有Http工具包和DB操作的API
Jsoup解析HTML代码,爬虫标配
Lombok简化PO
放FastJson是考虑可能不用DB存放,直接写JSON文件,在这里没用到
<dependencies> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.8.4</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.15</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.62</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.13.1</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.10</version> </dependency> <!-- 连接池https://mvnrepository.com/artifact/com.alibaba/druid --> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.1.14</version> </dependency> </dependencies>
HutoolDb需要的配置文件:
## db.setting文件 url = jdbc:mysql://localhost:3308/my?serverTimezone=Asia/Shanghai user = root pass = 123456 ## 可选配置 # 是否在日志中显示执行的SQL showSql = true # 是否格式化显示的SQL formatSql = false # 是否显示SQL参数 showParams = true # 打印SQL的日志等级,默认debug,可以是info、warn、error sqlLevel = debug #---------------------------------------------------------------------------------------------------------------- ## 连接池配置项 #———————————————— #版权声明:本文为CSDN博主「soulCoke」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。 #原文链接:https://blog.csdn.net/qq_36328170/article/details/105687633 ## ---------------------------------------------------- Druid # 初始化时建立物理连接的个数。初始化发生在显示调用init方法,或者第一次getConnection时 initialSize = 1 # 最大连接池数量 maxActive = 8 # 最小连接池数量 minIdle = 0 # 获取连接时最大等待时间,单位毫秒。配置了maxWait之后, 缺省启用公平锁,并发效率会有所下降, 如果需要可以通过配置useUnfairLock属性为true使用非公平锁。 maxWait = 0 # 是否缓存preparedStatement,也就是PSCache。 PSCache对支持游标的数据库性能提升巨大,比如说oracle。 在mysql5.5以下的版本中没有PSCache功能,建议关闭掉。作者在5.5版本中使用PSCache,通过监控界面发现PSCache有缓存命中率记录, 该应该是支持PSCache。 poolPreparedStatements = false # 要启用PSCache,必须配置大于0,当大于0时, poolPreparedStatements自动触发修改为true。 在Druid中,不会存在Oracle下PSCache占用内存过多的问题, 可以把这个数值配置大一些,比如说100 maxOpenPreparedStatements = -1 # 用来检测连接是否有效的sql,要求是一个查询语句。 如果validationQuery为null,testOnBorrow、testOnReturn、 testWhileIdle都不会其作用。 validationQuery = SELECT 1 # 申请连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能。 testOnBorrow = true # 归还连接时执行validationQuery检测连接是否有效,做了这个配置会降低性能 testOnReturn = false # 建议配置为true,不影响性能,并且保证安全性。 申请连接的时候检测,如果空闲时间大于 timeBetweenEvictionRunsMillis,执行validationQuery检测连接是否有效。 testWhileIdle = false # 有两个含义: 1) Destroy线程会检测连接的间隔时间 2) testWhileIdle的判断依据,详细看testWhileIdle属性的说明 timeBetweenEvictionRunsMillis = 60000 # 物理连接初始化的时候执行的sql connectionInitSqls = SELECT 1 # 属性类型是字符串,通过别名的方式配置扩展插件, 常用的插件有: 监控统计用的filter:stat 日志用的filter:log4j 防御sql注入的filter:wall # filters = stat # 类型是List<com.alibaba.druid.filter.Filter>, 如果同时配置了filters和proxyFilters, 是组合关系,并非替换关系 # proxyFilters =
表结构:
CREATE TABLE `region2021` ( `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码', `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码', `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称', `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL, `LEVEL` int DEFAULT NULL COMMENT '层级', `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型', `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间', PRIMARY KEY (`CODE`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;
代码部分:
封装的PO类:
package cn.cloud9.rdp.po; import lombok.*; import java.time.LocalDateTime; /** * 行政区域最小存储单元 * @projectName: 行政区域爬取工具 * @author: Cloud9 * @date: 2022年06月29日 10:30 * @version: 1.0 */ @Data @EqualsAndHashCode @AllArgsConstructor @NoArgsConstructor @Builder public class RegionCell { // 名称 private String name; // 下一层的访问地址 private String url; // 统计用区分代码 private String regionCode; // 统计用区分代码(上级代码) private String parentCode; // 城乡分类代码 private String typeCode; // 行政区的层级 private Integer level; // 创建时间 private LocalDateTime genTime; }
Main启动类:
本来是想用递归写逻辑的,但是每一层的逻辑不是完全一样,所以不采用递归
层级是可以确认的,最底层到village就没有了
package cn.cloud9.rdp; import cn.cloud9.rdp.po.RegionCell; import cn.hutool.core.collection.CollectionUtil; import cn.hutool.db.Db; import cn.hutool.db.Entity; import cn.hutool.http.HttpRequest; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.net.HttpCookie; import java.sql.SQLException; import java.time.LocalDateTime; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.ConcurrentHashMap; /** * @projectName: 行政区域爬取工具 * @author: Cloud9 * @date: 2022年06月29日 09:55 * @version: 1.0 */ public class MainApplication { public static final String HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"; public static final String COOKIE_KEY = "SF_cookie_1"; public static final String COOKIE_HEADER_KEY = "Cookie"; public static final String USER_AGENT = "User-Agent"; public static final String[] BROWSER_AGENTS = { "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0" }; public final static Random R = new Random(); public static final String TABLE_NAME = "region2021"; public static Map<String, String> getNewRequestHeader() { return new ConcurrentHashMap<String, String>(){{ this.put("Host", "www.stats.gov.cn"); // this.put("Server", "CWAP-waf"); // this.put("Content-Length", "2351"); // this.put("Content-Type", "text/html"); // this.put("Accept-Ranges", "bytes"); // this.put("Content-Encoding", "gzip"); // this.put("ETag", "1736-5d4bccabedf00-gzip"); // this.put("Vary", "Accept-Encoding"); // this.put("X-Powered-By", "anyu.qianxin.com"); // this.put("WZWS-RAY", "1129-1656502841.38-w-waf03cdm"); }}; } public static void main(String[] args) throws SQLException { // 创建MySQL连接 Db db = Db.use(); // 清空表记录 db.execute("TRUNCATE TABLE region2021", null); // 第一层请求 HttpRequest getRequest = HttpUtil.createGet(HEADER_URL); HttpResponse httpResponse = getRequest.execute(); HttpCookie cookie = httpResponse.getCookie(COOKIE_KEY); System.out.println(cookie); // 取响应状态 int status = httpResponse.getStatus(); // 取响应data String body = httpResponse.body(); System.out.println("省 响应状态 " + status); System.out.println("省 响应状态 " + body); if (status != 200) { System.out.println("爬取异常,程序终止"); return; } // 解析HTML文档,封装成文档对象 final Document DOC = Jsoup.parse(body); // 省份是 class="provincetr" 的tr标签, Elements provinceTrList = DOC.getElementsByClass("provincetr"); provinceTrList.forEach(tr -> { // 每个tr标签内部嵌套了若干省份 a标签 Elements provinceAList = tr.getElementsByTag("a"); provinceAList.forEach(a -> { int provinceTrIndex = provinceTrList.indexOf(tr) + 1; int provinceALinkIndex = provinceAList.indexOf(a) + 1; RegionCell cell = RegionCell.builder() // 文本就是省份 .name(a.text()) // 下一级的地址是 首页地址 + 标签存的地址 .url(HEADER_URL + a.attr("href")) // 第一级没有行政区编号,我自己设定规则是(tr元素下标 + a元素下标)组合 .regionCode(String.valueOf(provinceTrIndex) + provinceALinkIndex) // 省级没有 .parentCode(String.valueOf(0)) .genTime(LocalDateTime.now()) .build(); System.out.println(cell); // 插入省份 try { db.insertOrUpdate(Entity .create(TABLE_NAME) .set("CODE", cell.getRegionCode()) .set("PARENT_CODE", cell.getParentCode()) .set("NAME", cell.getName()) .set("LEVEL", 1) .set("LINK", cell.getUrl()) .set("GEN_TIME", cell.getGenTime()) ); } catch (SQLException e) { e.printStackTrace(); } // 设置header头, Map<String, String> header = getNewRequestHeader(); // 把首次请求提供的Cookie放进来 header.put(COOKIE_HEADER_KEY, cookie.toString()); // 设置每次请求时,伪装不同的浏览器访问 header.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]); HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute(); // 保留本次cookie,给下一级请求时提供 List<HttpCookie> provinceCookies = response.getCookies(); // 不判断状态,默认200 int provinceStatus = response.getStatus(); System.out.println(cell.getName() + " | " + cell.getUrl() + " | 请求状态" + provinceStatus); String provinceBody = response.body(); final Document provinceDOC = Jsoup.parse(provinceBody); // 市级取的 class="citytr" Elements cityTrList = provinceDOC.getElementsByClass("citytr"); cityTrList.forEach(cityTr -> { // 每个tr 存放一个城市, 第一个td 放编码 第二个td放城市名 // 有可能这个城市没有下一级,td就不嵌套a元素, 所以这里写了判断逻辑 Elements cityTds = cityTr.getElementsByTag("td"); Element codeTd = cityTds.get(0); Element nameTd = cityTds.get(1); Elements aTagInCodeTd = codeTd.getElementsByTag("a"); Elements aTagInNameTd = nameTd.getElementsByTag("a"); // 取城乡代码 String regionCityCode; String cityNextHref = null; boolean isLinkTag = !CollectionUtil.isEmpty(aTagInCodeTd); if (isLinkTag) { // 编码a元素和城市a元素都会放href, 这里按编码的来取 regionCityCode = aTagInCodeTd.get(0).text(); cityNextHref = aTagInCodeTd.get(0).attr("href"); } else regionCityCode = codeTd.text(); String regionName; if (!CollectionUtil.isEmpty(aTagInNameTd)) regionName = aTagInNameTd.get(0).text(); else regionName = nameTd.text(); RegionCell.RegionCellBuilder cellBuilder = RegionCell.builder(); // 城市的下一级Link 也是一样,用首地址 + href放的地址拼接组成 if (isLinkTag) cellBuilder.url(HEADER_URL + cityNextHref); cellBuilder.name(regionName); cellBuilder.regionCode(regionCityCode); cellBuilder.parentCode(cell.getRegionCode()); cellBuilder.genTime(LocalDateTime.now()).build(); RegionCell cityCell = cellBuilder.build(); System.out.println(cityCell); try { db.insertOrUpdate(Entity .create(TABLE_NAME) .set("CODE", cityCell.getRegionCode()) .set("PARENT_CODE", cityCell.getParentCode()) .set("NAME", cityCell.getName()) .set("LEVEL", 2) .set("LINK", cityCell.getUrl()) .set("GEN_TIME", cityCell.getGenTime()) ); } catch (SQLException e) { e.printStackTrace(); } if (null == cityNextHref) return; Map<String, String> countyHeader = getNewRequestHeader(); provinceCookies.forEach( provinceCookie -> countyHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";")); countyHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]); HttpResponse countyResponse = HttpUtil.createGet(cityCell.getUrl()).addHeaders(countyHeader).execute(); List<HttpCookie> countyCookies = countyResponse.getCookies(); int countyStatus = countyResponse.getStatus(); String countyBody = countyResponse.body(); System.out.println(cityCell.getName() + " | " + cityCell.getUrl() + " | 请求状态" + countyStatus); Document countyDOC = Jsoup.parse(countyBody); Elements countyTrList = countyDOC.getElementsByClass("countytr"); countyTrList.forEach(countyTr -> { Elements countyTds = countyTr.getElementsByTag("td"); Element countyCodeTd = countyTds.get(0); Element countyNameTd = countyTds.get(1); Elements aTagInCountyCodeTd = countyCodeTd.getElementsByTag("a"); Elements aTagInCountyNameTd = countyNameTd.getElementsByTag("a"); // 取城乡代码 String regionCountyCode; String countyNextHref = null; boolean isCountyLinkTag = !CollectionUtil.isEmpty(aTagInCountyCodeTd); if (isCountyLinkTag) { regionCountyCode = aTagInCountyCodeTd.get(0).text(); countyNextHref = aTagInCountyCodeTd.get(0).attr("href"); } else regionCountyCode = countyCodeTd.text(); String regionCountyName; if (!CollectionUtil.isEmpty(aTagInCountyNameTd)) regionCountyName = aTagInCountyNameTd.get(0).text(); else regionCountyName = countyNameTd.text(); RegionCell.RegionCellBuilder countyCellBuilder = RegionCell.builder(); if (isCountyLinkTag) { // 县级的href需要截取处理。 /xxxxxx.html -> 首地址/xx/xxxxxx.html这样 int index = countyNextHref.indexOf('/'); String provincePath = countyNextHref.substring(index + 1, index + 3) + "/"; String countyUrl = HEADER_URL + provincePath + countyNextHref; countyCellBuilder.url(countyUrl); } countyCellBuilder.name(regionCountyName); countyCellBuilder.regionCode(regionCountyCode); countyCellBuilder.genTime(LocalDateTime.now()); countyCellBuilder.parentCode(cityCell.getRegionCode()); RegionCell countyCell = countyCellBuilder.build(); System.out.println(countyCell); try { db.insertOrUpdate(Entity .create(TABLE_NAME) .set("CODE", countyCell.getRegionCode()) .set("PARENT_CODE", countyCell.getParentCode()) .set("NAME", countyCell.getName()) .set("LEVEL", 3) .set("LINK", countyCell.getUrl()) .set("GEN_TIME", countyCell.getGenTime()) ); } catch (SQLException e) { e.printStackTrace(); } if (null == countyNextHref) return; Map<String, String> townHeader = getNewRequestHeader(); countyCookies.forEach( provinceCookie -> townHeader.put(COOKIE_HEADER_KEY, provinceCookie.toString() + ";")); townHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]); HttpResponse townResponse = HttpUtil.createGet(countyCell.getUrl()).addHeaders(townHeader).execute(); List<HttpCookie> townCookies = townResponse.getCookies(); int townStatus = townResponse.getStatus(); String townBody = townResponse.body(); System.out.println(countyCell.getName() + " | " + countyCell.getUrl() + " | 请求状态" + townStatus); Document townDOC = Jsoup.parse(townBody); Elements townTrList = townDOC.getElementsByClass("towntr"); townTrList.forEach(townTr -> { Elements townTds = townTr.getElementsByTag("td"); Element townCodeTd = townTds.get(0); Element townNameTd = townTds.get(1); Elements aTagInTownCodeTd = townCodeTd.getElementsByTag("a"); Elements aTagInTownNameTd = townNameTd.getElementsByTag("a"); // 取城乡代码 String regionTownCode; String townNextHref = null; boolean isTownLinkTag = !CollectionUtil.isEmpty(aTagInTownCodeTd); if (isTownLinkTag) { regionTownCode = aTagInTownCodeTd.get(0).text(); townNextHref = aTagInTownCodeTd.get(0).attr("href"); } else regionTownCode = townCodeTd.text(); String regionTownName; if (!CollectionUtil.isEmpty(aTagInTownNameTd)) regionTownName = aTagInTownNameTd.get(0).text(); else regionTownName = townNameTd.text(); RegionCell.RegionCellBuilder townCellBuilder = RegionCell.builder(); if (isTownLinkTag) { int index = townNextHref.indexOf("/"); String provincePath = townNextHref.substring(index + 1, index + 3); String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/"; String url = HEADER_URL + provincePath + cityPath + townNextHref; townCellBuilder.url(url); } townCellBuilder.name(regionTownName); townCellBuilder.regionCode(regionTownCode); townCellBuilder.genTime(LocalDateTime.now()); townCellBuilder.parentCode(countyCell.getRegionCode()); RegionCell townCell = townCellBuilder.build(); System.out.println(townCell); try { db.insertOrUpdate(Entity .create(TABLE_NAME) .set("CODE", townCell.getRegionCode()) .set("PARENT_CODE", townCell.getParentCode()) .set("NAME", townCell.getName()) .set("LEVEL", 4) .set("LINK", townCell.getUrl()) .set("GEN_TIME", townCell.getGenTime()) ); } catch (SQLException e) { e.printStackTrace(); } if (null == townNextHref) return; Map<String, String> villageHeader = getNewRequestHeader(); townCookies.forEach(townCookie -> villageHeader.put(COOKIE_HEADER_KEY, townCookie.toString() + ";")); villageHeader.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]); HttpResponse villageResponse = HttpUtil.createGet(townCell.getUrl()).addHeaders(villageHeader).execute(); int villageStatus = villageResponse.getStatus(); String villageBody = villageResponse.body(); System.out.println(townCell.getName() + " | " + townCell.getUrl() + " | 请求状态" + villageStatus); Document villageDOC = Jsoup.parse(villageBody); Elements villageTrList = villageDOC.getElementsByClass("villagetr"); villageTrList.forEach(villageTr -> { Elements villageTdList = villageTr.getElementsByTag("td"); Element villageCodeTd = villageTdList.get(0); Element villageTypeCodeTd = villageTdList.get(1); Element villageNameTd = villageTdList.get(2); RegionCell.RegionCellBuilder villageCellBuilder = RegionCell.builder(); villageCellBuilder.regionCode(villageCodeTd.text()); villageCellBuilder.name(villageNameTd.text()); villageCellBuilder.typeCode(villageTypeCodeTd.text()); villageCellBuilder.parentCode(townCell.getRegionCode()); villageCellBuilder.genTime(LocalDateTime.now()); RegionCell villageCell = villageCellBuilder.build(); System.out.println(villageCell); try { db.insert(Entity .create(TABLE_NAME) .set("CODE", villageCell.getRegionCode()) .set("PARENT_CODE", villageCell.getParentCode()) .set("NAME", villageCell.getName()) .set("LEVEL", 5) .set("TYPE_CODE", villageCell.getTypeCode()) .set("LINK", villageCell.getUrl()) .set("GEN_TIME", villageCell.getGenTime()) ); } catch (SQLException e) { e.printStackTrace(); } }); }); }); }); }); }); System.out.println("爬取完毕"); } }
注意项:
在爬取居委村委层级时存在反爬限制,请求会被阻塞10分钟,程序不会报链接超时
这个问题暂时没找到解决办法,就是这样爬取的效率慢很多
我的思路是想,可不可以判断是否阻塞,如果阻塞就直接重新请求尝试
2022年7月6日22点49分更新:
对请求进行封装,使用递归不停止请求
private static HttpResponse retryConn(HttpRequest httpRequest) { HttpResponse httpResponse = null; try { httpResponse = httpRequest .timeout(TIMEOUT) .setConnectionTimeout(TIMEOUT) .setReadTimeout(TIMEOUT) .execute(); } catch (Exception exception) { exception.printStackTrace(); return retryConn(httpRequest); } return httpResponse; }
所有的请求都这样改成异常递归执行
// HttpResponse response = HttpUtil.createGet(cell.getUrl()).addHeaders(header).execute(); HttpResponse response = retryConn(HttpUtil.createGet(cell.getLink()).addHeaders(header));
但是发现还是有爆栈的情况:
所以有对应的写了一份补数据的逻辑:
1、补数据直接采用递归实现
2、首先要查询已经爬取的数据,查询那些本该有子节点,但实际为空的记录
3、查询得到之后遍历记录记载的LINK,继续爬取
4、发现44和46两个省份的link规则不一样,要单独做调整(Ctrl + F 输44,搜下面的代码)
5、LEVEL的层级也不能市确定的1 - 2 - 3 - 4 - 5, 所以改用自连接LEFT JOIN,根据上级代码查询
6、逻辑可重复执行
package cn.cloud9.fix; import cn.cloud9.po.RegionCell; import cn.hutool.core.collection.CollUtil; import cn.hutool.db.Db; import cn.hutool.db.Entity; import cn.hutool.http.HttpRequest; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import com.alibaba.druid.util.StringUtils; import lombok.SneakyThrows; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.sql.SQLException; import java.time.LocalDateTime; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; /** * @author OnCloud9 * @description * @project RegionReptile * @date 2022年07月04日 下午 09:21 */ public class DataFixApplication { private static final int TIMEOUT = 3000; private static String YEAR; private static String HEADER_URL; private static String TABLE_NAME; public static final String CODE = "CODE"; public static final String PARENT_CODE = "PARENT_CODE"; public static final String NAME = "NAME"; public static final String LEVEL = "LEVEL"; public static final String TYPE_CODE = "TYPE_CODE"; public static final String LINK = "LINK"; public static final String GEN_TIME = "GEN_TIME"; public static final String REFERER = "Referer"; public static final String USER_AGENT = "User-Agent"; public static final String[] BROWSER_AGENTS = { "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" }; public final static Random R = new Random(); private static Db db = Db.use(); public static void main(String[] args) { } public static void fixData(String year) { YEAR = year; TABLE_NAME = "region" + YEAR; HEADER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + YEAR + "/"; List<RegionCell> regionCells = queryLostData(db, TABLE_NAME, 1); int provinceLost = regionCells.size(); if (!CollUtil.isEmpty(regionCells)) { // 递归查询 regionCells.forEach(DataFixApplication::readDataRecursive); } regionCells = queryLostData(db, TABLE_NAME, 2); int cityLost = regionCells.size(); if (!CollUtil.isEmpty(regionCells)) { // 递归查询 regionCells.forEach(DataFixApplication::readDataRecursive); } regionCells = queryLostData(db, TABLE_NAME, 3); int countyLost = regionCells.size(); if (!CollUtil.isEmpty(regionCells)) { // 递归查询 regionCells.forEach(DataFixApplication::readDataRecursive); } regionCells = queryLostData(db, TABLE_NAME, 4); int townLost = regionCells.size(); if (!CollUtil.isEmpty(regionCells)) { // 递归查询 regionCells.forEach(DataFixApplication::readDataRecursive); } if (provinceLost + cityLost + countyLost + townLost == 0) { System.out.println(YEAR + "年数据补完!"); } } private static RegionCell readDataRecursive(RegionCell regionCell) { final String regionCellLink = regionCell.getLink(); if (StringUtils.isEmpty(regionCellLink)) return null; final HttpRequest httpRequest = HttpUtil .createGet(regionCellLink) .timeout(TIMEOUT) .setConnectionTimeout(TIMEOUT) .setReadTimeout(TIMEOUT); if (!CollUtil.isEmpty(regionCell.getCookies())) { Map<String, String> headers = new HashMap<>(); regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";")); headers.put(REFERER, regionCell.getLink()); headers.put(USER_AGENT, BROWSER_AGENTS[R.nextInt(BROWSER_AGENTS.length)]); headers.put("Host", "www.stats.gov.cn"); headers.put("Upgrade-Insecure-Requests", "1"); httpRequest.addHeaders(headers); } final HttpResponse httpResponse = httpRequest.execute(); // 封装cookie给下一次请求使用 final RegionCell.RegionCellBuilder builder = RegionCell.builder(); builder.cookies(httpResponse.getCookies()); final RegionCell newCell = builder.build(); if (!httpResponse.isOk()) return null; final Document document = Jsoup.parse(httpResponse.body()); readCityData(document, newCell, regionCell); readCountyData(document, newCell, regionCell); readTownData(document, newCell, regionCell); readVillageData(document, newCell, regionCell); return null; } private static void readVillageData(Document document, RegionCell newCell, RegionCell superCell) { Elements villageTrList = document.getElementsByClass("villagetr"); if (CollUtil.isEmpty(villageTrList)) return; villageTrList.forEach(villageTr -> { Elements villageTdList = villageTr.getElementsByTag("td"); Element villageCodeTd = villageTdList.get(0); Element villageTypeCodeTd = villageTdList.get(1); Element villageNameTd = villageTdList.get(2); newCell.setCode(villageCodeTd.text()); newCell.setParentCode(superCell.getCode()); newCell.setName(villageNameTd.text()); newCell.setTypeCode(villageTypeCodeTd.text()); newCell.setGenTime(LocalDateTime.now()); writeDataToDb(5, newCell); }); } private static void readTownData(Document document, RegionCell newCell, RegionCell superCell) { Elements townTrList = document.getElementsByClass("towntr"); if (CollUtil.isEmpty(townTrList)) return; townTrList.forEach(townTr -> { Elements townTds = townTr.getElementsByTag("td"); Element townCodeTd = townTds.get(0); Element townNameTd = townTds.get(1); Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a"); Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a"); // 取城乡代码 String regionTownCode; String townNextHref = null; boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd); if (isTownLinkTag) { regionTownCode = aTagsInTownCodeTd.get(0).text(); townNextHref = aTagsInTownCodeTd.get(0).attr("href"); } else regionTownCode = townCodeTd.text(); String regionTownName; if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text(); else regionTownName = townNameTd.text(); final String codePrefix = regionTownCode.substring(0, 2); boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix); if (condition1 && isTownLinkTag) { final String link = superCell.getLink(); final String basePath = link.substring(0, link.lastIndexOf("/") + 1); String url = basePath + townNextHref; newCell.setLink(url); } else if (isTownLinkTag) { int index = townNextHref.indexOf("/"); String provincePath = townNextHref.substring(index + 1, index + 3); String cityPath = "/" + townNextHref.substring(index + 3, index + 5) + "/"; String url = HEADER_URL + provincePath + cityPath + townNextHref; newCell.setLink(url); } newCell.setCode(regionTownCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionTownName); newCell.setGenTime(LocalDateTime.now()); writeDataToDb(4, newCell); readDataRecursive(newCell); }); } private static void readCountyData(Document document, RegionCell newCell, RegionCell superCell) { Elements countyTrList = document.getElementsByClass("countytr"); if (CollUtil.isEmpty(countyTrList)) return; countyTrList.forEach(countyTr -> { Elements countyTds = countyTr.getElementsByTag("td"); Element countyCodeTd = countyTds.get(0); Element countyNameTd = countyTds.get(1); Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a"); Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a"); // 取城乡代码 String regionCountyCode; String countyNextHref = null; boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd); if (isCountyLinkTag) { regionCountyCode = aTagsInCountyCodeTd.get(0).text(); countyNextHref = aTagsInCountyCodeTd.get(0).attr("href"); } else regionCountyCode = countyCodeTd.text(); String regionCountyName; if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text(); else regionCountyName = countyNameTd.text(); if (isCountyLinkTag) { int index = countyNextHref.indexOf('/'); String provincePath = countyNextHref.substring(index + 1, index + 3) + "/"; String countyUrl = HEADER_URL + provincePath + countyNextHref; newCell.setLink(countyUrl); } newCell.setCode(regionCountyCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionCountyName); newCell.setGenTime(LocalDateTime.now()); writeDataToDb(3, newCell); readDataRecursive(newCell); }); } private static void readCityData(Document document, RegionCell newCell, RegionCell superCell) { final Elements citytrList = document.getElementsByClass("citytr"); if (CollUtil.isEmpty(citytrList)) return; citytrList.forEach(cityTr -> { Elements cityTds = cityTr.getElementsByTag("td"); Element codeTd = cityTds.get(0); Element nameTd = cityTds.get(1); Elements asTagInCodeTd = codeTd.getElementsByTag("a"); Elements asTagInNameTd = nameTd.getElementsByTag("a"); String regionCityCode; String cityNextHref = null; boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd); if (isLinkTag) { regionCityCode = asTagInCodeTd.get(0).text(); cityNextHref = asTagInCodeTd.get(0).attr("href"); } else regionCityCode = codeTd.text(); String regionName; if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text(); else regionName = nameTd.text(); if (isLinkTag) newCell.setLink(HEADER_URL + cityNextHref); newCell.setCode(regionCityCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionName); newCell.setGenTime(LocalDateTime.now()); writeDataToDb(2, newCell); readDataRecursive(newCell); }); } private static void writeDataToDb(int level, RegionCell cell) { try { db.insertOrUpdate(Entity .create(TABLE_NAME) .set(CODE, cell.getCode()) .set(PARENT_CODE, cell.getParentCode()) .set(NAME, cell.getName()) .set(LEVEL, level) .set(TYPE_CODE, cell.getTypeCode()) .set(LINK, cell.getLink()) .set(GEN_TIME, cell.getGenTime()), CODE ); } catch (SQLException e) { e.printStackTrace(); } } /** * SELECT * SUPER.* * FROM * (SELECT * FROM region2021 WHERE `LEVEL` = 4) AS SUPER * LEFT JOIN (SELECT * FROM region2021) AS SUB ON SUPER.CODE = SUB.PARENT_CODE * WHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL * @param db * @param tableName * @param level * @return */ @SneakyThrows private static List<RegionCell> queryLostData( Db db, String tableName, int level ) { String sql = "SELECT \n" + "\tSUPER.*\n" + "FROM \n" + "\t(SELECT * FROM " + tableName + " WHERE `LEVEL` = ? ) AS SUPER\n" + "\tLEFT JOIN (SELECT * FROM " + tableName + " ) AS SUB ON SUPER.CODE = SUB.PARENT_CODE\n" + "\tWHERE SUB.PARENT_CODE IS NULL AND SUPER.LINK IS NOT NULL"; return db.query(sql, RegionCell.class, level); } }
2022年7月9日06点08分更新
通过写补偿逻辑发现可以进一步优化代码结果:
1、常量统一存放
package cn.cloud9.constant; import java.util.Random; public interface Constant { int TIMEOUT = 3000; String PATH_CHAR = "/"; String CODE = "CODE"; String PARENT_CODE = "PARENT_CODE"; String NAME = "NAME"; String LEVEL = "LEVEL"; String TYPE_CODE = "TYPE_CODE"; String LINK = "LINK"; String GEN_TIME = "GEN_TIME"; String REFERER = "Referer"; String USER_AGENT = "User-Agent"; String[] BROWSER_AGENTS = { "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" }; String ROOT_PATH = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/${YEAR}/"; Random R = new Random(); }
2、调用的方法封装在Util中
package cn.cloud9.util; import cn.cloud9.constant.Constant; import cn.cloud9.po.RegionCell; import cn.hutool.core.collection.CollUtil; import cn.hutool.db.Db; import cn.hutool.db.Entity; import cn.hutool.http.HttpRequest; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import com.alibaba.druid.util.StringUtils; import lombok.SneakyThrows; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.sql.SQLException; import java.time.LocalDateTime; import java.util.HashMap; import java.util.Map; import static cn.cloud9.constant.Constant.*; /** * @author OnCloud9 * @description * @project RegionReptile-Remaster * @date 2022年07月07日 下午 10:01 */ public class MyUtil { private static Db db = Db.use(); /** * 分配新的请求头Header * @return */ public static Map<String, String> getNewRequestHeader() { return new HashMap<String, String>(){{ this.put("Host", "www.stats.gov.cn"); this.put("Upgrade-Insecure-Requests", "1"); }}; } /** * 初始化表空间 * @param tableName */ @SneakyThrows public static void initialTableSpace(String tableName) { String SQL = "CREATE TABLE IF NOT EXISTS "+ tableName +" (\n" + " `CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '行政区代码',\n" + " `PARENT_CODE` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '上级代码',\n" + " `NAME` varchar(24) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '名称',\n" + " `LINK` varchar(252) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT '网页地址',\n" + " `LEVEL` int DEFAULT NULL COMMENT '层级',\n" + " `TYPE_CODE` varchar(12) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '类型',\n" + " `GEN_TIME` datetime DEFAULT NULL COMMENT '创建时间',\n" + " PRIMARY KEY (`CODE`) USING BTREE, \n" + " KEY `IDX_LEVEL` (`LEVEL`) USING BTREE,\n" + " KEY `IDX_PC` (`PARENT_CODE`) USING BTREE \n" + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;"; db.execute(SQL, null); } /** * 连接重试 * @param httpRequest * @return */ public static HttpResponse retryConn(HttpRequest httpRequest) { HttpResponse httpResponse = null; try { httpResponse = httpRequest .timeout(TIMEOUT) .setConnectionTimeout(TIMEOUT) .setReadTimeout(TIMEOUT) .execute(); } catch (Exception exception) { exception.printStackTrace(); return retryConn(httpRequest); } return httpResponse; } /** * 写入操作封装, 不更新, * 有CODE主键发生冲突,异常后执行下一个 * @param tableName * @param cell */ @SneakyThrows public static void writeDataToDb(String tableName, RegionCell cell) { db.insert(Entity .create(tableName) .set(CODE, cell.getCode()) .set(PARENT_CODE, cell.getParentCode()) .set(NAME, cell.getName()) .set(LEVEL, cell.getLevel()) .set(TYPE_CODE, cell.getTypeCode()) .set(LINK, cell.getLink()) .set(GEN_TIME, cell.getGenTime())); } /** * 村,居委会读取 * @param document * @param newCell * @param superCell * @param tableName */ public static void readVillage( Document document, RegionCell newCell, RegionCell superCell, String tableName ) { Elements villageTrList = document.getElementsByClass("villagetr"); if (CollUtil.isEmpty(villageTrList)) return; villageTrList.forEach(villageTr -> { Elements villageTdList = villageTr.getElementsByTag("td"); newCell.setCode(villageTdList.get(0).text()); newCell.setParentCode(superCell.getCode()); newCell.setName(villageTdList.get(2).text()); newCell.setTypeCode(villageTdList.get(1).text()); newCell.setLevel(5); newCell.setGenTime(LocalDateTime.now()); writeDataToDb(tableName, newCell); }); } /** * 街道,镇 读取 * @param BASE_URL * @param document * @param newCell * @param superCell * @param tableName */ public static void readTown ( String BASE_URL, Document document, RegionCell newCell, RegionCell superCell, String tableName ) { Elements townTrList = document.getElementsByClass("towntr"); if (CollUtil.isEmpty(townTrList)) return; townTrList.forEach(townTr -> { Elements townTds = townTr.getElementsByTag("td"); Element townCodeTd = townTds.get(0); Element townNameTd = townTds.get(1); Elements aTagsInTownCodeTd = townCodeTd.getElementsByTag("a"); Elements aTagsInTownNameTd = townNameTd.getElementsByTag("a"); // 取城乡代码 String regionTownCode; String townNextHref = null; boolean isTownLinkTag = !CollUtil.isEmpty(aTagsInTownCodeTd); if (isTownLinkTag) { regionTownCode = aTagsInTownCodeTd.get(0).text(); townNextHref = aTagsInTownCodeTd.get(0).attr("href"); } else regionTownCode = townCodeTd.text(); String regionTownName; if (!CollUtil.isEmpty(aTagsInTownNameTd)) regionTownName = aTagsInTownNameTd.get(0).text(); else regionTownName = townNameTd.text(); final String codePrefix = regionTownCode.substring(0, 2); boolean condition1 = "44".equals(codePrefix) || "46".equals(codePrefix); if (condition1 && isTownLinkTag) { final String link = superCell.getLink(); final String basePath = link.substring(0, link.lastIndexOf(Constant.PATH_CHAR) + 1); String url = basePath + townNextHref; newCell.setLink(url); } else if (isTownLinkTag) { int index = townNextHref.indexOf(Constant.PATH_CHAR); String provincePath = townNextHref.substring(index + 1, index + 3); String cityPath = Constant.PATH_CHAR + townNextHref.substring(index + 3, index + 5) + Constant.PATH_CHAR; String url = BASE_URL + provincePath + cityPath + townNextHref; newCell.setLink(url); } newCell.setCode(regionTownCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionTownName); newCell.setGenTime(LocalDateTime.now()); newCell.setLevel(4); writeDataToDb(tableName, newCell); readDataRecursive(newCell, BASE_URL, tableName); }); } /** * 区县读取 * @param BASE_URL * @param document * @param newCell * @param superCell * @param tableName */ public static void readCounty ( String BASE_URL, Document document, RegionCell newCell, RegionCell superCell, String tableName ) { Elements countyTrList = document.getElementsByClass("countytr"); if (CollUtil.isEmpty(countyTrList)) return; countyTrList.forEach(countyTr -> { Elements countyTds = countyTr.getElementsByTag("td"); Element countyCodeTd = countyTds.get(0); Element countyNameTd = countyTds.get(1); Elements aTagsInCountyCodeTd = countyCodeTd.getElementsByTag("a"); Elements aTagsInCountyNameTd = countyNameTd.getElementsByTag("a"); // 取城乡代码 String regionCountyCode; String countyNextHref = null; boolean isCountyLinkTag = !CollUtil.isEmpty(aTagsInCountyCodeTd); if (isCountyLinkTag) { regionCountyCode = aTagsInCountyCodeTd.get(0).text(); countyNextHref = aTagsInCountyCodeTd.get(0).attr("href"); } else regionCountyCode = countyCodeTd.text(); String regionCountyName; if (!CollUtil.isEmpty(aTagsInCountyNameTd)) regionCountyName = aTagsInCountyNameTd.get(0).text(); else regionCountyName = countyNameTd.text(); if (isCountyLinkTag) { int index = countyNextHref.indexOf(Constant.PATH_CHAR); String provincePath = countyNextHref.substring(index + 1, index + 3) + Constant.PATH_CHAR; String countyUrl = BASE_URL + provincePath + countyNextHref; newCell.setLink(countyUrl); } newCell.setCode(regionCountyCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionCountyName); newCell.setGenTime(LocalDateTime.now()); newCell.setLevel(3); writeDataToDb(tableName, newCell); readDataRecursive(newCell, BASE_URL, tableName); }); } /** * 城市读取 * @param BASE_URL * @param document * @param newCell * @param superCell * @param tableName */ public static void readCity ( String BASE_URL, Document document, RegionCell newCell, RegionCell superCell, String tableName ) { final Elements citytrList = document.getElementsByClass("citytr"); if (CollUtil.isEmpty(citytrList)) return; citytrList.forEach(cityTr -> { Elements cityTds = cityTr.getElementsByTag("td"); Element codeTd = cityTds.get(0); Element nameTd = cityTds.get(1); Elements asTagInCodeTd = codeTd.getElementsByTag("a"); Elements asTagInNameTd = nameTd.getElementsByTag("a"); String regionCityCode; String cityNextHref = null; boolean isLinkTag = !CollUtil.isEmpty(asTagInCodeTd); if (isLinkTag) { regionCityCode = asTagInCodeTd.get(0).text(); cityNextHref = asTagInCodeTd.get(0).attr("href"); } else regionCityCode = codeTd.text(); String regionName; if (!CollUtil.isEmpty(asTagInNameTd)) regionName = asTagInNameTd.get(0).text(); else regionName = nameTd.text(); if (isLinkTag) newCell.setLink(BASE_URL + cityNextHref); newCell.setCode(regionCityCode); newCell.setParentCode(superCell.getCode()); newCell.setName(regionName); newCell.setGenTime(LocalDateTime.now()); newCell.setLevel(2); writeDataToDb(tableName, newCell); readDataRecursive(newCell, BASE_URL, tableName); }); } /** * 递归请求调用 * @param regionCell * @param BASE_URL * @param tableName */ @SneakyThrows public static void readDataRecursive(RegionCell regionCell, String BASE_URL, String tableName) { final String regionCellLink = regionCell.getLink(); if (StringUtils.isEmpty(regionCellLink)) return; final HttpRequest httpRequest = HttpUtil .createGet(regionCellLink) .timeout(TIMEOUT) .setConnectionTimeout(TIMEOUT) .setReadTimeout(TIMEOUT); if (!CollUtil.isEmpty(regionCell.getCookies())) { final Map<String, String> headers = MyUtil.getNewRequestHeader(); regionCell.getCookies().forEach( cookie -> headers.put("Cookie", cookie.toString() + ";")); headers.put(REFERER, regionCell.getLink()); headers.put(USER_AGENT, BROWSER_AGENTS[Constant.R.nextInt(BROWSER_AGENTS.length)]); headers.put("Host", "www.stats.gov.cn"); headers.put("Upgrade-Insecure-Requests", "1"); httpRequest.addHeaders(headers); } final HttpResponse httpResponse = retryConn(httpRequest); // 封装cookie给下一次请求使用 final RegionCell.RegionCellBuilder builder = RegionCell.builder(); builder.cookies(httpResponse.getCookies()); final RegionCell newCell = builder.build(); if (!httpResponse.isOk()) { System.out.println(httpResponse.body()); return; } final Document document = Jsoup.parse(httpResponse.body()); readCity(BASE_URL, document, newCell, regionCell, tableName); readCounty(BASE_URL, document, newCell, regionCell, tableName); readTown(BASE_URL, document, newCell, regionCell, tableName); readVillage(document, newCell, regionCell, tableName); return; } /** * 读取省份数据 * @param BASE_URL * @param tableName */ public static void readProvinceData(String BASE_URL, String tableName) { final String s = HttpUtil.get(BASE_URL); final Elements provincetrs = Jsoup.parse(s).getElementsByClass("provincetr"); provincetrs.forEach(tr -> { Elements provinceas = tr.getElementsByTag("a"); provinceas.forEach(a -> { RegionCell cell = RegionCell.builder() .name(a.text()) .code( a.attr("href").replace(".html", "")) .link(BASE_URL + a.attr("href")) .parentCode(String.valueOf(0)) .genTime(LocalDateTime.now()) .level(1) .build(); MyUtil.writeDataToDb(tableName, cell); readDataRecursive(cell, BASE_URL, tableName); }); }); } }
启动类就不用写那么多东西了
package cn.cloud9; import cn.cloud9.constant.Constant; import cn.cloud9.util.MyUtil; /** * @author OnCloud9 * @description * @project RegionReptile-Remaster * @date 2022年07月07日 下午 09:35 */ public class MainApplication { public static void main(String[] args) { final String year = 0 == args.length ? "2009" : args[0]; String tableName = "region" + year; String BASE_URL = Constant.ROOT_PATH.replace("${YEAR}", year); MyUtil.initialTableSpace(tableName); MyUtil.readProvinceData(BASE_URL, tableName); } }