Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句
多不废话,直接上马,小主您稳着。。。
1 package com.fanfengping.zeus.uitl; 2 3 import com.alibaba.fastjson.JSONObject; 4 import lombok.extern.slf4j.Slf4j; 5 import org.jsoup.Jsoup; 6 import org.jsoup.nodes.Document; 7 import org.jsoup.nodes.Element; 8 import org.jsoup.select.Elements; 9 import org.testng.annotations.Test; 10 11 import java.io.File; 12 import java.io.FileWriter; 13 import java.util.ArrayList; 14 import java.util.HashMap; 15 import java.util.List; 16 import java.util.Map; 17 18 @Slf4j 19 public class JsoupGetRegionSql { 20 @Test 21 public void getRegionSql () throws Exception { 22 String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/201901-06/201904301706.html"; 23 String fp = System.getProperty("user.dir") + File.separator + "initRegion.sql"; 24 25 int count = 0; 26 27 File file = new File(fp); 28 29 if (file.exists()) { 30 file.delete(); 31 } 32 33 file.createNewFile(); 34 35 FileWriter fileWriter = new FileWriter(file.getName(), true); 36 37 Document doc = Jsoup.connect(url) 38 .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36") 39 .header("Accept", "text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,application/signed-exchangev=b3") 40 .maxBodySize(0) 41 .timeout(100000) 42 .get(); 43 44 Elements trs = doc.select("tr"); 45 46 List<Map<String, Object>> adminRegion = new ArrayList<>(); 47 List<Map<String, Object>> adminRegionSec = new ArrayList<>(); 48 List<Map<String, Object>> adminRegionThi = new ArrayList<>(); 49 50 51 for (Element tr : trs ) { 52 Elements tds = tr.select("td"); 53 54 Map<String, Object> region = new HashMap<>(); 55 56 if (tds.size() > 3) { 57 String regionCode = tds.get(1).text(); 58 String regionArea = tds.get(2).text(); 59 String parentCode = ""; 60 61 if (validCode(regionCode)) { 62 int leveType = 2; 63 parentCode = regionCode.substring(0,2) + "0000"; 64 65 if (!regionCode.endsWith("00")) { 66 leveType = 3; 67 parentCode = regionCode.substring(0,4) + "00"; 68 } 69 70 if (regionCode.endsWith("0000")) { 71 leveType = 1; 72 parentCode = "000000"; 73 } 74 75 region.put("code", regionCode); 76 region.put("region", regionArea); 77 region.put("parentCode", parentCode); 78 region.put("level", leveType); 79 80 switch ((Integer) region.get("level")) { 81 case 1: 82 adminRegion.add(region); 83 break; 84 case 2: 85 adminRegionSec.add(region); 86 break; 87 default: 88 adminRegionThi.add(region); 89 break; 90 } 91 92 count++; 93 String content = String.format("insert into region_code (code, region, level, parent_code, dtime, note, ctime)" + 94 " values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode); 95 96 fileWriter.write(content); 97 } 98 } 99 } 100 101 System.out.println("总数量:" + count); 102 103 System.out.println(fp); 104 105 fileWriter.close(); 106 } 107 108 public boolean validCode(String code) { 109 try { 110 Integer.parseInt(code); 111 return true; 112 } catch (Exception e) { 113 return false; 114 } 115 } 116 }
控制台输出如下所示:
数据库文件截图如下所示:
欢迎 【 留言 || 关注 || 打赏 】 。您的每一份心意都是对我的鼓励和支持!非常感谢!欢迎互加,相互交流学习!
作者:范丰平,本文链接:https://www.cnblogs.com/fengpingfan/p/10903440.html
Copyright @范丰平 版权所有,如需转载请标明本文原始链接出处,严禁商业用途! 我的个人博客链接地址:http://www.cnblogs.com/fengpingfan