Java学习-057-Jsoup爬虫获取中国所有的三级行政区划数据(二),并生成数据库 SQL 脚本插入语句

多不废话,直接上马,小主您稳着。。。

  1 package com.fanfengping.zeus.uitl;
  2 
  3 import com.alibaba.fastjson.JSONObject;
  4 import lombok.extern.slf4j.Slf4j;
  5 import org.jsoup.Jsoup;
  6 import org.jsoup.nodes.Document;
  7 import org.jsoup.nodes.Element;
  8 import org.jsoup.select.Elements;
  9 import org.testng.annotations.Test;
 10 
 11 import java.io.File;
 12 import java.io.FileWriter;
 13 import java.util.ArrayList;
 14 import java.util.HashMap;
 15 import java.util.List;
 16 import java.util.Map;
 17 
 18 @Slf4j
 19 public class JsoupGetRegionSql {
 20     @Test
 21     public void getRegionSql () throws Exception {
 22         String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/201901-06/201904301706.html";
 23         String fp = System.getProperty("user.dir") + File.separator + "initRegion.sql";
 24 
 25         int count = 0;
 26 
 27         File file = new File(fp);
 28 
 29         if (file.exists()) {
 30             file.delete();
 31         }
 32 
 33         file.createNewFile();
 34 
 35         FileWriter fileWriter = new FileWriter(file.getName(), true);
 36 
 37         Document doc = Jsoup.connect(url)
 38                 .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36")
 39                 .header("Accept", "text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,application/signed-exchangev=b3")
 40                 .maxBodySize(0)
 41                 .timeout(100000)
 42                 .get();
 43 
 44         Elements trs = doc.select("tr");
 45 
 46         List<Map<String, Object>> adminRegion = new ArrayList<>();
 47         List<Map<String, Object>> adminRegionSec = new ArrayList<>();
 48         List<Map<String, Object>> adminRegionThi = new ArrayList<>();
 49 
 50 
 51         for (Element tr : trs ) {
 52             Elements tds = tr.select("td");
 53 
 54             Map<String, Object> region = new HashMap<>();
 55 
 56             if (tds.size() > 3) {
 57                 String regionCode = tds.get(1).text();
 58                 String regionArea = tds.get(2).text();
 59                 String parentCode = "";
 60 
 61                 if (validCode(regionCode)) {
 62                     int leveType = 2;
 63                     parentCode = regionCode.substring(0,2) + "0000";
 64 
 65                     if (!regionCode.endsWith("00")) {
 66                         leveType = 3;
 67                         parentCode = regionCode.substring(0,4) + "00";
 68                     }
 69 
 70                     if (regionCode.endsWith("0000")) {
 71                         leveType = 1;
 72                         parentCode = "000000";
 73                     }
 74 
 75                     region.put("code", regionCode);
 76                     region.put("region", regionArea);
 77                     region.put("parentCode", parentCode);
 78                     region.put("level", leveType);
 79 
 80                     switch ((Integer) region.get("level")) {
 81                         case 1:
 82                             adminRegion.add(region);
 83                             break;
 84                         case 2:
 85                             adminRegionSec.add(region);
 86                             break;
 87                         default:
 88                             adminRegionThi.add(region);
 89                             break;
 90                     }
 91 
 92                     count++;
 93                     String content = String.format("insert into region_code (code, region, level, parent_code, dtime, note, ctime)" +
 94                             " values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode);
 95 
 96                     fileWriter.write(content);
 97                 }
 98             }
 99         }
100 
101         System.out.println("总数量:" + count);
102 
103         System.out.println(fp);
104 
105         fileWriter.close();
106     }
107 
108     public boolean validCode(String code) {
109         try {
110             Integer.parseInt(code);
111             return true;
112         } catch (Exception e) {
113             return false;
114         }
115     }
116 }

  

  控制台输出如下所示:

  

  

  数据库文件截图如下所示:

  

 

 

 

 

  

 

 

  

 

 

posted @ 2019-05-22 00:22  范丰平  Views(546)  Comments(0Edit  收藏  举报