获取省市区镇爬虫

 需要做分批导入数据库! 需要做分批导入数据库! 需要做分批导入数据库!

 

  1 package com.mock.utils;
  2 
  3 import java.io.IOException;
  4 import java.net.MalformedURLException;
  5 import java.util.ArrayList;
  6 import java.util.List;
  7 
  8 import org.jsoup.Jsoup;
  9 import org.jsoup.nodes.Document;
 10 import org.jsoup.nodes.Element;
 11 import org.jsoup.select.Elements;
 12 
 13 import com.gargoylesoftware.htmlunit.BrowserVersion;
 14 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 15 import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
 16 import com.gargoylesoftware.htmlunit.WebClient;
 17 import com.gargoylesoftware.htmlunit.WebClientOptions;
 18 import com.gargoylesoftware.htmlunit.html.HtmlPage;
 19 import com.justsy.army.mgt.mock.model.City;
 20 
 21 public class NationalBureauOfStatics {
 22     private static final String ADDRESS = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
 23     private static final String fix = ".html";
 24 
 25     public static void main(String[] args) {
 26         List<City> provinceList = new ArrayList<>();
 27         List<City> cityList = new ArrayList<>();
 28         List<City> countyList = new ArrayList<>();
 29         List<City> townList = new ArrayList<>();
 30         provinceList = getTVMall(provinceList, new City(), ADDRESS, 0);
 31         for (City city : provinceList) {
 32             cityList = getTVMall(cityList, city, city.getHtmlAddr(), 1);
 33         }
 34         for (City city : cityList) {
 35             countyList = getTVMall(countyList, city, city.getHtmlAddr(), 2);
 36         }
 37         for (City city : countyList) {
 38             townList = getTVMall(townList, city, city.getHtmlAddr(), 3);
 39 
 40         for (City item: townList) {
 41             System.out.println(item.toString());
 42         }
 43         townList .clear();
 44         }
 45     }
 46 
 47     public static List<City> getTVMall(List<City> list, City city, String address, int type) {
 48         WebClient webClient = new WebClient(BrowserVersion.CHROME);
 49         // webclient参数载体
 50         WebClientOptions clientOptions = webClient.getOptions();
 51         // 设置webClient的相关参数
 52         clientOptions.setJavaScriptEnabled(true);
 53         clientOptions.setCssEnabled(false);
 54         webClient.setAjaxController(new NicelyResynchronizingAjaxController());
 55         clientOptions.setTimeout(35000);
 56         clientOptions.setThrowExceptionOnScriptError(false);
 57         try {
 58             HtmlPage htmlPage = webClient.getPage(address);
 59             Document dom = Jsoup.parse(htmlPage.asXml());
 60             Elements ele = null;
 61             if (type == 0) {
 62                 ele = dom.getElementsByClass("provincetable");
 63             } else if (type == 1) {
 64                 ele = dom.getElementsByClass("citytable");
 65             } else if (type == 2) {
 66                 ele = dom.getElementsByClass("countytable");
 67             } else if (type == 3) {
 68                 ele = dom.getElementsByClass("towntable");
 69             }
 70             dom = Jsoup.parse(ele.toString());
 71             ele = dom.getElementsByTag("tr");
 72             if (ele != null) {
 73                 getList(list, ele, city, type);
 74             }
 75         } catch (FailingHttpStatusCodeException e) {
 76             e.printStackTrace();
 77         } catch (MalformedURLException e) {
 78             e.printStackTrace();
 79         } catch (IOException e) {
 80             e.printStackTrace();
 81         }
 82         return list;
 83     }
 84 
 85     private static List<City> getList(List<City> list, Elements ele, City city, int type) {
 86         if (type == 0) {
 87             for (int i = 3; i < ele.size(); i++) {
 88                 Element item = ele.get(i);
 89                 Elements aElements = item.getElementsByTag("a");
 90                 for (int j = 0; j < aElements.size(); j++) {
 91                     City c = new City();
 92                     String html = aElements.get(j).attr("href");
 93                     String name = aElements.get(j).text();
 94                     c.setProvince(name);
 95                     c.setHtmlAddr(ADDRESS + html);
 96                     c.setCode(html.replace(fix, "0000000000"));
 97                     list.add(c);
 98                 }
 99             }
100             return list;
101         }
102         for (int i = 0; i < ele.size(); i++) {
103             Element item = ele.get(i);
104             Elements aElements = item.getElementsByTag("a");
105             if (aElements.size() > 0) {
106                 City c = new City();
107                 String html = aElements.get(0).attr("href");
108                 String code = aElements.get(0).text();
109                 String name = aElements.get(1).text();
110                 if (type == 1) {
111                     c.setProvince(city.getProvince());
112                     c.setCity(name);
113                 } else if (type == 2) {
114                     c.setProvince(city.getProvince());
115                     c.setCity(city.getCity());
116                     c.setCounty(name);
117                 } else if (type == 3) {
118                     c.setProvince(city.getProvince());
119                     c.setCity(city.getCity());
120                     c.setCounty(city.getCounty());
121                     c.setTown(name);
122                 }
123                 c.setCode(code);
124                 String provinceCode = city.getCode().substring(0, 2);
125                 if (!html.startsWith(provinceCode + "/")) {
126                     html = provinceCode + "/" + html;
127                 }
128                 c.setHtmlAddr(ADDRESS + html);
129                 list.add(c);
130                 System.out.println(c.toString());
131             }
132         }
133         return list;
134     }
135 }

 

posted @ 2022-12-05 11:58  啄木鸟伍迪  阅读(40)  评论(0)    收藏  举报
//火箭 GenerateContentList();