获取省市区镇爬虫
需要做分批导入数据库! 需要做分批导入数据库! 需要做分批导入数据库!
1 package com.mock.utils; 2 3 import java.io.IOException; 4 import java.net.MalformedURLException; 5 import java.util.ArrayList; 6 import java.util.List; 7 8 import org.jsoup.Jsoup; 9 import org.jsoup.nodes.Document; 10 import org.jsoup.nodes.Element; 11 import org.jsoup.select.Elements; 12 13 import com.gargoylesoftware.htmlunit.BrowserVersion; 14 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 15 import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; 16 import com.gargoylesoftware.htmlunit.WebClient; 17 import com.gargoylesoftware.htmlunit.WebClientOptions; 18 import com.gargoylesoftware.htmlunit.html.HtmlPage; 19 import com.justsy.army.mgt.mock.model.City; 20 21 public class NationalBureauOfStatics { 22 private static final String ADDRESS = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"; 23 private static final String fix = ".html"; 24 25 public static void main(String[] args) { 26 List<City> provinceList = new ArrayList<>(); 27 List<City> cityList = new ArrayList<>(); 28 List<City> countyList = new ArrayList<>(); 29 List<City> townList = new ArrayList<>(); 30 provinceList = getTVMall(provinceList, new City(), ADDRESS, 0); 31 for (City city : provinceList) { 32 cityList = getTVMall(cityList, city, city.getHtmlAddr(), 1); 33 } 34 for (City city : cityList) { 35 countyList = getTVMall(countyList, city, city.getHtmlAddr(), 2); 36 } 37 for (City city : countyList) { 38 townList = getTVMall(townList, city, city.getHtmlAddr(), 3); 39 40 for (City item: townList) { 41 System.out.println(item.toString()); 42 } 43 townList .clear(); 44 } 45 } 46 47 public static List<City> getTVMall(List<City> list, City city, String address, int type) { 48 WebClient webClient = new WebClient(BrowserVersion.CHROME); 49 // webclient参数载体 50 WebClientOptions clientOptions = webClient.getOptions(); 51 // 设置webClient的相关参数 52 clientOptions.setJavaScriptEnabled(true); 53 clientOptions.setCssEnabled(false); 54 webClient.setAjaxController(new NicelyResynchronizingAjaxController()); 55 clientOptions.setTimeout(35000); 56 clientOptions.setThrowExceptionOnScriptError(false); 57 try { 58 HtmlPage htmlPage = webClient.getPage(address); 59 Document dom = Jsoup.parse(htmlPage.asXml()); 60 Elements ele = null; 61 if (type == 0) { 62 ele = dom.getElementsByClass("provincetable"); 63 } else if (type == 1) { 64 ele = dom.getElementsByClass("citytable"); 65 } else if (type == 2) { 66 ele = dom.getElementsByClass("countytable"); 67 } else if (type == 3) { 68 ele = dom.getElementsByClass("towntable"); 69 } 70 dom = Jsoup.parse(ele.toString()); 71 ele = dom.getElementsByTag("tr"); 72 if (ele != null) { 73 getList(list, ele, city, type); 74 } 75 } catch (FailingHttpStatusCodeException e) { 76 e.printStackTrace(); 77 } catch (MalformedURLException e) { 78 e.printStackTrace(); 79 } catch (IOException e) { 80 e.printStackTrace(); 81 } 82 return list; 83 } 84 85 private static List<City> getList(List<City> list, Elements ele, City city, int type) { 86 if (type == 0) { 87 for (int i = 3; i < ele.size(); i++) { 88 Element item = ele.get(i); 89 Elements aElements = item.getElementsByTag("a"); 90 for (int j = 0; j < aElements.size(); j++) { 91 City c = new City(); 92 String html = aElements.get(j).attr("href"); 93 String name = aElements.get(j).text(); 94 c.setProvince(name); 95 c.setHtmlAddr(ADDRESS + html); 96 c.setCode(html.replace(fix, "0000000000")); 97 list.add(c); 98 } 99 } 100 return list; 101 } 102 for (int i = 0; i < ele.size(); i++) { 103 Element item = ele.get(i); 104 Elements aElements = item.getElementsByTag("a"); 105 if (aElements.size() > 0) { 106 City c = new City(); 107 String html = aElements.get(0).attr("href"); 108 String code = aElements.get(0).text(); 109 String name = aElements.get(1).text(); 110 if (type == 1) { 111 c.setProvince(city.getProvince()); 112 c.setCity(name); 113 } else if (type == 2) { 114 c.setProvince(city.getProvince()); 115 c.setCity(city.getCity()); 116 c.setCounty(name); 117 } else if (type == 3) { 118 c.setProvince(city.getProvince()); 119 c.setCity(city.getCity()); 120 c.setCounty(city.getCounty()); 121 c.setTown(name); 122 } 123 c.setCode(code); 124 String provinceCode = city.getCode().substring(0, 2); 125 if (!html.startsWith(provinceCode + "/")) { 126 html = provinceCode + "/" + html; 127 } 128 c.setHtmlAddr(ADDRESS + html); 129 list.add(c); 130 System.out.println(c.toString()); 131 } 132 } 133 return list; 134 } 135 }
博客园地址:https://www.cnblogs.com/lixiuming521125/