Java使用Jsoup获取全国省市区(县)街道村等信息

1、在pom.xml文件中配置jsoup

 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.14.1</version>
        </dependency>

 

 

 

2、配置获取地址(目前最新的是2020,建议去列表看最新的,按照最新的配):

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

 

 





http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/

 

 







3、获取到txt文件中的数据可以根据自己想要的格式在parseNextLevel方法和printInfo方法中修改



import
java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 全国省市县镇村数据爬取 * @author Saffichan * @date 2021-07-22 10:19:39 * @version 1.0.0 */ public class JsoupTest { private static Map<Integer, String> cssMap = new HashMap<Integer, String>(); private static BufferedWriter bufferedWriter = null; static { cssMap.put(1, "provincetr");// cssMap.put(2, "citytr");// cssMap.put(3, "countytr");// 县/区 cssMap.put(4, "towntr");//cssMap.put(5, "villagetr");// } public static void main(String[] args) throws IOException { int level = 1; initFile(); // 获取全国各个省级信息 Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/"); Elements rowProvince = connect.select("tr." + cssMap.get(level)); for (Element provinceElement : rowProvince)// 遍历每一行的省份城市 { Elements select = provinceElement.select("a"); for (Element province : select)// 每一个省份 { printInfoProvice(province, level); parseNextLevel(province, level + 1); } } closeStream(); } private static void initFile() { try { bufferedWriter = new BufferedWriter(new FileWriter(new File("d:\\ShowAdress.txt"), true)); } catch (IOException e) { e.printStackTrace(); } } private static void closeStream() { if (bufferedWriter != null) { try { bufferedWriter.close(); } catch (IOException e) { e.printStackTrace(); } bufferedWriter = null; } } private static void parseNextLevel(Element parentElement, int level) throws IOException { try { Thread.sleep(500);//睡眠一下,否则可能出现各种错误状态码 } catch (InterruptedException e) { e.printStackTrace(); } Document doc = connect(parentElement.attr("abs:href")); if (doc != null) { Elements newsHeadlines = doc.select("tr." + cssMap.get(level));// // 获取表格的一行数据 for (Element element : newsHeadlines) { printInfo(parentElement,element, level + 1); Elements select = element.select("a");// 在递归调用的时候,这里是判断是否是村一级的数据,村一级的数据没有a标签 if (select.size() != 0) { parseNextLevel(select.last(), level + 1); } } } } /** * 写一行数据到数据文件中去 * @param element 爬取到的数据元素 * * @param level 城市级别 */ private static void printInfo(Element parentElement,Element element, int level) { try { String reg = "[^\u4e00-\u9fa5]"; if(level==3){ bufferedWriter.write(parentElement.toString().replaceAll(reg, "")+"-["+(level-1)+"]"+element.select("td").last().text()+","); }else if(level==4){ bufferedWriter.write(parentElement.toString().replaceAll(reg, "")+"-["+(level-1)+"]"+element.select("td").last().text()+","); }else if(level==5){ bufferedWriter.write(parentElement.toString().replaceAll(reg, "")+"-["+(level-1)+"]"+element.select("td").last().text()+","); } bufferedWriter.newLine(); bufferedWriter.flush(); } catch (IOException e) { e.printStackTrace(); } } private static void printInfoProvice(Element element, int level) { try { String reg = "[^\u4e00-\u9fa5]"; bufferedWriter.write("province"+element.toString().replaceAll(reg, "")+","); bufferedWriter.newLine(); bufferedWriter.flush(); } catch (IOException e) { e.printStackTrace(); } } private static Document connect(String url) { if (url == null || url.isEmpty()) { throw new IllegalArgumentException("The input url('" + url + "') is invalid!"); } try { return Jsoup.connect(url).timeout(100 * 1000).get(); } catch (IOException e) { e.printStackTrace(); return null; } } }

 

获取到的数据格式,我是根据我的数据表字段获取的,获取完之后再做处理存进数据表,中间会有部分地名乱码,会在另一个博客中提供处理方式和数据表

 

posted @ 2021-07-23 15:58  shuzu渊  阅读(3286)  评论(0编辑  收藏  举报