给老子爬爬爬!2019国家统计局最新城乡划分代码

爬一下最新的行政区划

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html

引入依赖

<!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j -->
<dependency>
    <groupId>com.belerweb</groupId>
    <artifactId>pinyin4j</artifactId>
    <version>2.5.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
    <groupId>org.projectlombok</groupId>
    <artifactId>lombok</artifactId>
    <version>1.18.8</version>
    <scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
    <groupId>com.squareup.okhttp3</groupId>
    <artifactId>okhttp</artifactId>
    <version>4.4.1</version>
</dependency>

代码

package com.demo.tools;

import com.alibaba.fastjson.JSON;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import okhttp3.ConnectionPool;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

/**
 * Created by 小LUA on 2020-03-30 11:39.
 */
@Slf4j
public class GetProvince {

    private static final OkHttpClient client = new OkHttpClient.Builder()
            .connectTimeout(5, TimeUnit.MINUTES)
            .writeTimeout(5, TimeUnit.MINUTES)
            .readTimeout(5, TimeUnit.MINUTES)
            .connectionPool(new ConnectionPool(0, 30, TimeUnit.MINUTES))
            .build();

    /**
     * 获取首字母
     * @param chinese
     * @return
     */
    private static String getFirstSpell(String chinese) {
        try {
            HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
            defaultFormat.setCaseType(HanyuPinyinCaseType.UPPERCASE);
            String[] temp = new String[0];
            try {
                temp = PinyinHelper.toHanyuPinyinStringArray(chinese.charAt(0), defaultFormat);
            } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
                badHanyuPinyinOutputFormatCombination.printStackTrace();
            }
            return temp[0].charAt(0)+"";
        } catch (Exception e){
            return "";
        }
    }

    /**
     * 读取URL内容
     * @param url
     * @return
     * @throws IOException
     */
    public static String readUrl(String url) throws IOException {
        System.out.println("读取URL:" + url);
        Request request = new Request.Builder()
                .url(url)
                .build();
        Response response = client.newCall(request).execute();
        String body = new String(response.body().bytes(), "gb2312");
//        System.out.println(body);
        return body;
    }


    public static void main(String[] args) throws Exception {
        // 正则
        Pattern pattern = Pattern.compile("[0-9]+");
//        System.out.println(pattern.matcher("划代码").matches());
//        System.out.println(pattern.matcher("110000").matches());

        List<Location> all = new ArrayList<>();
        String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
        // 主页
        Elements provinceList = null;
        do {
            String indexContent = readUrl(baseUrl + "index.html");
            provinceList = Jsoup.parse(indexContent).getElementsByClass("provincetr");
            System.out.println("provinceList是否为空:" + provinceList.isEmpty());
        } while (provinceList.isEmpty());

        for (Element pElement : provinceList) {
            // 获取省名称 + 子地址
            Elements a = pElement.select("a");
            for (Element e : a) {
                String pName = e.text();
                String pHref = e.attr("href");
                String pCode = pHref.substring(0, pHref.indexOf("."));
                String cityUrl = baseUrl + pHref;
                System.out.println(pName + "," + pCode + "," + cityUrl);
                Long provinceCode = Long.valueOf(pCode + "0000");

                // 设置省信息
                Location provinceInfo = new Location();
                provinceInfo.setCode(provinceCode);
                provinceInfo.setName(pName);
                provinceInfo.setLevel(1);
                provinceInfo.setLetterSort(getFirstSpell(pName));
                List<Location> cities = new ArrayList<>();
                provinceInfo.setChilds(cities); // 省下面:市
                all.add(provinceInfo);

                // 读取城市页面
                Elements cityList = null;
                do {
                    String cityContent = readUrl(cityUrl);
                    cityList = Jsoup.parse(cityContent).getElementsByClass("citytr");
                    System.out.println("cityList是否为空:" + cityList.isEmpty());
                } while (cityList.isEmpty());


                for (Element cElement : cityList) {
                    Elements aa = cElement.select("a");
                    for (Element ee : aa) {
                        String cName = ee.text();
                        // 过滤掉比如name为110100000000的数据,只需要取汉字的
                        if (pattern.matcher(cName).matches()){
                            continue;
                        }
                        String cHref = ee.attr("href");
                        String cCode = cHref.substring(cHref.indexOf("/")+1, cHref.indexOf("."));
                        String countyUrl = baseUrl + cHref;
                        System.out.println(cName + "," + cCode + "," + countyUrl);
                        Long cityCode = Long.valueOf(cCode + "00");

                        // 设置城市信息
                        Location city = new Location();
                        city.setCode(cityCode);
                        city.setName(cName);
                        city.setLevel(2);
                        city.setLetterSort(getFirstSpell(cName));
                        List<Location> counties = new ArrayList<>();
                        city.setChilds(counties);  // 市下面:区
                        city.setParentCode(provinceCode);
                        cities.add(city);   // 添加到城市列表


                        // 读取区页面
                        Elements countyList = null;
                        do{
                            String countyContent = readUrl(countyUrl);
                            countyList = Jsoup.parse(countyContent).getElementsByClass("countytr");
                            if ("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4419.html".equals(countyUrl) ||
                            "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4420.html".equals(countyUrl) ||
                            "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/46/4604.html".equals(countyUrl)){
                                countyList = Jsoup.parse(countyContent).getElementsByClass("towntr");
                            }
                            System.out.println("countyList是否为空:" + countyList.isEmpty());
                        } while (countyList.isEmpty());

                        for (Element aElement : countyList) {
                            Elements aaa = aElement.select("a");
                            for (Element eee : aaa) {
                                String aName = eee.text();
                                // 过滤掉比如name为110100000000的数据,只需要取汉字的
                                if (pattern.matcher(aName).matches()){
                                    continue;
                                }
                                String aHref = eee.attr("href");
                                String aCode = aHref.substring(aHref.indexOf("/")+1, aHref.indexOf("."));
                                System.out.println(aName + "," + aCode);

                                // 设置区信息
                                Location county = new Location();
                                county.setCode(Long.valueOf(aCode));
                                county.setName(aName);
                                county.setLevel(3);
                                county.setLetterSort(getFirstSpell(aName));
                                county.setParentCode(cityCode);
                                counties.add(county);   // 添加到区列表
                            }
                        }
                    }
                }

            }
        }
        String jsonString = JSON.toJSONString(all);
        System.out.println(jsonString);
        write(jsonString);
    }

    public static void write(String str) throws IOException {
        FileOutputStream out = new FileOutputStream("2019省市区-大陆.json");
        out.write(str.getBytes());
        out.flush();
        out.close();
    }

}

@Getter
@Setter
class Location{
    private Long code;
    private String name;
    private Integer level;
    private String letterSort;
    private Long parentCode;
    private List<Location> childs;
}

爬完数据我只是存在了json文件里了,如果你需要存到数据库,只需要对 all 进行处理即可。或者读文件再处理

private static void read() throws IOException {
    FileInputStream in = new FileInputStream("2019省市区-大陆.json");
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    StringBuilder sb = new StringBuilder();
    String t;
    while ((t = reader.readLine()) != null){
        sb.append(t);
    }
    List<Location> cities = JSONArray.parseArray(sb.toString(), Location.class);
    cities.forEach(e -> {
        // TODO
    });
}

另附:全部数据(很小一部分的名称爬下来就是乱码,导致首字母识别不出来需要手动改正,搜索【"letterSort": ""】)

 

一共有6个,不算多。

数据文件:https://github.com/Mysakura/DataFiles

============================================

算了,我帮你们找出来了

{
    "code": 341302,
    "letterSort": "Y",
    "level": 3,
    "name": "埇桥区",
    "parentCode": 341300
}

{
    "code": 410304,
    "letterSort": "C",
    "level": 3,
    "name": "瀍河回族区",
    "parentCode": 410300
}

{
    "code": 411502,
    "letterSort": "S",
    "level": 3,
    "name": "浉河区",
    "parentCode": 411500
}

{
    "code": 420104,
    "letterSort": "Q",
    "level": 3,
    "name": "硚口区",
    "parentCode": 420100
}

{
    "code": 420505,
    "letterSort": "X",
    "level": 3,
    "name": "猇亭区",
    "parentCode": 420500
}

{
    "code": 610118,
    "letterSort": "H",
    "level": 3,
    "name": "鄠邑区",
    "parentCode": 610100
}

 

posted @ 2020-03-31 10:58  露娜妹  阅读(1811)  评论(1编辑  收藏  举报