我们从阿里云的数字可视化平台获取数据 http://datav.aliyun.com/tools/atlas

爬取的链接如下:

湖北省(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420000.json

湖北地级市(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420100.json(以武汉市为例)

湖北区/县(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420111.json

attention:为什么选择不包含字区域的地址,因为不包含子区域时,每个地址只有当前省市区县的经纬度,即实现一个地址爬取一条记录。

我们可以先从数据库中获取省市区县的area_code编码,从而可以得到上面省市区的地址。

我们使用webmagic框架来爬取,这里直接上代码:

爬取类

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Json;

import javax.swing.plaf.synth.ColorType;
import java.math.BigDecimal;

//WebMagic的结构分为Downloader(下载)、PageProcessor(解析处理)、Scheduler(管理URL并去重)、Pipeline(持久化)四大组件
@Component
public class LngLatProcessor implements PageProcessor {
    public void process(Page page) {
        //打印页面内容
        //打印页面内容
        Json json = page.getJson();
        String s = json.get();
        JSONObject jsonObject = JSON.parseObject(s);
        JSONArray features = jsonObject.getJSONArray("features");
        int size = features.size();
        for (int i = 0; i < size; i++) {
            JSONObject jsonObject1 = features.getJSONObject(i);
            JSONObject properties = jsonObject1.getJSONObject("properties");
            JSONArray center = properties.getJSONArray("center");
            if(null == center){
                continue;
            }
            String name = properties.getString("name");
            Integer code = properties.getInteger("adcode");
            BigDecimal longitude = (BigDecimal) center.get(0);
            BigDecimal latitude = (BigDecimal) center.get(1);
            page.putField("name", name);

            page.putField("code", code);
            page.putField("longitude", longitude.doubleValue());
            page.putField("latitude", latitude.doubleValue());
        }

    }
    public Site getSite() {
        return Site.me()
                .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.2.300")
                .setSleepTime(1000)
                .setTimeOut(10000)
                .setRetryTimes(3);
    }
}

定制pipeline输出

import com.ljxx.pts.dao.AreasMapper;
import com.ljxx.pts.entity.Areas;
import com.ljxx.pts.service.AreasService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import javax.annotation.Resource;
import java.util.Date;
import java.util.Map;

// 定制pipeline输出
@Component
public class MyPipeline implements Pipeline {
    @Resource
    private AreasMapper areasMapper;
    @Autowired
    private AreasService areasService;
    @Override
    public void process(ResultItems resultItems, Task task) {
        Areas areaCode = new Areas();
        areaCode.setUpdateTime(new Date());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { // 一次只能取一个属性
            if ("code".equalsIgnoreCase(entry.getKey())) {
                Integer code = (Integer) entry.getValue();
                System.out.println(code);
                areaCode.setId(code);
            }
            if ("latitude".equalsIgnoreCase(entry.getKey())) {
                Double latitude = (Double) entry.getValue();
                System.out.println(latitude);
                areaCode.setLatitude(latitude);
            }
            if ("longitude".equalsIgnoreCase(entry.getKey())) {
                Double longitude = (Double) entry.getValue();
                System.out.println(longitude);
                areaCode.setLongitude(longitude);
            }

        }
        areasService.updateLngLat(areaCode);


    }
}

updateLngLat方法

@Transactional(rollbackFor = Exception.class)
    public void updateLngLat(Areas areaCode) {
        areasMapper.updateByPrimaryKeySelective(areaCode);
    }

测试类

import com.ljxx.pts.dao.AreasMapper;
import com.ljxx.pts.entity.Areas;
import com.ljxx.pts.webmagic.LngLatProcessor;
import com.ljxx.pts.webmagic.MyPipeline;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;

import javax.annotation.Resource;
import java.util.List;

@SpringBootTest
@RunWith(SpringJUnit4ClassRunner.class)
public class TestDemo {

    @Autowired
    private LngLatProcessor processor;
    @Autowired
    private MyPipeline myPipeline;
    @Resource
    private AreasMapper areasMapper;
    @Test
    public void test(){
        // 先获取省的code,省市区的级别分别为1,2,3
        Areas areas1 = new Areas();
        areas1.setAreaLevel(3);
        List<Areas> areas = areasMapper.select(areas1);
        for (Areas area : areas) {
            Integer id = area.getId();
            Spider.create( processor)
                    .addUrl("https://geo.datav.aliyun.com/areas_v3/bound/"+id.toString()+".json")
                    .addPipeline(myPipeline)
                    .addPipeline(new ConsolePipeline())
                    .thread(5)
                    .run();
        }

    }

}

当设置areaLevel的值为1时,获取的是省的经纬度,值为2时获取的是地级市的经纬度,值为3时,获取的时区县的经纬度。

 

posted on 2024-05-25 14:38  周文豪  阅读(61)  评论(0编辑  收藏  举报