我们从阿里云的数字可视化平台获取数据 http://datav.aliyun.com/tools/atlas
爬取的链接如下:
湖北省(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420000.json
湖北地级市(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420100.json(以武汉市为例)
湖北区/县(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420111.json
attention:为什么选择不包含字区域的地址,因为不包含子区域时,每个地址只有当前省市区县的经纬度,即实现一个地址爬取一条记录。
我们可以先从数据库中获取省市区县的area_code编码,从而可以得到上面省市区的地址。
我们使用webmagic框架来爬取,这里直接上代码:
爬取类
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Json; import javax.swing.plaf.synth.ColorType; import java.math.BigDecimal; //WebMagic的结构分为Downloader(下载)、PageProcessor(解析处理)、Scheduler(管理URL并去重)、Pipeline(持久化)四大组件 @Component public class LngLatProcessor implements PageProcessor { public void process(Page page) { //打印页面内容 //打印页面内容 Json json = page.getJson(); String s = json.get(); JSONObject jsonObject = JSON.parseObject(s); JSONArray features = jsonObject.getJSONArray("features"); int size = features.size(); for (int i = 0; i < size; i++) { JSONObject jsonObject1 = features.getJSONObject(i); JSONObject properties = jsonObject1.getJSONObject("properties"); JSONArray center = properties.getJSONArray("center"); if(null == center){ continue; } String name = properties.getString("name"); Integer code = properties.getInteger("adcode"); BigDecimal longitude = (BigDecimal) center.get(0); BigDecimal latitude = (BigDecimal) center.get(1); page.putField("name", name); page.putField("code", code); page.putField("longitude", longitude.doubleValue()); page.putField("latitude", latitude.doubleValue()); } } public Site getSite() { return Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.2.300") .setSleepTime(1000) .setTimeOut(10000) .setRetryTimes(3); } }
定制pipeline输出
import com.ljxx.pts.dao.AreasMapper; import com.ljxx.pts.entity.Areas; import com.ljxx.pts.service.AreasService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import javax.annotation.Resource; import java.util.Date; import java.util.Map; // 定制pipeline输出 @Component public class MyPipeline implements Pipeline { @Resource private AreasMapper areasMapper; @Autowired private AreasService areasService; @Override public void process(ResultItems resultItems, Task task) { Areas areaCode = new Areas(); areaCode.setUpdateTime(new Date()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { // 一次只能取一个属性 if ("code".equalsIgnoreCase(entry.getKey())) { Integer code = (Integer) entry.getValue(); System.out.println(code); areaCode.setId(code); } if ("latitude".equalsIgnoreCase(entry.getKey())) { Double latitude = (Double) entry.getValue(); System.out.println(latitude); areaCode.setLatitude(latitude); } if ("longitude".equalsIgnoreCase(entry.getKey())) { Double longitude = (Double) entry.getValue(); System.out.println(longitude); areaCode.setLongitude(longitude); } } areasService.updateLngLat(areaCode); } }
updateLngLat方法
@Transactional(rollbackFor = Exception.class) public void updateLngLat(Areas areaCode) { areasMapper.updateByPrimaryKeySelective(areaCode); }
测试类
import com.ljxx.pts.dao.AreasMapper; import com.ljxx.pts.entity.Areas; import com.ljxx.pts.webmagic.LngLatProcessor; import com.ljxx.pts.webmagic.MyPipeline; import org.junit.Test; import org.junit.runner.RunWith; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import javax.annotation.Resource; import java.util.List; @SpringBootTest @RunWith(SpringJUnit4ClassRunner.class) public class TestDemo { @Autowired private LngLatProcessor processor; @Autowired private MyPipeline myPipeline; @Resource private AreasMapper areasMapper; @Test public void test(){ // 先获取省的code,省市区的级别分别为1,2,3 Areas areas1 = new Areas(); areas1.setAreaLevel(3); List<Areas> areas = areasMapper.select(areas1); for (Areas area : areas) { Integer id = area.getId(); Spider.create( processor) .addUrl("https://geo.datav.aliyun.com/areas_v3/bound/"+id.toString()+".json") .addPipeline(myPipeline) .addPipeline(new ConsolePipeline()) .thread(5) .run(); } } }
当设置areaLevel的值为1时,获取的是省的经纬度,值为2时获取的是地级市的经纬度,值为3时,获取的时区县的经纬度。