webmagic学习之路-1:采集安居客列表页测试
---恢复内容开始---
package com.action; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.management.JMException; import org.bson.Document; import com.model.Model_AnjukeList; import com.mongodb.BasicDBObject; import com.util.Constants; import com.util.GetDate; import com.util.MD5With32; import com.util.MongoDBUtil; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; public class GetAnjukeListNum implements PageProcessor { public static Model_AnjukeList anjukeList; public static List<String> list = new ArrayList<String>(); public static List<BasicDBObject> list_insert = new ArrayList<BasicDBObject>(); private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8") .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"); @Override public Site getSite() { // TODO Auto-generated method stub return this.site; } @Override public void process(Page page) { String found = null; BasicDBObject doc = null; page.addTargetRequests(Constants.list_urls); System.out.println("code:"+page.getStatusCode()); String pg = page.getHtml().toString(); if(pg.length()>100){ Pattern p = Pattern.compile(Constants.anjuke_Reg_Found); Matcher m = p.matcher(pg); while(m.find()){ found = m.group(0).replace("\"found\":", "").replace(",", ""); String id = MD5With32.encryption(page.getUrl().toString()); if(!Constants.map_urls.containsKey(id)){ continue; } Model_AnjukeList model_AnjukeList = Constants.map_urls.get(id);
//mongo存储! doc = new BasicDBObject("_id",id) .append("city", model_AnjukeList.getCity()) .append("towards", model_AnjukeList.getTowards()) .append("zone_urls", model_AnjukeList.getZone_urls()) .append("zone", model_AnjukeList.getZone()) .append("site", model_AnjukeList.getSite()) .append("decoration", model_AnjukeList.getDecoration()) .append("flag", model_AnjukeList.getFlag()) .append("street", model_AnjukeList.getStreet()) .append("type", model_AnjukeList.getType()) .append("page", model_AnjukeList.getPage()) .append("urls", model_AnjukeList.getUrls()) .append("found", found) .append("update_time", model_AnjukeList.getUpdate_time()) ; list_insert.add(doc); } } } public static void main(String[] args) { String city = "北京"; String urls = "https://beijing.anjuke.com/sale/"; MongoGetUrls.GetMongoUrls(city); System.out.println("任务总数:"+Constants.list_urls.size()); Spider.create(new GetAnjukeListNum()) .addUrl(urls) .addPipeline(new ConsolePipeline()) .thread(30) .run(); MongoDBUtil.saveMany(..., list_insert); } }
第一次用webmagic 很多东西不懂,也没有重写。
很多都是用纯java实现
让我们慢慢发现webmagic的强大吧!
转载注明出处
如果本文对你有帮助,请帮忙啦~~
打开支付宝首页搜“522901509”领红包,领到大红包的小伙伴赶紧使用哦!