webmagic学习之路-3:采集安居客经纪人详情页
这里希望安居客的同行的轻喷!!单纯的做测试,玩玩。
就这么糟践你们的服务器了!!!sorry!
这次学会了webmagic 设置处理的访问HTML返回代码,因为之前一直404的页面process根本都不会进来,纳闷很久,也百度了半天。
看源码看了好半天,才知道原来有这个方法设置进process的状态码,让我看源码的决心来源于 我用logger 打印的内容告诉我,webmagic已经获取了404,只是没处理而已。
也同时学会了 scheduler
package com.action; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.management.JMException; import javax.swing.plaf.synth.SynthSpinnerUI; import org.apache.commons.collections.bag.SynchronizedSortedBag; import org.apache.log4j.Logger; import org.bson.Document; import com.model.AgentListByNumModel; import com.model.AgentListModel; import com.model.Model_AnjukeList; import com.mongodb.BasicDBObject; import com.util.Constants; import com.util.GetDate; import com.util.MysqlUtils; import com.util.MD5With32; import com.util.MongoDBUtil; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.PriorityScheduler; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; public class GetAnjukeAgentByNum implements PageProcessor { static Logger logger = Logger.getLogger(GetAnjukeAgentByNum.class); static AgentListByNumModel anjukeList; static List<String> list = new ArrayList<String>(); static List<AgentListByNumModel> list_insert = new ArrayList<AgentListByNumModel>(); static BasicDBObject doc = null; static int num = 0; private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8").setUserAgent( "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"); @Override public Site getSite() { // TODO Auto-generated method stub Set<Integer> acceptStatCode = new HashSet<Integer>(); acceptStatCode.add(200); acceptStatCode.add(404); site = site.setAcceptStatCode(acceptStatCode); return this.site; } @Override public void process(Page page) { if(page.getStatusCode()==404 ||page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "").equals("") ||(page.getHtml()+"").contains("经纪人店铺暂时关闭")){ String spider_urls = page.getUrl() +""; anjukeList = new AgentListByNumModel("", "", "", "", "anjuke", GetDate.getDay0(), spider_urls, "", spider_urls, "", "", "", "", "", ""); list_insert.add(anjukeList); MysqlUtils.InsertAnjukeAgentByNum(list_insert); list_insert.clear(); }else{ if((page.getHtml()+"").contains("访问验证-安居客")){ num = num+1; System.out.println("被封次数 : "+num); } String zone = ""; String street = ""; String contact = ""; String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", ""); String name = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("的店铺", ""); String staffNo = page.getUrl() + ""; String company = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/text()").get(); String company_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/@href").get(); String store = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/text()").get(); String store_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/@href").get(); String comms = page.getHtml().xpath("//dl[@class='item last']/dd/a/text()").all() + ""; comms = comms.replace("[", "").replace("]", ""); String contacts = page.getHtml().xpath("//head/meta[3]/@content").get(); Pattern p = Pattern.compile(Constants.reg_phone); Matcher m = p.matcher(contacts); if (m.find()) { contact = m.group(0); } Object[] zs = page.getHtml().xpath("//div[@class='section service']/dl[3]/dd/a/text()").all().toArray(); if (zs == null || zs.length == 0) { String zss = page.getHtml().xpath("//div[@class='details-item']/span[@class='comm-address']/@title").get(); if (zss!=null&&(zss.contains("[") && zss.contains(" "))) { zss = zss.substring(zss.indexOf("["), zss.indexOf("]")); zss = zss.substring(1, zss.indexOf(" ")); if (zss.contains("-")) { zone = zss.split("-")[0]; street = zss.split("-")[1]; } } anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(), page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms); list_insert.add(anjukeList); } else { for (int i = 0; i < zs.length; i++) { if ((zs[i] + "").contains("-")) { String[] zss = zs[i].toString().split("-"); zone = zss[0]; street = zss[1]; } anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(), page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms); list_insert.add(anjukeList); } } if(list_insert.size()>0){ MysqlUtils.InsertAnjukeAgentByNum(list_insert); list_insert.clear(); } } } public static void main(String[] args) { MysqlUtils.SelectSpiderID(); PriorityScheduler scheduler = new PriorityScheduler(); Spider spider = Spider.create(new GetAnjukeAgentByNum()).setScheduler(scheduler).addPipeline(new ConsolePipeline()); for (int n = 0; n < 100000; n++) { if(Constants.map_id.containsKey(n+"")){ System.out.println("contain : " +n); continue; } String url = "https://junranfangchan.anjuke.com/gongsi-jjr-" + n + "/"; scheduler.push(new Request(url), spider); } System.out.println("total task num :" +scheduler.getTotalRequestsCount(spider)); spider.thread(25).run(); // Spider.create(new GetAnjukeAgentByNum()).addUrl("https://junranfangchan.anjuke.com/gongsi-jjr-99988/") // .addPipeline(new ConsolePipeline()).thread(1).run(); // } }
转载注明出处
如果本文对你有帮助,请帮忙啦~~
打开支付宝首页搜“522901509”领红包,领到大红包的小伙伴赶紧使用哦!