webmagic学习之路-2:采集安居客经纪人列表
相比较 1 稍微成熟了一点,会用的东西多了。 正则用的不好,很多东西不会,大神轻喷!
package com.action; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.management.JMException; import javax.swing.plaf.synth.SynthSpinnerUI; import org.bson.Document; import com.model.AgentListModel; import com.model.Model_AnjukeList; import com.mongodb.BasicDBObject; import com.util.Constants; import com.util.GetDate; import com.util.MysqlUtils; import com.util.MD5With32; import com.util.MongoDBUtil; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; public class GetAnjukeAgentList implements PageProcessor { static AgentListModel anjukeList; static List<String> list = new ArrayList<String>(); static List<AgentListModel> list_insert = new ArrayList<AgentListModel>(); static BasicDBObject doc = null; private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8") .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"); @Override public Site getSite() { // TODO Auto-generated method stub return this.site; } @Override public void process(Page page) { System.out.println("code:"+page.getStatusCode()); System.out.println(page.getUrl()); if(!page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/").match()/*&&!page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()*/){ page.addTargetRequests(page.getHtml().xpath("//span[@class='elems-l']/a/@href").regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").all()); if(page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").match()){ page.addTargetRequests(page.getHtml().xpath("//div[@class='sub-items']/a/@href").all()); } }else{ //3行可以移动到这里!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! //分割线-------------------- List<Selectable> htmls = page.getHtml().xpath("//div[@class='jjr-info']").nodes(); for(Selectable html: htmls){ String name = html.xpath("//div/h3/a/text()").get(); String staffNo = html.xpath("//div/h3/a/@href").get(); String company = html.xpath("//p[@class='jjr-desc']/a[1]/text()").get(); String company_url = html.xpath("//p[@class='jjr-desc']/a[1]/@href").get(); String store = html.xpath("//p[@class='jjr-desc']/a[2]/text()").get(); String store_url = html.xpath("//p[@class='jjr-desc']/a[2]/@href").get(); anjukeList = new AgentListModel("", "", "", "", "anjuke", GetDate.getDay0(), page.getUrl()+"", name, staffNo, company, company_url, store, store_url); list_insert.add(anjukeList); } String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", ""); String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", ""); String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", ""); MysqlUtils.InsertAnjukeAgent(list_insert,city,zone,street); list_insert.clear(); if(page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()){ //获取分页 page.addTargetRequests(page.getHtml().xpath("//div[@class='page-content']/div/a/@href").all()); } } } public static void main(String[] args) { List<String> list = new ArrayList<String>(); list.add("https://chongqing.anjuke.com/tycoon/");for (int i = 0; i < list.size(); i++) { Spider.create(new GetAnjukeAgentList()) .addUrl(list.get(i)) .addPipeline(new ConsolePipeline()) .thread(20) .run(); } } }
这段代码有个很大的疑问,不知道有没有大神给解释一下。
String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", ""); String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", ""); String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", "");
上面这3行,如果你把位置移动一下。
移动到分割线上面去。
这3个xpath会匹配不到内容,我研究了很长时间,没搞明白,也就没再研究下去了。
评论区留言告知下,谢谢!!
转载注明出处
如果本文对你有帮助,请帮忙啦~~
打开支付宝首页搜“522901509”领红包,领到大红包的小伙伴赶紧使用哦!