一:java maven依赖:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
二:示例代码:
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
public class GithubRepoPageProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String titleXpath = "//div[@class='postTitle']/a/span/text()";
String date_xpath = "//div[@class='dayTitle']/a/text()";
String detailUrlsXpath="//div[@class='postTitle']/a[@class='postTitle2 vertical-middle']/@href";
String nextPageXpath="//div[@class='pager']/a/@href";
String nextPageCss = "#homepage_top_pager > div:nth-child(1) > a:nth-child(8)";
page.putField("title", page.getHtml().xpath(titleXpath).toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("date",page.getHtml().xpath(date_xpath).toString());
// 部分三:从页面发现后续的url地址来抓取
if(page.getHtml().xpath(detailUrlsXpath).match()){
Selectable detailUrls = page.getHtml().xpath(detailUrlsXpath);
page.addTargetRequests(detailUrls.all());
}
if(page.getHtml().xpath(nextPageXpath).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}
if(page.getHtml().css(nextPageCss).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}
//page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/dick159/default.html?page=1").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor())
//从"http://www.cnblogs.com/dick159/default.html?page=1"开始抓
.addUrl("http://www.cnblogs.com/dick159/default.html?page=1")
// 输出到D盘webmagic文件夹
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
//开启1个线程抓取
.thread(1)
//启动爬虫
.run();
}
}