一:java maven依赖:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
二:示例代码:
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
public class GithubRepoPageProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String titleXpath = "//div[@class='postTitle']/a/span/text()";
String date_xpath = "//div[@class='dayTitle']/a/text()";
String detailUrlsXpath="//div[@class='postTitle']/a[@class='postTitle2 vertical-middle']/@href";
String nextPageXpath="//div[@class='pager']/a/@href";
String nextPageCss = "#homepage_top_pager > div:nth-child(1) > a:nth-child(8)";
page.putField("title", page.getHtml().xpath(titleXpath).toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("date",page.getHtml().xpath(date_xpath).toString());
// 部分三:从页面发现后续的url地址来抓取
if(page.getHtml().xpath(detailUrlsXpath).match()){
Selectable detailUrls = page.getHtml().xpath(detailUrlsXpath);
page.addTargetRequests(detailUrls.all());
}
if(page.getHtml().xpath(nextPageXpath).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}
if(page.getHtml().css(nextPageCss).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}
//page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/dick159/default.html?page=1").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor())
//从"http://www.cnblogs.com/dick159/default.html?page=1"开始抓
.addUrl("http://www.cnblogs.com/dick159/default.html?page=1")
// 输出到D盘webmagic文件夹
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
//开启1个线程抓取
.thread(1)
//启动爬虫
.run();
}
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效
2018-10-31 spring cloud zuul
2018-10-31 spring cloud turbine
2018-10-31 spring coud feign
2018-10-31 spring cloud DashBoard
2018-10-31 spring cloud 消费者
2018-10-31 spring cloud 服务提供者
2018-10-31 spring cloud: eureka搭建