随笔 - 581  文章 - 0 评论 - 48 阅读 - 131万
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

 

一:java maven依赖:

<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>

二:示例代码:
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
public class GithubRepoPageProcessor implements PageProcessor {

// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String titleXpath = "//div[@class='postTitle']/a/span/text()";
String date_xpath = "//div[@class='dayTitle']/a/text()";
String detailUrlsXpath="//div[@class='postTitle']/a[@class='postTitle2 vertical-middle']/@href";
String nextPageXpath="//div[@class='pager']/a/@href";
String nextPageCss = "#homepage_top_pager > div:nth-child(1) > a:nth-child(8)";
page.putField("title", page.getHtml().xpath(titleXpath).toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("date",page.getHtml().xpath(date_xpath).toString());

// 部分三:从页面发现后续的url地址来抓取
if(page.getHtml().xpath(detailUrlsXpath).match()){
Selectable detailUrls = page.getHtml().xpath(detailUrlsXpath);
page.addTargetRequests(detailUrls.all());
}
if(page.getHtml().xpath(nextPageXpath).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}
if(page.getHtml().css(nextPageCss).match()){
Selectable nextPageUrl = page.getHtml().xpath(nextPageXpath);
page.addTargetRequests(nextPageUrl.all());
}

//page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/dick159/default.html?page=1").all());
}

@Override
public Site getSite() {
return site;
}

public static void main(String[] args) {

Spider.create(new GithubRepoPageProcessor())
//从"http://www.cnblogs.com/dick159/default.html?page=1"开始抓
.addUrl("http://www.cnblogs.com/dick159/default.html?page=1")
// 输出到D盘webmagic文件夹
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
//开启1个线程抓取
.thread(1)
//启动爬虫
.run();
}
}
posted on   毛会懂  阅读(11)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效
历史上的今天:
2018-10-31 spring cloud zuul
2018-10-31 spring cloud turbine
2018-10-31 spring coud feign
2018-10-31 spring cloud DashBoard
2018-10-31 spring cloud 消费者
2018-10-31 spring cloud 服务提供者
2018-10-31 spring cloud: eureka搭建
点击右上角即可分享
微信分享提示