SpringBoot+MybaitsPlus+Webmagic+AMIS爬取什么值得买并展示
1. WebMagic爬虫框架
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
1.1 相关文档
官网:
中文文档地址:
English:
1.2 WebMagic结构如下
WebMagic的结构分为
Downloader
、PageProcessor
、Scheduler
、Pipeline
四大组件,并由Spider将它们彼此组织起来。这四大组件对应爬虫生命周期中的下载、处理、管理和持久化等功能。
2.SpringBoot集成MybatisPlus+WebMagic
2.1 集成WebMagic
spring boot
与webmagic
的结合主要有三个模块,分别为爬取模块Processor
,入库模块Pipeline
,向数据库存入爬取数据,和定时任务模块Scheduled
,复制定时爬取网站数据。
2.1.1 添加maven依赖
<!--爬虫框架 --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency>
2.1.2 爬取模块Processor
爬取什么值得买的页面的Processor,分析什么值得买的页面数据,获取响应的链接和标题,放入wegmagic的Page中,到入库模块取出添加到数据库。代码如下
package com.dxz.spider.HttpUtil; import com.dxz.spider.model.SmzdmModel; import com.dxz.spider.util.TimeUtil; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; @Slf4j @Component public class SmzdmPageProcessor implements PageProcessor { // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 //抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private Site site = Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36") .setTimeOut(10 * 1000) .setRetryTimes(3) .setRetrySleepTime(3000); // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 @Override public void process(Page page) { // 部分二:定义如何抽取页面信息,并保存下来 \\w+ if (page.getUrl().regex("https://search.smzdm.com/\\?c=faxian&s=GU&v=b&p=\\d+").match()){ page.addTargetRequests(page.getHtml().xpath("//ul[@id='J_feed_pagenation']/li/a").links().all()); page.addTargetRequests(page.getHtml().xpath("//div[@class=feed-main-con]/ul[@id='feed-main-list']/li/div/div[@class='z-feed-content']/h5/a").links().all()); }else { SmzdmModel smzdmModel = new SmzdmModel(); String imgLocation = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='info']/a/img[@class=main-img]/@src").get(); // 获取物品的url String url= page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='info']/a/@href").get(); String title= page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='info']/div[@class='info-right']/div[@class='title-box']/h1[@class='title']/text()").get(); String price = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='info']/div[@class='info-right']/div[@class='title-box']/div[@class='price']/span/text()").get(); String introduce = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/article/div[@class='baoliao-block']/p/text()").get(); String baoliao = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/article/p/text()").get(); String time = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='info']/div[@class='info-right']/div[@class='info-details']/div[@class='author-info']/span[@class='time']/text()").get(); String zhi = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/div[@class='score_rateBox']/div[@class='score_rate']/span[@id='rating_worthy_num']/text()").get(); String buZhi = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/div[@class='score_rateBox']/div[@class='score_rate']/span[@id='rating_unworthy_num']/text()").get(); String start = page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/div[@class='operate_box']/div[@class='operate_icon']/a[@class='fav']/span/text()").get(); String pl =page.getHtml().xpath("//section[@id='feed-wrap']/article/div[@id='feed-main']/div[@class='item-name']/div[@class='operate_box']/div[@class='operate_icon']/a[@class='comment']/em[@class='commentNum']/text()").get(); if (StringUtils.isBlank(introduce)){ smzdmModel.setIntroduce(baoliao); } time = TimeUtil.handSmzdm(time); smzdmModel.setUrl(url); smzdmModel.setTitle(title); smzdmModel.setPrice(price); smzdmModel.setIntroduce(introduce); smzdmModel.setFbtime(time); smzdmModel.setNoZhi(buZhi); smzdmModel.setZhi(zhi); smzdmModel.setStart(start); smzdmModel.setPl( pl); smzdmModel.setImgurl(imgLocation); // 将爬取结果存储起来,key为smzdm value为爬取的数据即为smzdmModel的对象 page.putField("smzdm",smzdmModel); } } @Override public Site getSite() { return site; } }
2.1.3 入库模块Pipeline
入库模块结合MyBatisPlus模块一起组合成入库方法,继承webmagic的Pipeline,然后实现方法,在process方法中获取爬虫模块的数据,然后调用MybatisPlus的save方法。代码如下:
package com.dxz.spider.HttpUtil; import com.dxz.spider.model.HotWeeklyBlogs; import com.dxz.spider.model.SmzdmModel; import com.dxz.spider.service.SmzdmService; import com.dxz.spider.service.WeeklyService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; @Component public class MysqlPipeline implements Pipeline { @Autowired private WeeklyService weeklyService; @Autowired private SmzdmService smzdmService; @Override public void process(ResultItems resultItems, Task task) { // 取出processor过程中保存的结果,和Map类似,取出的key为smzdm和blogs HotWeeklyBlogs blogs = resultItems.get("blogs"); SmzdmModel smzdmModel = resultItems.get("smzdm"); if (blogs!=null){ weeklyService.save(blogs); }else if (smzdmModel!=null){ smzdmService.save(smzdmModel); System.out.println(smzdmModel.toString()); } } }
2.1.4 定时任务模块Scheduled
使用spring boot自带的定时任务注解@Scheduled(cron = "* * * * * ? ")
,每天每分钟执行一次爬取任务,在定时任务里调取webmagic的爬取模块Processor
。代码如下:
package com.dxz.spider.HttpUtil; import com.dxz.spider.WebMagicBugs.HttpClientDownloader; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Spider; @Component @Slf4j public class AllSpiderStarter { @Autowired private MysqlPipeline mysqlPipeline; @Scheduled(cron = "* * * * * ?") public void WeeklyScheduled(){ log.info("开始执行爬取任务"); Spider.create(new SmzdmPageProcessor()) .setDownloader(new HttpClientDownloader()) .addUrl("https://blog.hellobi.com/hot/monthly?page=1") .thread(5) .addPipeline(mysqlPipeline) .run(); } }
在springboot
启动类上加注解@EnableScheduling
import com.dxz.spider.util.HotMonthWebMagic; import org.mybatis.spring.annotation.MapperScan; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.scheduling.annotation.EnableScheduling; @SpringBootApplication @EnableScheduling @MapperScan("com.dxz.spider.mapper") public class SpiderApplication { public static void main(String[] args) { SpringApplication.run(SpiderApplication.class, args); } }
2.2 集成MybatisPlus
2.1.1 MyBatisPlus
使用上基本和MyBatis一致,但是集成了基本的CRUD接口,对基本的CRUD可以直接调用。
官网地址
2.1.2 导入maven依赖
<!-- Mybatis-plus --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> <version>3.0.5</version> </dependency>
2.1.3 编写Mapper、Server和Model
什么值得买爬取的Model类
package com.dxz.spider.model; import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; /** * 什么值得买的数据库模型 */ @Data // TODO:对应数据库的名字,可自行更改 @TableName("smzdm") public class SmzdmModel { /** * 标题 */ private String title; /** * 价格 */ private String price; /** * 简介 */ private String introduce; /** * 认为值的人数 */ private String zhi; /** * 认为不值得人数 */ //TODO:对应的数据库列的名字,可自行更改 @TableField(value = "NoZhi") private String NoZhi; /** * 收藏的人数 */ private String start; /** * 评论数 */ private String pl; /** * 发布时间 */ private String fbtime; /** * url */ private String url; /** * 图床链接 */ private String imgurl; }
编写Mapper类
public interface SmzdmMapper extends BaseMapper<SmzdmModel> { @Select("select * from smzdm") List<SmzdmModel> selectAll(); }
继承BaseMapper<T>接口,获取基础的CRUD
@Service @Slf4j public class SmzdmService extends ServiceImpl<SmzdmMapper, SmzdmModel> { public List<SmzdmModel> selectAll(){ return smzdmMapper.selectAll(); } }
编写application.properties
spring.datasource.username=root spring.datasource.password=123456 spring.datasource.url=jdbc:mysql://localhost:3306/dxzstudy?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai spring.datasource.driverClassName = com.mysql.cj.jdbc.Driver // mybatis的xml的保存位置 mybatis-plus.mapper-locations=classpath:mapperxml/*.xml
集成完毕!
3.编写视图AMIS
3.1 What is AMIS ?
amis 是一个前端低代码框架,它使用 JSON 配置来生成页面,可以极大节省页面开发工作量,极大提升开发前端界面的效率。 有了AMIS,对于基本的界面,就算程序员不会前端。只要会JSON配置,或者说只要会汉语就能很快上手了。百度开源的神器!!!
参考文档
https://baidu.github.io/amis/docs/intro?page=1
3.2 下载css和js
从官网下载sdk.css和就sdk.js
3.3 编写HTML页面
<!DOCTYPE html> <html lang="zh"> <head> <meta charset="UTF-8"/> <title>什么值得买</title> <meta name="referrer" content="no-referrer" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> <meta http-equiv="X-UA-Compatible" content="IE=Edge"/> <link rel="stylesheet" href="/static/sdk.css"/> <style> html, body, .app-wrapper { position: relative; width: 100%; height: 100%; margin: 0; padding: 0; } </style> </head> <body> <div id="root" class="app-wrapper"></div> <script src="/static/sdk.js"></script> <script type="text/javascript"> (function () { var amis = amisRequire('amis/embed'); amis.embed('#root', { "$schema": "https://houtai.baidu.com/v2/schemas/page.json#", "type": "page", "title": "什么值得买优衣库专场", "toolbar": [ { "type": "button", "actionType": "dialog", "label": "新增", "icon": "fa fa-plus pull-left", "primary": true, "dialog": { "title": "新增", "body": { "type": "form", "name": "sample-edit-form", "api": "", "controls": [ { "type": "alert", "level": "info", "body": "因为没有配置 api 接口,不能真正的提交哈!" }, { "type": "text", "name": "text", "label": "文本", "required": true }, { "type": "divider" }, { "type": "image", "name": "image", "label": "图片", "required": true }, { "type": "divider" }, { "type": "date", "name": "date", "label": "日期", "required": true }, { "type": "divider" }, { "type": "select", "name": "type", "label": "选项", "options": [ { "label": "漂亮", "value": "1" }, { "label": "开心", "value": "2" }, { "label": "惊吓", "value": "3" }, { "label": "紧张", "value": "4" } ] } ] } } } ], "body": [ { "type": "form", "title": "条件输入", "className": "m-t", "wrapWithPanel": false, "target": "service1", "mode": "inline", "controls": [ { "type": "text", "name": "keywords", "placeholder": "关键字", "addOn": { "type": "button", "icon": "fa fa-search", "actionType": "submit", "level": "primary" } } ] }, { "type": "crud", "api": "http://localhost:8080/getAll", "defaultParams": { "perPage": 5 }, "columns": [ { "name": "title", "label": "标题", "type": "text" }, { "name": "price", "label": "价格", "type": "text" }, { "name": "url", "label": "商品链接", "type": "text" }, { "type": "image", "label": "物品图片", "multiple": false, "name": "imgurl", "popOver": { "title": "查看大图", "body": "<div class=\"w-xxl\"><img class=\"w-full\" src=\"${imgurl}\"/></div>" } }, { "name": "fbtime", "type": "date", "label": "发布日期" }, { "type": "container", "label": "操作", "body": [ { "type": "button", "icon": "fa fa-eye", "level": "link", "actionType": "dialog", "tooltip": "查看", "dialog": { "title": "查看", "body": { "type": "form", "controls": [ { "type": "static", "name": "title", "label": "标题" }, { "type": "divider" }, { "type": "static", "name": "price", "label": "价格" }, { "type": "divider" }, { "type": "static-image", "label": "图片", "name": "imgurl", "popOver": { "title": "查看大图", "body": "<div class=\"w-xxl\"><img class=\"w-full\" src=\"${imgurl}\"/></div>" } }, { "type": "divider" }, { "name": "fbtime", "type": "static", "label": "发布时间" }, { "type": "divider" }, { "name": "url", "type": "static", "label": "购买链接" }, ] } } }, { "type": "button", "icon": "fa fa-pencil", "tooltip": "编辑", "level": "link", "actionType": "drawer", "drawer": { "position": "left", "size": "lg", "title": "编辑", "body": { "type": "form", "name": "sample-edit-form", "controls": [ { "type": "alert", "level": "info", "body": "因为没有配置 api 接口,不能真正的提交哈!" }, { "type": "hidden", "name": "id" }, { "type": "text", "name": "text", "label": "文本", "required": true }, { "type": "divider" }, { "type": "image", "name": "image", "multiple": false, "label": "图片", "required": true }, { "type": "divider" }, { "type": "date", "name": "date", "label": "日期", "required": true }, { "type": "divider" }, { "type": "select", "name": "type", "label": "选项", "options": [ { "label": "漂亮", "value": "1" }, { "label": "开心", "value": "2" }, { "label": "惊吓", "value": "3" }, { "label": "漂亮", "value": "紧张" } ] } ] } } }, { "type": "button", "level": "link", "icon": "fa fa-times text-danger", "actionType": "ajax", "tooltip": "删除", "confirmText": "您确认要删除? 没有配置 api 确定了也没用,还是不要确定了", "api": "" } ] } ] } ] }); })(); </script> </body> </html>
3.4 配置SpringBoot
编写后台访问接口,这里只写了查找的接口,编辑和删除的可以自行编写
package com.dxz.spider.web; import com.dxz.spider.model.SmzdmModel; import com.dxz.spider.service.SmzdmService; import com.dxz.spider.web.SmzdmVO.GoodsVO; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMethod; import org.springframework.web.bind.annotation.RestController; import java.util.List; @RestController @Slf4j public class SmzdmWeb { @Autowired private SmzdmService smzdmService; @RequestMapping(value = "/getAll",method = RequestMethod.GET) public GoodsVO selectByPage(){ log.info("请求什么值得买的getAll接口"); GoodsVO goodsVO = new GoodsVO(); List<SmzdmModel> smzdmModels = smzdmService.selectAll(); if (smzdmModels.size()>0){ goodsVO.setStatus(0); goodsVO.setMsg("请求成功"); goodsVO.setData(smzdmModels); return goodsVO; }else{ return null; } } }
编写视图控制器
package com.dxz.spider.config; import org.springframework.context.annotation.Configuration; import org.springframework.web.servlet.config.annotation.CorsRegistry; import org.springframework.web.servlet.config.annotation.ResourceHandlerRegistry; import org.springframework.web.servlet.config.annotation.ViewControllerRegistry; import org.springframework.web.servlet.config.annotation.WebMvcConfigurationSupport; @Configuration public class WebConfig extends WebMvcConfigurationSupport { /** * 映射静态文件 * @param registry */ @Override protected void addResourceHandlers(ResourceHandlerRegistry registry) { registry.addResourceHandler("/static/**").addResourceLocations("classpath:/static/"); super.addResourceHandlers(registry); } /** * 映射视图 * @param registry */ @Override protected void addViewControllers(ViewControllerRegistry registry) { registry.addViewController("/smzdm").setViewName("smzdm"); super.addViewControllers(registry); } /** * 跨域配置 * @param registry */ @Override protected void addCorsMappings(CorsRegistry registry) { registry.addMapping("/**") .allowedOrigins("http://localhost:8080") .allowedMethods("*") .allowedHeaders("*"); super.addCorsMappings(registry); } }
3.5 运行查看