【爬虫入门】HttpClient+Jsoup进行简单的网页访问和信息保存
【项目选型】
(Maven)SpringBoot+JPA
【项目搭建】
pom.xml:
<parent> <artifactId>spring-boot-starter-parent</artifactId> <groupId>org.springframework.boot</groupId> <version>2.5.0</version> </parent> <dependencies> <!--spring-boot-mvc--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <!--springData JPA--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-jpa</artifactId> </dependency> <!--mysql--> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.2</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> </dependencies>
AppMain.class:
@SpringBootApplication /** * 开启定时任务 */ @EnableScheduling public class AppMain { public static void main(String[] args) { SpringApplication.run(AppMain.class,args); } }
【分析】
【具体实现】
POJO+JPA+SQL建表
POJO类: @Table(name = "md_item") @Entity public class Product { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) String id; String proid; String proauthor; String protitle; String probackcount; String probackermoney; String promoneypercent; String starttime; String endtime; String protype; String prostatus; String proimgpath; // get/set/toString } JPA: /** * extends JpaRepository<PoJo类,Key主键类型> */ public interface ProductDao extends JpaRepository<Product,Long> { } SQL: DROP TABLE IF EXISTS `md_item`; CREATE TABLE `md_item` ( `id` bigint(20) NOT NULL AUTO_INCREMENT, `proid` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `proauthor` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `protitle` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `probackcount` varchar(20) DEFAULT NULL, `probackermoney` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `promoneypercent` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `starttime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `endtime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `protype` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `prostatus` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, `proimgpath` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=82 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
封装HttpClient 实现获取网页代码和图片下载
/** * 封装HttpClient,交给Spring容器管理 */ @Component public class HttpUtils { //httpclient连接池! private PoolingHttpClientConnectionManager clientConnectionManager; /** * 在构造方法中new一个 */ public HttpUtils() { this.clientConnectionManager = new PoolingHttpClientConnectionManager(); //设置最大连接数 this.clientConnectionManager.setMaxTotal(100); //设置每个主机的最大连接数 this.clientConnectionManager.setDefaultMaxPerRoute(10); } /** * 使用get请求获得页面 * @param url * @return 页面数据 */ public String doGetHtml(String url){ //获取HttpClient对象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build(); //创建HttpGet请求对象,设置url地址 HttpGet httpGet = new HttpGet(url); //设置请求信息 httpGet.setConfig(this.getConfig()); CloseableHttpResponse httpResponse=null; //使用HttpClient发起请求,获得相应 try { httpResponse = httpClient.execute(httpGet); if(httpResponse.getStatusLine().getStatusCode()==200){ //判断Entity是否为空,如果不为空就可以使用EntityUtils if(httpResponse.getEntity()!=null){ String content = EntityUtils.toString(httpResponse.getEntity(),"utf-8"); return content; }else{ return "ERROR"; } } } catch (IOException e) { e.printStackTrace(); }finally { if(httpResponse!=null){ try { httpResponse.close(); } catch (IOException e) { e.printStackTrace(); } } } //解析响应,返回结果 return ""; } /** * 设置RequestConfig * @return */ private RequestConfig getConfig() { RequestConfig config=RequestConfig.custom() .setConnectTimeout(1000) //创建链接的最长时间 .setConnectionRequestTimeout(500) //获取链接的最长时间 .setSocketTimeout(10000) //数据传输的最长时间 .build(); return config; } /** * 下载图片 * @param url * @return 图片名称 */ public String doGetImage(String url){ //获取HttpClient对象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build(); //创建HttpGet请求对象,设置url地址 HttpGet httpGet = new HttpGet(url); //设置请求信息 httpGet.setConfig(this.getConfig()); CloseableHttpResponse httpResponse=null; //使用HttpClient发起请求,获得相应 try { httpResponse = httpClient.execute(httpGet); if(httpResponse.getStatusLine().getStatusCode()==200){ //判断Entity是否为空,如果不为空就可以使用EntityUtils if(httpResponse.getEntity()!=null){ //下载图片 //获取图片后缀 String exName=url.substring(url.lastIndexOf(".")); //创建图片名,重命名图片 String picName= UUID.randomUUID().toString()+exName; //下载图片 OutputStream outputStream=new FileOutputStream(new File("G:/IJDailyCode/Crawler/src/main/resources/downloadImg/"+picName)); httpResponse.getEntity().writeTo(outputStream); //返回图片名称 return picName; }else{ return "ERROR"; } } } catch (IOException e) { e.printStackTrace(); }finally { if(httpResponse!=null){ try { httpResponse.close(); } catch (IOException e) { e.printStackTrace(); } } } //出现其他问题返回结果 return ""; }
任务方法:
@Component public class GetTask { @Autowired HttpUtils httpUtils; @Autowired ProductService productService; @Scheduled(fixedDelay = 100*1000) public void crawlerMain(){ String mainUrl="https://zhongchou.modian.com/all/top_time/going/"; /*页码for循环*/ for(int page=1;page<=8;++page){ String tempUrl=mainUrl+page; String html = httpUtils.doGetHtml(tempUrl); /*解析页面,获取商品数据*/ this.parse(html); } } private void parse(String html) { Document doc = Jsoup.parse(html); Elements proElms = doc.select("div.pro_field > ul > li"); for (Element proElm:proElms) { /** * 乱七八糟的搜寻匹配项 */ String proId = proElm.attr("data-pro-id"); String proAuthor=proElm.select("div.author > a > p").text(); // 重复的使用选择器找信息。。。。。。。。。。。。。 String imgUrl=infoDoc.getElementById("big_logo").attr("src"); //如果实体的属性是null,它就会忽略它,这里只传一个proId参数就好 Product proExample=new Product(); proExample.setProid(proId); //查询并判断数据是否存在 List<Product> examples = productService.findAll(proExample); System.out.println("list有无数据:"+examples.size()); if(examples.size()>0){ System.out.println("===数据已存在==="); continue; } /*绑定数值*/ proExample.setProauthor(proAuthor); // 重复的数值绑定操作。。。。。。。。。 proExample.setProstatus(proStatus); /*下载图片*/ String proImgPath=httpUtils.doGetImage(imgUrl); proExample.setProimgpath(proImgPath); /*提交保存*/ productService.save(proExample); } } }
最没用的Service
@Service @Transactional public class ProductServiceImpl implements ProductService { @Autowired private ProductDao productDao; @Override public void save(Product product) { this.productDao.save(product); } @Override public List<Product> findAll(Product product) { Example<Product> example = Example.of(product); List<Product> products = productDao.findAll(example); return products; } }
【保存结果展示】
【遇到问题】
网页get下来本身的数据就是NULL,无法判断,导致保存失败。懒得加判断了。
【重要】在判断数据是否重复时,使用JPARepository的findAll的Example方式,提供一个POJO模板进行自动查询,返回的List.size()始终为0,导致重复数据还会保存。
【重要】@Table(name=" 表名")报红