JAVA程序使用Jsoup 爬取网页内容

  本程序将展示使用Jsoup爬取51job招聘信息的示例,只是用于对Jsoup的学习,不会做其他使用

 

  1. 新建一个springboot项目

 

    添加Jsoup的依赖,以及mysql和mybatis的依赖,其中后面的依赖用于将爬取到的数据存入中mysql数据库中

    

 1    <dependency>
 2             <groupId>org.springframework.boot</groupId>
 3             <artifactId>spring-boot-starter-web</artifactId>
 4         </dependency>
 5 
 6 
 7         <dependency>
 8             <groupId>org.jsoup</groupId>
 9             <artifactId>jsoup</artifactId>
10             <version>1.10.2</version>
11         </dependency>
12 
13         <dependency>
14             <groupId>tk.mybatis</groupId>
15             <artifactId>mapper-spring-boot-starter</artifactId>
16             <version>2.0.4</version>
17         </dependency>
18 
19         <dependency>
20             <groupId>org.springframework.boot</groupId>
21             <artifactId>spring-boot-starter-jdbc</artifactId>
22             <version>2.2.1.RELEASE</version>
23         </dependency>
24 
25         <dependency>
26             <groupId>org.mybatis.spring.boot</groupId>
27             <artifactId>mybatis-spring-boot-starter</artifactId>
28             <version>2.0.1</version>
29         </dependency>
30 
31         <dependency>
32             <groupId>mysql</groupId>
33             <artifactId>mysql-connector-java</artifactId>
34             <version>5.1.48</version>
35         </dependency>
36 
37         <dependency>
38             <groupId>com.alibaba</groupId>
39             <artifactId>druid</artifactId>
40             <version>1.1.1</version>
41         </dependency>

 

  2. 配置文件application.yml

    主要配置数据库的链接字符串信息

     

 1 server:
 2   port: 7999
 3 spring:
 4   servlet:
 5     multipart:
 6       max-request-size: 100MB #最大请求文件的大小
 7       max-file-size: 20MB #设置单个文件最大长度
 8   http:
 9     encoding:
10       charset: utf-8
11       force: true
12       enabled: true
13   datasource:
14     platform: mysql
15     type: com.alibaba.druid.pool.DruidDataSource
16     initialSize: 5
17     minIdle: 3
18     maxActive: 500
19     maxWait: 60000
20     timeBetweenEvictionRunsMillis: 60000
21     minEvictableIdleTimeMillis: 30000
22     validationQuery: select 1
23     testOnBorrow: true
24     poolPreparedStatements: true
25     maxPoolPreparedStatementPerConnectionSize: 20
26     driverClassName: com.mysql.jdbc.Driver
27     url: jdbc:mysql://localhost:3306/job?serverTimezone=UTC&useSSL=false&useUnicode=true&characterEncoding=utf-8&useAffectedRows=true&rewriteBatchedStatements=true
28     username: root
29     password: root

 

  3. springboot启动类

     额外添加了对mybatis mapper的扫描

     

 1 package com.devin.jobsearch;
 2 
 3 import org.springframework.boot.SpringApplication;
 4 import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 import tk.mybatis.spring.annotation.MapperScan;
 6 
 7 
 8 @MapperScan("com.devin.jobsearch.mapper")
 9 @SpringBootApplication
10 public class JobSearchApplication {
11 
12     public static void main(String[] args) {
13         SpringApplication.run(JobSearchApplication.class, args);
14     }
15 
16 }

 

 4. 数据库表和对应的model

    

CREATE TABLE `job` (
  `job_id` varchar(128) NOT NULL,
  `job_name` varchar(512) DEFAULT NULL,
  `job_detail` text,
  `job_company_name` varchar(512) DEFAULT NULL,
  `job_company_image` varchar(512) DEFAULT NULL,
  `job_company_desc` text,
  `job_company_url` varchar(512) DEFAULT NULL,
  `job_url` varchar(512) DEFAULT NULL,
  `job_location` varchar(128) DEFAULT NULL,
  `job_location_detail` varchar(4000) DEFAULT NULL,
  `job_salary` varchar(512) DEFAULT NULL,
  `job_date` varchar(128) DEFAULT NULL,
  `job_restrict_str` varchar(512) DEFAULT NULL,
  PRIMARY KEY (`job_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

 

  

  1 package com.devin.jobsearch.model;
  2 
  3 import javax.persistence.Table;
  4 
  5 /**
  6  * @author Devin Zhang
  7  * @className JobModel
  8  * @description TODO
  9  * @date 2020/4/22 9:42
 10  */
 11 @Table(name = "job")
 12 public class JobModel {
 13     private String jobId;
 14     private String jobName;
 15     private String jobDetail;
 16     private String jobCompanyName;
 17     private String jobCompanyImage;
 18     private String jobCompanyDesc;
 19     private String jobCompanyUrl;
 20     private String jobUrl;
 21     private String jobLocation;
 22     private String jobLocationDetail;
 23     private String jobSalary;
 24     private String jobDate;
 25     private String jobRestrictStr;
 26 
 27     public String getJobId() {
 28         return jobId;
 29     }
 30 
 31     public void setJobId(String jobId) {
 32         this.jobId = jobId;
 33     }
 34 
 35     public String getJobName() {
 36         return jobName;
 37     }
 38 
 39     public void setJobName(String jobName) {
 40         this.jobName = jobName;
 41     }
 42 
 43     public String getJobDetail() {
 44         return jobDetail;
 45     }
 46 
 47     public void setJobDetail(String jobDetail) {
 48         this.jobDetail = jobDetail;
 49     }
 50 
 51     public String getJobCompanyName() {
 52         return jobCompanyName;
 53     }
 54 
 55     public void setJobCompanyName(String jobCompanyName) {
 56         this.jobCompanyName = jobCompanyName;
 57     }
 58 
 59     public String getJobCompanyDesc() {
 60         return jobCompanyDesc;
 61     }
 62 
 63     public void setJobCompanyDesc(String jobCompanyDesc) {
 64         this.jobCompanyDesc = jobCompanyDesc;
 65     }
 66 
 67     public String getJobCompanyUrl() {
 68         return jobCompanyUrl;
 69     }
 70 
 71     public void setJobCompanyUrl(String jobCompanyUrl) {
 72         this.jobCompanyUrl = jobCompanyUrl;
 73     }
 74 
 75     public String getJobUrl() {
 76         return jobUrl;
 77     }
 78 
 79     public void setJobUrl(String jobUrl) {
 80         this.jobUrl = jobUrl;
 81     }
 82 
 83     public String getJobLocation() {
 84         return jobLocation;
 85     }
 86 
 87     public void setJobLocation(String jobLocation) {
 88         this.jobLocation = jobLocation;
 89     }
 90 
 91     public String getJobLocationDetail() {
 92         return jobLocationDetail;
 93     }
 94 
 95     public void setJobLocationDetail(String jobLocationDetail) {
 96         this.jobLocationDetail = jobLocationDetail;
 97     }
 98 
 99     public String getJobSalary() {
100         return jobSalary;
101     }
102 
103     public void setJobSalary(String jobSalary) {
104         this.jobSalary = jobSalary;
105     }
106 
107     public String getJobDate() {
108         return jobDate;
109     }
110 
111     public void setJobDate(String jobDate) {
112         this.jobDate = jobDate;
113     }
114 
115     public String getJobCompanyImage() {
116         return jobCompanyImage;
117     }
118 
119     public void setJobCompanyImage(String jobCompanyImage) {
120         this.jobCompanyImage = jobCompanyImage;
121     }
122 
123     public String getJobRestrictStr() {
124         return jobRestrictStr;
125     }
126 
127     public void setJobRestrictStr(String jobRestrictStr) {
128         this.jobRestrictStr = jobRestrictStr;
129     }
130 
131     @Override
132     public String toString() {
133         return "JobModel{" +
134                 "jobId='" + jobId + '\'' +
135                 ", jobName='" + jobName + '\'' +
136                 ", jobDetail='" + jobDetail + '\'' +
137                 ", jobCompanyName='" + jobCompanyName + '\'' +
138                 ", jobCompanyImage='" + jobCompanyImage + '\'' +
139                 ", jobCompanyDesc='" + jobCompanyDesc + '\'' +
140                 ", jobCompanyUrl='" + jobCompanyUrl + '\'' +
141                 ", jobUrl='" + jobUrl + '\'' +
142                 ", jobLocation='" + jobLocation + '\'' +
143                 ", jobLocatonDetail='" + jobLocationDetail + '\'' +
144                 ", jobSalary='" + jobSalary + '\'' +
145                 ", jobDate='" + jobDate + '\'' +
146                 ", jobRestrictStr='" + jobRestrictStr + '\'' +
147                 '}';
148     }
149 }

 

  5. 数据库操作mapper类 和 mapper配置文件

 

     因为使用了tkmybatis,所以mapper类 和 mapper配置文件中补需要额外添加任何代码和配置

      

 JobMapper.java
 1 package com.devin.jobsearch.mapper;
 2 
 3 import com.devin.jobsearch.model.JobModel;
 4 import tk.mybatis.mapper.common.Mapper;
 5 import tk.mybatis.mapper.common.MySqlMapper;
 6 
 7 /**
 8  * @author Devin Zhang
 9  * @className JobMapper
10  * @description TODO
11  * @date 2020/4/22 16:24
12  */
13 
14 public interface JobMapper extends Mapper<JobModel>, MySqlMapper<JobModel> {
15 }

 

   JobMapper.xml

   

1 <?xml version="1.0" encoding="UTF-8" ?>
2 <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
3 <mapper namespace="com.devin.jobsearch.mapper.JobMapper" >
4 
5 </mapper>

 

   6. 工具类

    

    

SearchUtil.java  主要用于Jsoup加载url返回一个Document对象
 1 package com.devin.jobsearch.util;
 2 
 3 
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.springframework.stereotype.Component;
 7 
 8 /**
 9  * @author Devin Zhang
10  * @className SearchUtil
11  * @description TODO
12  * @date 2020/4/22 10:07
13  */
14 
15 @Component
16 public class SearchUtil {
17 
18     public  Document getDocument(String url) throws Exception {
19         Document document = Jsoup
20                 .connect(url)
21                 .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")
22                 .get();
23         return document;
24     }
25 }

 

FileUtil.java 主要用于记录错误信息
 1 package com.devin.jobsearch.util;
 2 
 3 import org.springframework.stereotype.Component;
 4 
 5 import java.io.BufferedWriter;
 6 import java.io.File;
 7 import java.io.FileOutputStream;
 8 import java.io.OutputStreamWriter;
 9 
10 /**
11  * @author Devin Zhang
12  * @className FileUtil
13  * @description TODO
14  * @date 2020/4/22 17:56
15  */
16 @Component
17 public class FileUtil {
18 
19     private String filePath = "D:\\data\\job\\fail.log";
20 
21     public void writeLog(String log) {
22         BufferedWriter bw = null;
23         try {
24             bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath), true)));
25             bw.write(log);
26             bw.flush();
27         } catch (Exception e) {
28             e.printStackTrace();
29         } finally {
30             try {
31                 bw.close();
32             } catch (Exception e) {
33                 e.printStackTrace();
34             }
35         }
36     }
37 }

 

    7. 爬虫的逻辑处理

       我们打开51job的搜索页,分别查看首页,第2页,第3页,可以看到变化只是在访问页码的参数上有变化,所以我们可以循环去爬取整个的职位信息

         

    首页       

    https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
 

   第二页   

   https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=


   第三页   

   https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=

 

      代码实现:

      首先创建一个接口,接口中定了两个方法,一个用于获取总的页数,一个用于循环去爬取数据

      爬虫的方法实现了该接口,后续如果我们要爬取其他网站,只需要实现该接口,编写逻辑即可

 

       

IJobHandle.java

 1 package com.devin.jobsearch.Service;
 2 
 3 /**
 4  * @author Devin Zhang
 5  * @className IJobHandle
 6  * @description TODO
 7  * @date 2020/4/22 16:39
 8  */
 9 
10 public interface IJobHandle {
11 
12     int getJobPage() throws Exception;
13 
14     void handle() throws Exception;
15 }

 

      

Job51SearchHandle.java

      

   

  1 package com.devin.jobsearch.Service;
  2 
  3 import com.devin.jobsearch.mapper.JobMapper;
  4 import com.devin.jobsearch.model.JobModel;
  5 import com.devin.jobsearch.util.FileUtil;
  6 import com.devin.jobsearch.util.SearchUtil;
  7 import org.jsoup.nodes.Document;
  8 import org.jsoup.nodes.Element;
  9 import org.jsoup.select.Elements;
 10 import org.springframework.stereotype.Service;
 11 import org.springframework.util.CollectionUtils;
 12 import org.springframework.util.StringUtils;
 13 
 14 import javax.annotation.Resource;
 15 import java.util.ArrayList;
 16 import java.util.List;
 17 import java.util.regex.Matcher;
 18 import java.util.regex.Pattern;
 19 
 20 /**
 21  * @author Devin Zhang
 22  * @className Job51SearchHandle
 23  * @description TODO
 24  * @date 2020/4/21 16:41
 25  */
 26 
 27 @Service
 28 public class Job51SearchHandle implements IJobHandle {
 29 
 30     @Resource
 31     private SearchUtil searchUtil;
 32     @Resource
 33     private JobMapper jobMapper;
 34     @Resource
 35     private FileUtil fileUtil;
 36 
 37     private static final String PAGEPATTERN = "pagePattern";
 38     private static final String JOB51URL = "https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,pagePattern.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
 39 
 40 
 41     /**
 42      * 获取51job总共有多少页
 43      *
 44      * @return
 45      * @throws Exception
 46      */
 47     @Override
 48     public int getJobPage() throws Exception {
 49         String url = JOB51URL;
 50         url = url.replace(PAGEPATTERN, "1");
 51         Document document = searchUtil.getDocument(url);
 52         return Integer.parseInt(document.getElementById("hidTotalPage").val());
 53     }
 54 
 55 
 56     /**
 57      * 分页爬取51job
 58      */
 59     @Override
 60     public void handle() throws Exception {
 61         //目标地址
 62         int pageTotal = this.getJobPage();
 63         List<JobModel> jobModelList = null;
 64         for (int page = 1; page <= pageTotal; page++) {
 65             try {
 66                 jobModelList = new ArrayList();
 67                 System.out.println("开始爬取第:" + page + "页的数据");
 68                 String url = JOB51URL;
 69                 url = url.replace(PAGEPATTERN, page + "");
 70                 Document document = searchUtil.getDocument(url);
 71                 //  右侧导航栏
 72                 Elements nav_com = document.getElementsByClass("el");
 73                 for (Element element : nav_com) {
 74                     if (element.children().first().tagName("p").hasClass("t1") &&
 75                             element.children().first().tagName("p").children().hasClass("check")) {
 76 
 77                         String jobName = element.children().first().tagName("p").children().tagName("span").text();
 78                         String jobUrl = element.children().first().tagName("p").child(2).child(0).attr("href");
 79                         String companyName = element.child(1).text();
 80                         String companyUrl = element.child(1).child(0).attr("href");
 81                         String jobLocation = element.child(2).text();
 82                         String jobSalary = element.child(3).text();
 83                         String jobDate = element.child(4).text();
 84 
 85                         JobModel jobModel = new JobModel();
 86                         jobModel.setJobName(jobName);
 87                         jobModel.setJobUrl(jobUrl);
 88                         jobModel.setJobCompanyName(companyName);
 89                         jobModel.setJobCompanyUrl(companyUrl);
 90                         jobModel.setJobLocation(jobLocation);
 91                         jobModel.setJobSalary(jobSalary);
 92                         jobModel.setJobDate(jobDate);
 93 
 94                         //爬取明细
 95                         Document detailDocument = searchUtil.getDocument(jobUrl);
 96 
 97                         String jobRestrict = detailDocument.getElementsByClass("msg ltype").text();
 98                         String jobDesc = detailDocument.getElementsByClass("bmsg job_msg inbox").text();
 99                         String jobLocationDetail = "";
100                         if (detailDocument.getElementsByClass("bmsg inbox").size() > 0) {
101                             jobLocationDetail = detailDocument.getElementsByClass("bmsg inbox").first().child(0).text();
102                         }
103                         String companyDesc = detailDocument.getElementsByClass("tmsg inbox").text();
104                         String companyImage = "";
105                         if (detailDocument.getElementsByClass("com_name himg").size() > 0) {
106                             companyImage = detailDocument.getElementsByClass("com_name himg").first().child(0).attr("src");
107                         }
108 
109 
110                         jobModel.setJobRestrictStr(jobRestrict);
111                         jobModel.setJobDetail(jobDesc);
112                         jobModel.setJobLocationDetail(jobLocationDetail);
113                         jobModel.setJobCompanyDesc(companyDesc);
114                         jobModel.setJobCompanyImage(companyImage);
115 
116                         String jobId = "";
117                         String patternStr = "/[0-9]*.html";
118                         Pattern pattern = Pattern.compile(patternStr);
119                         Matcher matcher = pattern.matcher(jobUrl);
120                         if (matcher.find()) {
121                             jobId = matcher.group();
122                             jobId = jobId.replaceAll(".html", "").replaceAll("/","");
123                         }
124                         if (StringUtils.isEmpty(jobId)) {
125                             patternStr = "jobid=[0-9]*";
126                             pattern = Pattern.compile(patternStr);
127                             matcher = pattern.matcher(jobUrl);
128                             if (matcher.find()) {
129                                 jobId = matcher.group();
130                                 jobId = jobId.replaceAll("jobid=", "");
131                             }
132                         }
133                         if (StringUtils.isEmpty(jobId)) {
134                             patternStr = "#[0-9]*";
135                             pattern = Pattern.compile(patternStr);
136                             matcher = pattern.matcher(jobUrl);
137                             if (matcher.find()) {
138                                 jobId = matcher.group();
139                                 jobId = jobId.replaceAll("#", "");
140                             }
141                         }
142                         jobModel.setJobId(jobId);
143                         System.out.println(jobModel);
144                         jobModelList.add(jobModel);
145                     }
146                 }
147                 if (!CollectionUtils.isEmpty(jobModelList)) {
148                     System.out.println("第" + page + "页数据,开始插入数据");
149                     jobMapper.insertList(jobModelList);
150                 }
151                 Thread.sleep(3000); //sleep 3 秒,防止访问太频繁,被禁掉
152             } catch (Exception e) {
153                 e.printStackTrace();
154                 if (null != jobModelList) {
155                     fileUtil.writeLog(jobModelList.toString());
156                 }
157             }
158         }
159     }
160     
161 }

 

     8. 调用

     我们新建一个controller,访问直接调用弄

      

 1 package com.devin.jobsearch.controller;
 2 
 3 import com.devin.jobsearch.Service.Job51SearchHandle;
 4 import org.springframework.web.bind.annotation.GetMapping;
 5 import org.springframework.web.bind.annotation.RequestMapping;
 6 import org.springframework.web.bind.annotation.RestController;
 7 
 8 import javax.annotation.Resource;
 9 
10 /**
11  * @author Devin Zhang
12  * @className JobController
13  * @description TODO
14  * @date 2020/4/22 16:36
15  */
16 @RestController
17 @RequestMapping("/job")
18 public class JobController {
19 
20     @Resource
21     private Job51SearchHandle job51SearchHandle;
22 
23     @GetMapping("/51jobHandle")
24     public String handle51JobController() throws Exception {
25         job51SearchHandle.handle();
26         return "success";
27     }
28 }

 

   访问: localhost:7999/job/51jobHandle 即可触发爬取,可以看到爬取到的数据已经存到数据库了

    

 

 

   github地址:  https://github.com/devinzhang0209/jobsearch.git

   最后,再次说明,本文章的目的只是为了学习Jsoup,爬取到的数据也不会用作其他使用,毕竟爬虫爬的好,牢饭吃的饱。

 

 

 

 

 

posted @ 2020-04-23 13:25  devin_w_zhang  阅读(673)  评论(0编辑  收藏  举报