Java爬虫框架--WebMagic

WebMagic框架教程 http://webmagic.io/docs/zh/

爬取世纪佳缘小姐姐信息

/**
 * @auther mxh
 * @time 2019/5/17 13:44
 * 信息实体类 
 */
public class Info {

    private Integer id;
    
    //昵称
    private String name;
    
    // 照片
    private String image;
    
    //基本信息
    private String info;
    
    //爱情宣言
    private String mottos;
    
    //推荐理由
    private String reason;

    public Info() {

    }

    public Info(String name, String image, String info, String mottos, String reason) {
        this.name = name;
        this.image = image;
        this.info = info;
        this.mottos = mottos;
        this.reason = reason;
    }

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getImage() {
        return image;
    }

    public void setImage(String image) {
        this.image = image;
    }

    public String getInfo() {
        return info;
    }

    public void setInfo(String info) {
        this.info = info;
    }

    public String getMottos() {
        return mottos;
    }

    public void setMottos(String mottos) {
        this.mottos = mottos;
    }

    public String getReason() {
        return reason;
    }

    public void setReason(String reason) {
        this.reason = reason;
    }

    @Override
    public String toString() {
        return "Info{" +
                "id=" + id +
                ", name='" + name + '\'' +
                ", image='" + image + '\'' +
                ", info='" + info + '\'' +
                ", mottos='" + mottos + '\'' +
                ", reason='" + reason + '\'' +
                '}';
    }

dao层

import org.springframework.stereotype.Repository;

/**
 * @auther mxh
 * @time 2019/5/17 13:46
 */
@Repository
public interface SJJYMapper {

    int addInfo(Info info);

}

爬虫框架持久层

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 * @auther mxh
 * @time 2019/5/17 13:59
 *
 * 爬虫框架dao层
 */
@Service
public class SJJYPipeline implements Pipeline {

    @Autowired
    private SJJYMapper sjjyMapper;

    @Override
    public void process(ResultItems resultItems, Task task) {
        System.out.println("get page: " + resultItems.getRequest().getUrl());
        String[] names = resultItems.get("names").toString().split(",");
        String[] images = resultItems.get("images").toString().split(",");
        String[] infos = resultItems.get("infos").toString().split(",");
        String[] mottoes = resultItems.get("mottoes").toString().split(",");
        String[] reasons = resultItems.get("reasons").toString().split(",");
        for (int i=0;i<names.length;i++){
            Info info = new Info(names[i],images[i],infos[i],mottoes[i],reasons[i]);
            sjjyMapper.addInfo(info);
            System.out.println("add info: " + info.toString());
        }
    }
}

爬虫框架数据筛选逻辑层

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.DateUtils;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.DefaultCookieSpec;
import org.apache.http.message.BasicHeader;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.*;

/**
 * @auther mxh
 * @time 2019/5/16 17:01
 *
 * 爬虫框架数据筛选逻辑层
 */
@Service
public class SJJYProcessor implements PageProcessor {

    private Site site = Site.me().setCharset("utf8").setRetryTimes(1000).setSleepTime(1000);

    // 用来存储cookie信息
    private Set<Cookie> cookies;

    @Override
    public void process(Page page) {
        Html html = page.getHtml();
        //照片
        List<String> images = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanPic\"]/a/img/@_src").all();
        //姓名
        List<String> names = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanName\"]/a/text()").all();
        //基本信息
        List<String> infos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanArea\"]/text()").all();
        //爱情宣言
        List<String> mottos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanText\"]/text()").all();
        //推荐理由
        List<String> reasons = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanLy\"]/text()").all();

        /*输出到控制台 并使dao层接收到数据*/
        page.putField("names",names);
        page.putField("images",images);
        page.putField("infos",infos);
        page.putField("mottoes",mottos);
        page.putField("reasons",reasons);
    }

    @Override
    public Site getSite() {
        //设置主机地址
        site.setDomain("www.jiayuan.com");
        //手动设置cookie
        //site.addCookie("PHPSESSID","f16de947c3a48a1084d22dd7e72cd283");
        /*site.addCookie("PHPSESSID","8b392aacbf80a4d6cf102938271273a7");
        site.addCookie("COMMON_HASH","0d8c3daa82c80277292723d74ff197d0");
        site.addCookie("PROFILE","207838031%3A%25E5%25BD%25BC%25E5%25BE%2597%25E5%25B8%2595%25E5%2585%258B%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3.0");
        site.addCookie("RAW_HASH","fYGR2xG5XJL10gfFF4mP3qO0yN65wBrTZpeOrelDWKHerbx69EjQ138l9BfHlTYP%2AGuyrs-5xYCSsUMipqBNkKqExN%2AWVe7sWAWAa5w8VXf-TMA.");
        site.addCookie("SESSION_HASH","c2dbd047d891295d1b3e4d5b4cb687e71eeb1afd");
        site.addCookie("accessID","20190516163650639629");
        site.addCookie("ip_loc","31");
        site.addCookie("save_jy_login_name","15735400536");
        site.addCookie("stadate1","206838031");
        site.addCookie("user_access","1");
        site.addCookie("main_search:207838031","%7C%7C%7C00");
        site.addCookie("last_login_time","1558057676");*/

        //自动追加
        for (org.apache.http.cookie.Cookie cookie : cookies) { 
        site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
     }
return site;
}

// 自动登陆方法
public void login() {
//注册chrome
System.setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.get("http://login.jiayuan.com/?refrer=http://www.jiayuan.com&host=0");// 打开网址
// 防止页面未能及时加载出来而设置一段时间延迟
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 设置用户名密码
driver.findElement(By.id("login_email")).sendKeys("15735400536"); // 用户名
driver.findElement(By.id("login_password")).sendKeys("mxh970923"); // 密码
// 模拟点击 //form[@id='form-group-login']/button
driver.findElement(By.xpath("//*[@id=\"login_btn\"]"))
.click(); // xpath语言:id为form-group-login的form下的button


// 防止页面未能及时加载出来而设置一段时间延迟
try {
Thread.sleep(15000);
} catch (InterruptedException e) {
e.printStackTrace();
}


// 获取cookie信息
cookies = driver.manage().getCookies();


driver.close();
}


controller

import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.*;
import us.codecraft.webmagic.Spider;
import org.apache.http.cookie.Cookie;
import java.util.List;
import org.apache.http.client.CookieStore;

/**
 * @auther mxh
 * @time 2019/5/16 17:10
 */
@Controller
public class SJJYController {

    @Autowired
    private SJJYProcessor sjjyProcessor;

    @Autowired
    private SJJYPipeline sjjyPipeline;

    @ResponseBody
    @RequestMapping(value = "/start",method = RequestMethod.GET)
    public String start(){
        //模拟浏览器自动登录
        sjjyProcessor.login();
        for (int i=1;i<=9;i++){
            Spider.create(sjjyProcessor)
                    .addUrl("http://www.jiayuan.com/usercp/dynmatch/ajax/jymatch_list.php?p="+i)
                    .addPipeline(sjjyPipeline)
                    .thread(5)
                    .run();
        }
        return "success";
    }

    @ResponseBody
    @RequestMapping(value = "/login",method = RequestMethod.GET)
    public String login(){
        String url ="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/";
        try {
            sjjyProcessor.getCookieBySendPost(url);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "login success";
    }


    @ResponseBody
    @RequestMapping(value = "/test2",method = RequestMethod.GET)
    public String test2(){
        /*// TODO Auto-generated method stub
        String url="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/";
        //POST的URL
        HttpPost httppost=new HttpPost(url);
        //建立HttpPost对象
        List<NameValuePair> params=new ArrayList<NameValuePair>();
        //建立一个NameValuePair数组,用于存储欲传送的参数
        params.add(new BasicNameValuePair("pwd","2544"));
        HttpResponse response = null;
        //添加参数
        try {
            httppost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8));
            //设置编码
            response = new DefaultHttpClient().execute(httppost);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e){
            e.printStackTrace();
        }
        //发送Post,并返回一个HttpResponse对象
        //Header header = response.getFirstHeader("Content-Length");
        //String Length=header.getValue();
        // 上面两行可以得到指定的Header
        if(response.getStatusLine().getStatusCode()==200){//如果状态码为200,就是正常返回
            String result= response.getEntity().getContent();
            //得到返回的字符串
            System.out.println(result);
        }*/

        // TODO Auto-generated method stub
        CloseableHttpClient httpClient = null;
        //创建GET请求
        HttpGet httpget = new HttpGet("https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/");
        String result = null;
        try {
            CookieStore cookieStore = new BasicCookieStore();
            httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
            httpClient.execute(httpget);
            String PHPSESSID = null;
            List<Cookie> cookies = cookieStore.getCookies();
            System.out.println(cookies);
            for (int i = 0; i < cookies.size(); i++) {
                if (cookies.get(i).getName().equals("PHPSESSID")) {
                    PHPSESSID = cookies.get(i).getValue();
                    System.out.println(PHPSESSID);
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        return "Hello World";
    }


}
    

 application.properties

server.port=8001
mybatis.type-aliases-package=com.example.shijijiayuan.demo
mybatis.mapper-locations=classpath*:mapper.xml
spring.datasource.url=jdbc:mysql://localhost:3306/******
spring.datasource.username=root
spring.datasource.password=root
spring.datasource.driver-class-name=com.mysql.jdbc.Driver

mapper.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.example.shijijiayuan.demo.SJJYMapper" >

    <insert id="addInfo" parameterType="com.example.shijijiayuan.demo.Info">
        INSERT INTO info(name,image,info,mottos,reason) VALUES(#{name}, #{image}, #{info}, #{mottos}, #{reason})
    </insert>

</mapper>

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.5.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.example</groupId>
    <artifactId>shijijiayuan</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>shijijiayuan</name>
    <description>Demo project for Spring Boot</description>

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>

        <!--WebMagic-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!--myBatis-->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>2.0.1</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.30</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-api</artifactId>
            <version>3.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>3.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-api</artifactId>
            <version>3.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.8</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>

记得下载相应的浏览器驱动,注意版本号要一致哦

博主这里用的是谷歌浏览器驱动

世纪佳缘网站登录要做验证码验证,博主暂时不会写那么智能的代码,所以只能手动选择了

代码有些jar包可能导的不正确,注意哦,不要盲目copy

posted @ 2019-05-17 20:38  尘世间迷茫的小书童  阅读(2831)  评论(0编辑  收藏  举报