动态页面爬虫前的准备:https://www.cnblogs.com/maohuidong/p/18517953
一:java添加maven依赖:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
<!--selenium依赖-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
二:重写PageProcessor:
import java.util.List;
import java.util.Set;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 准备抓取这个页面:http://yapi.xxxxx.so/project/317/interface/api
* 但是这个页面需要登录后才能访问,登录页面为:http://yapi.xxxx.so/login
* 登录后抓取的页面有 分页, 分页的特点是地址不变,所以从第二页开始,无法通过webMagic抓取第二页的内容
* 只能通过Selenium来点击下一页的操作,然后再解析页面。
*/
public class YapiPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(0).setTimeOut(3000);
//用来存储cookie信息
private Set<Cookie> cookies;
private RemoteWebDriver driver;
public YapiPageProcessor(){
System.setProperty("webdriver.chrome.driver","E:\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
//创建浏览器参数对象
ChromeOptions chromeOptions = new ChromeOptions();
// 设置为 无界面浏览器 模式,若是不想看到浏览器打开,就可以配置此项
//解决 403 出错问题
chromeOptions.addArguments("--remote-allow-origins=*");
// chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--window-size=1440,1080");// 设置浏览器窗口打开大小
RemoteWebDriver driver = new ChromeDriver(chromeOptions);
this.driver = driver;
}
@Override
public void process(Page page) {
// 取所有的key 和对应的所有的value
List<String> keys = page.getHtml().xpath("//tbody[@class='ant-table-tbody']/tr/td/a/span/text()").all();
List<String> values = page.getHtml().xpath("//tbody[@class='ant-table-tbody']/tr/td/div/span[3]/text()").all();
//获取用户的id
for (int i = 0;i < keys.size();i++) {
page.putField(keys.get(i),values.get(i));
}
while (driver.findElements(By.xpath("//li[@title='下一页' and @aria-disabled='false']")).size() > 0){
// 点击下一页
driver.findElement(By.xpath("//li[@title='下一页' and @aria-disabled='false']")).click();
// 等待2S
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
List<WebElement> elements = driver.findElements(By.xpath("//tbody[@class='ant-table-tbody']/tr"));
for (int ii = 1;ii <= elements.size();ii++) {
WebElement key = driver.findElement(By.xpath("//tbody[@class='ant-table-tbody']/tr[" + ii + "]/td/a/span"));
WebElement value = driver.findElement(By.xpath("//tbody[@class='ant-table-tbody']/tr[" + ii + "]/td/div/span[3]"));
page.putField(key.getText(),value.getText());
}
}
}
//使用 selenium 来模拟用户的登录获取cookie信息
public void login(String loginUrl,String userName,String password)
{
driver.get(loginUrl);
driver.findElement(By.id("email")).clear();
//在******中填你的用户名
driver.findElement(By.id("email")).sendKeys(userName);
driver.findElement(By.id("password")).clear();
//在*******填你密码
driver.findElement(By.id("password")).sendKeys(password);
//模拟点击登录按钮
driver.findElement(By.xpath("//button[@type='submit']")).click();
//获取cookie信息
cookies = driver.manage().getCookies();
// driver.close();
}
@Override
public Site getSite() {
//将获取到的cookie信息添加到webmagic中
for (Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(),cookie.getValue().toString());
}
return site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
}
public static void main(String[] args){
YapiPageProcessor pageProcessor = new YapiPageProcessor();
//调用selenium,进行模拟登录
pageProcessor.login("http://yapi.xxxx.so/login","xxx","xxx");
Spider.create(pageProcessor)
.addUrl("http://yapi.xxx.so/project/317/interface/api")
.setDownloader(new MyDownloader(pageProcessor.driver))//可选择使用自定义的
// 输出到D盘webmagic文件夹
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
//开启1个线程抓取
.thread(1)
//启动爬虫
.run();
System.out.println("爬取结束");
}
}
三:重写Downloader:
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.selector.PlainText;
import org.openqa.selenium.Cookie;
import java.util.Map;
public class MyDownloader implements Downloader {
//声明驱动
private RemoteWebDriver driver;
public MyDownloader(RemoteWebDriver driver) {
this.driver = driver;
}
/**
* 由于selenium的默认域名为data;因此第一次必须跳转到登录页,才能加入对应域名
* @param request Request
*/
@Override
public Page download(Request request, Task task) {
try {
driver.get(request.getUrl());//第一次打开url,跳转到登录页
Thread.sleep(3000);//等待打开浏览器
//获取从process返回的site携带的cookies,填充后第二次打开url
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
driver.manage().addCookie(cookie);
}
//添加对应domain的cookie后,第二次打开url
driver.get(request.getUrl());
}
Thread.sleep(2000);
driver.executeScript("window.scrollTo(0, document.body.scrollHeight - 1000)");//需要滚动到页面的底部,获取完整的数据
Thread.sleep(2000);//等待滚动完成
//获取页面,打包成Page对象,传给PageProcessor 实现类
Page page = createPage(request.getUrl(), driver.getPageSource());
//driver.close();//看需要是否关闭浏览器
return page;
} catch (InterruptedException e) {
e.printStackTrace();
}
return null;
}
@Override
public void setThread(int threadNum) {
}
//构建page返回对象
private Page createPage(String url, String content) {
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(url));
page.setRequest(new Request(url));
page.setDownloadSuccess(true);
return page;
}