<!--爬数据 start-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.9</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!--爬数据 end-->
一.创建你要爬取的字段实体
package com.tecnon.common.utils;
import lombok.Data;
@Data
public class POItoExcel {
/**
* 书名
*/
private String bookName;
/**
* 价格
*/
private String price;
/**
* 作者
*/
private String author;
/**
* 出版社
*/
private String Press;
/**
* 出版时间
*/
private String pressTime;
}
二.单元测试实现代码
这是我要爬取的页面链接:https://www.bookuu.com/search.php?cid=101702
实现单元测试
public static void main(String[] args) {
List<POItoExcel> poItoExcelList = new ArrayList<>();
for (int i = 1; i <= 2; i++) {
String url = "https://www.bookuu.com/search.php?cid=101702&page=" + i;
try {
Document document = Jsoup.connect(url).header("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
.header("referer", "https://www.bookuu.com/search.php?cid=101702&page=" + i).get();
Element body = document.body();
Elements a = body.getElementsByClass("wd-640");
for (Iterator it = a.iterator(); it.hasNext(); ) {
POItoExcel poItoExcel = new POItoExcel();
Element e = (Element) it.next();
Elements bn = e.getElementsByClass("fs-16");
Elements p = e.getElementsByClass("fs-21");
Elements w = e.getElementsByClass("wd-30p fl to-hd mr-10");
Elements f = e.getElementsByClass("wd-30p fl to-hd cl-9 mr-10");
Elements t = e.getElementsByClass("wd-30p fl to-hd cl-9");
//爬到的数据放到list中
poItoExcel.setBookName(bn.text());
poItoExcel.setPrice(p.text());
poItoExcel.setAuthor(w.text());
poItoExcel.setPress(f.text());
poItoExcel.setPressTime(t.text());
poItoExcelList.add(poItoExcel);
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("第" + i+ "页结束");
}
System.out.println("----"+ StringUtil.getJsonFromObject(poItoExcelList) +"----");
}
有什么问题:加qq:501397578