Jsoup爬取京东和融e购商品列表工具类
1.新建maven项目,添加Jsoup的依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>Jsoup-demo</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.13.1</version> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.12</version> </dependency> </dependencies> </project>
2.新建实体类,代码如下:
1 package cn.lxcourse.jsoup.pojo; 2 3 import lombok.AllArgsConstructor; 4 import lombok.Data; 5 import lombok.NoArgsConstructor; 6 7 @Data 8 @NoArgsConstructor 9 @AllArgsConstructor 10 public class Content { 11 private String price; 12 private String title; 13 private String imgSrc; 14 }
3.编写工具类,代码如下:
1 package cn.lxcourse.jsoup.util; 2 3 import cn.lxcourse.jsoup.pojo.Content; 4 import org.jsoup.Jsoup; 5 import org.jsoup.nodes.Document; 6 import org.jsoup.nodes.Element; 7 import org.jsoup.select.Elements; 8 9 import java.net.URL; 10 import java.util.ArrayList; 11 import java.util.List; 12 13 /** 14 * 爬虫工具 15 */ 16 public class JsoupUtils { 17 18 /** 19 * 爬取京东商品列表 20 * @param keywords 21 * @return 22 * @throws Exception 23 */ 24 public static List<Content> getJDGoods(String keywords) throws Exception { 25 String url = "https://search.jd.com/Search?keyword=Java" + keywords; 26 Document document = Jsoup.parse(new URL(url), 300000); 27 //商品列表 28 Element j_goodsList = document.getElementById("J_goodsList"); 29 Elements glEtemElements = j_goodsList.getElementsByClass("gl-item"); 30 31 List<Content> list = new ArrayList<>(); 32 for (Element element : glEtemElements) { 33 34 String imgSrc = element.getElementsByTag("img").eq(0).attr("source-data-lazy-img"); 35 String price = element.getElementsByClass("p-price").eq(0).text(); 36 String title = element.getElementsByClass("p-name").eq(0).text(); 37 38 Content content = new Content(); 39 content.setImgSrc(imgSrc); 40 content.setPrice(price); 41 content.setTitle(title); 42 43 list.add(content); 44 } 45 46 return list; 47 } 48 49 /** 50 * 爬取工行融e购商品列表 51 * @param keywords 52 * @return 53 * @throws Exception 54 */ 55 public static List<Content> getRongYiGouGoods(String keywords) throws Exception { 56 //https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=java 57 String url = "https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=" + keywords; 58 59 Document document = Jsoup.parse(new URL(url), 30000); 60 Element ajaxQueryContent = document.getElementById("ajaxQueryContent"); 61 62 Elements liElements = ajaxQueryContent.getElementsByTag("li"); 63 64 List<Content> list = new ArrayList<>(); 65 66 for (Element el : liElements) { 67 String src = el.getElementsByTag("img").eq(0).attr("src"); 68 String price = el.getElementsByClass("p-price").eq(0).text(); 69 String title = el.getElementsByClass("p-name").eq(0).select("a").eq(0).attr("title"); 70 Content content = new Content(); 71 content.setTitle(title); 72 content.setPrice(price); 73 content.setImgSrc(src); 74 list.add(content); 75 } 76 77 return list; 78 } 79 80 public static void main(String[] args) throws Exception { 81 //getJDGoods("Java").forEach(System.out::println); 82 getRongYiGouGoods("java").forEach(System.out::println); 83 } 84 }