【反反爬】使用Jsoup爬取数据保存Excel

   本文主要使用Jsoup爬取XXX房屋信息,抓取一些房屋信息,比如房屋楼盘、户型、价格、地址等信息,然后保存到Excel,便于对比和筛选,选出符合预期的好房。

  注意,有些网站有防爬取机制,需要设置动态更换代理IP进行重试。

 1         <dependency>
 2             <groupId>org.slf4j</groupId>
 3             <artifactId>slf4j-api</artifactId>
 4             <version>1.7.25</version>
 5         </dependency>
 6         <dependency>
 7             <groupId>org.slf4j</groupId>
 8             <artifactId>slf4j-simple</artifactId>
 9             <version>1.7.25</version>
10         </dependency>
11 
12         <dependency>
13             <groupId>org.jsoup</groupId>
14             <artifactId>jsoup</artifactId>
15             <version>1.11.3</version>
16         </dependency>
17 
18         <dependency>
19             <groupId>com.squareup.okhttp3</groupId>
20             <artifactId>okhttp</artifactId>
21             <version>3.3.0</version>
22         </dependency>
23 
24         <dependency>
25             <groupId>org.apache.poi</groupId>
26             <artifactId>poi</artifactId>
27             <version>4.1.2</version>
28         </dependency>
29 
30         <dependency>
31             <groupId>org.apache.poi</groupId>
32             <artifactId>poi-ooxml</artifactId>
33             <version>4.1.2</version>
34         </dependency>
pom.xml

 

爬取数据的部分代码


 1 public static List<List<String>> getData(String urls) throws Exception {
 2 
 3         // 读取数据
 4         List<List<String>> data = Lists.newArrayList();
 5 
 6         // 代理ip和端口,需动态替换(可以本地新建一个ip列表,动态读取,获取失败替换代理ip即可)
 7         String ip = "xxx.xxx.xxx.xxx";
 8         int port = 80;
 9 
10         Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
11         URL url = new URL(urls);
12         URLConnection urlConnection = url.openConnection(proxy);
13         urlConnection.setUseCaches(false);
14         urlConnection.connect();
15         InputStream is = urlConnection.getInputStream();
16         BufferedReader buffer = new BufferedReader(new InputStreamReader(is));
17         StringBuffer bs = new StringBuffer();
18         String l = null;
19         while ((l = buffer.readLine()) != null) {
20             bs.append(l);
21         }
22         System.out.println(bs.toString());
23 
24         Document doc = Jsoup.parse(bs.toString());
25         Elements els = doc.body().getElementsByClass("list-item");
26         for (Element el : els) {
27             List<String> rowData = Lists.newArrayList();
28             Elements titleEls = el.getElementsByClass("house-title");
29             log.info("# 标题:{}", titleEls.get(0).getElementsByTag("a").text());
30             rowData.add(titleEls.get(0).getElementsByTag("a").text());
31 
32             Elements itemEls = el.getElementsByClass("details-item");
33             Elements itemSpanEls = itemEls.get(0).getElementsByTag("span");
34             log.info("# 户型:{}", itemSpanEls.get(0).text());
35             rowData.add(itemSpanEls.get(0).text());
36             log.info("# 面积:{}", itemSpanEls.get(1).text());
37             rowData.add(itemSpanEls.get(1).text());
38             log.info("# 楼层:{}", itemSpanEls.get(2).text());
39             rowData.add(itemSpanEls.get(2).text());
40             log.info("# 年限:{}", itemSpanEls.get(3).text());
41             rowData.add(itemSpanEls.get(3).text());
42 
43             String[] address = itemEls.get(1).getElementsByTag("span").text().split(" ");
44             if (address==null || address.length == 0) {
45                 continue;
46             }
47             log.info("# 楼盘:{}", address[0]);
48             log.info("# 地址:{}", address[1]);
49             rowData.add(address[0]);
50             rowData.add(address[1]);
51 
52             Elements priceEls = el.getElementsByClass("pro-price");
53             Elements priceSpanEls = priceEls.get(0).getElementsByTag("span");
54             log.info("# 总价:{}", priceSpanEls.get(0).getElementsByTag("strong").text());
55             rowData.add(priceSpanEls.get(0).getElementsByTag("strong").text());
56             log.info("# 单价:{}", priceSpanEls.get(1).text());
57             rowData.add(priceSpanEls.get(1).text());
58             data.add(rowData);
59         }
60 
61         return data;
62     }

 

数据写入excel的代码

 1 public static void writeExcel(List<String> titleList, List<List<String>> dataList) throws Exception {
 2         //open file.
 3         File excel = new File("D:\\Users\\Desktop\\data.xls");
 4         excel.deleteOnExit();
 5         excel.createNewFile();
 6         FileOutputStream fos = new FileOutputStream(excel);
 7 
 8         Workbook book = new HSSFWorkbook();
 9 
10         //create Sheet named "Sheet_1". 0 means this is 1st page.
11         Sheet sheet = book.createSheet("安居客房源信息");
12 
13         // 写入标题
14         Row titleRow = sheet.createRow(0);
15         for (int x = 0; x < titleList.size(); x++) {
16             Cell cell0 = titleRow.createCell(x);
17             cell0.setCellValue(titleList.get(x));
18         }
19 
20         // 写入数据
21         for (int i = 0; i < dataList.size(); i++) {
22             int row = i + 1;
23             Row dataRow = sheet.createRow(row);
24             List<String> rowData = dataList.get(i);
25             for (int j = 0; j < titleList.size(); j++) {
26                 Cell dataCell = dataRow.createCell(j);
27                 dataCell.setCellValue(rowData.get(j));
28             }
29         }
30 
31         book.write(fos);
32         book.close();
33 
34         log.info("# write data success");
35     }

 

运行方法(建议第一次先不获取所有数据,只获取第一页数据,用来看实际效果,不然ip被封了无法继续使用)

 1 public static void main(String[] args) {
 2         try {
 3             String url = "file:///D:/Users/Desktop/test.html";
 4 //            url = "https://hanchuanshi.anjuke.com/sale/p1-rd1/#filtersort";
 5 
 6             List<List<String>> data = Lists.newArrayList();
 7             for (int i = 1; i <= 50; i++) {
 8                 url = "https://hanchuanshi.anjuke.com/sale/p" + i + "-rd1/#filtersort";
 9                 data.addAll(getData(url));
10             }
11 
12             List<String> titleList = Arrays.asList("标题", "户型", "面积", "楼层", "年限", "楼盘", "地址", "总价", "单价");
13             writeExcel(titleList, data);
14         } catch (Exception e) {
15             e.printStackTrace();
16         }
17     }

 


posted @ 2020-08-05 15:56  稻草人_yhc  阅读(427)  评论(0编辑  收藏  举报