java爬虫 案例
package com.zjazn; import com.sun.org.apache.bcel.internal.generic.RETURN; import com.sun.xml.internal.ws.api.server.InstanceResolver; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sun.net.www.http.HttpClient; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; public class Data { //2、解析数据 public static void main(String[] args) { String html = getData(); Document htmledThisDocument = Jsoup.parse(html); List<MyData> myData=new ArrayList<MyData>(); Elements courses = htmledThisDocument.select(".learn-path-container>div"); for (Element course:courses){ String courseName = course.select("a>div").first().text(); String courseNum = course.select("a>div").last().text(); if(courseNum.indexOf("门")>-1){ int num = Integer.parseInt(courseNum.substring(0, courseNum.indexOf("门"))); String imgPath = course.select("a>img").attr("src"); String fuffix = imgPath.substring(imgPath.lastIndexOf(".")); MyData myData6 = new MyData(); myData6.setName(courseName); myData6.setImgPath(imgPath); myData6.setNum(num); myData.add(myData6); downloadFile(imgPath,"E://myimg",courseName+fuffix); } } System.out.println(myData.toString()); }
//1、获取数据(html) public static String getData(){ CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("https://www.lanqiao.cn/paths/"); CloseableHttpResponse response=null; HttpEntity entity=null; String html=null; try { response = httpClient.execute(httpGet);//发送请求 if(response.getStatusLine().getStatusCode() ==200){ entity = response.getEntity();//获取html html= EntityUtils.toString(entity,"UTF-8");//用指定编码解析html } return html; } catch (IOException e) { e.printStackTrace(); } return null; }
//传入资源链接,下载资源的方法,比如下载图片
public static void downloadFile(String urlStr,String directory,String fileName){//#有些页面抓取不了,网站设置了反爬,拒绝反爬取,请看最下面 FileOutputStream out =null; InputStream in=null; try { URL url=new URL(urlStr); URLConnection urlConnection = url.openConnection(); in=urlConnection.getInputStream(); byte[] buf=new byte[1000]; File dir = new File(directory); if(!dir.exists() ){ dir.mkdir(); } out=new FileOutputStream(directory+"\\"+fileName); int len=-1; while ((len=in.read(buf))!=-1){ out.write(buf,0,len); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally { try { if(in != null){ in.close(); } if (out !=null){ out.close(); } } catch (IOException e) { e.printStackTrace(); } } } }
package com.zjazn; import lombok.Data; @Data public class MyData { private String name; private String imgPath; private Integer num; }
##拒绝反爬取
public static String getData(String TargetUrl) throws IOException { URL url = new URL(TargetUrl);//目标URL HttpURLConnection conn = (HttpURLConnection) url.openConnection();//模拟浏览器,反拒绝爬取 conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"); BufferedReader fr = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); char[] chars = new char[1024]; int len=0; String html = null; while ((len=fr.read(chars))!=-1){ html+=new String(chars,0,len); } fr.close(); return html; }