java爬虫案例

package com.zjazn;

import com.sun.org.apache.bcel.internal.generic.RETURN;
import com.sun.xml.internal.ws.api.server.InstanceResolver;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class Data {

　　 //2、解析数据
    public static void main(String[] args) {
        String html = getData();
        Document htmledThisDocument = Jsoup.parse(html);
        List<MyData> myData=new ArrayList<MyData>();
        Elements courses = htmledThisDocument.select(".learn-path-container>div");
        for (Element course:courses){
            String courseName = course.select("a>div").first().text();
            String courseNum = course.select("a>div").last().text();
            if(courseNum.indexOf("门")>-1){
                int num = Integer.parseInt(courseNum.substring(0, courseNum.indexOf("门")));
                String imgPath = course.select("a>img").attr("src");
                String fuffix = imgPath.substring(imgPath.lastIndexOf("."));
                MyData myData6 = new MyData();
                    myData6.setName(courseName);
                    myData6.setImgPath(imgPath);
                    myData6.setNum(num);
                myData.add(myData6);
                downloadFile(imgPath,"E://myimg",courseName+fuffix);
            }
        }
        System.out.println(myData.toString());


    }
　　//1、获取数据（html）
    public static String getData(){
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet("https://www.lanqiao.cn/paths/");

        CloseableHttpResponse response=null;
        HttpEntity entity=null;
        String html=null;
        try {
            response = httpClient.execute(httpGet);//发送请求

            if(response.getStatusLine().getStatusCode() ==200){
                entity = response.getEntity();//获取html
                html= EntityUtils.toString(entity,"UTF-8");//用指定编码解析html
            }
            return html;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;

    }
//传入资源链接，下载资源的方法，比如下载图片

public static void downloadFile(String urlStr,String directory,String fileName){//#有些页面抓取不了，网站设置了反爬，拒绝反爬取，请看最下面
        FileOutputStream out =null;
        InputStream in=null;
        try {
            URL url=new URL(urlStr);
            URLConnection urlConnection = url.openConnection();
            in=urlConnection.getInputStream();
            byte[] buf=new byte[1000];
            File dir = new File(directory);
            if(!dir.exists() ){
                dir.mkdir();
            }
            out=new FileOutputStream(directory+"\\"+fileName);
            int len=-1;
            while ((len=in.read(buf))!=-1){
                out.write(buf,0,len);

            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                if(in != null){
                    in.close();
                }
                if (out !=null){
                    out.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

package com.zjazn;

import lombok.Data;

@Data
public class MyData {
    private String name;
    private String imgPath;
    private Integer num;

}

##拒绝反爬取

public static String getData(String TargetUrl) throws IOException {
URL url = new URL(TargetUrl);//目标URL

HttpURLConnection conn = (HttpURLConnection) url.openConnection();//模拟浏览器，反拒绝爬取
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");

BufferedReader fr = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
char[] chars = new char[1024];
int len=0;
String html = null;
while ((len=fr.read(chars))!=-1){
html+=new String(chars,0,len);
}

fr.close();
return html;


}

posted @ 2020-12-25 13:09 小庄的blog 阅读(350) 评论(0) 编辑收藏举报

刷新页面返回顶部

小庄的博客园

zhuangjie

java爬虫案例

公告

小庄的博客园

zhuangjie

java爬虫 案例

公告

java爬虫案例