java.net.*爬取网页，Jsoup解析网页内容

java.net.* 建立网络连接
Jsoup解析网页内容
package com.sun.util;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class DataDownUtil {
    /**
     * @author UPO
     * @param url
     * @param encoding
     * @return String 网页的源代码
     * <a href="http://www.baidu.com">百度</a>
     * <a href="https://movie.douban.com/subject/3168101/comments?start=0&limit=20&sort=new_score&status=P">爬取的网页</a>
     *                 
     */
    public static String getHtmlResourceByUrl(String url,String encoding){
        StringBuffer buffer=new StringBuffer();
        URL urlobj=null;
        URLConnection uc=null;
        InputStreamReader isr=null;
        BufferedReader reader=null;
        try {
            //建立网络连接
            urlobj=new URL(url);
            //打开网络
            uc=urlobj.openConnection();
            //建立文件输入流的对象
            isr=new InputStreamReader(uc.getInputStream(), encoding);
            //建立文件缓冲写入流（相当于ctrl+v放入内存中）
            reader=new BufferedReader(isr);
            
            //建立临时变量
            String temp=null;
            while((temp=reader.readLine())!=null){
                buffer.append(temp);
                //buffer.append("\n");
            }
            
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.out.println("网络连接不可用");
        }catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.out.println("网络连接失败");
        }finally {
            if(isr!=null){
                try {
                    isr.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        return buffer.toString();
    }
    public static String getContext(){
        
        StringBuffer context=new StringBuffer();
        int start=0;
        while(start>=0&&start<=60){
            
            //查看网页url地址栏
            String url="https://movie.douban.com/subject/26266893/comments?start="+start+"&limit=20&sort=new_score&status=P";
            String encoding="utf-8";
            //观察可知每一页加载20个评价item
            start=start+20;
            //1.获取网页源代码
            String html=getHtmlResourceByUrl(url, encoding);
            //System.out.println(html);
            //2.解析
            Document document=Jsoup.parse(html);
            //3.最外层的id是：comments
            Element element=document.getElementById("comments");
            //4.里面的每一个item的id是：comment-item
            Elements elements=element.getElementsByClass("comment-item");
            for (Element ele : elements) {
                //https://movie.douban.com/subject/3168101/comments?start=20&limit=20&sort=new_score&status=P
                String name=ele.getElementsByTag("a").last().text();
                String desc=ele.getElementsByClass("short").text();
                String time=ele.getElementsByClass("comment-time").text();
                String votes=ele.getElementsByClass("votes").text();
                //System.out.println("\nname:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes);
                context.append("\n");
                context.append("name:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes);
                context.append("\n");
            }
        }
        System.out.println(context);
        return context.toString();
    }

    /**
     * 将文件一行行写入到文件中
     * @author 孙敬钦
     * @version 1.0
     * @param content 解析到的文件内容
     * @param filePath 存储的文件名字
     * @return void
     */
    public static void writeFileByLine(String context,String filePath){
        File file=new File(filePath);
        PrintWriter printWriter=null;;
        try {
            printWriter=new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
            printWriter.print(context);
            printWriter.flush();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            //关闭printWriter
            if(printWriter!=null){
                printWriter.close();

            }

        }
    }
    
    public static void main(String[] args) {
        System.out.println("你好阿泡");
        //1.得到解析的网页数据
        String context=getContext();
        
        System.out.println(context);
        //2.保存到txt文件
        String filePath="D:/movie/bigdata.txt";
        
        writeFileByLine(context, filePath);
        //3.保存到hdfs文件系统
        
        
    }

}
posted @ 2020-07-31 14:37 懒惰的星期六阅读(289) 评论(0) 编辑收藏举报
刷新页面返回顶部
懒惰的星期六

java.net.*爬取网页，Jsoup解析网页内容

公告