java.net.*爬取网页,Jsoup解析网页内容
java.net.* 建立网络连接
Jsoup解析网页内容
package com.sun.util; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class DataDownUtil { /** * @author UPO * @param url * @param encoding * @return String 网页的源代码 * <a href="http://www.baidu.com">百度</a> * <a href="https://movie.douban.com/subject/3168101/comments?start=0&limit=20&sort=new_score&status=P">爬取的网页</a> * */ public static String getHtmlResourceByUrl(String url,String encoding){ StringBuffer buffer=new StringBuffer(); URL urlobj=null; URLConnection uc=null; InputStreamReader isr=null; BufferedReader reader=null; try { //建立网络连接 urlobj=new URL(url); //打开网络 uc=urlobj.openConnection(); //建立文件输入流的对象 isr=new InputStreamReader(uc.getInputStream(), encoding); //建立文件缓冲写入流(相当于ctrl+v放入内存中) reader=new BufferedReader(isr); //建立临时变量 String temp=null; while((temp=reader.readLine())!=null){ buffer.append(temp); //buffer.append("\n"); } } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("网络连接不可用"); }catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("网络连接失败"); }finally { if(isr!=null){ try { isr.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return buffer.toString(); } public static String getContext(){ StringBuffer context=new StringBuffer(); int start=0; while(start>=0&&start<=60){ //查看网页url地址栏 String url="https://movie.douban.com/subject/26266893/comments?start="+start+"&limit=20&sort=new_score&status=P"; String encoding="utf-8"; //观察可知每一页加载20个评价item start=start+20; //1.获取网页源代码 String html=getHtmlResourceByUrl(url, encoding); //System.out.println(html); //2.解析 Document document=Jsoup.parse(html); //3.最外层的id是:comments Element element=document.getElementById("comments"); //4.里面的每一个item的id是:comment-item Elements elements=element.getElementsByClass("comment-item"); for (Element ele : elements) { //https://movie.douban.com/subject/3168101/comments?start=20&limit=20&sort=new_score&status=P String name=ele.getElementsByTag("a").last().text(); String desc=ele.getElementsByClass("short").text(); String time=ele.getElementsByClass("comment-time").text(); String votes=ele.getElementsByClass("votes").text(); //System.out.println("\nname:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes); context.append("\n"); context.append("name:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes); context.append("\n"); } } System.out.println(context); return context.toString(); } /** * 将文件一行行写入到文件中 * @author 孙敬钦 * @version 1.0 * @param content 解析到的文件内容 * @param filePath 存储的文件名字 * @return void */ public static void writeFileByLine(String context,String filePath){ File file=new File(filePath); PrintWriter printWriter=null;; try { printWriter=new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); printWriter.print(context); printWriter.flush(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { //关闭printWriter if(printWriter!=null){ printWriter.close(); } } } public static void main(String[] args) { System.out.println("你好阿泡"); //1.得到解析的网页数据 String context=getContext(); System.out.println(context); //2.保存到txt文件 String filePath="D:/movie/bigdata.txt"; writeFileByLine(context, filePath); //3.保存到hdfs文件系统 } }