java正则 读取html 获取标题/超链接/链接文本/内容
java正则 读取html 获取标题/超链接/链接文本/内容
参考链接:http://yijianfengvip.blog.163.com/blog/static/17527343220114278593064/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author
*
*/
public class WebContent
{
/**
* 读取一个网页全部内容
*/
///此方法 读取小量数据网站 速度较快
1 public static String getOneHtml(String urlString)throws Exception{ 2 InputStreamReader in = new InputStreamReader(new URL(urlString).openStream()); 3 // read contents into string buffer 4 StringBuilder input = new StringBuilder(); 5 int ch; 6 while ((ch = in.read()) != -1) input.append((char) ch); 7 //System.out.println(input); 8 return input.toString(); 9 }
///此方法 读取大量数据网站 速度较快 但是不能太大。 例如sina.com.cn 本方法 运行十次耗时 7688毫秒 getOneHtml 耗时 41016毫秒
1 public static String getAllHtml(String urlString)throws Exception{ 2 // ConnectionManager manager = Parser.getConnectionManager(); 3 // Cookie cookie = new Cookie("ASP.NET_SessionId","szpgjr45ucsacir0d1p4g255"); 4 // Cookie cookie1 = new Cookie("SID", "111017"); 5 // Cookie cookie2 = new Cookie("LID", ""); 6 // manager.setCookie(cookie, "acad.cnki.net"); 7 // manager.setCookie(cookie1, "acad.cnki.net"); 8 // manager.setCookie(cookie2, "acad.cnki.net"); 9 //Parser parser = new Parser(""); 10 Parser parser = new Parser(urlString); 11 parser.setEncoding("utf8"); 12 String filterStr = "html"; 13 NodeFilter innerFilter = new TagNameFilter(filterStr); 14 NodeList nodes = parser.parse(innerFilter); 15 return nodes.toHtml(); 16 //System.out.println(nodes.toHtml()); 17 } 18 19
1 /** 2 * 3 * @param s 4 * @return 获得网页标题 5 */ 6 public String getTitle(final String s) 7 { 8 String regex; 9 String title = ""; 10 final List<String> list = new ArrayList<String>(); 11 regex = "<title>.*?</title>"; 12 final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); 13 final Matcher ma = pa.matcher(s); 14 while (ma.find()) 15 { 16 list.add(ma.group()); 17 } 18 for (int i = 0; i < list.size(); i++) 19 { 20 title = title + list.get(i); 21 } 22 return outTag(title); 23 }
1 /** 2 * 3 * @param s 4 * @return 获得链接 5 */ 6 public List<String> getLink(final String s) 7 { 8 String regex; 9 final List<String> list = new ArrayList<String>(); 10 regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>"; 11 final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 12 final Matcher ma = pa.matcher(s); 13 while (ma.find()) 14 { 15 list.add(ma.group()); 16 } 17 return list; 18 }
1 /** 2 * 3 * @param s 4 * @return 获得脚本代码 5 */ 6 public List<String> getScript(final String s) 7 { 8 String regex; 9 final List<String> list = new ArrayList<String>(); 10 regex = "<script.*?</script>"; 11 final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 12 final Matcher ma = pa.matcher(s); 13 while (ma.find()) 14 { 15 list.add(ma.group()); 16 } 17 return list; 18 }
1 /** 2 * 3 * @param s 4 * @return 获得CSS 5 */ 6 public List<String> getCSS(final String s) 7 { 8 String regex; 9 final List<String> list = new ArrayList<String>(); 10 regex = "<style.*?</style>"; 11 final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); 12 final Matcher ma = pa.matcher(s); 13 while (ma.find()) 14 { 15 list.add(ma.group()); 16 } 17 return list; 18 }
1 /** 2 * 3 * @param s 4 * @return 去掉标记 5 */ 6 public String outTag(final String s) 7 { 8 return s.replaceAll("<.*?>", ""); 9 }
1 /** 2 * 3 * @param s 4 * @return 获取雅虎知识堂文章标题及内容 5 */ 6 public HashMap<String, String> getFromYahoo(final String s) 7 { 8 final HashMap<String, String> hm = new HashMap<String, String>(); 9 final StringBuffer sb = new StringBuffer(); 10 String html = ""; 11 System.out.println("\n------------------开始读取网页(" + s + ")--------------------"); 12 try 13 { 14 html = getOneHtml(s); 15 } 16 catch (final Exception e) 17 { 18 e.getMessage(); 19 } 20 // System.out.println(html); 21 System.out.println("------------------读取网页(" + s + ")结束--------------------\n"); 22 System.out.println("------------------分析(" + s + ")结果如下--------------------\n"); 23 String title = outTag(getTitle(html)); 24 title = title.replaceAll("_雅虎知识堂", ""); 25 // Pattern pa=Pattern.compile("<div 26 // class=\"original\">(.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)</div>",Pattern.DOTALL); 27 final Pattern pa = Pattern.compile("<div class=\"original\">(.*?)</p></div>", Pattern.DOTALL); 28 final Matcher ma = pa.matcher(html); 29 while (ma.find()) 30 { 31 sb.append(ma.group()); 32 } 33 String temp = sb.toString(); 34 temp = temp.replaceAll("(<br>)+?", "\n");// 转化换行 35 temp = temp.replaceAll("<p><em>.*?</em></p>", "");// 去图片注释 36 hm.put("title", title); 37 hm.put("original", outTag(temp)); 38 return hm; 39 40 }
1 /** 2 * 3 * @param args 4 * 测试一组网页,针对雅虎知识堂 5 */ 6 public static void main(final String args[]) 7 { 8 String url = ""; 9 final List<String> list = new ArrayList<String>(); 10 System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行: \n"); 11 /* 12 * http://ks.cn.yahoo.com/question/1307121201133.html 13 * http://ks.cn.yahoo.com/question/1307121101907.html 14 * http://ks.cn.yahoo.com/question/1307121101907_2.html 15 * http://ks.cn.yahoo.com/question/1307121101907_3.html 16 * http://ks.cn.yahoo.com/question/1307121101907_4.html 17 * http://ks.cn.yahoo.com/question/1307121101907_5.html 18 * http://ks.cn.yahoo.com/question/1307121101907_6.html 19 * http://ks.cn.yahoo.com/question/1307121101907_7.html 20 * http://ks.cn.yahoo.com/question/1307121101907_8.html 21 */ 22 final BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); 23 try 24 { 25 while (!(url = br.readLine()).equals("go")) 26 { 27 list.add(url); 28 } 29 } 30 catch (final Exception e) 31 { 32 e.getMessage(); 33 } 34 final WebContent wc = new WebContent(); 35 HashMap<String, String> hm = new HashMap<String, String>(); 36 for (int i = 0; i < list.size(); i++) 37 { 38 hm = wc.getFromYahoo(list.get(i)); 39 System.out.println("标题: " + hm.get("title")); 40 System.out.println("内容: \n" + hm.get("original")); 41 } 42 /* 43 * String htmlurl[] = { 44 * "http://ks.cn.yahoo.com/question/1307121201133.html", 45 * "http://ks.cn.yahoo.com/question/1307121101907.html", 46 * "http://ks.cn.yahoo.com/question/1307121101907_2.html", 47 * "http://ks.cn.yahoo.com/question/1307121101907_3.html", 48 * "http://ks.cn.yahoo.com/question/1307121101907_4.html", 49 * "http://ks.cn.yahoo.com/question/1307121101907_5.html", 50 * "http://ks.cn.yahoo.com/question/1307121101907_6.html", 51 * "http://ks.cn.yahoo.com/question/1307121101907_7.html", 52 * "http://ks.cn.yahoo.com/question/1307121101907_8.html" }; WebContent 53 * wc = new WebContent(); HashMap<String, String> hm = new HashMap<String, 54 * String>(); for (int i = 0; i < htmlurl.length; i++) { hm = 55 * wc.getFromYahoo(htmlurl[i]); System.out.println("标题: " + 56 * hm.get("title")); System.out.println("内容: \n" + hm.get("original")); } 57 */ 58 /* 59 * String html=""; String link=""; String sscript=""; String content=""; 60 * System.out.println(htmlurl+" 开始读取网页内容:"); 61 * html=wc.getOneHtml(htmlurl); System.out.println(htmlurl+" 62 * 读取完毕开始分析……"); html=html.replaceAll("(<script.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)(</script>)"," 63 * ");//去除脚本 html=html.replaceAll("(<style.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)(</style>)"," 64 * ");//去掉CSS html=html.replaceAll("<title>.*?</title>"," ");//除去页面标题 65 * html=html.replaceAll("<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>"," 66 * ");//去掉链接 html=html.replaceAll("(\\s){2,}?"," ");//除去多余空格 67 * html=wc.outTag(html);//多余标记 System.out.println(html); 68 */ 69 70 /* 71 * String s[]=html.split(" +"); for(int i=0;i<s.length;i++){ 72 * content=(content.length()>s[i].length())?content:s[i]; } 73 * System.out.println(content); 74 */ 75 76 // System.out.println(htmlurl+"网页内容结束"); 77 /* 78 * System.out.println(htmlurl+"网页脚本开始:"); List 79 * script=wc.getScript(html); for(int i=0;i<script.size();i++){ 80 * System.out.println(script.get(i)); } 81 * System.out.println(htmlurl+"网页脚本结束:"); 82 * 83 * System.out.println(htmlurl+"CSS开始:"); List css=wc.getCSS(html); 84 * for(int i=0;i<css.size();i++){ System.out.println(css.get(i)); } 85 * System.out.println(htmlurl+"CSS结束:"); 86 * 87 * System.out.println(htmlurl+"全部链接内容开始:"); List list=wc.getLink(html); 88 * for(int i=0;i<list.size();i++){ link=list.get(i).toString(); } 89 * System.out.println(htmlurl+"全部链接内容结束:"); 90 * 91 * System.out.println("内容"); System.out.println(wc.outTag(html)); 92 */ 93 } 94 } 95 96