package Test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Test { /** * 读取一个网页全部内容 * @param htmlurl * @return * @throws IOException */ public String getOneHtml(String htmlurl) throws IOException{ URL url; String temp; StringBuffer sb = new StringBuffer(); try{ url = new URL(htmlurl); BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(),"utf-8")); while((temp = in.readLine())!=null){ sb.append(temp); } in.close(); }catch(MalformedURLException me){ System.out.println("您输入的URL格式有问题!请检查后再次输入!"); me.getMessage(); throw me; }catch(IOException e){ e.printStackTrace(); throw e; } return sb.toString(); } /** * 获得网页标题 * @param args */ public String getTitle(String s){ String regex; String title=""; List<String> list = new ArrayList<String>(); regex = "<title>.*?</title>"; Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); Matcher ma = pa.matcher(s); while(ma.find()){ list.add(ma.group()); } for(int i=0;i<list.size();i++){ title = title+list.get(i); } return outTag(title); } //获得链接 public List<String> getLink(String s){ String regex; String regex1; List<String> list = new ArrayList<String>(); regex = "<a[^>]*href=('([^']*)'|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>"; Pattern pa = Pattern.compile(regex,Pattern.DOTALL); Matcher ma = pa.matcher(s); while(ma.find()){ list.add(ma.group()); } return list; } /** * 获得脚本代码 * @param s * @return */ public List<String> getScript(String s){ String regex ; List<String> list = new ArrayList<String>(); regex = "<script.*?</script>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); while(ma.find()){ list.add(ma.group()); } return list; } /** * 获得CSS * @param s * @return */ public List<String> getCSS(String s){ String regex; List<String> list = new ArrayList<String>(); regex = "<style.*?</style>"; Pattern pa = Pattern.compile(regex,Pattern.DOTALL); Matcher ma = pa.matcher(s); while(ma.find()){ list.add(ma.group()); } return list; } public String outTag(String s) { return s.replaceAll("<.*?>", ""); } /** * 获取京东团购文章标题及内容 * @param args */ public HashMap<String,String> getFrom360buy(String s){ HashMap<String,String> hm = new HashMap<String,String>(); StringBuffer sb = new StringBuffer(); String html = ""; System.out.println("------------开始读取网页("+s+")---------"); try{ html = getOneHtml(s); }catch(Exception e){ e.getMessage(); } System.out.println("--------------读取网页("+s+")结束----------"); System.out.println("--------------分析("+s+")结果如下----------"); String title = outTag(getTitle(html)); //title = title.replace("京东团购", ""); Pattern pa = Pattern.compile("<h1.*?</h1>",Pattern.DOTALL); Matcher ma = pa.matcher(html); while(ma.find()){ sb.append(ma.group()); } String temp = sb.toString(); temp = temp.replaceAll("<p><em>.*?</em></p>",""); hm.put("title", title); hm.put("no", outTag(temp)); return hm; } /** * 测试一组网页 * @param args */ public static void main(String[] args) { String url = ""; List<String> list = new ArrayList<String>(); System.out.println("输入URL,一行一个,输入结束后输入go程序开始运行"); BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); try{ while(!(url=br.readLine()).equals("go")){ list.add(url); } }catch(Exception e){ e.getMessage(); } Test t = new Test(); HashMap<String, String> hm = new HashMap<String, String>(); for (int i = 0; i < list.size(); i++) { hm = t.getFrom360buy(list.get(i)); System.out.println(list.get(i)); System.out.println("标题: " + hm.get("title")); System.out.println("内容: " + hm.get("no")); } } }
由于分配到爬虫任务,就在网上找了下资料,仿照了下代码,发现是可以用的!
试了几个页面 有title的页面是能够读出标题的。没的话自然也就读不出。想得到自己想要内容主要的还是看正则表达式写的正确与否!
用京东的团购页面做了实验http://tuan.360buy.com/beijing-0-0-1-0-0-index.html,代码是读出团购产品的内容,是能够得到结果的!
有一个问题就是在一个有title的页面得到title时候,在控制台输出的是乱码。还不知道怎么解决,再继续尝试下!