网页爬虫(蜘蛛)
所用的知识点有:io,正则,Pattern,url
package com.regexTest; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Zhizhu { public static void main(String[] args) throws IOException { test_1();//本地 test_2();//网页 } private static void test_2() throws IOException { URL url=new URL("http://www.cnblogs.com/skyseraph/p/6443596.html"); URLConnection conn=url.openConnection(); BufferedReader bufin=new BufferedReader(new InputStreamReader(conn.getInputStream())); String line=null; String mailreg="\\w+@\\w+(\\.\\w+)+"; Pattern p=Pattern.compile(mailreg); while((line=bufin.readLine())!=null){ p.matcher(line); Matcher m=p.matcher(line);//匹配器 while(m.find()){ System.out.println(m.group());//返回上面被匹配到的输入子序列 } } } private static void test_1() throws IOException { // 1.将文本读取到内存缓冲区 BufferedReader br=new BufferedReader(new FileReader("D:\\BaiduYunDownload\\mail.txt")); String line=null; //书写正则表达式 String mailreg="\\w+@\\w+(\\.\\w+)+"; //将规则封装成对象 Pattern p=Pattern.compile(mailreg); while((line=br.readLine())!=null){ //让正则对象和要作用的字符串关联,获取匹配器对象 p.matcher(line); Matcher m=p.matcher(line); //当找到, while(m.find()){ System.out.println(m.group()); } } } }