JAVA爬取网页邮箱
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 抓取邮箱 * @author lsh * */ public class GetEmail { public static void main(String[] args) throws IOException { URL url = new URL("https://book.douban.com/subject/24753651/discussion/58975313/"); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); InputStream is = conn.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is, "utf-8")); // 书写正则规则 String regex = "\\w+@[0-9a-z]{2,8}\\.com"; // 获得正则对象 Pattern compile = Pattern.compile(regex); // line 始终代表网页中一行数据 String line = br.readLine(); while(line != null){ // 正则对象和 要操作字符串关联 得到匹配引擎 Matcher matcher = compile.matcher(line); while(matcher.find()){ System.out.println("邮箱 : "+matcher.group()); } line = br.readLine(); } } }
传播知识,分享快乐!
作者:IT_BULL
出处:http://www.cnblogs.com/itBulls/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
博客园-博客园。