转自:https://www.cnblogs.com/akiradunn/p/5855073.html
1 import java.io.BufferedReader; 2 3 import java.io.FileInputStream; 4 5 import java.io.FileNotFoundException; 6 7 import java.io.FileOutputStream; 8 9 import java.io.IOException; 10 11 import java.io.InputStreamReader; 12 13 import java.net.HttpURLConnection; 14 15 import java.net.MalformedURLException; 16 17 import java.net.URL; 18 19 import java.util.regex.Matcher; 20 21 import java.util.regex.Pattern; 22 23 //正则表达式抓取网页数据 24 25 public class HtmlAddressCatch { 26 27 28 public static void main(String[] args) { 29 30 String webaddress = "https://www.zhihu.com/people/Akira_Dunn"; 31 HtmlAddressCatch.getWebTextContent(webaddress); 32 /*String localaddress = "D:\\test\\test.html"; 33 String targetaddress = "D:\\test\\http.txt"; 34 HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/ 35 36 } 37 38 //给定http链接抓取地址 39 40 public static void getWebTextContent(String webaddress){ 41 42 try { 43 44 URL url = new URL(webaddress); 45 46 HttpURLConnection con = (HttpURLConnection)url.openConnection(); 47 48 FileOutputStream file = new FileOutputStream("D:\text.txt"); 49 50 InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流,一次读取更多的字节 51 52 BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节 53 54 int i=0; 55 56 String regex = "https?://\w+\.\w+\.\w+"; 57 58 Pattern p = Pattern.compile(regex); 59 60 while((i=packetreader.read())!=-1) 61 62 { 63 64 String str = packetreader.readLine(); 65 66 Matcher m = p.matcher(str); 67 68 while(m.find()) 69 70 { 71 72 file.write((m.group()+"\r\n").getBytes()); 73 74 } 75 76 } 77 78 } catch (MalformedURLException e) { 79 80 // TODO Auto-generated catch block 81 82 e.printStackTrace(); 83 84 } catch (FileNotFoundException e) { 85 86 // TODO Auto-generated catch block 87 88 e.printStackTrace(); 89 90 } catch (IOException e) { 91 92 // TODO Auto-generated catch block 93 94 e.printStackTrace(); 95 96 } 97 98 99 } 100 101 102 // 从本地test.html文件抓取http链接和邮箱地址 103 104 public static void getLocalTextContent(String localaddress,String targetaddress){ 105 106 try { 107 108 FileInputStream reader = new FileInputStream(localaddress); 109 110 FileOutputStream writer = new FileOutputStream(targetaddress); 111 112 byte[] buf = new byte[200]; 113 114 int point = 0; 115 116 //String regex = "https?://\w+\.\w+\.\w+";http链接抓取 117 118 String regex = "\w+@\w+\.\w+";//邮箱地址抓取 119 120 Pattern p = Pattern.compile(regex); 121 122 while((point=reader.read(buf))>0) 123 124 { 125 126 Matcher m = p.matcher(new String(buf)); 127 128 while(m.find()) 129 130 { 131 132 writer.write((m.group()+"\r\n").getBytes()); 133 134 } 135 136 } 137 138 } catch (FileNotFoundException e) { 139 140 // TODO Auto-generated catch block 141 142 e.printStackTrace(); 143 144 } catch (IOException e) { 145 146 // TODO Auto-generated catch block 147 148 e.printStackTrace(); 149 150 } 151 152 } 153 154 }