正则表达式抓取文件内容中的http链接地址

转自：https://www.cnblogs.com/akiradunn/p/5855073.html
  1 import java.io.BufferedReader;
  2 
  3 import java.io.FileInputStream;
  4 
  5 import java.io.FileNotFoundException;
  6 
  7 import java.io.FileOutputStream;
  8 
  9 import java.io.IOException;
 10 
 11 import java.io.InputStreamReader;
 12 
 13 import java.net.HttpURLConnection;
 14 
 15 import java.net.MalformedURLException;
 16 
 17 import java.net.URL;
 18 
 19 import java.util.regex.Matcher;
 20 
 21 import java.util.regex.Pattern;
 22 
 23 //正则表达式抓取网页数据
 24 
 25 public class HtmlAddressCatch {
 26 
 27 
 28 public static void main(String[] args) {
 29 
 30   String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
 31   HtmlAddressCatch.getWebTextContent(webaddress);
 32   /*String localaddress = "D:\\test\\test.html";
 33   String targetaddress = "D:\\test\\http.txt";
 34   HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
 35     
 36 }
 37 
 38 //给定http链接抓取地址
 39 
 40 public static void getWebTextContent(String webaddress){
 41 
 42 try {
 43 
 44 URL url = new URL(webaddress);
 45 
 46 HttpURLConnection con = (HttpURLConnection)url.openConnection();
 47 
 48 FileOutputStream file = new FileOutputStream("D:\text.txt");
 49 
 50 InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流，一次读取更多的字节
 51 
 52 BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节
 53 
 54 int i=0;
 55 
 56 String regex = "https?://\w+\.\w+\.\w+";
 57 
 58 Pattern p = Pattern.compile(regex);
 59 
 60 while((i=packetreader.read())!=-1)
 61 
 62 {
 63 
 64 String str = packetreader.readLine();
 65 
 66 Matcher m = p.matcher(str);
 67 
 68 while(m.find())
 69 
 70 {
 71 
 72 file.write((m.group()+"\r\n").getBytes());
 73 
 74 }
 75 
 76 }
 77 
 78 } catch (MalformedURLException e) {
 79 
 80 // TODO Auto-generated catch block
 81 
 82 e.printStackTrace();
 83 
 84 } catch (FileNotFoundException e) {
 85 
 86 // TODO Auto-generated catch block
 87 
 88 e.printStackTrace();
 89 
 90 } catch (IOException e) {
 91 
 92 // TODO Auto-generated catch block
 93 
 94 e.printStackTrace();
 95 
 96 }
 97 
 98 
 99 }
100 
101 
102 // 从本地test.html文件抓取http链接和邮箱地址
103 
104 public static void getLocalTextContent(String localaddress,String targetaddress){
105 
106 try {
107 
108 FileInputStream reader = new FileInputStream(localaddress);
109 
110 FileOutputStream writer = new FileOutputStream(targetaddress);
111 
112 byte[] buf = new byte[200];
113 
114 int point = 0;
115 
116 //String regex = "https?://\w+\.\w+\.\w+";http链接抓取
117 
118 String regex = "\w+@\w+\.\w+";//邮箱地址抓取
119 
120 Pattern p = Pattern.compile(regex);
121 
122 while((point=reader.read(buf))>0)
123 
124 {
125 
126 Matcher m = p.matcher(new String(buf));
127 
128 while(m.find())
129 
130 {
131 
132 writer.write((m.group()+"\r\n").getBytes());
133 
134 }
135 
136 }
137 
138 } catch (FileNotFoundException e) {
139 
140 // TODO Auto-generated catch block
141 
142 e.printStackTrace();
143 
144 } catch (IOException e) {
145 
146 // TODO Auto-generated catch block
147 
148 e.printStackTrace();
149 
150 }
151 
152 }
153 
154 }
posted on 2019-02-17 01:52 Sharpest 阅读(1160) 评论(0) 编辑收藏举报