【java】抓取页面内容,提取链接(此方法可以http get无需账号密码的请求)

 1 package 网络编程;
 2 
 3 import java.io.BufferedReader;
 4 import java.io.BufferedWriter;
 5 import java.io.FileOutputStream;
 6 import java.io.IOException;
 7 import java.io.InputStreamReader;
 8 import java.io.OutputStreamWriter;
 9 import java.net.URL;
10 
11 public class TestBaidu {
12     public static void main(String[] args) throws IOException {
13         URL url=new URL("http://www.baidu.com");
14         /*此方法会有乱码输出
15         InputStream is=url.openStream();
16         byte[] b=new byte[1024];
17         int len=0;
18         while((len=is.read(b))!=-1){
19             System.out.println(new String(b,0,len));
20         }
21         */
22         BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream(),"utf-8"));
23         BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(new FileOutputStream("baidu.html"),"utf-8"));
24         String str=null;
25         while((str=br.readLine())!=null){
26             bw.append(str);
27             bw.newLine();            
28         }
29             //System.out.print(str);
30         bw.flush();
31         bw.close();
32         br.close();
33     }
34 }
抓取页面内容
 1 package 网络编程;
 2 
 3 import java.io.BufferedReader;
 4 import java.io.IOException;
 5 import java.io.InputStreamReader;
 6 import java.net.URL;
 7 import java.nio.charset.Charset;
 8 import java.util.regex.Matcher;
 9 import java.util.regex.Pattern;
10 
11 public class Get163URL {
12     public static void main(String[] args) throws IOException {
13         URL url=new URL("http://www.163.com");
14         BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
15         StringBuffer sb=new StringBuffer();
16         String tmp=null;
17         while((tmp=br.readLine())!=null){
18             sb.append(tmp);            
19         }
20 //        System.out.println(sb.toString());
21         Pattern p=Pattern.compile("\"(http:\\/\\/.+?)\"");
22         Matcher m=p.matcher(sb);
23         while(m.find())
24             System.out.println(m.group(1));
25     }
26 }
提出链接

 


 1 public class WikiDownload {
 2     static final String name = "username";
 3     static final String pwd = "password";
 4 
 5     public static void main(String[] args){
 6         CookieManager manager = new CookieManager();
 7         CookieHandler.setDefault(manager);
 8         String wikiUrl = "http://wiki.xxxxx.org/pages/viewpage.action?pageId=71709153";
 9         String loginUrl = "http://wiki.xxxxx.org/login.action?os_destination=%2Fpages%2Fviewpage.action%3FpageId%3D71709153";
10         try{
11             URL url = new URL(loginUrl);
12             HttpURLConnection connection = (HttpURLConnection)url.openConnection();
13             connection.setRequestProperty("accept", "*/*");
14             connection.setRequestProperty("connection", "Keep-Alive");
15             connection.setRequestProperty("user-agent",
16                     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36");
17             String line;
18             connection.setDoInput(true);
19             connection.setDoOutput(true);
20             connection.setUseCaches(false);
21             connection.setRequestMethod("POST");
22             try(OutputStreamWriter writer = new OutputStreamWriter(connection.getOutputStream())){
23                 writer.write("os_username=" + name
24                         +"&os_password="+ pwd
25                         + "&login=%E7%99%BB%E5%BD%95&os_destination="
26                         + URLEncoder.encode(wikiUrl.split("http://wiki.xxxxx.org")[0],"utf-8"));
27             }
28             try(InputStreamReader reader = new InputStreamReader(connection.getInputStream())){
29                 BufferedReader in = new BufferedReader(reader);
30                 StringBuilder result= new StringBuilder("");
31                 while ((line = in.readLine()) != null) {
32                     result.append("\n");
33                     result.append(line);
34                 }
35                 System.out.println(result);
36             }
37         }catch (Exception e){
38             e.printStackTrace();
39         }
40 
41     }
42 
43 }
获取需要登录的网页

 

posted @ 2017-04-15 15:42  xiongjiawei  阅读(351)  评论(0编辑  收藏  举报