(转)HttpURLConnection模拟登录后添加cookie读取网页

  1. package fileTest;  
  2.   
  3. import java.io.*;  
  4. import java.net.HttpURLConnection;  
  5. import java.net.URL;  
  6.   
  7. public class ConnTest {  
  8.     public static void main(String args[]) throws Exception{  
  9.         String lianzaiUrl = "http://tieba.baidu.com/p/1243174814?pn=";  
  10.         String loginAction = "https://passport.baidu.com/?login?";  
  11.         //取cookie  
  12.         String cookie =  getCookie("test","test",loginAction);  
  13.         if(!cookie.contains("USERID=")){  
  14.             System.out.println("登录失败");  
  15.             System.exit(1);  
  16.         }  
  17.         StringBuffer result = new StringBuffer();  
  18.         StringBuffer errorList = new StringBuffer();  
  19.         for(int i=1;i<=3;i++){  
  20.             String allUrl = getUrl(lianzaiUrl+i);  
  21.             String all[] = allUrl.split(";");  
  22.             for(int x = 0;x all.length;x++){//拿到每个帖子的地址  
  23.                 String content = doRead(cookie, all[x]);  
  24.                 if(null != content && !"".equals(content)){  
  25.                     result.append(content);  
  26.                 }else{  
  27.                     errorList.append(all[x] + "\r\n");  
  28.                 }  
  29.             }  
  30.         }  
  31.         BufferedWriter writer = new BufferedWriter(new FileWriter(new File("F:\\遮天.txt")));  
  32.         BufferedWriter errorWriter = new BufferedWriter(new FileWriter(new File("F:\\errorList.txt")));  
  33.         writer.write(result.toString());  
  34.         writer.flush();  
  35.         writer.close();  
  36.         errorWriter.write(errorList.toString());  
  37.         errorWriter.flush();  
  38.         errorWriter.close();  
  39.     }  
  40.   
  41.     public static String doRead(String cookie,String url) throws IOException {  
  42.         BufferedReader reader = null;  
  43.         String titleBegin = "<h1>";  
  44.         String titleEnd = "</h1>";  
  45.         String contentBegin = "class=\"d_post_content\">";  
  46.         String contentEnd = "</p>";  
  47.         HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();  
  48.         conn.setRequestProperty("Cookie",cookie);  
  49.         reader = new BufferedReader(new InputStreamReader(conn.getInputStream(),"gbk"));  
  50.         String line = "";  
  51.         StringBuffer resultBuffer = new StringBuffer();  
  52.         while((line = reader.readLine()) != null){  
  53.             resultBuffer.append(line);  
  54.         }  
  55.         String result = resultBuffer.toString();  
  56.         int titleBeginIndex = result.indexOf(titleBegin) + titleBegin.length();  
  57.         int titleEndIndex = result.indexOf(titleEnd);  
  58.         if(titleBeginIndex <0 || titleEndIndex <0){  
  59.             System.out.println("帖子不存在,url:"+url);  
  60.             return null;  
  61.         }  
  62.         String title = result.substring(titleBeginIndex,titleEndIndex);  
  63.         System.out.println("正在读取帖子:"+ title + "...");  
  64.         String content = title + "\r\n";  
  65.         while(result.contains(contentBegin)){  
  66.             int contentBeginIndex = result.indexOf(contentBegin) + contentBegin.length();  
  67.             result = result.substring(contentBeginIndex);  
  68.             int contentEndIndex = result.indexOf(contentEnd);  
  69.             content += result.substring(0,contentEndIndex);  
  70.             result = result.substring(contentEndIndex + contentEnd.length());  
  71.         }  
  72.         conn.disconnect();  
  73.         reader.close();  
  74.         content = content.replaceAll("<br>","\r\n");  
  75.         content = content.replaceAll("</br>","\r\n");  
  76.         content += "\r\n";  
  77.         return content;  
  78.     }  
  79.   
  80.     /**  
  81.      * 获得一连载贴内容中的所有超链接  
  82.      * @param lianzaiUrl  
  83.      * @return  
  84.      * @throws Exception  
  85.      */  
  86.     public static String getUrl(String lianzaiUrl) throws Exception{  
  87.         URL url  = new URL(lianzaiUrl);  
  88.         HttpURLConnection conn = (HttpURLConnection) url.openConnection();  
  89.         BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(),"gbk"));  
  90.         String line = "";  
  91.         StringBuffer buffer = new StringBuffer();  
  92.         StringBuffer urlBuf = new StringBuffer();  
  93.         while((line = reader.readLine()) != null){  
  94.             buffer.append(line);  
  95.         }  
  96.         String result = buffer.toString();  
  97.         String contentBegin = "class=\"d_post_content\">";  
  98.         String contentEnd = "</p>";  
  99.         String urlBegin = "<a href=\"";  
  100.         String urlEnd = "\"";  
  101.         while(result.contains(contentBegin)){  
  102.             int contentStartIndex = result.indexOf(contentBegin) + contentBegin.length();  
  103.             result = result.substring(contentStartIndex);  
  104.             int contentEndIndex = result.indexOf(contentEnd);  
  105.             String content = result.substring(0,contentEndIndex);  
  106.             while (content.contains(urlBegin)){  
  107.                 int urlBeginIndex = content.indexOf(urlBegin) + urlBegin.length();  
  108.                 content = content.substring(urlBeginIndex);  
  109.                 int urlEndIndex = content.indexOf(urlEnd);  
  110.                 String href = content.substring(0,urlEndIndex).trim();  
  111.                 /*http://tieba.baidu.com/p/1196506653  
  112.                 http://tieba.baidu.com/p/1196506653?see_lz=1  
  113.                 http://tieba.baidu.com/f?kz=1127473409  
  114.                 http://tieba.baidu.com/p/1127473409?see_lz=1*/  
  115.                 //http://tieba.baidu.com/f?kz=1127600193  
  116.                 //http://tieba.baidu.com/p/1127600193?see_lz=1  
  117.                 //将超链接转为只看楼主模式  
  118.                 if(href.contains("http://tieba.baidu.com/f?")){  
  119.                     String kz = href.substring("http://tieba.baidu.com/f?kz=".length());  
  120.                     href = "http://tieba.baidu.com/p/" + kz.trim() + "?see_lz=1";  
  121.                 }else{  
  122.                     href += "?see_lz=1";  
  123.                 }  
  124.                 urlBuf.append(href + ";");  
  125.                 content = content.substring(urlEndIndex + urlEnd.length());  
  126.             }  
  127.             result = result.substring(contentEndIndex + contentEnd.length());  
  128.         }  
  129.         reader.close();  
  130.         return urlBuf.toString();  
  131.     }  
  132.   
  133.     /**  
  134.      * post方式登录  
  135.      * @param username  
  136.      * @param password  
  137.      * @param loginAction  
  138.      * @return  
  139.      * @throws Exception  
  140.      */  
  141.     public static String getCookie(String username,String password,String loginAction) throws Exception{  
  142.         //登录  
  143.         URL url = new URL(loginAction);  
  144.         String param = "username="+username+"&password="+password;  
  145.         HttpURLConnection conn = (HttpURLConnection) url.openConnection();  
  146.         conn.setDoInput(true);  
  147.         conn.setDoOutput(true);  
  148.         conn.setRequestMethod("POST");  
  149.         OutputStream out = conn.getOutputStream();  
  150.         out.write(param.getBytes());  
  151.         out.flush();  
  152.         out.close();  
  153.         String sessionId = "";  
  154.         String cookieVal = "";  
  155.         String key = null;  
  156.         //取cookie  
  157.         for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){  
  158.             if(key.equalsIgnoreCase("set-cookie")){  
  159.                 cookieVal = conn.getHeaderField(i);  
  160.                 cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));  
  161.                 sessionId = sessionId + cookieVal + ";";  
  162.             }  
  163.         }  
  164.         return sessionId;  
  165.     }  
  166. }  
posted @ 2016-05-09 13:44  流浪随风  阅读(2613)  评论(0编辑  收藏  举报