用HttpClient导出人人日志

这两天想把自己在人人的日志导出来，想自己偷个懒，去网上搜个工具，结果在JavaEye找到了一篇，但是下载下来发现已经不适用现在的情况，查了半天原因，原来是人人的blog地址做了改变，现在将新的类贴出来，大家可以看看！这个比原来JavaEye上的老兄发的那篇有以下几个方面改进：rss格式；准确判断用户是否登录成功。好了，代码贴出来，大家看看吧。有不明白的Email给我：zzcwfp@gmail.com
奥：之所以将打印信息弄成英文，是因为ant编译后不支持UTF-8编码，根源于windows下cmd不支持UTF-8的原因，大家可以自己去网上google一下！jar文件包请参考JavaEye老兄列出的那些，我没有增加啥，呵呵！
代码
  1 package com.soffon.tools;
  2  import java.io.BufferedReader;
  3  import java.io.File;
  4 import java.io.IOException;
  5 import java.io.InputStreamReader;
  6 import java.io.StringReader;
  7 import java.util.ArrayList;
  8 import java.util.Iterator;
  9 import java.util.List;
 10 import java.util.regex.Matcher;
 11 import java.util.regex.Pattern;
 12 import org.apache.commons.io.FileUtils;
 13 import org.apache.commons.lang.StringUtils;
 14 import org.apache.http.Header;
 15 import org.apache.http.HttpResponse;
 16 import org.apache.http.NameValuePair;
 17 import org.apache.http.client.HttpClient;
 18 import org.apache.http.client.ResponseHandler;
 19 import org.apache.http.client.entity.UrlEncodedFormEntity;
 20 import org.apache.http.client.methods.HttpGet;
 21 import org.apache.http.client.methods.HttpPost;
 22 import org.apache.http.client.params.CookiePolicy;
 23 import org.apache.http.cookie.Cookie;
 24 import org.apache.http.impl.client.BasicResponseHandler;
 25 import org.apache.http.impl.client.DefaultHttpClient;
 26 import org.apache.http.message.BasicNameValuePair;
 27 import org.apache.http.protocol.HTTP;
 28 public class BlogExport {
 29     private static final String Encoding = "UTF-8";
 30    
 31     private String uid;//登录后通过查询uid看看是否登录成功
 32     private final Pattern uidReg = Pattern.compile("http:\\/\\/www\\.renren\\.com\\/profile\\.do\\?id\\=(\\d+)");
 33     // The configuration items
 34     private static String userName;
 35     private static String password;
 36     private static String redirectURL = "http://www.renren.com/Home.do";
 37     private static String blogURL = "http://blog.renren.com/blog/0?from=homeleft&__view=async-html";
 38 //    private static String blogURL = "http://www.renren.com/home#//blog/blog/0?from=homeleft";
 39     private static String renRenLoginURL = "http://www.renren.com/PLogin.do";
 40     private String index = "index.html";
 41     private String blogIndex = "blogIndex.html";
 42     private String blogContent = "blogContent.html";
 43     private String xmlFile = "D:/nginx/html/airtles.xml";
 44     // The HttpClient is used in one session
 45     private HttpResponse response;
 46     private DefaultHttpClient httpclient;
 47     private final Pattern pageReg = Pattern.compile("当前显示1-(\\d*)篇/共(\\d*)篇");
 48     private final Pattern articleUrlReg = Pattern.compile("http://blog.renren.com/blog/\\d{2,}/(\\d+)");
 49     private final Pattern articleUrlNxtReg = Pattern
 50             .compile("http://blog.renren.com/blog/\\d+/(\\d+)\\?from=fanyeOld");
 51     private final Pattern articleContent = Pattern.compile("<div\\sid=\"blogContent\"\\sclass=\"text-article\">");
 52     private final Pattern articleTitle = Pattern
 53             .compile("<input\\stype=\"hidden\"\\sname=\"itemName\"\\sid=\"itemName\"\\svalue=\"");
 54     private final Pattern articleTimestamp = Pattern
 55             .compile("<span\\sclass=\"timestamp\">([\\d|\\s|\\-\\:]+)<span\\sclass=\"group\">");
 56     private String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
 57     private String xmlFooter = "\t</channel>\n</rss>";
 58    
 59     public BlogExport(){
 60      httpclient = new DefaultHttpClient();
 61      xmlHeader += "  <rss version=\"1.0\"> \n";
 62      xmlHeader += "  <channel> \n";
 63   xmlHeader += "\t<title>soffon.com Latest Articles</title> \n"
 64      +"\t<link>www.soffon.com</link> \n"
 65      +"\t<description>soffon.com Latest Articles</description> \n"
 66      +"\t<language>zh-cn</language> \n"
 67      +"\t<copyright>Copyright (C) soffon.com. All rights reserved.</copyright> \n" 
 68      +"\t<generator>Balder,Home</generator> \n";
 69     }
 70    
 71     private boolean login() {
 72         HttpPost httpost = new HttpPost(renRenLoginURL);
 73         // All the parameters post to the web site
 74         List<NameValuePair> nvps = new ArrayList<NameValuePair>();
 75         nvps.add(new BasicNameValuePair("origURL", redirectURL));
 76         nvps.add(new BasicNameValuePair("domain", "renren.com"));
 77         nvps.add(new BasicNameValuePair("isplogin", "true"));
 78         nvps.add(new BasicNameValuePair("formName", ""));
 79         nvps.add(new BasicNameValuePair("method", ""));
 80         nvps.add(new BasicNameValuePair("submit", "登录"));
 81         nvps.add(new BasicNameValuePair("email", userName));
 82         nvps.add(new BasicNameValuePair("password", password));
 83         nvps.add(new BasicNameValuePair("User-Agent", "Firefox Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3"));
 84         try {
 85             httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
 86             response = httpclient.execute(httpost);
 87         } catch (Exception e) {
 88             e.printStackTrace();
 89             return false;
 90         } finally {
 91             httpost.abort();
 92         }
 93         return true;
 94     }
 95     private String getRedirectLocation() {
 96         Header locationHeader = response.getFirstHeader("Location");
 97         if (locationHeader == null) {
 98             return null;
 99         }
100         return locationHeader.getValue();
101     }
102    
103     //来自：http://www.docin.com/p-89049017.html
104     private void setCookie(DefaultHttpClient httpclient,List<Cookie> cookies){
105      if(cookies.isEmpty()){
106       System.out.println("cookies is empty");
107       return;
108      }else{
109       for(Cookie cookie:cookies){
110        httpclient.getCookieStore().addCookie(cookie);
111       }
112      }
113      
114     }
115     private String getHtml(String redirectLocation) {
116         HttpGet httpget = new HttpGet(redirectLocation);
117        
118         // Create a response handler
119         ResponseHandler<String> responseHandler = new BasicResponseHandler();
120         String responseBody = "";
121         try {
122             responseBody = httpclient.execute(httpget, responseHandler);         
123         } catch (Exception e) {
124             e.printStackTrace();
125             responseBody = "";
126         } finally {
127             httpget.abort();
128         }
129         return responseBody;
130     }
131     public void closeConnect() {
132         httpclient.getConnectionManager().shutdown();
133     }
134     public void export() {
135         if (!login()) {
136             System.out.println("Exception：Login failed!");
137             return;
138         }      
139         //读取首页信息
140         readHomeInfo();
141         ////读取日志首页信息
142         readBlogIndexInfo();
143         //读取日志总数
144         int articleNum = parseArticleNum();
145         System.out.println("total " + articleNum + " blogs!");
146         //导出所有日志
147         exportAllArticle(articleNum);
148         closeConnect();
149         System.out.println("finish!");
150     }
151     private void exportAllArticle(int articleNum) {
152         //获得最新一篇日志的地址
153         String newestArticleUrl = getNewestArticleUrl();
154         if (StringUtils.isNotBlank(newestArticleUrl)) {
155             String blogUrl = newestArticleUrl;
156             //            String[] articles = new String[articleNum];
157             List<String> articles = new ArrayList<String>();
158             articles.add(xmlHeader);
159             //                        int i = 1;
160             for (int i = 1; i <= articleNum; i++) {
161                 System.out.println("export the " + i + " blog(total " + articleNum + ")");
162                 File blogContentFile = new File(blogContent);
163                 String blogHtml = getHtml(blogUrl);
164                 try {
165                     FileUtils.writeStringToFile(blogContentFile, blogHtml, Encoding);
166                 } catch (IOException e) {
167                     e.printStackTrace();
168                 }
169                 articles.add(toRss(blogContentFile));
170                                 Matcher matcher = articleUrlNxtReg.matcher(blogHtml);
171                 if (matcher.find()) {
172                     blogUrl = matcher.group();
173                 }
174                 try {
175                     Thread.sleep(500);
176                 } catch (InterruptedException e) {
177                     e.printStackTrace();
178                 }
179             }
180             writeArticleXmlFile(articles);
181         }
182     }
183     private void writeArticleXmlFile(List<String> articles) {
184         try {
185          articles.add(xmlFooter);
186             FileUtils.writeLines(new File(xmlFile), Encoding, articles);
187         } catch (IOException e) {
188             e.printStackTrace();
189         }
190     }
191     private String getNewestArticleUrl() {
192         String url = "";
193         try {
194             File blogIndexFile = new File(blogIndex);
195             for (Iterator<String> lineIterator = FileUtils.lineIterator(blogIndexFile, Encoding); lineIterator
196                     .hasNext();) {
197                 String lineStr = lineIterator.next();
198                 Matcher matcher = articleUrlReg.matcher(lineStr);
199                 if (matcher.find()) {
200                     url = matcher.group();
201                     break;
202                 }
203             }
204         } catch (IOException e) {
205             e.printStackTrace();
206         }
207         return url;
208     }
209     private int parseArticleNum() {
210         int articleNum = 0;
211         try {
212             File blogIndexFile = new File(blogIndex);
213             for (Iterator<String> lineIterator = FileUtils.lineIterator(blogIndexFile, Encoding); lineIterator
214                     .hasNext();) {
215                 String lineStr = lineIterator.next();
216                 Matcher matcher = pageReg.matcher(lineStr);
217                 if (matcher.find()) {
218                     String num = matcher.group(2);
219                     articleNum = Integer.parseInt(num);
220                     break;
221                 }
222             }
223         } catch (IOException e) {
224             e.printStackTrace();
225         }
226         return articleNum;
227     }
228     private void readBlogIndexInfo() {
229         //读取日志首页
230         String blogIndexHtml = getHtml(blogURL);
231         File blogIndexFile = new File(blogIndex);
232         try {
233             FileUtils.writeStringToFile(blogIndexFile, blogIndexHtml, Encoding);
234         } catch (IOException e) {
235             e.printStackTrace();
236         }
237     }
238     public void readHomeInfo() {
239         //读取首页
240         String redirectLocation = getRedirectLocation();
241         if (redirectLocation != null) {
242             String homeIndexHtml = getHtml(redirectLocation);
243             Matcher matcher = uidReg.matcher(homeIndexHtml);
244             if (matcher.find()) {
245                 uid = matcher.group(1);
246                 System.out.println(uid);
247             }  
248             if(uid != null && !uid.trim().equals("")){//登录成功
249              System.out.println("Login succes!");
250              File homeIndexFile = new File(index);
251              try {
252                  FileUtils.writeStringToFile(homeIndexFile, homeIndexHtml, Encoding);
253              } catch (IOException e) {
254                  e.printStackTrace();
255              }
256             }
257         }else{
258          System.err.println("Login failed!");
259 //         System.exit(-1);
260         }      
261     }
262     public String toRss(File file) {
263         String title = null;
264         String timeStamp = null;
265         String content = null;
266         try {
267             for (Iterator<String> lineIterator = FileUtils.lineIterator(file, Encoding); lineIterator
268                     .hasNext();) {
269                 String lineStr = lineIterator.next();
270                 Matcher contentMatcher = articleContent.matcher(lineStr);
271                 if (contentMatcher.find()) {
272                     lineIterator.next();
273                     content = lineIterator.next();
274                     continue;
275                 }
276                 Matcher titleMatcher = articleTitle.matcher(lineStr);
277                 if (titleMatcher.find()) {
278                     title = StringUtils.substring(lineStr, 58, lineStr.length() - 4);
279                     continue;
280                 }
281                 Matcher timeStampMatcher = articleTimestamp.matcher(lineStr);
282                 if (timeStampMatcher.find()) {
283                     timeStamp = timeStampMatcher.group(1);
284                     continue;
285                 }
286             }
287         } catch (IOException e) {
288             e.printStackTrace();
289         }
290         return toRss(title, timeStamp, content);
291     }
292    private String toRss(String title, String timeStamp, String content) {
293         StringBuffer xmlStr = new StringBuffer(10000).append("\t<item>\n");
294         xmlStr.append("\t\t<title>").append(title).append("</title>\n");
295         xmlStr.append("\t\t<pubdate>").append(timeStamp).append("</pubdate>\n");
296         xmlStr.append("\t\t<description><!--[CDATA[").append(content).append("]]--></description>\n");
297         xmlStr.append("\t</item>\n");
298         return xmlStr.toString();
299     }
300     public static void main(String[] args) throws Exception {
301      userName = "你的用户名";
302      password = "你的密码";     
303         BlogExport export = new BlogExport();
304         export.export();
305     }
306 }
posted @ 2010-12-16 13:43 zzc1986 阅读(521) 评论(0) 编辑收藏举报
刷新页面返回顶部
用HttpClient导出人人日志

公告