用HttpClient导出人人日志

这两天想把自己在人人的日志导出来,想自己偷个懒,去网上搜个工具,结果在JavaEye找到了一篇,但是下载下来发现已经不适用现在的情况,查了半天原因,原来是人人的blog地址做了改变,现在将新的类贴出来,大家可以看看!这个比原来JavaEye上的老兄发的那篇有以下几个方面改进:rss格式;准确判断用户是否登录成功。好了,代码贴出来,大家看看吧。有不明白的Email给我:zzcwfp@gmail.com

奥:之所以将打印信息弄成英文,是因为ant编译后不支持UTF-8编码,根源于windows下cmd不支持UTF-8的原因,大家可以自己去网上google一下!jar文件包请参考JavaEye老兄列出的那些,我没有增加啥,呵呵!

 

 

代码
1 package com.soffon.tools;
2  import java.io.BufferedReader;
3  import java.io.File;
4 import java.io.IOException;
5 import java.io.InputStreamReader;
6 import java.io.StringReader;
7 import java.util.ArrayList;
8 import java.util.Iterator;
9 import java.util.List;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12 import org.apache.commons.io.FileUtils;
13 import org.apache.commons.lang.StringUtils;
14 import org.apache.http.Header;
15 import org.apache.http.HttpResponse;
16 import org.apache.http.NameValuePair;
17 import org.apache.http.client.HttpClient;
18 import org.apache.http.client.ResponseHandler;
19 import org.apache.http.client.entity.UrlEncodedFormEntity;
20 import org.apache.http.client.methods.HttpGet;
21 import org.apache.http.client.methods.HttpPost;
22 import org.apache.http.client.params.CookiePolicy;
23 import org.apache.http.cookie.Cookie;
24 import org.apache.http.impl.client.BasicResponseHandler;
25 import org.apache.http.impl.client.DefaultHttpClient;
26 import org.apache.http.message.BasicNameValuePair;
27 import org.apache.http.protocol.HTTP;
28 public class BlogExport {
29 private static final String Encoding = "UTF-8";
30
31 private String uid;//登录后通过查询uid看看是否登录成功
32 private final Pattern uidReg = Pattern.compile("http:\\/\\/www\\.renren\\.com\\/profile\\.do\\?id\\=(\\d+)");
33 // The configuration items
34 private static String userName;
35 private static String password;
36 private static String redirectURL = "http://www.renren.com/Home.do";
37 private static String blogURL = "http://blog.renren.com/blog/0?from=homeleft&__view=async-html";
38 // private static String blogURL = "http://www.renren.com/home#//blog/blog/0?from=homeleft";
39 private static String renRenLoginURL = "http://www.renren.com/PLogin.do";
40 private String index = "index.html";
41 private String blogIndex = "blogIndex.html";
42 private String blogContent = "blogContent.html";
43 private String xmlFile = "D:/nginx/html/airtles.xml";
44 // The HttpClient is used in one session
45 private HttpResponse response;
46 private DefaultHttpClient httpclient;
47 private final Pattern pageReg = Pattern.compile("当前显示1-(\\d*)篇/共(\\d*)篇");
48 private final Pattern articleUrlReg = Pattern.compile("http://blog.renren.com/blog/\\d{2,}/(\\d+)");
49 private final Pattern articleUrlNxtReg = Pattern
50 .compile("http://blog.renren.com/blog/\\d+/(\\d+)\\?from=fanyeOld");
51 private final Pattern articleContent = Pattern.compile("<div\\sid=\"blogContent\"\\sclass=\"text-article\">");
52 private final Pattern articleTitle = Pattern
53 .compile("<input\\stype=\"hidden\"\\sname=\"itemName\"\\sid=\"itemName\"\\svalue=\"");
54 private final Pattern articleTimestamp = Pattern
55 .compile("<span\\sclass=\"timestamp\">([\\d|\\s|\\-\\:]+)<span\\sclass=\"group\">");
56 private String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
57 private String xmlFooter = "\t</channel>\n</rss>";
58
59 public BlogExport(){
60 httpclient = new DefaultHttpClient();
61 xmlHeader += " <rss version=\"1.0\"> \n";
62 xmlHeader += " <channel> \n";
63 xmlHeader += "\t<title>soffon.com Latest Articles</title> \n"
64 +"\t<link>www.soffon.com</link> \n"
65 +"\t<description>soffon.com Latest Articles</description> \n"
66 +"\t<language>zh-cn</language> \n"
67 +"\t<copyright>Copyright (C) soffon.com. All rights reserved.</copyright> \n"
68 +"\t<generator>Balder,Home</generator> \n";
69 }
70
71 private boolean login() {
72 HttpPost httpost = new HttpPost(renRenLoginURL);
73 // All the parameters post to the web site
74 List<NameValuePair> nvps = new ArrayList<NameValuePair>();
75 nvps.add(new BasicNameValuePair("origURL", redirectURL));
76 nvps.add(new BasicNameValuePair("domain", "renren.com"));
77 nvps.add(new BasicNameValuePair("isplogin", "true"));
78 nvps.add(new BasicNameValuePair("formName", ""));
79 nvps.add(new BasicNameValuePair("method", ""));
80 nvps.add(new BasicNameValuePair("submit", "登录"));
81 nvps.add(new BasicNameValuePair("email", userName));
82 nvps.add(new BasicNameValuePair("password", password));
83 nvps.add(new BasicNameValuePair("User-Agent", "Firefox Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3"));
84 try {
85 httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
86 response = httpclient.execute(httpost);
87 } catch (Exception e) {
88 e.printStackTrace();
89 return false;
90 } finally {
91 httpost.abort();
92 }
93 return true;
94 }
95 private String getRedirectLocation() {
96 Header locationHeader = response.getFirstHeader("Location");
97 if (locationHeader == null) {
98 return null;
99 }
100 return locationHeader.getValue();
101 }
102
103 //来自:http://www.docin.com/p-89049017.html
104 private void setCookie(DefaultHttpClient httpclient,List<Cookie> cookies){
105 if(cookies.isEmpty()){
106 System.out.println("cookies is empty");
107 return;
108 }else{
109 for(Cookie cookie:cookies){
110 httpclient.getCookieStore().addCookie(cookie);
111 }
112 }
113
114 }
115 private String getHtml(String redirectLocation) {
116 HttpGet httpget = new HttpGet(redirectLocation);
117
118 // Create a response handler
119 ResponseHandler<String> responseHandler = new BasicResponseHandler();
120 String responseBody = "";
121 try {
122 responseBody = httpclient.execute(httpget, responseHandler);
123 } catch (Exception e) {
124 e.printStackTrace();
125 responseBody = "";
126 } finally {
127 httpget.abort();
128 }
129 return responseBody;
130 }
131 public void closeConnect() {
132 httpclient.getConnectionManager().shutdown();
133 }
134 public void export() {
135 if (!login()) {
136 System.out.println("Exception:Login failed!");
137 return;
138 }
139 //读取首页信息
140 readHomeInfo();
141 ////读取日志首页信息
142 readBlogIndexInfo();
143 //读取日志总数
144 int articleNum = parseArticleNum();
145 System.out.println("total " + articleNum + " blogs!");
146 //导出所有日志
147 exportAllArticle(articleNum);
148 closeConnect();
149 System.out.println("finish!");
150 }
151 private void exportAllArticle(int articleNum) {
152 //获得最新一篇日志的地址
153 String newestArticleUrl = getNewestArticleUrl();
154 if (StringUtils.isNotBlank(newestArticleUrl)) {
155 String blogUrl = newestArticleUrl;
156 // String[] articles = new String[articleNum];
157 List<String> articles = new ArrayList<String>();
158 articles.add(xmlHeader);
159 // int i = 1;
160 for (int i = 1; i <= articleNum; i++) {
161 System.out.println("export the " + i + " blog(total " + articleNum + ")");
162 File blogContentFile = new File(blogContent);
163 String blogHtml = getHtml(blogUrl);
164 try {
165 FileUtils.writeStringToFile(blogContentFile, blogHtml, Encoding);
166 } catch (IOException e) {
167 e.printStackTrace();
168 }
169 articles.add(toRss(blogContentFile));
170 Matcher matcher = articleUrlNxtReg.matcher(blogHtml);
171 if (matcher.find()) {
172 blogUrl = matcher.group();
173 }
174 try {
175 Thread.sleep(500);
176 } catch (InterruptedException e) {
177 e.printStackTrace();
178 }
179 }
180 writeArticleXmlFile(articles);
181 }
182 }
183 private void writeArticleXmlFile(List<String> articles) {
184 try {
185 articles.add(xmlFooter);
186 FileUtils.writeLines(new File(xmlFile), Encoding, articles);
187 } catch (IOException e) {
188 e.printStackTrace();
189 }
190 }
191 private String getNewestArticleUrl() {
192 String url = "";
193 try {
194 File blogIndexFile = new File(blogIndex);
195 for (Iterator<String> lineIterator = FileUtils.lineIterator(blogIndexFile, Encoding); lineIterator
196 .hasNext();) {
197 String lineStr = lineIterator.next();
198 Matcher matcher = articleUrlReg.matcher(lineStr);
199 if (matcher.find()) {
200 url = matcher.group();
201 break;
202 }
203 }
204 } catch (IOException e) {
205 e.printStackTrace();
206 }
207 return url;
208 }
209 private int parseArticleNum() {
210 int articleNum = 0;
211 try {
212 File blogIndexFile = new File(blogIndex);
213 for (Iterator<String> lineIterator = FileUtils.lineIterator(blogIndexFile, Encoding); lineIterator
214 .hasNext();) {
215 String lineStr = lineIterator.next();
216 Matcher matcher = pageReg.matcher(lineStr);
217 if (matcher.find()) {
218 String num = matcher.group(2);
219 articleNum = Integer.parseInt(num);
220 break;
221 }
222 }
223 } catch (IOException e) {
224 e.printStackTrace();
225 }
226 return articleNum;
227 }
228 private void readBlogIndexInfo() {
229 //读取日志首页
230 String blogIndexHtml = getHtml(blogURL);
231 File blogIndexFile = new File(blogIndex);
232 try {
233 FileUtils.writeStringToFile(blogIndexFile, blogIndexHtml, Encoding);
234 } catch (IOException e) {
235 e.printStackTrace();
236 }
237 }
238 public void readHomeInfo() {
239 //读取首页
240 String redirectLocation = getRedirectLocation();
241 if (redirectLocation != null) {
242 String homeIndexHtml = getHtml(redirectLocation);
243 Matcher matcher = uidReg.matcher(homeIndexHtml);
244 if (matcher.find()) {
245 uid = matcher.group(1);
246 System.out.println(uid);
247 }
248 if(uid != null && !uid.trim().equals("")){//登录成功
249 System.out.println("Login succes!");
250 File homeIndexFile = new File(index);
251 try {
252 FileUtils.writeStringToFile(homeIndexFile, homeIndexHtml, Encoding);
253 } catch (IOException e) {
254 e.printStackTrace();
255 }
256 }
257 }else{
258 System.err.println("Login failed!");
259 // System.exit(-1);
260 }
261 }
262 public String toRss(File file) {
263 String title = null;
264 String timeStamp = null;
265 String content = null;
266 try {
267 for (Iterator<String> lineIterator = FileUtils.lineIterator(file, Encoding); lineIterator
268 .hasNext();) {
269 String lineStr = lineIterator.next();
270 Matcher contentMatcher = articleContent.matcher(lineStr);
271 if (contentMatcher.find()) {
272 lineIterator.next();
273 content = lineIterator.next();
274 continue;
275 }
276 Matcher titleMatcher = articleTitle.matcher(lineStr);
277 if (titleMatcher.find()) {
278 title = StringUtils.substring(lineStr, 58, lineStr.length() - 4);
279 continue;
280 }
281 Matcher timeStampMatcher = articleTimestamp.matcher(lineStr);
282 if (timeStampMatcher.find()) {
283 timeStamp = timeStampMatcher.group(1);
284 continue;
285 }
286 }
287 } catch (IOException e) {
288 e.printStackTrace();
289 }
290 return toRss(title, timeStamp, content);
291 }
292 private String toRss(String title, String timeStamp, String content) {
293 StringBuffer xmlStr = new StringBuffer(10000).append("\t<item>\n");
294 xmlStr.append("\t\t<title>").append(title).append("</title>\n");
295 xmlStr.append("\t\t<pubdate>").append(timeStamp).append("</pubdate>\n");
296 xmlStr.append("\t\t<description><!--[CDATA[").append(content).append("]]--></description>\n");
297 xmlStr.append("\t</item>\n");
298 return xmlStr.toString();
299 }
300 public static void main(String[] args) throws Exception {
301 userName = "你的用户名";
302 password = "你的密码";
303 BlogExport export = new BlogExport();
304 export.export();
305 }
306 }

 

 

posted @ 2010-12-16 13:43  zzc1986  阅读(520)  评论(0编辑  收藏  举报