Java模拟登陆新浪微博抓取数据【转载】
1 package com.shiyimm.crawler.weibo; 2 3 import java.io.FileNotFoundException; 4 import java.io.FileReader; 5 import java.io.IOException; 6 import java.io.UnsupportedEncodingException; 7 import java.net.URLDecoder; 8 import java.net.URLEncoder; 9 import java.util.ArrayList; 10 import java.util.Date; 11 import java.util.List; 12 import java.util.regex.Matcher; 13 import java.util.regex.Pattern; 14 15 import javax.script.Invocable; 16 import javax.script.ScriptEngine; 17 import javax.script.ScriptEngineManager; 18 import javax.script.ScriptException; 19 20 import net.sf.json.JSONObject; 21 22 import org.apache.commons.codec.binary.Base64; 23 import org.apache.http.NameValuePair; 24 import org.apache.http.client.ClientProtocolException; 25 import org.apache.http.client.HttpClient; 26 import org.apache.http.impl.client.DefaultHttpClient; 27 import org.apache.http.message.BasicNameValuePair; 28 29 import com.shiyimm.crawler.util.MyUrlUtil; 30 import com.shiyimm.crawler.util.UrlUtil; 31 32 public class SinaWeibo { 33 private HttpClient client; 34 private String username; //登录帐号(明文) 35 private String password; //登录密码(明文) 36 private String su; //登录帐号(Base64加密) 37 private String sp; //登录密码(各种参数RSA加密后的密文) 38 private long servertime; //初始登录时,服务器返回的时间戳,用以密码加密以及登录用 39 private String nonce; //初始登录时,服务器返回的一串字符,用以密码加密以及登录用 40 private String rsakv; //初始登录时,服务器返回的一串字符,用以密码加密以及登录用 41 private String pubkey; //初始登录时,服务器返回的RSA公钥 42 43 private String errInfo; //登录失败时的错误信息 44 private String location; //登录成功后的跳转连接 45 46 private String url; 47 48 public SinaWeibo(String username,String password){ 49 client = new DefaultHttpClient(); 50 this.username = username; 51 this.password = password; 52 } 53 54 55 /** 56 * 初始登录信息<br> 57 * 返回false说明初始失败 58 * @return 59 */ 60 public boolean preLogin(){ 61 boolean flag = false; 62 try { 63 su = new String(Base64.encodeBase64(URLEncoder.encode(username, "UTF-8").getBytes())); 64 String url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&rsakt=mod&checkpin=1&" + 65 "client=ssologin.js(v1.4.5)&_="+getTimestamp(); 66 url += "&su="+su; 67 String content; 68 content = HttpTools.getRequest(client, url); 69 //System.out.println(content); 70 System.out.println("content------------"+content); 71 JSONObject json = JSONObject.fromObject(content); 72 System.out.println(json); 73 servertime = json.getLong("servertime"); 74 nonce = json.getString("nonce"); 75 rsakv = json.getString("rsakv"); 76 pubkey = json.getString("pubkey"); 77 flag = encodePwd(); 78 } catch (UnsupportedEncodingException e) { 79 // TODO Auto-generated catch block 80 //e.printStackTrace(); 81 } catch (ClientProtocolException e) { 82 // TODO Auto-generated catch block 83 //e.printStackTrace(); 84 } catch (IOException e) { 85 // TODO Auto-generated catch block 86 //e.printStackTrace(); 87 } 88 return flag; 89 } 90 91 /** 92 * 登录 93 * @return true:登录成功 94 */ 95 public boolean login(){ 96 if(preLogin()){ 97 String url = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)"; 98 List<NameValuePair> parms = new ArrayList<NameValuePair>(); 99 parms.add(new BasicNameValuePair("entry", "weibo")); 100 parms.add(new BasicNameValuePair("geteway", "1")); 101 parms.add(new BasicNameValuePair("from", "")); 102 parms.add(new BasicNameValuePair("savestate", "7")); 103 parms.add(new BasicNameValuePair("useticket", "1")); 104 parms.add(new BasicNameValuePair("pagerefer", "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%2F")); 105 parms.add(new BasicNameValuePair("vsnf", "1")); 106 parms.add(new BasicNameValuePair("su", su)); 107 parms.add(new BasicNameValuePair("service", "miniblog")); 108 parms.add(new BasicNameValuePair("servertime", servertime+"")); 109 parms.add(new BasicNameValuePair("nonce", nonce)); 110 parms.add(new BasicNameValuePair("pwencode", "rsa2")); 111 parms.add(new BasicNameValuePair("rsakv", rsakv)); 112 parms.add(new BasicNameValuePair("sp", sp)); 113 parms.add(new BasicNameValuePair("encoding", "UTF-8")); 114 parms.add(new BasicNameValuePair("prelt", "182")); 115 parms.add(new BasicNameValuePair("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack")); 116 parms.add(new BasicNameValuePair("returntype", "META")); 117 try { 118 String content = HttpTools.postRequest(client, url, parms); 119 System.out.println("content----------"+content); 120 String regex = "location\\.replace\\(\"(.+?)\"\\);"; 121 Pattern p = Pattern.compile(regex); 122 Matcher m = p.matcher(content); 123 if(m.find()){ 124 location = m.group(1); 125 if(location.contains("reason=")){ 126 errInfo = location.substring(location.indexOf("reason=")+7); 127 errInfo = URLDecoder.decode(errInfo, "GBK"); 128 }else{ 129 String result = HttpTools.getRequest(client, location); 130 System.out.println("result--------------"+result); 131 return true; 132 } 133 } 134 } catch (ClientProtocolException e) { 135 // TODO Auto-generated catch block 136 e.printStackTrace(); 137 } catch (IOException e) { 138 // TODO Auto-generated catch block 139 e.printStackTrace(); 140 } 141 // url = "http://www.weibo.com/hm"; 142 // System.out.println(MyUrlUtil.getResource(url)); 143 } 144 return false; 145 } 146 147 /** 148 * 密码进行RSA加密<br> 149 * 返回false说明加密失败 150 * @return 151 */ 152 private boolean encodePwd(){ 153 ScriptEngineManager sem = new ScriptEngineManager(); 154 ScriptEngine se = sem.getEngineByName("javascript"); 155 try { 156 FileReader fr = new FileReader("E:\\encoder.js"); 157 se.eval(fr); 158 Invocable invocableEngine = (Invocable) se; 159 String callbackvalue = (String) invocableEngine.invokeFunction("encodePwd",pubkey,servertime,nonce,password); 160 sp = callbackvalue; 161 return true; 162 } catch (FileNotFoundException e) { 163 // TODO Auto-generated catch block 164 System.out.println("加密脚本encoder.sj未找到"); 165 } catch (ScriptException e) { 166 // TODO Auto-generated catch block 167 //e.printStackTrace(); 168 } catch (NoSuchMethodException e) { 169 // TODO Auto-generated catch block 170 //e.printStackTrace(); 171 } 172 errInfo = "密码加密失败!"; 173 return false; 174 } 175 176 public String getErrInfo() { 177 return errInfo; 178 } 179 180 /** 181 * 获取时间戳 182 * @return 183 */ 184 private long getTimestamp(){ 185 Date now = new Date(); 186 return now.getTime(); 187 } 188 189 public static void main(String[] args) throws ClientProtocolException, IOException { 190 SinaWeibo weibo = new SinaWeibo("账号", "密码"); 191 if(weibo.login()){ 192 System.out.println("登陆成功!"); 193 String url = "http://www.weibo.com/hm"; 194 // String source = MyUrlUtil.getResource(url); 195 // System.out.println(source); 196 }else{ 197 System.out.println("登录失败!"); 198 } 199 } 200 } 201 <pre class="brush:java; toolbar: true; auto-links: false;">package com.shiyimm.crawler.weibo; 202 203 import java.io.IOException; 204 import java.util.List; 205 206 import org.apache.http.HttpEntity; 207 import org.apache.http.HttpResponse; 208 import org.apache.http.NameValuePair; 209 import org.apache.http.client.ClientProtocolException; 210 import org.apache.http.client.HttpClient; 211 import org.apache.http.client.entity.UrlEncodedFormEntity; 212 import org.apache.http.client.methods.HttpGet; 213 import org.apache.http.client.methods.HttpPost; 214 import org.apache.http.util.EntityUtils; 215 216 public class HttpTools { 217 /** 218 * 正常GET方式HTTP请求 219 * @param client 220 * @param url 221 * @return 222 * @throws ClientProtocolException 223 * @throws IOException 224 */ 225 public static String getRequest(HttpClient client,String url) throws ClientProtocolException, IOException{ 226 HttpGet get = new HttpGet(url); 227 get.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"); 228 //get.addHeader("Referer", "http://2013.weibo.com/"); 229 HttpResponse response = client.execute(get); 230 HttpEntity entity = response.getEntity(); 231 String content = EntityUtils.toString(entity,"GBK"); 232 //System.out.println(content); 233 /*EntityUtils.consume(entity);*/ 234 return content; 235 } 236 237 /** 238 * 正常POST方式HTTP请求 239 * @param client 240 * @param url 241 * @param parms 242 * @return 243 * @throws ClientProtocolException 244 * @throws IOException 245 */ 246 public static String postRequest(HttpClient client,String url,List<NameValuePair> parms) throws ClientProtocolException, IOException{ 247 HttpPost post = new HttpPost(url); 248 post.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"); 249 post.addHeader("Content-Type", "application/x-www-form-urlencoded"); 250 //post.addHeader("Referer", "http://2013.weibo.com/"); 251 UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(parms, "UTF-8"); 252 post.setEntity(postEntity); 253 HttpResponse response = client.execute(post); 254 HttpEntity entity = response.getEntity(); 255 String content = EntityUtils.toString(entity,"GBK"); 256 /*EntityUtils.consume(entity);*/ 257 return content; 258 } 259 }