微博Tag的抓取
问题
抓取页面中的标签内容
方案
使用Fiddler获取Cookie
package crwaler;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.HttpClients;
import util.PostUtils;
public class TagUtil {
public static void main(String[] args) throws Exception {
// Thread.sleep(1000*60*30);
BufferedReader br = new BufferedReader(new FileReader(new File("detial.csv")));
BufferedWriter bw = new BufferedWriter(new FileWriter(new File("tag.csv")));
String temp;
while((temp=br.readLine())!=null){
String[] strs = temp.split(",");
String id = strs[0];
String tag = getTags(id);
System.out.println(id);
System.out.println(tag);
bw.write(temp + "," + tag + "\n");
Thread.sleep(4*1000);
}
br.close();
bw.close();
}
private static String getTags(String id){
String url = "http://weibo.com/p/100306" + id + "/info?mod=pedit_more";
HttpClient httpClient = HttpClients.createDefault();
Map<String, String> map = new HashMap<>();
String ret = PostUtils.sendPost(httpClient, url, map, Charset.forName("utf-8"));
try{
ret = ret.split("标签")[2];
}catch(Exception e){
return "";
}
Pattern pattern = Pattern.compile("\\\\t([0-9a-zA-Z\\u4e00-\\u9fa5]+?)\\\\t");
Matcher matcher = pattern.matcher(ret);
StringBuilder sb = new StringBuilder();
while(matcher.find()){
sb.append(matcher.group(1)+" ");
}
if(sb.length()>0)
return sb.substring(0, sb.length()-1);
return "";
}
}
package util;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class PostUtils {
/**
*
* @param httpClient
* @param url
* 请求地址
* @param params
* 请求参数
* @param encoding
* 编码
* @return
*/
public static String sendPost(HttpClient httpClient, String url, Map<String, String> params, Charset encoding) {
String resp = "";
HttpPost httpPost = new HttpPost(url);
HttpHost proxy = new HttpHost("127.0.0.1",8888);
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
httpPost.setConfig(config);
httpPost.addHeader("Host", "weibo.com");
//httpPost.addHeader("Connection","keep-alive");
//httpPost.addHeader("Cache-Control","max-age=0");
httpPost.addHeader("Upgrade-Insecure-Requests","1");
httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36");
httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpPost.addHeader("DNT","1");
httpPost.addHeader("Referer",url);
httpPost.addHeader("Accept-Language","zh-CN,zh;q=0.8");
httpPost.addHeader("Accept-Encoding", "gzip, deflate");
httpPost.addHeader("Cookie","你的Cookie");
boolean redo = false;
if (params != null && params.size() > 0) {
List<NameValuePair> formParams = new ArrayList<NameValuePair>();
Iterator<Map.Entry<String, String>> itr = params.entrySet().iterator();
while (itr.hasNext()) {
Map.Entry<String, String> entry = itr.next();
formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(formParams, encoding);
httpPost.setEntity(postEntity);
}
CloseableHttpResponse response = null;
try {
response = (CloseableHttpResponse) httpClient.execute(httpPost);
System.out.println(response.getStatusLine().getStatusCode());
if(response.getStatusLine().getStatusCode() == 302)
redo = true;
resp = EntityUtils.toString(response.getEntity(), encoding);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
}
}
}
return resp;
}
public static String sendGet(HttpClient httpClient, String url,Charset encoding) {
String resp = "";
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
CloseableHttpResponse response = null;
try {
response = (CloseableHttpResponse) httpClient.execute(httpGet);
resp = EntityUtils.toString(response.getEntity());
} catch (Exception e) {
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
}
}
}
return resp;
}
}