微博Tag的抓取

问题

抓取页面中的标签内容

方案

使用Fiddler获取Cookie

package crwaler;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.HttpClients;

import util.PostUtils;

public class TagUtil {
	public static void main(String[] args) throws Exception {
//		Thread.sleep(1000*60*30);
		BufferedReader br = new BufferedReader(new FileReader(new File("detial.csv")));
		BufferedWriter bw = new BufferedWriter(new FileWriter(new File("tag.csv")));
		String temp;
		while((temp=br.readLine())!=null){
			String[] strs = temp.split(",");
			String id = strs[0];
			String tag = getTags(id);
			System.out.println(id);
			System.out.println(tag);
			bw.write(temp + "," + tag + "\n");
			Thread.sleep(4*1000);
		}
		br.close();
		bw.close();
	}
	private static String getTags(String id){
		String url = "http://weibo.com/p/100306" + id + "/info?mod=pedit_more";
		HttpClient httpClient = HttpClients.createDefault();
		Map<String, String> map = new HashMap<>();
		String ret = PostUtils.sendPost(httpClient, url, map, Charset.forName("utf-8"));
		try{
			ret = ret.split("标签")[2];
		}catch(Exception e){
			return "";
		}
		Pattern pattern = Pattern.compile("\\\\t([0-9a-zA-Z\\u4e00-\\u9fa5]+?)\\\\t");
		Matcher matcher = pattern.matcher(ret);
		StringBuilder sb = new StringBuilder();
		while(matcher.find()){
			sb.append(matcher.group(1)+" ");
		}
		if(sb.length()>0)
			return sb.substring(0, sb.length()-1);
		return "";
	}
}
package util;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;


public class PostUtils {
	/**
	 *
	 * @param httpClient
	 * @param url
	 *            请求地址
	 * @param params
	 *            请求参数
	 * @param encoding
	 *            编码
	 * @return
	 */
	public static String sendPost(HttpClient httpClient, String url, Map<String, String> params, Charset encoding) {
		String resp = "";
		HttpPost httpPost = new HttpPost(url);
		HttpHost proxy = new HttpHost("127.0.0.1",8888);
		RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
		httpPost.setConfig(config);
		
		httpPost.addHeader("Host", "weibo.com");
		
		//httpPost.addHeader("Connection","keep-alive");
		//httpPost.addHeader("Cache-Control","max-age=0");
		httpPost.addHeader("Upgrade-Insecure-Requests","1");
		
		httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36");
		httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
		httpPost.addHeader("DNT","1");
		httpPost.addHeader("Referer",url);
		httpPost.addHeader("Accept-Language","zh-CN,zh;q=0.8");
		httpPost.addHeader("Accept-Encoding", "gzip, deflate");
		httpPost.addHeader("Cookie","你的Cookie");
		boolean redo = false;
		if (params != null && params.size() > 0) {
			List<NameValuePair> formParams = new ArrayList<NameValuePair>();
			Iterator<Map.Entry<String, String>> itr = params.entrySet().iterator();
			while (itr.hasNext()) {
				Map.Entry<String, String> entry = itr.next();
				formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
			}
			UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(formParams, encoding);
			httpPost.setEntity(postEntity);
		}
		CloseableHttpResponse response = null;
		try {
			response = (CloseableHttpResponse) httpClient.execute(httpPost);
			System.out.println(response.getStatusLine().getStatusCode());
			if(response.getStatusLine().getStatusCode() == 302)
				redo = true;
			resp = EntityUtils.toString(response.getEntity(), encoding);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (response != null) {
				try {
					response.close();
				} catch (IOException e) {

				}
			}
		}
		return resp;
	}
	
	public static String sendGet(HttpClient httpClient, String url,Charset encoding) {
		String resp = "";
		HttpGet httpGet = new HttpGet(url);
		httpGet.setHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
		CloseableHttpResponse response = null;
		try {
			response = (CloseableHttpResponse) httpClient.execute(httpGet);
			resp = EntityUtils.toString(response.getEntity());
		} catch (Exception e) {
		} finally {
			if (response != null) {
				try {
					response.close();
				} catch (IOException e) {

				}
			}
		}
		return resp;
	}
}

posted on 2019-06-05 22:05  yanximin  阅读(446)  评论(0编辑  收藏  举报

导航