使用Post方法模拟登陆爬取网页

最近弄爬虫,遇到的一个问题就是如何使用post方法模拟登陆爬取网页。下面是极简版的代码:


import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;

import java.net.HttpURLConnection;
import java.net.URL;

import java.util.HashMap;

public class test {

	//post请求地址
	private static final String POST_URL = "";
	
	//模拟谷歌浏览器请求
	private static final String USER_AGENT = "";
	
	//用账号登录某网站后 请求POST_URL链接获取cookie
	private static final String COOKIE = "";
	
	//用账号登录某网站后 请求POST_URL链接获取数据包
	private static final String REQUEST_DATA =  "";
	
	public static void main(String[] args) throws Exception {
		HashMap<String, String> map = postCapture(REQUEST_DATA);
		String responseCode = map.get("responseCode");
		String value = map.get("value");
		
		while(!responseCode.equals("200")){
			map =  postCapture(REQUEST_DATA);
			responseCode = map.get("responseCode");
			value = map.get("value");
		}
		
		//打印爬取结果
		System.out.println(value);
	}
	
	private static HashMap<String, String> postCapture(String requestData) throws Exception{
		HashMap<String, String> map = new HashMap<>();
		
		URL url = new URL(POST_URL);
		HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
		httpConn.setDoInput(true); // 设置输入流采用字节流
		httpConn.setDoOutput(true); // 设置输出流采用字节流
		httpConn.setUseCaches(false); //设置缓存
		httpConn.setRequestMethod("POST");//POST请求
		httpConn.setRequestProperty("User-Agent", USER_AGENT);
		httpConn.setRequestProperty("Cookie", COOKIE);
		
		PrintWriter out = new PrintWriter(new OutputStreamWriter(httpConn.getOutputStream(), "UTF-8"));
		out.println(requestData);
		out.close();

		int responseCode = httpConn.getResponseCode();
		StringBuffer buffer = new StringBuffer();
		if (responseCode == 200) {
			BufferedReader reader = new BufferedReader(new InputStreamReader(httpConn.getInputStream(), "UTF-8"));
			String line = null;
			while ((line = reader.readLine()) != null) {
				buffer.append(line);
			}
			reader.close();
			httpConn.disconnect();
		}
		
		map.put("responseCode", new Integer(responseCode).toString());
		map.put("value", buffer.toString());
		return map;
	}

}


原文地址:
http://wangxin123.com/2016/12/19/使用Post方法模拟登陆爬取网页/

posted @ 2017-02-14 17:55  星空str  阅读(787)  评论(0编辑  收藏  举报