java抓取网页数据,登录之后抓取数据。
最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。
也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。
首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa
1,获取网页内容(核心代码,技术有限没封装)。
2,登录之后抓取网页数据(如何在请求中携带cookie)。
3,获取网站的ajax请求方法(返回json)。
以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)
一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试
package com.minxinloan.black.web.utils;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class CookieUtil {
public final static String CONTENT_TYPE = "Content-Type";
public static void main(String[] args) {
//String loginURL = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";
String listURL = "http://www.p2peye.com/blacklist.php?p=2";
String logURL = "http://www.p2peye.com/member.php";
//********************************需要登录的*************************************************
try {
Connection.Response res =
Jsoup.connect(logURL)
.data("mod","logging"
,"action","login"
,"loginsubmit","yes"
,"loginhash","Lsc66"
,"username","puqiuxiaomao"
,"password","a1234567")
.method(Method.POST)
.execute();
//这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定
Connection con=Jsoup.connect(listURL);
//设置访问形式(电脑访问,手机访问):直接百度都参数设置
con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
//把登录信息的cookies保存如map对象里面
Map <String,String> map=res.cookies();
Iterator<Entry<String,String>> it =map.entrySet().iterator();
while(it.hasNext()){
Entry<String,String> en= it.next();
//把登录的信息放入请求里面
con =con.cookie(en.getKey(), en.getValue());
}
//再次获取Document对象。
Document objectDoc = con.get();
Elements elements = objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)
for (Element element : elements) {
//element是迭代出来的标签:如:<div><span></span></div>
Elements elements2= element.getAllElements();//
for (Element element2 : elements2) {
element2.text();
element2.attr("href");//获取标签属性。element2代表a标签:href代表属性
element2.text();//获取标签文本
}
}
//********************************不需要登录的*************************************************
String URL = "http://www.p2peye.com/blacklist.php?p=2";
Document conTemp = Jsoup.connect(URL).get();
Elements elementsTemps = conTemp.getAllElements();
for (Element elementsTemp : elementsTemps) {
elementsTemp.text();
elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性
elementsTemp.text();//获取标签文本
}
//********************************ajax方法获取内容。。。*************************************************。
HttpURLConnection connection = null;
BufferedReader reader = null;
try {
StringBuffer sb = new StringBuffer();
URL getUrl = new URL(URL);
connection = (HttpURLConnection)getUrl.openConnection();
reader = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"utf-8"));
String lines;
while ((lines = reader.readLine()) != null) {
sb.append(lines);
};
List<Map<String, Object>> list = parseJSON2List(sb.toString());//json转换成list
} catch (Exception e) {
} finally{
if(reader!=null)
try {
reader.close();
} catch (IOException e) {
}
// 断开连接
connection.disconnect();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Map<String, Object> parseJSON2Map(String jsonStr){
Map<String, Object> map = new HashMap<String, Object>();
//最外层解析
JSONObject json = JSONObject.fromObject(jsonStr);
for(Object k : json.keySet()){
Object v = json.get(k);
//如果内层还是数组的话,继续解析
if(v instanceof JSONArray){
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
Iterator<JSONObject> it = ((JSONArray)v).iterator();
while(it.hasNext()){
JSONObject json2 = it.next();
list.add(parseJSON2Map(json2.toString()));
}
map.put(k.toString(), list);
} else {
map.put(k.toString(), v);
}
}
return map;
}
public static List<Map<String, Object>> parseJSON2List(String jsonStr){
JSONArray jsonArr = JSONArray.fromObject(jsonStr);
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
Iterator<JSONObject> it = jsonArr.iterator();
while(it.hasNext()){
JSONObject json2 = it.next();
list.add(parseJSON2Map(json2.toString()));
}
return list;
}
}
二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)
package com.minxinloan.black.web.utils;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
public class Utils {//解析验证码的
public static Content getRandom(String method, String sUrl,// 要解析的url
Map<String, String> paramMap, // 存放用户名和密码的map
Map<String, String> requestHeaderMap,// 存放COOKIE的map
boolean isOnlyReturnHeader, String path) {
Content content = null;
HttpURLConnection httpUrlConnection = null;
InputStream in = null;
try {
URL url = new URL(sUrl);
boolean isPost = "POST".equals(method);
if (method == null
|| (!"GET".equalsIgnoreCase(method) && !"POST"
.equalsIgnoreCase(method))) {
method = "POST";
}
URL resolvedURL = url;
URLConnection urlConnection = resolvedURL.openConnection();
httpUrlConnection = (HttpURLConnection) urlConnection;
httpUrlConnection.setRequestMethod(method);
httpUrlConnection.setRequestProperty("Accept-Language",
"zh-cn,zh;q=0.5");
// Do not follow redirects, We will handle redirects ourself
httpUrlConnection.setInstanceFollowRedirects(false);
httpUrlConnection.setDoOutput(true);
httpUrlConnection.setDoInput(true);
httpUrlConnection.setConnectTimeout(5000);
httpUrlConnection.setReadTimeout(5000);
httpUrlConnection.setUseCaches(false);
httpUrlConnection.setDefaultUseCaches(false);
httpUrlConnection.connect();
int responseCode = httpUrlConnection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK
|| responseCode == HttpURLConnection.HTTP_CREATED) {
byte[] bytes = new byte[0];
if (!isOnlyReturnHeader) {
DataInputStream ins = new DataInputStream(
httpUrlConnection.getInputStream());
// 验证码的位置
DataOutputStream out = new DataOutputStream(
new FileOutputStream(path + "/code.bmp"));
byte[] buffer = new byte[4096];
int count = 0;
while ((count = ins.read(buffer)) > 0) {
out.write(buffer, 0, count);
}
out.close();
ins.close();
}
String encoding = null;
if (encoding == null) {
encoding = getEncodingFromContentType(httpUrlConnection
.getHeaderField(""));
}
content = new Content(sUrl, new String(bytes, encoding),
httpUrlConnection.getHeaderFields());
}
} catch (Exception e) {
return null;
} finally {
if (httpUrlConnection != null) {
httpUrlConnection.disconnect();
}
}
return content;
}
public static String getEncodingFromContentType(String contentType) {
String encoding = null;
if (contentType == null) {
return null;
}
StringTokenizer tok = new StringTokenizer(contentType, ";");
if (tok.hasMoreTokens()) {
tok.nextToken();
while (tok.hasMoreTokens()) {
String assignment = tok.nextToken().trim();
int eqIdx = assignment.indexOf('=');
if (eqIdx != -1) {
String varName = assignment.substring(0, eqIdx).trim();
if ("charset".equalsIgnoreCase(varName)) {
String varValue = assignment.substring(eqIdx + 1)
.trim();
if (varValue.startsWith("\"")
&& varValue.endsWith("\"")) {
// substring works on indices
varValue = varValue.substring(1,
varValue.length() - 1);
}
if (Charset.isSupported(varValue)) {
encoding = varValue;
}
}
}
}
}
if (encoding == null) {
return "UTF-8";
}
return encoding;
}
// 这个是输出
public static boolean inFile(String content, String path) {
PrintWriter out = null;
File file = new File(path);
try {
if (!file.exists()) {
file.createNewFile();
}
out = new PrintWriter(new FileWriter(file));
out.write(content);
out.flush();
return true;
} catch (Exception e) {
e.printStackTrace();
} finally {
out.close();
}
return false;
}
public static String getHtmlReadLine(String httpurl) {
String CurrentLine = "";
String TotalString = "";
InputStream urlStream;
String content = "";
try {
URL url = new URL(httpurl);
HttpURLConnection connection = (HttpURLConnection) url
.openConnection();
connection.connect();
System.out.println(connection.getResponseCode());
urlStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(
new InputStreamReader(urlStream, "utf-8"));
while ((CurrentLine = reader.readLine()) != null) {
TotalString += CurrentLine + "\n";
}
content = TotalString;
} catch (Exception e) {
}
return content;
}
}
class Content {
private String url;
private String body;
private Map<String, List<String>> m_mHeaders = new HashMap<String, List<String>>();
public Content(String url, String body, Map<String, List<String>> headers) {
this.url = url;
this.body = body;
this.m_mHeaders = headers;
}
public String getUrl() {
return url;
}
public String getBody() {
return body;
}
public Map<String, List<String>> getHeaders() {
return m_mHeaders;
}
}