HttpConnectionManager.java
package spider;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.net.ssl.SSLHandshakeException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
/**
* http连接、抓取管理类
* @author lidongyang
* @createtime Oct 18, 2012 1:55:18 PM
*
* @note 基本测试版
*/
public class HttpConnectionManager {
/**
* 连接池里的最大连接数
*/
public static final int MAX_TOTAL_CONNECTIONS = 100;
/**
* 每个路由的默认最大连接数
*/
public static final int MAX_ROUTE_CONNECTIONS = 50;
/**
* 连接超时时间
*/
public static final int CONNECT_TIMEOUT = 50000;
/**
* 套接字超时时间
*/
public static final int SOCKET_TIMEOUT = 50000;
/**
* 连接池中 连接请求执行被阻塞的超时时间
*/
public static final long CONN_MANAGER_TIMEOUT = 60000;
/**
* http连接相关参数
*/
private static HttpParams parentParams;
/**
* http线程池管理器
*/
private static PoolingClientConnectionManager cm;
/**
* http客户端
*/
private static DefaultHttpClient httpClient;
/**
* 默认目标主机
*/
private static final HttpHost DEFAULT_TARGETHOST = new HttpHost("http://www.qq.com", 80);
/**
* 初始化http连接池,设置参数、http头等等信息
*/
static {
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(
new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(
new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
cm = new PoolingClientConnectionManager(schemeRegistry);
cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);
cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20); //设置对目标主机的最大连接数
parentParams = new BasicHttpParams();
parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST); //设置默认targetHost
parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);
parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
//设置头信息,模拟浏览器
Collection
collection = new ArrayList
();
collection.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
collection.add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
//请求重试处理
HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= 5) {
// 如果超过最大重试次数,那么就不要继续了
return false;
}
if (exception instanceof NoHttpResponseException) {
// 如果服务器丢掉了连接,那么就重试
return true;
}
if (exception instanceof SSLHandshakeException) {
// 不要重试SSL握手异常
return false;
}
HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果请求被认为是幂等的,那么就重试
return true;
}
return false;
}
};
httpClient = new DefaultHttpClient(cm, parentParams);
httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
}
/**
* 抓取页面代码
* @param url 目标页面的url
* @return 页面代码
*/
public String getHtml(String url) {
HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);//代理
String html = getHtml(url, proxyHost);
int count = 0;
while(StringUtils.isEmpty(html)){
proxyHost = new HttpHost("211.142.236.137", 80);//更换代理
html = getHtml(url, proxyHost);
count++;
if(count > 3){
System.out.println("抓取失败");
break;
}
}
System.out.println(html.length());
return html;
}
/**
* 抓取url所指的页面代码
* @param url 目标页面的url
* @return 页面代码
*/
public String getHtml(String url, HttpHost proxyHost) {
String html = "";
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);//设置代理
HttpResponse httpResponse;
HttpEntity httpEntity;
try {
httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return html;
}
httpEntity = httpResponse.getEntity();
if(httpEntity != null){
html = readHtmlContentFromEntity(httpEntity);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if(httpGet != null){
httpGet.releaseConnection();
}
}
return html;
}
/**
* 从response返回的实体中读取页面代码
* @param httpEntity Http实体
* @return 页面代码
* @throws ParseException
* @throws IOException
*/
private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException {
String html = "";
Header header = httpEntity.getContentEncoding();
if(httpEntity.getContentLength() < 2147483647L){ //EntityUtils无法处理ContentLength超过2147483647L的Entity
if(header != null && "gzip".equals(header.getValue())){
html = EntityUtils.toString(new GzipDecompressingEntity(httpEntity));
} else {
html = EntityUtils.toString(httpEntity);
}
} else {
InputStream in = httpEntity.getContent();
if(header != null && "gzip".equals(header.getValue())){
html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
} else {
html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
}
if(in != null){
in.close();
}
}
return html;
}
/**
* 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
* @param httpHost 封装了代理的ip地址和端口
* @param url 用来测试的页面
* @return true 可用 false 不可用
*/
public boolean isProxyUsable(HttpHost proxyHost, String url) {
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
try {
HttpResponse httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return false;
}
HttpEntity httpEntity = httpResponse.getEntity();
if(httpEntity != null) {
String html = readHtmlContentFromEntity(httpEntity);
System.out.println(html.length());
if(StringUtils.isEmpty(html)){
return false;
}
} else {
return false;
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
/**
* 解压服务器返回的gzip流
* @param in 抓取返回的InputStream流
* @param charSet 页面内容编码
* @return 页面内容的String格式
* @throws IOException
*/
private String unZip(InputStream in, String charSet) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPInputStream gis = null;
try {
gis = new GZIPInputStream(in);
byte[] _byte = new byte[1024];
int len = 0;
while ((len = gis.read(_byte)) != -1) {
baos.write(_byte, 0, len);
}
String unzipString = new String(baos.toByteArray(), charSet);
return unzipString;
} finally {
if (gis != null) {
gis.close();
}
if(baos != null){
baos.close();
}
}
}
/**
* 读取InputStream流
* @param in InputStream流
* @return 从流中读取的String
* @throws IOException
*/
private String readInStreamToString(InputStream in, String charSet) throws IOException {
StringBuilder str = new StringBuilder();
String line;
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in, charSet));
while((line = bufferedReader.readLine()) != null){
str.append(line);
str.append("\n");
}
if(bufferedReader != null) {
bufferedReader.close();
}
return str.toString();
}
/**
* for test
* @author lidongyang
* @createtime Oct 18, 2012 2:35:09 PM
*/
public class Test implements Runnable {
String url;
int threadNum;
public Test() {
}
public Test(String url, int threadNum) {
this.url = url;
this.threadNum = threadNum;
}
@Override
public void run() {
getHtml(url);
}
}
/**
* for test
* @param args
* @throws InterruptedException
*/
public static void main(String[] args) throws InterruptedException{
HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
Date start = new Date();
httpConnectionManager.getHtml("http://www.qq.com");
Date end = new Date();
System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
}
}
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.net.ssl.SSLHandshakeException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NoHttpResponseException;
import org.apache.http.ParseException;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
/**
* http连接、抓取管理类
* @author lidongyang
* @createtime Oct 18, 2012 1:55:18 PM
*
* @note 基本测试版
*/
public class HttpConnectionManager {
/**
* 连接池里的最大连接数
*/
public static final int MAX_TOTAL_CONNECTIONS = 100;
/**
* 每个路由的默认最大连接数
*/
public static final int MAX_ROUTE_CONNECTIONS = 50;
/**
* 连接超时时间
*/
public static final int CONNECT_TIMEOUT = 50000;
/**
* 套接字超时时间
*/
public static final int SOCKET_TIMEOUT = 50000;
/**
* 连接池中 连接请求执行被阻塞的超时时间
*/
public static final long CONN_MANAGER_TIMEOUT = 60000;
/**
* http连接相关参数
*/
private static HttpParams parentParams;
/**
* http线程池管理器
*/
private static PoolingClientConnectionManager cm;
/**
* http客户端
*/
private static DefaultHttpClient httpClient;
/**
* 默认目标主机
*/
private static final HttpHost DEFAULT_TARGETHOST = new HttpHost("http://www.qq.com", 80);
/**
* 初始化http连接池,设置参数、http头等等信息
*/
static {
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(
new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(
new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
cm = new PoolingClientConnectionManager(schemeRegistry);
cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);
cm.setMaxPerRoute(new HttpRoute(DEFAULT_TARGETHOST), 20); //设置对目标主机的最大连接数
parentParams = new BasicHttpParams();
parentParams.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
parentParams.setParameter(ClientPNames.DEFAULT_HOST, DEFAULT_TARGETHOST); //设置默认targetHost
parentParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
parentParams.setParameter(ClientPNames.CONN_MANAGER_TIMEOUT, CONN_MANAGER_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
parentParams.setParameter(CoreConnectionPNames.SO_TIMEOUT, SOCKET_TIMEOUT);
parentParams.setParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS, true);
parentParams.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
//设置头信息,模拟浏览器
Collection
collection = new ArrayList
();
collection.add(new BasicHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)"));
collection.add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
collection.add(new BasicHeader("Accept-Language", "zh-cn,zh,en-US,en;q=0.5"));
collection.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8,gbk,gb2312;q=0.7,*;q=0.7"));
collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
parentParams.setParameter(ClientPNames.DEFAULT_HEADERS, collection);
//请求重试处理
HttpRequestRetryHandler httpRequestRetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= 5) {
// 如果超过最大重试次数,那么就不要继续了
return false;
}
if (exception instanceof NoHttpResponseException) {
// 如果服务器丢掉了连接,那么就重试
return true;
}
if (exception instanceof SSLHandshakeException) {
// 不要重试SSL握手异常
return false;
}
HttpRequest request = (HttpRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果请求被认为是幂等的,那么就重试
return true;
}
return false;
}
};
httpClient = new DefaultHttpClient(cm, parentParams);
httpClient.setHttpRequestRetryHandler(httpRequestRetryHandler);
}
/**
* 抓取页面代码
* @param url 目标页面的url
* @return 页面代码
*/
public String getHtml(String url) {
HttpHost proxyHost = new HttpHost("211.142.236.137", 8080);//代理
String html = getHtml(url, proxyHost);
int count = 0;
while(StringUtils.isEmpty(html)){
proxyHost = new HttpHost("211.142.236.137", 80);//更换代理
html = getHtml(url, proxyHost);
count++;
if(count > 3){
System.out.println("抓取失败");
break;
}
}
System.out.println(html.length());
return html;
}
/**
* 抓取url所指的页面代码
* @param url 目标页面的url
* @return 页面代码
*/
public String getHtml(String url, HttpHost proxyHost) {
String html = "";
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);//设置代理
HttpResponse httpResponse;
HttpEntity httpEntity;
try {
httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return html;
}
httpEntity = httpResponse.getEntity();
if(httpEntity != null){
html = readHtmlContentFromEntity(httpEntity);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if(httpGet != null){
httpGet.releaseConnection();
}
}
return html;
}
/**
* 从response返回的实体中读取页面代码
* @param httpEntity Http实体
* @return 页面代码
* @throws ParseException
* @throws IOException
*/
private String readHtmlContentFromEntity(HttpEntity httpEntity) throws ParseException, IOException {
String html = "";
Header header = httpEntity.getContentEncoding();
if(httpEntity.getContentLength() < 2147483647L){ //EntityUtils无法处理ContentLength超过2147483647L的Entity
if(header != null && "gzip".equals(header.getValue())){
html = EntityUtils.toString(new GzipDecompressingEntity(httpEntity));
} else {
html = EntityUtils.toString(httpEntity);
}
} else {
InputStream in = httpEntity.getContent();
if(header != null && "gzip".equals(header.getValue())){
html = unZip(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
} else {
html = readInStreamToString(in, ContentType.getOrDefault(httpEntity).getCharset().toString());
}
if(in != null){
in.close();
}
}
return html;
}
/**
* 测试代理是否可用(其实和getHtml(String url, HttpHost proxyHost)的代码差不多,为了从功能上区别,暂时这样)
* @param httpHost 封装了代理的ip地址和端口
* @param url 用来测试的页面
* @return true 可用 false 不可用
*/
public boolean isProxyUsable(HttpHost proxyHost, String url) {
HttpGet httpGet = new HttpGet(url);
httpGet.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxyHost);
try {
HttpResponse httpResponse = httpClient.execute(httpGet);
StatusLine statusLine = httpResponse.getStatusLine();
int statusCode = statusLine.getStatusCode();
System.out.println(statusCode);
if(200 != statusCode) {
return false;
}
HttpEntity httpEntity = httpResponse.getEntity();
if(httpEntity != null) {
String html = readHtmlContentFromEntity(httpEntity);
System.out.println(html.length());
if(StringUtils.isEmpty(html)){
return false;
}
} else {
return false;
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
return true;
}
/**
* 解压服务器返回的gzip流
* @param in 抓取返回的InputStream流
* @param charSet 页面内容编码
* @return 页面内容的String格式
* @throws IOException
*/
private String unZip(InputStream in, String charSet) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPInputStream gis = null;
try {
gis = new GZIPInputStream(in);
byte[] _byte = new byte[1024];
int len = 0;
while ((len = gis.read(_byte)) != -1) {
baos.write(_byte, 0, len);
}
String unzipString = new String(baos.toByteArray(), charSet);
return unzipString;
} finally {
if (gis != null) {
gis.close();
}
if(baos != null){
baos.close();
}
}
}
/**
* 读取InputStream流
* @param in InputStream流
* @return 从流中读取的String
* @throws IOException
*/
private String readInStreamToString(InputStream in, String charSet) throws IOException {
StringBuilder str = new StringBuilder();
String line;
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in, charSet));
while((line = bufferedReader.readLine()) != null){
str.append(line);
str.append("\n");
}
if(bufferedReader != null) {
bufferedReader.close();
}
return str.toString();
}
/**
* for test
* @author lidongyang
* @createtime Oct 18, 2012 2:35:09 PM
*/
public class Test implements Runnable {
String url;
int threadNum;
public Test() {
}
public Test(String url, int threadNum) {
this.url = url;
this.threadNum = threadNum;
}
@Override
public void run() {
getHtml(url);
}
}
/**
* for test
* @param args
* @throws InterruptedException
*/
public static void main(String[] args) throws InterruptedException{
HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
Date start = new Date();
httpConnectionManager.getHtml("http://www.qq.com");
Date end = new Date();
System.out.println((end.getTime() - start.getTime())/1000.0 + " 秒");
}
}
GetQqNews.java
package parser;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import spider.HttpConnectionManager;
/** test
* @author lidongyang
* @createtime Oct 23, 2012 11:05:33 AM
*/
public class GetQqNews {
public static void main(String[] args){
HttpConnectionManager httpConnectionManager = new HttpConnectionManager();
String html = httpConnectionManager.getHtml("http://www.qq.com");
Document doc = Jsoup.parse(html);
Elements newsList = doc.select("[class=ft fl]").select("ul").select("li").select("a");
for (Element element : newsList) {
System.out.println(element.attr("href") + "----" + element.text());
}
}
}