httpclient+Jsoup总结

Jsoup.parse解析HTML字符串,如Jsoup.parse("<html><head><title>Firstparse</title></head>")

Jsoup.connect解析url网站地址,如Jsoup.connect(http://www.baidu.com).get()

可以用httpclient获取网页,再用Jsoup.parse解析页面

String text = getMethod.getResponseBodyAsString();
Document doc = Jsoup.parse(text);

法一:httpclient+Jsoup

 1 String dataUrl = "http://hi.mop.com/?";
 2 HttpClient httpClient = new HttpClient();
 3 String cookies = "_ml=371386500452711504675;";
 4 GetMethod getMethod = new GetMethod(dataUrl);
 5 // 每次访问需授权的网址时需带上前面的 cookie 作为通行证
 6 getMethod.setRequestHeader("cookie", cookies);
 7 getMethod.setRequestHeader("Referer", "http://passport.mop.com/");
 8 getMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)");
 9 int statusCodes = httpClient.executeMethod(getMethod);
10 System.out.println(statusCodes);
11 System.out.println("模拟登录成功");        
12 String text = getMethod.getResponseBodyAsString();

 

法二:Jsoup Connction + Jsoup解析

Connection conn = Jsoup.connect(String url);            
conn.data("txtBill", key);// 设置关键字查询字段
Document doc = null;
doc = conn.timeout(100000).post();//设置请求类型为post型或者get型,超时100000毫秒    
results = doc.select(TagName);// 处理返回数据            
for (Element result : results){}

 

package Step1;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;

import javax.lang.model.element.Element;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * 
    * @ClassName: HttpLogin 
    * @Description:  java通过httpclient结合Jsoup
    * @author zeze
    * @date 2015年11月10日 下午5:07:33 
    *
 */

public class HttpLogin {

    public static void main(String[] args) {
        //Url
        String dataUrl = "http://hi.mop.com/?";
        HttpClient httpClient = new HttpClient();

        try {
            // 获得登陆后的 Cookie
            String cookies = "_ml=371386500452711504675;";
            GetMethod getMethod = new GetMethod(dataUrl);
            // 每次访问需授权的网址时需带上前面的 cookie 作为通行证
            getMethod.setRequestHeader("cookie", cookies);
            getMethod.setRequestHeader("Referer", "http://passport.mop.com/");
            getMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)");
            int statusCodes = httpClient.executeMethod(getMethod);
            System.out.println("状态码:"+statusCodes);
            
            //如果返回的头信息没有指定长度或长度大于1M则抛出异常需把 getResponseBodyAsString()换成 getResponseBodyAsStream()
            //String text = getMethod.getResponseBodyAsString();
            InputStream inputStream = getMethod.getResponseBodyAsStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
            StringBuffer stringBuffer = new StringBuffer();
            String str= "";
            while((str = br.readLine()) != null){
                stringBuffer.append(str );
            }
            
            //Element result=text;
             Document doc = Jsoup.parse(stringBuffer.toString());              
             Elements name=doc.select("[class=tc c068 fs14 yahei mt5 username]");
             System.out.println(name.text());
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

 

posted @ 2015-11-10 16:56  陈泽泽  阅读(1278)  评论(0编辑  收藏  举报