HtmlUnit的工具类(请求头,JavaScript,AJAX,验证代理服务器)

一. HtmlUnitUtils

package org.spider.htmlunit;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.Cookie;
import org.jsoup.helper.StringUtil;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * @description:
 * @author: long.li
 * @date: 2019/2/20 14:40
 */
public class HtmlUnitUtils {
    public static void main(String[] args) throws Exception {
        HtmlUnitBuilder builder = HtmlUnitBuilder.config()
                .url("www.baidu.com")
                .enableJS(true)
                .enableCookie(true);
        getPage(builder).asText();
    }


    public static HtmlPage getPage(HtmlUnitBuilder builder)throws Exception{
        WebClient webClient = getWebClient(builder);
        int count = -1;
        while(true){
            try {
                count++;
                return webClient.getPage(builder.url());
            }
            catch (Exception e){
                if(e instanceof IOException && count < builder.retry()){
                    //日志打印:e,重试次数:i,再次执行
                }else {//如果url错误等情况
                    //执行失败,抛出异常
                    throw e;
                }
            }
        }
    }

    public static WebClient getWebClient(HtmlUnitBuilder builder){
        WebClient webClient = null;
        if(isBlank(builder.proxyHost())){
            webClient = new WebClient(BrowserVersion.CHROME);
        }else{
            webClient = new WebClient(BrowserVersion.CHROME,builder.proxyHost(),builder.proxyPort());
            if(!isBlank(builder.username())){ //需要验证的代理服务器
                ((DefaultCredentialsProvider) webClient.getCredentialsProvider()).
                        addCredentials(builder.username(),builder.password());
            }
        }
        //浏览器基本设置
        webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
        webClient.getOptions().setCssEnabled(builder.enableCSS());//是否启用CSS
        webClient.getOptions().setJavaScriptEnabled(builder.enableJS()); //默认设置为禁用
        if(builder.enableAjax()) {
            webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
        }

        if(builder.enableCookie()){
            webClient.getCookieManager().setCookiesEnabled(true);
            for(Map.Entry<String,String> pair:builder.cookies().entrySet()){
                webClient.getCookieManager().addCookie(new Cookie("/",pair.getKey(),pair.getKey()));
            }

        }
        webClient.waitForBackgroundJavaScript(builder.waitForBackgroundJavaScript());
        Map<String,String> headers = builder.headers();
        if(headers!=null&&headers.size()>0){
            for(Map.Entry<String,String> header:headers.entrySet()){
                webClient.addRequestHeader(header.getKey(),header.getValue());
            }
        }
        return webClient;
    }

    /**
     * org.jsoup.helper.StringUtil
     * @param string
     * @return
     */
    private static boolean isBlank(String string) {
        if (string == null || string.length() == 0)
            return true;

        int l = string.length();
        for (int i = 0; i < l; i++) {
            if (!StringUtil.isWhitespace(string.codePointAt(i)))
                return false;
        }
        return true;
    }
    private static boolean isWhitespace(int c){
        return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
    }
}

二. HtmlUnitBuilder

package org.spider.htmlunit;


import java.util.HashMap;
import java.util.Map;

/**
 * @description:
 * @author: long.li
 * @date: 2019/2/20 14:40
 */
public class HtmlUnitBuilder {


    private String url; //请求URL
    private String proxyHost; //代理服务器地址
    private int proxyPort; //代理服务器端口
    private String username; //代理服务器账户
    private String password; //代理服务器密码
    private boolean enableCSS = false; //CSS支持
    private boolean enableJS = false; //JavaScript支持
    private boolean enableAjax = false; //Ajax支持
    private boolean enableCookie = false;//cookie支持
    private int waitForBackgroundJavaScript = 0; //等待JS加载时间
    private int retry = 0; //请求异常重试次数
    private Map<String, String> headers = new HashMap<>(); //请求头参数
    private Map<String, String> cookies = new HashMap<>(); //cookie


    public static HtmlUnitBuilder config() {
        return new HtmlUnitBuilder();
    }

    public HtmlUnitBuilder url(String url) {
        this.url = url;
        return this;
    }

    public HtmlUnitBuilder retry(int retry) {
        this.retry = retry;
        return this;
    }

    /**
     * 不需要验证的代理服务器
     *
     * @param proxyHost
     * @param proxyPort
     * @return
     */
    public HtmlUnitBuilder proxy(String proxyHost, int proxyPort) {
        this.proxyHost = proxyHost;
        this.proxyPort = proxyPort;
        return this;
    }

    /**
     * 需要验证的代理服务器
     *
     * @param proxyHost
     * @param proxyPort
     * @param username
     * @param password
     * @return
     */
    public HtmlUnitBuilder proxy(String proxyHost, int proxyPort,
                                 String username, String password) {
        this.proxyHost = proxyHost;
        this.proxyPort = proxyPort;
        this.username = username;
        this.password = password;
        return this;
    }

    public HtmlUnitBuilder enableCSS(boolean enableCSS) {
        this.enableCSS = enableCSS;
        return this;
    }

    public HtmlUnitBuilder enableJS(boolean enableJS) {
        this.enableJS = enableJS;
        return this;
    }

    public HtmlUnitBuilder enableAjax(boolean enableAjax) {
        this.enableAjax = enableAjax;
        return this;
    }

    public HtmlUnitBuilder enableCookie(boolean enableCookie) {
        this.enableCookie = enableCookie;
        return this;
    }

    public HtmlUnitBuilder cookies(Map<String, String> cookies){
        this.cookies = cookies;
        return this;
    }



    /**
     * 设置新的请求头集合
     *
     * @param headers
     * @return
     */
    public HtmlUnitBuilder headers(Map<String, String> headers) {
        this.headers = headers;
        return this;
    }

    /**
     * 添加请求头参数
     *
     * @param key   键
     * @param value 值
     * @return
     */
    public HtmlUnitBuilder addHeader(String key, String value) {
        headers.put(key, value);
        return this;
    }

    public HtmlUnitBuilder waitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
        this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
        return this;
    }

    public String url() {
        return url;
    }

    public int retry() {
        return retry;
    }

    public String proxyHost() {
        return proxyHost;
    }

    public int proxyPort() {
        return proxyPort;
    }

    public String username() {
        return username;
    }

    public String password() {
        return password;
    }

    public boolean enableCSS() {
        return enableCSS;
    }

    public boolean enableJS() {
        return enableJS;
    }

    public boolean enableAjax() {
        return enableAjax;
    }

    public boolean enableCookie() {
        return enableCookie;
    }

    public int waitForBackgroundJavaScript() {
        return waitForBackgroundJavaScript;
    }

    public Map<String, String> headers() {
        return headers;
    }
    public Map<String,String> cookies(){
        return cookies;
    }
}


posted @ 2019-02-20 20:17  李子君啊  阅读(918)  评论(0编辑  收藏  举报