HtmlUnit的工具类(请求头,JavaScript,AJAX,验证代理服务器)
一. HtmlUnitUtils
package org.spider.htmlunit;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.Cookie;
import org.jsoup.helper.StringUtil;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @description:
* @author: long.li
* @date: 2019/2/20 14:40
*/
public class HtmlUnitUtils {
public static void main(String[] args) throws Exception {
HtmlUnitBuilder builder = HtmlUnitBuilder.config()
.url("www.baidu.com")
.enableJS(true)
.enableCookie(true);
getPage(builder).asText();
}
public static HtmlPage getPage(HtmlUnitBuilder builder)throws Exception{
WebClient webClient = getWebClient(builder);
int count = -1;
while(true){
try {
count++;
return webClient.getPage(builder.url());
}
catch (Exception e){
if(e instanceof IOException && count < builder.retry()){
//日志打印:e,重试次数:i,再次执行
}else {//如果url错误等情况
//执行失败,抛出异常
throw e;
}
}
}
}
public static WebClient getWebClient(HtmlUnitBuilder builder){
WebClient webClient = null;
if(isBlank(builder.proxyHost())){
webClient = new WebClient(BrowserVersion.CHROME);
}else{
webClient = new WebClient(BrowserVersion.CHROME,builder.proxyHost(),builder.proxyPort());
if(!isBlank(builder.username())){ //需要验证的代理服务器
((DefaultCredentialsProvider) webClient.getCredentialsProvider()).
addCredentials(builder.username(),builder.password());
}
}
//浏览器基本设置
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
webClient.getOptions().setCssEnabled(builder.enableCSS());//是否启用CSS
webClient.getOptions().setJavaScriptEnabled(builder.enableJS()); //默认设置为禁用
if(builder.enableAjax()) {
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
}
if(builder.enableCookie()){
webClient.getCookieManager().setCookiesEnabled(true);
for(Map.Entry<String,String> pair:builder.cookies().entrySet()){
webClient.getCookieManager().addCookie(new Cookie("/",pair.getKey(),pair.getKey()));
}
}
webClient.waitForBackgroundJavaScript(builder.waitForBackgroundJavaScript());
Map<String,String> headers = builder.headers();
if(headers!=null&&headers.size()>0){
for(Map.Entry<String,String> header:headers.entrySet()){
webClient.addRequestHeader(header.getKey(),header.getValue());
}
}
return webClient;
}
/**
* org.jsoup.helper.StringUtil
* @param string
* @return
*/
private static boolean isBlank(String string) {
if (string == null || string.length() == 0)
return true;
int l = string.length();
for (int i = 0; i < l; i++) {
if (!StringUtil.isWhitespace(string.codePointAt(i)))
return false;
}
return true;
}
private static boolean isWhitespace(int c){
return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
}
}
二. HtmlUnitBuilder
package org.spider.htmlunit;
import java.util.HashMap;
import java.util.Map;
/**
* @description:
* @author: long.li
* @date: 2019/2/20 14:40
*/
public class HtmlUnitBuilder {
private String url; //请求URL
private String proxyHost; //代理服务器地址
private int proxyPort; //代理服务器端口
private String username; //代理服务器账户
private String password; //代理服务器密码
private boolean enableCSS = false; //CSS支持
private boolean enableJS = false; //JavaScript支持
private boolean enableAjax = false; //Ajax支持
private boolean enableCookie = false;//cookie支持
private int waitForBackgroundJavaScript = 0; //等待JS加载时间
private int retry = 0; //请求异常重试次数
private Map<String, String> headers = new HashMap<>(); //请求头参数
private Map<String, String> cookies = new HashMap<>(); //cookie
public static HtmlUnitBuilder config() {
return new HtmlUnitBuilder();
}
public HtmlUnitBuilder url(String url) {
this.url = url;
return this;
}
public HtmlUnitBuilder retry(int retry) {
this.retry = retry;
return this;
}
/**
* 不需要验证的代理服务器
*
* @param proxyHost
* @param proxyPort
* @return
*/
public HtmlUnitBuilder proxy(String proxyHost, int proxyPort) {
this.proxyHost = proxyHost;
this.proxyPort = proxyPort;
return this;
}
/**
* 需要验证的代理服务器
*
* @param proxyHost
* @param proxyPort
* @param username
* @param password
* @return
*/
public HtmlUnitBuilder proxy(String proxyHost, int proxyPort,
String username, String password) {
this.proxyHost = proxyHost;
this.proxyPort = proxyPort;
this.username = username;
this.password = password;
return this;
}
public HtmlUnitBuilder enableCSS(boolean enableCSS) {
this.enableCSS = enableCSS;
return this;
}
public HtmlUnitBuilder enableJS(boolean enableJS) {
this.enableJS = enableJS;
return this;
}
public HtmlUnitBuilder enableAjax(boolean enableAjax) {
this.enableAjax = enableAjax;
return this;
}
public HtmlUnitBuilder enableCookie(boolean enableCookie) {
this.enableCookie = enableCookie;
return this;
}
public HtmlUnitBuilder cookies(Map<String, String> cookies){
this.cookies = cookies;
return this;
}
/**
* 设置新的请求头集合
*
* @param headers
* @return
*/
public HtmlUnitBuilder headers(Map<String, String> headers) {
this.headers = headers;
return this;
}
/**
* 添加请求头参数
*
* @param key 键
* @param value 值
* @return
*/
public HtmlUnitBuilder addHeader(String key, String value) {
headers.put(key, value);
return this;
}
public HtmlUnitBuilder waitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
return this;
}
public String url() {
return url;
}
public int retry() {
return retry;
}
public String proxyHost() {
return proxyHost;
}
public int proxyPort() {
return proxyPort;
}
public String username() {
return username;
}
public String password() {
return password;
}
public boolean enableCSS() {
return enableCSS;
}
public boolean enableJS() {
return enableJS;
}
public boolean enableAjax() {
return enableAjax;
}
public boolean enableCookie() {
return enableCookie;
}
public int waitForBackgroundJavaScript() {
return waitForBackgroundJavaScript;
}
public Map<String, String> headers() {
return headers;
}
public Map<String,String> cookies(){
return cookies;
}
}