1、htmlunit简单使用
1、 引入依赖
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.66.0</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.10</version>
</dependency>
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>2.1</version>
</dependency>
2、封装获取Cookie的方法
public List<org.apache.http.cookie.Cookie> getHtmlByWebClient(String url, String method) {
WebClient webClient = new WebClient();
webClient.getOptions().setTimeout(100000);//超时时间
webClient.getOptions().setJavaScriptEnabled(true); // 支持js
webClient.getOptions().setThrowExceptionOnScriptError(false); //忽略js错误
webClient.setCssErrorHandler(new SilentCssErrorHandler()); //忽略css错误
webClient.getOptions().setCssEnabled(false);// 不执行CSS渲染
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
webClient.getOptions().setRedirectEnabled(true);//允许重定向
webClient.getCookieManager().setCookiesEnabled(true);//允许cookie
WebResponse response = null;
Set<com.gargoylesoftware.htmlunit.util.Cookie> cookies = null;
try {
//先去请求网站,让他把网站的CookieSet进去
try {
webClient.getPage(url);
} catch (IOException e) {
} catch (FailingHttpStatusCodeException e) {
}
//等待30秒让WebClient执行js脚本
TimeUnit.SECONDS.sleep(30L);
URL ur = new URL(url);
WebRequest webequest;
if ("GET".equals(method)) {
webequest = new WebRequest(ur, HttpMethod.GET);
} else {
webequest = new WebRequest(ur, HttpMethod.POST);
}
response = webClient.loadWebResponse(webequest);
cookies = webClient.getCookies(new URL(url));
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
webClient.close();
}
return com.gargoylesoftware.htmlunit.util.Cookie.toHttpClient(cookies);
}
3、测试
@Test
void name888() {
String url = "https://www.yuque.com/wanqi-1f4b0/vlgn2k/cnf0v6tw2v5viz5t"
List<org.apache.http.cookie.Cookie> get = getHtmlByWebClient(url);
StringBuilder sb = new StringBuilder();
for (org.apache.http.cookie.Cookie cookie : get) {
HttpCookie httpCookie = new HttpCookie( cookie.getName(), cookie.getValue());
sb.append(httpCookie);
sb.append("; ");
}
Map<String, String> headers = new HashMap<>();
headers.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0");
headers.put("Host", "pic.netbian.com");
headers.put("Sec-Fetch-Dest", "document");
headers.put("Sec-Fetch-Mode", "navigate");
headers.put("Sec-Fetch-Site", "cross-site");
headers.put("Upgrade-Insecure-Requests", "1");
headers.put("Pragma", "no-cache");
headers.put("Cookie", sb.toString());
String s1 = HttpUtil.createGet(url)
.addHeaders(headers)
.execute().body();
System.out.println(s1);
}
2、html解析工具htmlparser封装
package com.wanqi.util;
import org.htmlparser.tags.CompositeTag;
/**
* @Auther: wq
* @Date: 2020/3/9 17:37
* @Description: htmlparser定制标签
* @Version: 1.0
*/
public class CustomizeTag extends CompositeTag {
private static final String mIds[] = {
"tbody", "b", "strong", "dd", "section", "big"
};
private static final String mEndTagEnders[] = {
"tbody", "b", "strong", "dd", "section", "big"
};
public CustomizeTag() {
}
@Override
public String[] getIds() {
return mIds;
}
@Override
public String[] getEndTagEnders() {
return mEndTagEnders;
}
}
package com.wanqi.util;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
public class ParseUtil {
/**
* 提取具有某个属性值的标签列表
*
* @param <T>
* @param html
* 被提取的HTML文本
* @param tagType
* 标签类型
* @param attributeName
* 某个属性的名称
* @param attributeValue
* 属性应取的值
* @return
*/
@SuppressWarnings({ "serial", "unchecked" })
public static <T extends TagNode> List<T> parseTags(String html, final Class<T> tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node -> {
if (node.getClass() == tagType) {
T tn = (T) node;
if (attributeName == null) {
return true;
}
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null && attrValue.equals(attributeValue)) {
return true;
}
}
return false;
});
return getTs(tagList);
} catch (ParserException e) {
}
return null;
}
@NotNull
private static <T extends TagNode> List<T> getTs(NodeList tagList) {
List<T> tags = new ArrayList<T>();
for (int i = 0; i < tagList.size(); i++) {
T t = (T) tagList.elementAt(i);
tags.add(t);
}
return tags;
}
@SuppressWarnings({ "serial", "unchecked" })
public static <T extends TagNode> List<T> parseNodes(String html, final Class<T> tagType,
final String attributeName, final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node -> {
if (node instanceof TagNode) {
T tn = (T) node;
if (attributeName == null) {
return true;
}
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null && attrValue.equals(attributeValue)) {
return true;
}
}
return false;
});
return getTs(tagList);
} catch (ParserException e) {
}
return null;
}
public static <T extends TagNode> List<T> parseTags(String html, final Class<T> tagType) {
return parseTags(html, tagType, null, null);
}
public static <T extends TagNode> T parseTag(String html, final Class<T> tagType, final String attributeName,
final String attributeValue) {
List<T> tags = parseTags(html, tagType, attributeName, attributeValue);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
public static <T extends TagNode> T parseNode(String html, final Class<T> tagType, final String attributeName,
final String attributeValue) {
List<T> tags = parseNodes(html, tagType, attributeName, attributeValue);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
public static <T extends TagNode> T parseTag(String html, final Class<T> tagType) {
return parseTag(html, tagType, null, null);
}
@SuppressWarnings({ "serial", "unchecked" })
public static <T extends TagNode> List<T> parseFuzzyTags(String html, final Class<T> tagType,
final String attributeName, final String attributeValue) {
try {
// 创建一个HTML解释器
Parser parser = new Parser();
parser.setInputHTML(html);
NodeList tagList = parser.parse((NodeFilter) node -> {
if (node instanceof TagNode) {
T tn = (T) node;
if (attributeName == null) {
return true;
}
String attrValue = tn.getAttribute(attributeName);
if (attrValue != null && attrValue.contains(attributeValue)) {
return true;
}
}
return false;
});
return getTs(tagList);
} catch (ParserException e) {
// e.printStackTrace();
}
return null;
}
public static <T extends TagNode> T parseFuzzyTag(String html, final Class<T> tagType, final String attributeName,
final String attributeValue) {
List<T> tags = parseFuzzyTags(html, tagType, attributeName, attributeValue);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @return
*/
public static CustomizeTag parseTagsByCustomize(String html, final String tagType) {
List<CustomizeTag> tags = parseTagsByCustomizes(html, tagType, null, null);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @param attributeName
* @param attributeValue
* @return
*/
public static CustomizeTag parseTagsByCustomize(String html, final String tagType, final String attributeName,
final String attributeValue) {
List<CustomizeTag> tags = parseTagsByCustomizes(html, tagType, attributeName, attributeValue);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @param attributeName
* @param attributeValue
* @return
*/
public static List<CustomizeTag> parseTagsByCustomizes(String html, final String tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
CustomizeTag customizeTag = new CustomizeTag();
Parser parser = new Parser();
parser.setInputHTML(html);
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(customizeTag);
parser.setNodeFactory(p);
String tagName = null;
String[] ids = customizeTag.getIds();
for (String id : ids) {
if(tagType.equals(id)){
tagName = id;
}
}
if(tagName == null){
return null;
}
NodeList tagList = parser.parse(new TagNameFilter(tagName));
List<CustomizeTag> tags = new ArrayList<CustomizeTag>();
for (int i = 0; i < tagList.size(); i++) {
CustomizeTag t = (CustomizeTag) tagList.elementAt(i);
if (attributeName == null) {
tags.add(t);
} else {
String attrValue = t.getAttribute(attributeName);
if (attrValue != null && attrValue.equals(attributeValue)) {
tags.add(t);
}
}
}
return tags;
} catch (ParserException e) {
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @return
*/
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType) {
List<CustomizeTag> tags = parseFuzzyTagsByCustomizes(html, tagType, null, null);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @param attributeName
* @param attributeValue
* @return
*/
public static CustomizeTag parseFuzzyTagsByCustomize(String html, final String tagType, final String attributeName,
final String attributeValue) {
List<CustomizeTag> tags = parseFuzzyTagsByCustomizes(html, tagType, attributeName, attributeValue);
if (tags != null && tags.size() > 0) {
return tags.get(0);
}
return null;
}
/**
* 自定义html标签
* @param html
* @param tagType 标签名
* @param attributeName
* @param attributeValue
* @return
*/
public static List<CustomizeTag> parseFuzzyTagsByCustomizes(String html, final String tagType, final String attributeName,
final String attributeValue) {
try {
// 创建一个HTML解释器
CustomizeTag customizeTag = new CustomizeTag();
Parser parser = new Parser();
parser.setInputHTML(html);
PrototypicalNodeFactory p=new PrototypicalNodeFactory();
p.registerTag(customizeTag);
parser.setNodeFactory(p);
String tagName = null;
String[] ids = customizeTag.getIds();
for (String id : ids) {
if(tagType.equals(id)){
tagName = id;
}
}
if(tagName == null){
return null;
}
NodeList tagList = parser.parse(new TagNameFilter(tagName));
List<CustomizeTag> tags = new ArrayList<CustomizeTag>();
for (int i = 0; i < tagList.size(); i++) {
CustomizeTag t = (CustomizeTag) tagList.elementAt(i);
if (attributeName == null) {
tags.add(t);
} else {
String attrValue = t.getAttribute(attributeName);
if (attrValue != null && attrValue.contains(attributeValue)) {
tags.add(t);
}
}
}
return tags;
} catch (ParserException e) {
}
return null;
}
}