准确条件
加入依赖jar包
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.15</version>
</dependency>
代码示例
private WebClient initWc() throws IOException {
WebClient wc = new WebClient(BrowserVersion.CHROME);
wc.getOptions().setJavaScriptEnabled(false);
wc.getOptions().setCssEnabled(false);
wc.getOptions().setTimeout(8000);
wc.setJavaScriptTimeout(8000);
wc.setAjaxController(new NicelyResynchronizingAjaxController());
wc.waitForBackgroundJavaScript(8000);
// Cache cache=new Cache();
// wc.setCache(cache);
wc.getOptions().setThrowExceptionOnScriptError(false);
// wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
return wc;
}
public void loadData() {
WebClient wc = null;
if ( wc == null ) {
try {
wc = initWc();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
//图片中文字解析时使用
IIORegistry registry = IIORegistry.getDefaultInstance();
registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());
registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());
StringBuffer errPage =new StringBuffer();
for(int i =1 ; i<=97;i++){
loadPage(i,errPage,wc);
riskCompanyDao.flush();
}
log.info("errPage:"+errPage);
// loadPage(27,errPage,wc);
} catch (Exception e) {
log.warn("loadData error! ", e);
} finally {
wc.closeAllWindows();
}
}
private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
HtmlPage page;
try {
String refer="http://www.baidu.com/";
URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html");
WebRequest request=new WebRequest(link);
request.setCharset("UTF-8");
request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
设置请求报文头里的User-Agent字段
request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
request.setAdditionalHeader("Connection", "keep-alive");
request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");
page = wc.getPage(request);
HtmlPage pageResult = page;
HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
int indexRow = 0;
for ( DomNode node2 : body.getChildNodes() ) {
if (node2 instanceof HtmlTableRow ) {
HtmlTableRow row = (HtmlTableRow) node2;
List<HtmlTableCell> cells = row.getCells();
HtmlTableCell cell0=cells.get(0);
String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
industryName = industryName.split(":")[1];
String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
if (addr.split(":").length>1){
addr = addr.split(":")[1];
}else{
addr=null;
}
String mobile =null;
if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
String imgStr =img.getAttribute("src");
imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
mobile = ImageRead.getImgStr(imgStr);
log.info("mobile:"+mobile);
}
}
indexRow++;
}
} catch (Exception e) {
errPage.append(pageNo).append(",");
log.warn("page error :"+pageNo,e);
}
}
注意事项
普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
WebClient在请求发起前初始化一次即可
不同浏览器版返回的html代码有一定差异,需单独调试
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 25岁的心里话
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
2018-07-23 (c#) 销毁资源和释放内存
2018-07-23 C#中 ThreadStart和ParameterizedThreadStart区别
2018-07-23 MongoDB的C#封装类
2018-07-23 mongo DB for C#
2018-07-23 C#操作MongoDB
2018-07-23 关于Mongodb的全面总结
2018-07-23 Mongodb下载、安装、配置与使用