Java操作Html

由于工作需要,需要解析使用java操作Html页面,所以搜索了一下资料进行汇总。

一、java解析Html

java解析Html需要引入jsoup的包,这里的httpClient模拟请求使用。

        <dependency>
            <!-- jsoup HTML parser library @ https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>

代码如下:

public static void main(String[] args) {
        HttpClient httpClient = HttpClientBuilder.create().build();
        HttpGet httpGet = new HttpGet("https://www.baidu.com/");
        try {
            httpGet.setHeader("Content-Type","text/plain;charset=UTF-8");

            HttpResponse httpResponse =  httpClient.execute(httpGet);
            HttpEntity httpEntity = httpResponse.getEntity();
            String resHtml = EntityUtils.toString(httpEntity,"UTF-8");

            // 使用 jsoup 解析
            Document doc = Jsoup.parse(resHtml);
            Elements e = doc.select("input[id=su]");
            System.out.println(e);
            System.out.println(e.val());
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

我用java代码获取的如下图的元素:

输出结果为:

二、使用htmlunit模拟请求

需要引入下面jar包

        <!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.35.0</version>
        </dependency>

代码如下:

package com.wh.utils;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.*;

/**
 * @Description //TODO
 * @Author wanghao
 * @Date 2019-08-05 21:45
 **/
public class HttpAnalysisHtml {


    public static void main(String[] args) {
        // 创建webclient
        WebClient webClient = new WebClient();
        // 取消 JS 支持
        webClient.getOptions().setJavaScriptEnabled(false);
        // 取消 CSS 支持
        webClient.getOptions().setCssEnabled(false);

        // 获取指定网页实体
        try {
            HtmlPage page = (HtmlPage) webClient.getPage("https://www.baidu.com/");
            // 获取搜索输入框
            HtmlInput input = (HtmlInput) page.getHtmlElementById("kw");
            // 往输入框 “填值”
            input.setValueAttribute("王浩");
            // 获取搜索按钮
            HtmlInput btn = (HtmlInput) page.getHtmlElementById("su");
            // “点击” 搜索
            HtmlPage page2 = btn.click();


            FileOutputStream fos = new FileOutputStream(new File("D:\\repository\\code\\util\\src\\main\\resources\\data\\test002.html"));
            OutputStreamWriter osw = new OutputStreamWriter(fos);
            osw.write(page2.asXml());
        } catch (IOException e) {
            e.printStackTrace();
        }


    }


}

 

 

 

 

 

 

参考文档:

https://blog.csdn.net/larger5/article/details/79683048

 https://blog.csdn.net/zhanglei500038/article/details/74858395

posted @ 2019-08-05 22:53  苦心明  阅读(6776)  评论(1编辑  收藏  举报