JSOUP 简单爬取网页

前情提要:

公司有一个需求是利用java订阅知乎RSS的数据源,通过上网查询资料通过JSOUP是个不错的选择,操作简单方便

一、maven项目的pom依赖

<!-- 解析html -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
</dependency>

二、代码案例

package com.rssdemo.utils;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**读取Url网页链接中的内容信息
 * @author: zy
 * @Date: 2020/8/14 13:35
 */
public class HtmlContent {

    public static void main(String[] args) throws Exception {
        //这里要注意,因为是http请求,所以要加行http://,不然会报错:
        //Http协议异常:java.net.MalformedURLException: no protocol:   www.baidu.com
        String url_read = "https://www.zhihu.com/question/412188418/answer/1386120965?utm_campaign=rss&utm_medium=rss&utm_source=rss&utm_content=title";
        String url_doc = "https://www.zhihu.com/question/412188418/answer/1386120965?utm_campaign=rss&utm_medium=rss&utm_source=rss&utm_content=title";
        //方法1
        HtmlContent htmlContent = new HtmlContent();
        String content = htmlContent.readUrl(url_read, StandardCharsets.UTF_8);
        System.out.println(content);
        System.out.println("--------------");
        //方法2
        Document doc = Jsoup.connect(url_doc).get();
        Elements elementsByClass = doc.getElementsByClass("NumberBoard-itemValue");
//        for (int i = 0; i < elementsByClass.size(); i++) {
//            Element element = elementsByClass.get(i);
//            String text = element.text();
//            System.out.println(text);
//        }
        String text = elementsByClass.get(0).text();
        System.out.println(text);
//        System.out.println(doc);
    }

    /**
     * 读取url的内容
     * @param url 网页链接
     * @param charset 编码字符
     * @return
     * @throws Exception
     */
    public String readUrl(String url, Charset charset) throws Exception {
        BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openConnection().getInputStream(), charset));
        StringBuffer str = new StringBuffer();
        String realLineStr = null;
        while ((realLineStr = br.readLine()) != null) {
            str.append(realLineStr).append("\r\n"); //每读取一行就换行
        }
        return str.toString();
    }

}
posted @ 2020-10-27 01:20  Jsonring  阅读(117)  评论(0编辑  收藏  举报
分享到: