基于boilberpipe算法抓取文章类网页中文章文本

<dependency>
       <groupId>de.l3s.boilerpipe</groupId>
       <artifactId>boilerpipe</artifactId>
       <!--<version>1.2.0</version>-->
        <version>1.1.0</version>
</dependency>
<dependency>
        <groupId>xerces</groupId>
        <artifactId>xercesImpl</artifactId>
        <version>2.9.1</version>
</dependency>
<dependency>
         <groupId>net.sourceforge.nekohtml</groupId>
         <artifactId>nekohtml</artifactId>
         <version>1.9.13</version>
</dependency>
public static String getNewsContent(String html) {
        if (StringUtils.isEmpty(html)) return html;
        String content = null;
        InputStream is = null;
        try {
            is = new ByteArrayInputStream(html.getBytes());
            InputSource inputSource = new InputSource(is);
            inputSource.setEncoding("UTF-8"); // 在这里设置你的文本的正确格式
            TextDocument textDocument = new BoilerpipeSAXInput(inputSource).getTextDocument();
            BoilerpipeExtractor extractor = CommonExtractors.CANOLA_EXTRACTOR;
            extractor.process(textDocument);
            content = textDocument.getContent();
        }catch (Exception e){
            e.printStackTrace();
        } finally {
            if (is!=null){
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return content;
    }

  

posted @ 2020-08-21 13:17  小小爬虫  阅读(184)  评论(0编辑  收藏  举报