基于boilberpipe算法抓取文章类网页中文章文本
<dependency> <groupId>de.l3s.boilerpipe</groupId> <artifactId>boilerpipe</artifactId> <!--<version>1.2.0</version>--> <version>1.1.0</version> </dependency> <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> <version>2.9.1</version> </dependency> <dependency> <groupId>net.sourceforge.nekohtml</groupId> <artifactId>nekohtml</artifactId> <version>1.9.13</version> </dependency>
public static String getNewsContent(String html) { if (StringUtils.isEmpty(html)) return html; String content = null; InputStream is = null; try { is = new ByteArrayInputStream(html.getBytes()); InputSource inputSource = new InputSource(is); inputSource.setEncoding("UTF-8"); // 在这里设置你的文本的正确格式 TextDocument textDocument = new BoilerpipeSAXInput(inputSource).getTextDocument(); BoilerpipeExtractor extractor = CommonExtractors.CANOLA_EXTRACTOR; extractor.process(textDocument); content = textDocument.getContent(); }catch (Exception e){ e.printStackTrace(); } finally { if (is!=null){ try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } return content; }
不积跬步无以至千里