利用htmlparser读取html文档的内容
1.添加相关的的jar
htmlparser-2.1.jar
2.方法和代码
public static String readHtml(File html) {
String htmlPath = html.getAbsolutePath();
String text = "";
Parser parser = null;
try {
parser = new Parser(htmlPath);
} catch (Exception e) {
e.printStackTrace();
}
try {
parser.setEncoding("UTF-8");
} catch (Exception e) {
e.printStackTrace();
}
HtmlPage visitor = new HtmlPage(parser);
try {
parser.visitAllNodesWith(visitor);
} catch (Exception e) {
e.printStackTrace();
}
NodeList nodes = visitor.getBody();
int size = nodes.size();
for (int i = 0; i < size; i++) {
Node node = nodes.elementAt(i);
text += node.toPlainTextString();
}
return text;
}