java利用htmlparser得到网页html内容
java利用htmlparser得到网页html内容,利用org.htmlparser.Parser包我们可以很轻松取到任何页面的源代码,方法如下:
/** * 返回网页内容 * * @param path * @return */ public static String getItemDesc() { String htmlStr = ""; try { URL url = new URL("http://www.lingshij.com"); URLConnection conn = url.openConnection(); conn.setConnectTimeout(5000); conn.setReadTimeout(15000); Parser parser = new Parser(); parser.setConnection(conn); parser.setEncoding("GBK"); TextExtractingVisitor visitor = new TextExtractingVisitor(); parser.visitAllNodesWith(visitor); htmlStr = visitor.getExtractedText(); } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return htmlStr; }