htmlparser使用2

import java.net.*;
import java.io.*;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
 
public class Test {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
test5(readNetFile());// 当然这里可以写成一个链接地址比如将html代替为"http://www.baidu.com"
}
 
public static String readNetFile() {
// System.out
// .println("-------------------------读取网页文件开始---------------------");
URL url = null;
try {
url = new URL(
"http://www.dailyfx.com.hk/calendar/index.php?date=2011-02-14");
} catch (MalformedURLException ex2) {
ex2.printStackTrace();
}
URLConnection conn = null;
BufferedReader br = null;
try {
conn = url.openConnection();
br = new BufferedReader(new InputStreamReader(
conn.getInputStream(), "utf-8"));
} catch (IOException ex1) {
ex1.printStackTrace();
}
 
String temp = null;
StringBuffer sb = new StringBuffer();
try {
temp = br.readLine();
while (temp != null) {
sb.append(temp + "\n");
temp = br.readLine();
}
} catch (IOException ex) {
ex.printStackTrace();
}
// System.out.println(sb.toString());
// System.out
// .println("-------------------------读取网页文件结束---------------------");
return sb.toString();
}
 
// static String html = "http://www.baidu.com";
 
public static void test5(String resource) throws Exception {
Parser myParser = new Parser(resource);
// Parser parser = new Parser(content);
// 设置编码
myParser.setEncoding("utf-8");
// String filterStr = "table";//这里析取得是标签为table的元素
String filterStr = "table";
NodeFilter filter = new TagNameFilter(filterStr);// 过滤这个标签
NodeList tableList = myParser.extractAllNodesThatMatch(filter);// 抽取所有table列表
 
// 只需要读取最后一个表格
for (int i = tableList.size() - 1; i < tableList.size(); i++) {
TableTag table = (TableTag) tableList.elementAt(i);
// 取得表中的行集
TableRow[] rows = table.getRows();
// 遍历每行
for (int r = 0; r < rows.length; r++) {
TableRow tr = rows[r];
TableColumn[] td = tr.getColumns();
// 行中的列
for (int c = 0; c < td.length; c++) {
System.out.print(td[c].toPlainTextString() + " ");
}
System.out.println();
 
}
 
}
}
}
posted @ 2013-05-21 16:53  licomeback  阅读(838)  评论(0编辑  收藏  举报