HtmlparseUtil.java
该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已!
详细看这里:http://gundumw100.iteye.com/blog/704311
import java.util.*; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * httpclient与htmlparse对网页的解析 * * @author Administrator * */ public class HtmlparseUtil { WebHttpClient util=new WebHttpClient(); /** * 获得网页中的超链接,将href和text保存在Map中:map(href,text) * @param url * @param charset * @return */ public Map<String, String> linkGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); Map<String, String> linkMap = new HashMap<String, String>(); try { //开始解析 Parser parser = Parser.createParser(content, charset); // 过滤出<a></a>标签 NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(linkFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页中的链接map(href,text) linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText())); } } catch (ParserException e) { e.printStackTrace(); } return linkMap; } /** * 获得网页<body></body>标签中的内容, 保存在body中 * @param url * @param charset * @return */ public String bodyGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); String body = ""; try { Parser parser = Parser.createParser(content, charset); // 过滤<body></body>标签 NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class); NodeList list = parser.extractAllNodesThatMatch(bodyFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页内容 保存在content中 body = ((BodyTag) node).getBody(); } } catch (ParserException e) { e.printStackTrace(); } return body; } /** * 过滤出class为term的<span>元素,并获得他们的文本 * @param url * @param charset * @return */ public Map<String,String> termGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); Map<String, String> map = new HashMap<String, String>(); try { //开始解析 // 过滤出class为term的<span>元素 Parser parser = Parser.createParser(content, charset); AndFilter filter = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term")); Node node = null; NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); map.put("term", node.toPlainTextString()); } // 过滤出class为start-time的<span>元素 Parser parser2 = Parser.createParser(content, charset); AndFilter filter2 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time")); NodeList nodeList2 = parser2.parse(filter2); for (int i = 0; i < nodeList2.size(); i++) { node = nodeList2.elementAt(i); map.put("start-time", node.toPlainTextString()); } // 过滤出id为J_SingleEndTimeLabel的<span>元素 Parser parser3 = Parser.createParser(content, charset); AndFilter filter3 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel")); NodeList nodeList3 = parser3.parse(filter3); for (int i = 0; i < nodeList3.size(); i++) { node = nodeList3.elementAt(i); map.put("end-time", node.toPlainTextString()); } // 过滤出class为box post的<div>元素 Parser parser4 = Parser.createParser(content, charset); AndFilter filter4 = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post")); NodeList nodeList4 = parser4.parse(filter4); for (int i = 0; i < nodeList4.size(); i++) { node = nodeList4.elementAt(i); String temp=node.toPlainTextString().trim(); temp=temp.substring(10,20).trim(); map.put("pre-term", temp); } // 过滤出class为J_AwardNumber的<span>元素 Parser parser5 = Parser.createParser(content, charset); // AndFilter filter5 = // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber")); NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber")); StringBuffer buffer=new StringBuffer(); for (int i = 0; i < nodeList5.size(); i++) { node = nodeList5.elementAt(i); buffer.append(","+node.toPlainTextString()); } buffer.append("|"); // 过滤出class为blue J_AwardNumber的<span>元素 Parser parser6 = Parser.createParser(content, charset); // AndFilter filter6 = // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber")); NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber")); for (int i = 0; i < nodeList6.size(); i++) { node = nodeList6.elementAt(i); buffer.append(node.toPlainTextString()+","); } map.put("numbers", buffer.toString()); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return map; } private String processText(String content){ content=content.trim().replaceAll(" ", ""); // content=content.replaceAll("<p>", "\n"); // content=content.replaceAll("</TD>", ""); // content=content.replaceAll("</div>", ""); // content=content.replaceAll("</a>", ""); // content=content.replaceAll("<a href=.*>", ""); return content; } public static void main(String[] str) { String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1"; HtmlparseUtil util=new HtmlparseUtil(); Map<String,String> map=util.termGet(url, "gb2312"); System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span> System.out.println("start-time="+map.get("start-time"));// System.out.println("end-time="+map.get("end-time"));// System.out.println("pre-term="+map.get("pre-term"));// System.out.println("numbers="+map.get("numbers"));// /* Map<String, String> linkMap = util.linkGet(url, "gb2312"); for (String s : linkMap.keySet()) { System.out.println(s + " = " + linkMap.get(s)); //如果是个链接,则再获取它的<body>中的内容 // if (s.startsWith("http")) { // util.bodyGet(s, "gb2312"); // } } */ } }