import java.io.IOException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class TT {
/**
* @param args
* @throws IOException
* @throws HttpException
* @throws ParserException
*/
public static void main(String[] args) throws HttpException, IOException, ParserException {
String resource = getContent("http://www.dianping.com/shop/1968937");
getReview(resource);
}
public static String getContent(String url) throws HttpException, IOException {
HttpClient hc=new HttpClient();
GetMethod gm=new GetMethod(url);
hc.getParams().setParameter(HttpMethodParams.USER_AGENT,"Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2");//设置信息
hc.executeMethod(gm);
return gm.getResponseBodyAsString();
}
public static void getReview(String resource) throws ParserException {
Parser myParser = new Parser(resource);
NodeList nodeList = null;
//myParser.setEncoding("gb2312");
NodeFilter divFilter = new NodeClassFilter(Div.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { divFilter });
try {
int count = 0;
nodeList = myParser.parse(lastFilter);
for (int i = 0; i <= nodeList.size(); i++) {
if (nodeList.elementAt(i) instanceof Div) {
Div div = (Div) nodeList.elementAt(i);
String id = div.getAttribute("id");
if (id != null && id.startsWith("review_")) {
System.out.println("--------------------------------" + ++count);
String content = div.getChildrenHTML();
content = content.replaceAll("//<p>.*</p>", "")
.replaceAll("<span.*</span>", "")
.replaceAll("<br/>", "/n")
.replaceAll(" ", " ");
System.out.println(content);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
}