package org.apache.nutch.htmlfilter.my;
import java.util.regex.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.Crawl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.DocumentFragment;
public class MyHtmlParseFilter implements HtmlParseFilter {
public static final Log LOG = LogFactory.getLog(MyHtmlParseFilter.class);
private Configuration conf;
private Pattern p_p_title = Pattern
.compile("<span .+class=\"b14c\">(.*?)</span>");
private Pattern p_p_article = Pattern
.compile("<td .*class=\"h14\".*>([\\s\\S]+?)</td>");
private Pattern p_p_pubdate = Pattern
.compile("<font class=\"h12\">发布时间:(.*)</font>");
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
Metadata md = parse.getData().getParseMeta();
try {
// 抽取字段 正文信息示例
String html = new String(content.getContent());
String title = extract(html, p_p_title);
String article = extract(html, p_p_article);
String site = "中国公路信息网|行业动态|新通车信息";
String pubdate_1 = extract(html, p_p_pubdate);
String pubdate = pubdate_1.replace('年', '-').replace('月', '-')
.replace("日", "");
String refurl = null;
String cate = "1234567";
md.add("p_title", title);
md.add("p_article", article);
md.add("p_site", site);
md.add("p_pubdate", pubdate);
md.add("p_refurl", refurl);
md.add("p_cate", cate);
} catch (Exception e) {
LOG.info(e.getMessage());
}
return parseResult;
}
private String extract(String html, Pattern p) {
Matcher match = p.matcher(html);
String val = null;
while (match.find()) {
val = match.group(1);
if (val != null) {
val = val.trim();
}
}
return val;
}
public Configuration getConf() {
return this.conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
}