Robin's Blog

记录 积累 学习 成长

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

最近项目开发中编写了一个每日笑话功能。 系统每天晚上自动从  Internet  固定网站中获得一条新的笑话并保存下来。笑话可在工作台显示,并支持前后滚动查看。该功能是通过  htmlParser(  一个纯的java写的html解析的库  )技术实现。小编在这里贴出自己写的通过htmlParser解析html文本抓取新闻的案例。实现思路如下:  
  1. 设置网络代理
  2. 分析网站首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。返回NodeList
  3. 提取标题连接标签,获取标题。检查数据数库是否已存在该新闻,不存在就提取标题相应内容保存,跳出循环节点。已存在,就提取下一个连接标签
  4. 通过标题标签提取相应的内容
    去除新闻中href包含cheshi.com的<a>标签
    downloadImages方法下载内容中的图片
标签: HTMLParser

代码片段(1)

[文件] HtmlParser.java ~ 8KB    下载(55)

001import java.io.File;
002import java.io.FileNotFoundException;
003import java.io.FileOutputStream;
004import java.io.IOException;
005import java.io.InputStream;
006import java.net.URL;
007import java.sql.Connection;
008import java.sql.DriverManager;
009import java.sql.PreparedStatement;
010import java.sql.ResultSet;
011import java.sql.SQLException;
012 
013import org.apache.log4j.Logger;
014import org.apache.log4j.PropertyConfigurator;
015import org.htmlparser.Node;
016import org.htmlparser.NodeFilter;
017import org.htmlparser.Parser;
018import org.htmlparser.Tag;
019import org.htmlparser.filters.TagNameFilter;
020import org.htmlparser.tags.LinkTag;
021import org.htmlparser.util.NodeIterator;
022import org.htmlparser.util.NodeList;
023import org.htmlparser.util.ParserException;
024import org.htmlparser.util.SimpleNodeIterator;
025 
026/**
027 * 分析www.cheshi.com首页新闻
028 * @author j.li
029 */
030public class HtmlParser {
031    private static Logger logger;
032    private Connection conn = null;
033    private static final String SiteName = "";
034 
035    public void indexNewsContent(String sitepath) throws Exception {
036        logger.info("分析网站【" + sitepath + "】首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。");
037        Parser myParser = new Parser(sitepath);
038        myParser.setEncoding("GBK");
039        NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() {
040            public boolean accept(Node node) {
041                return ((node instanceof Tag)
042                        && !((Tag)node).isEndTag()
043                        && ((Tag)node).getTagName().equals("DIV")
044                        && ((Tag)node).getAttribute("class") != null
045                        && ((Tag)node).getAttribute("class").equals("w_box"));
046            }
047        });
048        Node node = nodeList.elementAt(1);
049        logger.debug(node.toHtml());
050        extractText(node.toHtml());
051    }
052     
053    public void extractText(String inputHtml) throws Exception {   
054        Parser parser = Parser.createParser(inputHtml, "GBK");
055        TagNameFilter filter = new TagNameFilter("a");
056        NodeList nodeList = parser.extractAllNodesThatMatch(filter);
057        NodeIterator it = nodeList.elements();
058        getConnection();
059        while (it.hasMoreNodes()) {
060            LinkTag node = (LinkTag) it.nextNode();
061            String href = node.getLink();
062            String title = node.getLinkText();
063            logger.info("分析首页新闻【"+title+"】,链接地址【"+href+"】");
064            try {
065                if(!newsExist(title)) {
066                    insertDataBase(title, extractContent(href));
067                else {
068                    logger.info("新闻【"+title+"】数据库中已经存在,忽略进入下一个新闻分析!");
069                }
070            catch (SQLException e) {
071                logger.error("插入数据库新闻记录异常!" + e.getMessage());
072                e.printStackTrace();
073            catch (Exception e) {
074                logger.error(e.getMessage());
075                logger.info("分析新闻【"+title+"】,链接地址【"+href+"】失败,进入下一个新闻分析。");
076                e.printStackTrace();
077            }
078        }
079        closeConnection();
080    }
081 
082    public String extractContent(String content) throws Exception {
083        try {
084            Parser myParser = new Parser(content);
085            myParser.setEncoding("GBK");
086            NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter() {
087                public boolean accept(Node node) {
088                    return ((node instanceof Tag)
089                            && !((Tag)node).isEndTag()
090                            && ((Tag)node).getTagName().equals("DIV")
091                            && ((Tag)node).getAttribute("class") != null
092                            && ((Tag)node).getAttribute("class").equals("cs_content"));
093                }
094            });
095            int size = nodeList.size();
096            Node node = nodeList.elementAt(size - 1);
097            content = node.toHtml();
098            logger.debug("==========extractContent==============");
099            logger.debug(content);
100        catch (Exception pe) {
101            logger.error("分析新闻页面出现异常!" + pe.getMessage() + "原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。");
102            throw pe;
103        }
104        return removeTagA(content);
105    }
106     
107    /**
108     * 去除新闻中href包含cheshi.com的<a>标签
109     * @param content 分析html内容
110     * @return 分析处理后的html内容
111     */
112    public String removeTagA(String content) throws ParserException {
113        Parser myParser = new Parser(content);
114        myParser.setEncoding("GBK");
115        NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("a"));
116        SimpleNodeIterator it = nodeList.elements();
117        while (it.hasMoreNodes()) {
118            LinkTag node = (LinkTag)it.nextNode();
119            logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。");
120            if(node.getLink().indexOf("cheshi.com") > -1)
121                content = content.replace(node.toHtml(), node.getStringText());
122        }
123        logger.debug("==========removeTagA==============");
124        logger.debug(content);
125        return downloadImages(content, "D:\\autodata\\upload\\intersite", SiteName +"upload/intersite");
126    }
127 
128    public String downloadImages(String content, String uploadImgPath, String localhost)throws ParserException {
129        File f = new File(uploadImgPath);
130        if(!f.exists()) {
131            f.mkdirs();
132        }
133        Parser myParser = new Parser(content);
134        myParser.setEncoding("GBK");
135        NodeList nodeList = myParser.extractAllNodesThatMatch(new TagNameFilter("img"));
136        SimpleNodeIterator it = nodeList.elements();
137        while(it.hasMoreNodes()) {
138            Tag tag = (Tag)it.nextNode();
139            String src = tag.getAttribute("src");
140            String filename = src.substring(src.lastIndexOf("/") + 1);
141            InputStream is = null;
142            FileOutputStream fos = null;
143            try {
144                URL url = new URL(src);
145                is = url.openStream();
146                int bytesRead = 0;
147                byte[] buff = new byte[1024];
148                fos = new FileOutputStream(uploadImgPath+"/"+filename);
149                while((bytesRead = is.read(buff, 0, buff.length)) != -1){
150                    fos.write(buff, 0, bytesRead);
151                }
152                content = content.replace(src, localhost + "/" + filename);
153            catch(FileNotFoundException notFoundException) {
154                notFoundException.printStackTrace();
155            catch(IOException ioe) {
156                ioe.printStackTrace();
157            finally {
158                try {
159                    if(fos != null) fos.close();
160                    if(is != null) is.close();
161                catch(IOException ioe) {
162                    ioe.printStackTrace();
163                }
164            }
165        }
166        logger.debug("=================downloadImages==================");
167        logger.debug(content);
168        return content;
169    }
170     
171    public void getConnection() {
172        try {
173            Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
174            String strCon ="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor";
175            String strUserName = "sa";
176            String strPWD = "qsyjcsxdl@@@web2009@@@";
177            conn = DriverManager.getConnection(strCon, strUserName, strPWD);
178        catch (java.lang.ClassNotFoundException cnfe) {
179            cnfe.printStackTrace();
180        catch (SQLException se) {
181            se.printStackTrace();
182        }
183    }
184     
185    public void closeConnection() {
186        try {
187            if(conn!= null && !conn.isClosed()) conn.close();
188        catch (SQLException se) {
189            se.printStackTrace();
190        }
191    }
192     
193    public void insertDataBase(String newsTitle, String newsContent) throws SQLException {
194        PreparedStatement pstmt = null;
195        try {
196            pstmt = conn.prepareStatement("INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)");
197            pstmt.setString(1, newsTitle);
198            pstmt.setString(2, newsContent);
199            pstmt.setInt(31);
200            pstmt.executeUpdate();
201        catch(SQLException e) {
202            throw e;
203        finally {
204            try {
205                if(pstmt != null) pstmt.close();
206            catch(SQLException e) {
207                e.printStackTrace();
208            }
209        }
210    }
211     
212    public boolean newsExist(String title) throws SQLException {
213        PreparedStatement pstmt = null;
214        try {
215            pstmt = conn.prepareStatement("SELECT top 1 NewsId from FumNews where NewsTitle = ?");
216            pstmt.setString(1, title);
217            ResultSet rs = pstmt.executeQuery();
218            return rs.next();
219        catch(SQLException e) {
220            throw e;
221        finally {
222            try {
223                if(pstmt != null) pstmt.close();
224            catch(SQLException e) {
225                e.printStackTrace();
226            }
227        }
228    }
229 
230    public static void main(String[] args) {
231        HtmlParser html = new HtmlParser();
232//      设置代理链接网络
233//      System.getProperties().put("proxySet", "true");
234//      System.getProperties().put("proxyHost", "192.168.99.100");
235//      System.getProperties().put("proxyPort", "80");
236        URL url = html.getClass().getResource("log4j.properties");
237        PropertyConfigurator.configure(url);
238        logger = Logger.getLogger(HtmlParser.class);
239        try {
240            html.indexNewsContent("http://www.cheshi.com/");
241        catch (Exception e) {
242            e.printStackTrace();
243            logger.error("分析网页遇到错误,原因:"+e.getMessage());
244        }
245        logger.info("分析网页内容完成。");
246    }
247}

posted on 2012-04-26 11:58  Robin99  阅读(291)  评论(0编辑  收藏  举报