关于网络爬虫和网站数据采集的一些总结

由于公司这段时间比较缺人手，这段时间上游戏厂商的专题活动也较为频繁，不得不做一个新闻采集的小软件，采集别人站点的一些新闻到我们平台上。自己总结了下，新闻采集主要有几点： 1、通过模拟http请求，请求页面内容 2、通过正则表达式，把页面内容进行过滤，取出想要的部分。 3、把数据给整合成符合我们需要的数据。

模拟http请求，请求页面内容 关于模拟http请求。我这里不详细解释，如果英语好的，可以看官方的文档：HttpWebRequest 我贴下我这里的模拟Http请求的helper

        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static string GetHttpRequest(string url)
        {
            if (string.IsNullOrEmpty(url))
                return string.Empty;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            StreamReader stream = null;
            string responseValue = string.Empty;
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream());
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close(); stream.Close(); stream = null;
            } return responseValue;
        }
        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static byte[] GetHttpRequestStream(string url)
        {
            byte[] bytes = null;
            StreamReader stream = null;
            if (string.IsNullOrEmpty(url)) return bytes;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.CookieContainer = new CookieContainer();
            CookieContainer cookie = request.CookieContainer;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            //string responseValue = string.Empty; 
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream());
                List<byte> lBtyes = new List<byte>();
                while (stream.BaseStream.CanRead)
                {
                    int result = stream.BaseStream.ReadByte();
                    if (result == -1) break;
                    lBtyes.Add((byte) result);
                }
                bytes = lBtyes.ToArray();
                // 
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close();
                stream.Close();
                stream = null;
            } return bytes;
        }

        /// <summary> 
        /// 模拟Http请求 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        public static string GetHttpRequest(string url, Encoding ec)
        {
            if (string.IsNullOrEmpty(url)) return string.Empty;
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.Method = "GET";
            request.ServicePoint.Expect100Continue = false;
            StreamReader stream = null;
            string responseValue = string.Empty;
            try
            {
                stream = new StreamReader(request.GetResponse().GetResponseStream(), ec);
                responseValue = stream.ReadToEnd();
            }
            catch
            {
                throw;
            }
            finally
            {
                request.GetResponse().GetResponseStream().Close();
                stream.Close();
                stream = null;
            }
            return responseValue;
        }

我采集的是完美官网的新闻，新闻链接： http://sw.wanmei.com/news/gamenews/list.shtml

请求的代码：

//str是请求的结果的html内容 

string str = PostRegister.Tools.GetHttpRequest("http://sw.wanmei.com/news/gamenews/list.shtml");

通过正则表达式，过滤html内容：

string reg = @"<a[^>]*href=(""(?<href>[^""]*)""|'(?<href>[^']*)'|(?<href>[^\s>]*))[^>]*>(?<text>[\s\S]*?)</a>";
            MatchCollection mc = Regex.Matches(str, reg);
            StringBuilder strTitle = new StringBuilder();
            Dictionary<string, string> titleUrlList = new Dictionary<string, string>();
            strTitle.Append("");
            for (int i = 0; i < mc.Count; i++)
            {
                string href = mc[i].Groups["href"].Value;// 这是href内容 
                string text = mc[i].Groups["text"].Value;// 这是text内容,就是<a>这里的内容</a> 
                if (i < mc.Count - 1)
                {
                    if (text.Contains("新闻") && mc[i + 1].Groups[0].Value.Contains("_blank") && mc[i + 1].Groups[0].Value.Contains("hidefocus") == false)
                    {
                        strTitle.Append("'" + mc[i + 1].Groups["text"].Value + "',");//拼接字符串，用于查询数据库使用 
                        titleUrlList.Add(mc[i + 1].Groups["text"].Value, "http://sw.wanmei.com" + mc[i + 1].Groups["href"].Value);//新闻标题和新闻链接 
                    }
                }
            }
            DbClassLibrary.Spiders.CommonSpider commonSpider = new CommonSpider();
            string allTitle = "";
            if (strTitle.ToString() != "")
            {
                allTitle = strTitle.ToString().Substring(0, strTitle.ToString().Length - 1);//截掉拼接的字符串中的最后一个,号 
                List<string> allNotExists = commonSpider.GetNotExistsNews(allTitle, 1);//查询数据库中不存在的新闻，gameID=1表示 圣王 
                for (int i = 0; i < allNotExists.Count; i++)
                {
                    listBoxTtitle.Items.Add(allNotExists[i]); listBoxLink.Items.Add(titleUrlList[allNotExists[i]]);
                }
            }
            lblResult.Text = "共发现" + listBoxTtitle.Items.Count + "条新数据";
            if (listBoxTtitle.Items.Count > 0)
                MessageBox.Show("啦啦啦，发现新数据!共发现" + listBoxTtitle.Items.Count + "条新数据");
            else
            {
                MessageBox.Show("对不起，暂时没有发现官网有新数据");
            }

通过上面的代码，我们过滤出来所有符合条件的新闻的标题和新闻链接。注：关于正则表达式的使用方法和解释，可参照我的另外一篇文章： http://www.woaic.com/2012/09/159 下面就是请求新闻链接，获取新闻主体内容了，方法也挺简单，贴出来，主要是获取id是article_txt的内容：

        /// <summary> 
        /// 根据新闻详情页面url获取url主体内容 
        /// </summary> 
        /// <param name="url"></param> 
        /// <returns></returns> 
        private string GetHtmlContent(string url)
        {
            string str = PostRegister.Tools.GetHttpRequest(url);
            string reg = @"<div id=""article_txt"">((?!</?div>)[\s\S]*?)</div>";
            MatchCollection mc = Regex.Matches(str, reg);
            StringBuilder strHref = new StringBuilder();
            StringBuilder strText = new StringBuilder();
            StringBuilder strTemp = new StringBuilder();
            for (int i = 0; i < mc.Count; i++)
            {
                strTemp.AppendFormat(mc[i].Groups[0].Value.Replace("/resources/JPG", "http://sw.wanmei.com/resources/JPG").Replace("/resources/jpg", "http://sw.wanmei.com/resources/jpg"));
                return strTemp.ToString().Substring(0, strTemp.ToString().Length - 6).Replace("<div id=\"article_txt\">", "").Trim();
            }
            return "";
        }

希望对你能有所帮助，^_^

posted @ 2013-09-15 21:58 keepnode 阅读(2032) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

打怪升级中

每件事到最后都是好事，如果不是好事，说明还没有到最后

关于网络爬虫和网站数据采集的一些总结

公告