网站数据采集程序（爬虫）

采集数据无非就是三步，抓取页面，分析数据，入库。

一、抓取页面

抓取页面也是在网上找的例子，主要是用到了2个方法

1，获取网站类容；2，清除html标签。具体看代码：

/// <summary>
        /// 根据Url获得内容
        /// </summary>
        /// <param name="url">Url</param>
        /// <returns>string</returns>
        public string GetContentUrl(string url)
        {
            string htmlContent = string.Empty;
            try
            {
                System.Threading.Thread.Sleep(500);
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
                //伪造浏览器数据，避免被防采集程序过滤
                req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215; CrazyCoder.cn;www.aligong.com)";
                req.ReadWriteTimeout = 30000;
                req.Timeout = 300000;
                req.Proxy = null;
                HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                {
                    htmlContent = sr.ReadToEnd();
                    sr.Dispose();
                    response.Close();
                }
            }
            catch
            {
                htmlContent = "";
            }
            return htmlContent;
        }

View Code

 1 /// <summary>
 2         /// 清除Html标签
 3         /// </summary>
 4         /// <param name="ContentStr">Html内容</param>
 5         /// <returns>string</returns>
 6         public string ClearLable(string ContentStr)
 7         {
 8             while (ContentStr.IndexOf('<') >= 0 && ContentStr.IndexOf('>') > 0)
 9             {
10                 int begin = ContentStr.IndexOf('<');
11                 int end = ContentStr.IndexOf('>');
12                 string SubContect = ContentStr.Substring(begin, end - begin + 1);
13                 ContentStr = ContentStr.Replace(SubContect, "");
14             }
15             ContentStr = ContentStr.Replace("&nbsp;", "");
16             return ContentStr.Trim();
17         }

View Code

第二步：分析数据

通过html正则模板获取到匹配的正则，然后取得正则匹配的集合。放入自己的集合里分析它

 1 public List<String> GetListURl(string url)
 2         {
 3             string htmlContent = GetContentUrl(url);//取得网页地址内容
 4 
 5             if (!string.IsNullOrWhiteSpace(htmlContent))
 6             {
 7                 return DealHtmlContentList(htmlContent);//调用处理方法得到list返回集合
 8             }
 9             return null;
10         }
11 private List<String> DealHtmlContentList(string htmlContent)
12         {
13             List<string> listStr = new List<string>();
14             string sLi = "<ul id=\"house-lst\" class=\"house-lst\">";//获取的列表代码段
15             string eLi = "</ul>";
16             string arryLi = string.Empty;
17             int start = htmlContent.IndexOf(sLi);
18             int end = 0;
19             if (start > 0)
20             {
21                 end = htmlContent.Substring(start).IndexOf(eLi);
22                 if (end > 0) arryLi = htmlContent.Substring(start, end);//通过截取得到列表代码
23             }
24             if (!string.IsNullOrWhiteSpace(arryLi))
25             {
26                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);//正则匹配li列表
27                 for (Match mch = regli.Match(arryLi); mch.Success; mch = mch.NextMatch())//放进集合
28                 {
29                     listStr.Add(mch.Value);
30                 }
31             }
32             return listStr;
33         }

View Code

这是获取网页内容代码，截取到列表页集合那段html代码。匹配正则变成集合返回。这只是列表页的数据

 1 public string GetListDetail(string url) {
 2             string htmlContent = GetContentUrl(url);//取得详情页地址内容
 3             if (!string.IsNullOrWhiteSpace(htmlContent))
 4             {
 5                 return DealHtmlContentDetail(htmlContent);//调用处理方法得到sql执行语句
 6             }
 7             return null;
 8         }
 9 
10 private string DealHtmlContentDetail(string htmlContent) {
11             string sql = string.Empty;
12             string sDiv = "<ol>";
13             string eDiv = "</ol>";
14             string arryDiv = string.Empty;
15             int start = htmlContent.IndexOf(sDiv);
16             int end = 0;
17             if (start > 0)
18             {
19                 end = htmlContent.Substring(start).IndexOf(eDiv);
20                 if (end > 0) arryDiv = htmlContent.Substring(start, end);
21             }
22 
23             if (!string.IsNullOrWhiteSpace(arryDiv))
24             {
25                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);
26                 Regex reglable = new Regex("<label>(.*?)</label>", RegexOptions.Singleline);
27                 Regex regspan = new Regex("<span class=\"other\">(.*?)</span>", RegexOptions.Singleline);
28                 Match mlable, mspan;
29                 string InsertSql = "INSERT INTO LJHostInfo(Title,AveragePrice,";//sql语句拼接
30                 string InsertSqlParam = "('{0}','{1}',";
31                 for (Match mch = regli.Match(arryDiv); mch.Success; mch = mch.NextMatch())//匹配详情数据
32                 {
33                     mlable = reglable.Match(mch.Value); mspan = regspan.Match(mch.Value);
34                     if (mlable.Success)
35                     {
36                         string value = ClearLable(mspan.Value);
37                         switch (ClearLable(mlable.Value))//分部比较并写入sql语句拼接
38                         {
39                             case "建筑年代：":
40                                 InsertSql += "BuildYear,";
41                                 InsertSqlParam += "'" + value + "',";
42                                 break;
43                             case "建筑类型：":
44                                 InsertSql += "BuildType,";
45                                 InsertSqlParam += "'" + value + "',";
46                                 break;
47                             case "物业费用：":
48                                 InsertSql += "PropertyPrice,";
49                                 InsertSqlParam += "'" + value + "',";
50                                 break;
51                             case "物业公司：":
52                                 InsertSql += "PropertyCompany,";
53                                 InsertSqlParam += "'" + value + "',";
54                                 break;
55                             case "开发商：":
56                                 InsertSql += "Developers,";
57                                 InsertSqlParam += "'" + value + "',";
58                                 break;
59                             case "楼栋总数：":
60                                 InsertSql += "FloorNum,";
61                                 InsertSqlParam += "'" + value + "',";
62                                 //匹配容积率
63                                 if (mlable.NextMatch().Success)
64                                 {
65                                     InsertSql += "Rate,";
66                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
67                                 }
68                                 break;
69                             case "房屋总数：":
70                                 InsertSql += "HousesNum,";
71                                 InsertSqlParam += "'" + value + "',";
72                                 //匹配绿化率
73                                 if (mlable.NextMatch().Success)
74                                 {
75                                     InsertSql += "GreenRates,";
76                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
77                                 }
78                                 break;
79                             case "所属学区：":
80                                 InsertSql += "SchoolAddress,";
81                                 InsertSqlParam += "'" + value + "',";
82                                 break;
83                             case "附近门店：":
84                                 InsertSql += "NearbyAddress,";
85                                 InsertSqlParam += "'" + ClearLable(mch.Value).Replace("附近门店：","").Trim().Replace(" ","") + "',";//获取门店信息
86                                 break;
87                         }
88                     }
89                 }
90                 InsertSql = InsertSql.TrimEnd(',') + ") ";
91                 InsertSqlParam = InsertSqlParam.TrimEnd(',') + ")";
92                 sql = InsertSql + "VALUES" + InsertSqlParam;
93             }
94 
95             return sql;
96         }

View Code

需要注意的就是匹配数据去掉html标签，加入sql语句。重复的匹配再插入

第三步：多线程任务类

 1 /// <summary>
 2     /// 任务执行入库操作类
 3     /// </summary>
 4     public class ThreadWorker
 5     {
 6         private ClumbForm cForm;
 7         private List<String> list;
 8         private string siteUrl = "@$#@$#@$#@$@#$#@$#@$#@$";//加密处理（^_^）
 9         private LianJiaCaiJi caiji=new LianJiaCaiJi();
10 
11         public ThreadWorker(ClumbForm cf, List<String> _list)
12         {
13             cForm = cf;
14             list = _list;
15         }
16 
17         /// <summary>
18         /// 线程任务开始
19         /// </summary>
20         /// <param name="objParams"></param>
21         public void StartWorker()
22         {
23             string splitStr = string.Empty;
24             Regex regh2 = new Regex("<h2>(.*?)</h2>", RegexOptions.Singleline);
25             Regex regspan = new Regex("<span class=\"num\">(.*?)</span>", RegexOptions.Singleline);
26             Match m;
27             Match ms;
28             foreach (var item in list)
29             {
30                 m = regh2.Match(item);
31                 if (m.Success)
32                 {
33                     lock (this)
34                     {
35                         ms = regspan.Match(item);
36                         cForm.TotalCount += 1;
37                         cForm.SBINSERTSQL.AppendFormat(caiji.GetListDetail(siteUrl + GetQuotationContent(m.Value, "href")), GetQuotationContent(m.Value, "title"), ms.Success ? caiji.ClearLable(ms.Value) : "0.00");
38                         cForm.ShowMsg("已完成：" + GetQuotationContent(m.Value, "title") + "小区,价格:"+ (ms.Success ? caiji.ClearLable(ms.Value) : "0.00")+ " 完成时间：" + System.DateTime.Now.ToString());
39                         cForm.ShowLableMsg(cForm.TotalCount+"");
40                     }
41                 }
42             }
43             cForm.ShowMsg("已完成第：" + cForm.TotalCount + "页数据采集, 完成时间：" + System.DateTime.Now.ToString());
44         }
45 
46         /// <summary>
47         /// 取得双引号中间的数据
48         /// </summary>
49         /// <param name="content"></param>
50         /// <returns></returns>
51         private string GetQuotationContent(string content,string tag) {
52             int s=content.IndexOf(tag)+2;
53             if ( s>= 0) {
54                 int tagS = content.Substring(s + tag.Length).IndexOf('"');
55                 return content.Substring(s + tag.Length, tagS);
56             }
57             return "";
58         }
59     
60     }

View Code

然后是任务执行

 1 private void btnCaiJi_Click(object sender, EventArgs e)
 2         {
 3             //初始状态
 4             listBoxMessage.Items.Clear();
 5             IsComplete = false;
 6 
 7             if (string.IsNullOrWhiteSpace(txtPageStart.Text) || string.IsNullOrWhiteSpace(txtPageEnd.Text))
 8             {
 9                 MessageBox.Show("请输入采集页数！");
10                 return;
11             }
12             else if (int.Parse(txtPageStart.Text) > 100) {
13                 MessageBox.Show("采集页数只能在100以内！");
14                 return;
15             }
16             ShowMsg("开始时间：" + System.DateTime.Now.ToString() + " 处理中请等待....");
17             _cts = new CancellationTokenSource();
18             ThreadPool.QueueUserWorkItem(state => CountTo(int.Parse(txtPageStart.Text), _cts.Token));
19 
20         }
21 
22         /// <summary>
23         /// 以累计的方式多线程采集数据
24         /// </summary>
25         /// <param name="countTo">累加到的指定值</param>
26         /// <param name="ct">取消凭证</param>
27         private void CountTo(int countTo, CancellationToken ct)
28         {
29             for (; countTo <= int.Parse(txtPageEnd.Text); countTo++)
30             {
31                 tw = new ThreadWorker(this, caiji.GetListURl(string.Format(url, countTo)), null);
32                 if (ct.IsCancellationRequested)
33                 {
34                     break;
35                 }
36                 //Invoke方法用于获得创建控件的线程所在的上下文
37                 this.Invoke(new Action(tw.StartWorker));
38                 Thread.Sleep(200);
39             }
40             IsComplete = true;
41             ShowMsg("结束时间：" + System.DateTime.Now.ToString() + " 采集完成,总条数："+TotalCount);
42         }
43 
44         /// <summary>
45         /// 实时信息显示
46         /// </summary>
47         /// <param name="msg">提示信息</param>
48         public void ShowMsg(string msg)
49         {
50             try
51             {
52                 if (listBoxMessage.InvokeRequired)
53                 {
54                     GetMsgDelegate labDele = new GetMsgDelegate(ShowMsg);
55                     this.Invoke(labDele, new object[] { msg });
56                 }
57                 else
58                 {
59                     listBoxMessage.Items.Add(msg);
60                     listBoxMessage.SelectedItem = listBoxMessage.Items[listBoxMessage.Items.Count - 1];//设定listbox自动滚动
61                     if (IsComplete)
62                     {
63                         btnCaiJi.Enabled = true;
64                         btnExceSql.Enabled = true;
65                     }
66                     else
67                     {
68                         btnCaiJi.Enabled = false;
69                         btnExceSql.Enabled = false;
70                     }
71                 }
72             }
73             catch { }
74         }

View Code

执行时界面

posted on 2016-04-12 10:01 梨窝★浅笑阅读(1358) 评论(0) 收藏举报

刷新页面返回顶部

网站数据采集程序（爬虫）

导航

公告