网站数据采集程序(爬虫)
采集数据无非就是三步,抓取页面,分析数据,入库。
一、抓取页面
抓取页面也是在网上找的例子,主要是用到了2个方法
1,获取网站类容;2,清除html标签。具体看代码:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
/// <summary> /// 根据Url获得内容 /// </summary> /// <param name="url">Url</param> /// <returns>string</returns> public string GetContentUrl(string url) { string htmlContent = string.Empty; try { System.Threading.Thread.Sleep(500); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); //伪造浏览器数据,避免被防采集程序过滤 req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215; CrazyCoder.cn;www.aligong.com)"; req.ReadWriteTimeout = 30000; req.Timeout = 300000; req.Proxy = null; HttpWebResponse response = (HttpWebResponse)req.GetResponse(); using (StreamReader sr = new StreamReader(response.GetResponseStream())) { htmlContent = sr.ReadToEnd(); sr.Dispose(); response.Close(); } } catch { htmlContent = ""; } return htmlContent; }
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 /// <summary> 2 /// 清除Html标签 3 /// </summary> 4 /// <param name="ContentStr">Html内容</param> 5 /// <returns>string</returns> 6 public string ClearLable(string ContentStr) 7 { 8 while (ContentStr.IndexOf('<') >= 0 && ContentStr.IndexOf('>') > 0) 9 { 10 int begin = ContentStr.IndexOf('<'); 11 int end = ContentStr.IndexOf('>'); 12 string SubContect = ContentStr.Substring(begin, end - begin + 1); 13 ContentStr = ContentStr.Replace(SubContect, ""); 14 } 15 ContentStr = ContentStr.Replace(" ", ""); 16 return ContentStr.Trim(); 17 }
第二步:分析数据
通过html正则模板获取到匹配的正则,然后取得正则匹配的集合。放入自己的集合里分析它
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 public List<String> GetListURl(string url) 2 { 3 string htmlContent = GetContentUrl(url);//取得网页地址内容 4 5 if (!string.IsNullOrWhiteSpace(htmlContent)) 6 { 7 return DealHtmlContentList(htmlContent);//调用处理方法得到list返回集合 8 } 9 return null; 10 } 11 private List<String> DealHtmlContentList(string htmlContent) 12 { 13 List<string> listStr = new List<string>(); 14 string sLi = "<ul id=\"house-lst\" class=\"house-lst\">";//获取的列表代码段 15 string eLi = "</ul>"; 16 string arryLi = string.Empty; 17 int start = htmlContent.IndexOf(sLi); 18 int end = 0; 19 if (start > 0) 20 { 21 end = htmlContent.Substring(start).IndexOf(eLi); 22 if (end > 0) arryLi = htmlContent.Substring(start, end);//通过截取得到列表代码 23 } 24 if (!string.IsNullOrWhiteSpace(arryLi)) 25 { 26 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);//正则匹配li列表 27 for (Match mch = regli.Match(arryLi); mch.Success; mch = mch.NextMatch())//放进集合 28 { 29 listStr.Add(mch.Value); 30 } 31 } 32 return listStr; 33 }
这是获取网页内容代码,截取到列表页集合那段html代码。匹配正则变成集合返回。这只是列表页的数据
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 public string GetListDetail(string url) { 2 string htmlContent = GetContentUrl(url);//取得详情页地址内容 3 if (!string.IsNullOrWhiteSpace(htmlContent)) 4 { 5 return DealHtmlContentDetail(htmlContent);//调用处理方法得到sql执行语句 6 } 7 return null; 8 } 9 10 private string DealHtmlContentDetail(string htmlContent) { 11 string sql = string.Empty; 12 string sDiv = "<ol>"; 13 string eDiv = "</ol>"; 14 string arryDiv = string.Empty; 15 int start = htmlContent.IndexOf(sDiv); 16 int end = 0; 17 if (start > 0) 18 { 19 end = htmlContent.Substring(start).IndexOf(eDiv); 20 if (end > 0) arryDiv = htmlContent.Substring(start, end); 21 } 22 23 if (!string.IsNullOrWhiteSpace(arryDiv)) 24 { 25 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline); 26 Regex reglable = new Regex("<label>(.*?)</label>", RegexOptions.Singleline); 27 Regex regspan = new Regex("<span class=\"other\">(.*?)</span>", RegexOptions.Singleline); 28 Match mlable, mspan; 29 string InsertSql = "INSERT INTO LJHostInfo(Title,AveragePrice,";//sql语句拼接 30 string InsertSqlParam = "('{0}','{1}',"; 31 for (Match mch = regli.Match(arryDiv); mch.Success; mch = mch.NextMatch())//匹配详情数据 32 { 33 mlable = reglable.Match(mch.Value); mspan = regspan.Match(mch.Value); 34 if (mlable.Success) 35 { 36 string value = ClearLable(mspan.Value); 37 switch (ClearLable(mlable.Value))//分部比较并写入sql语句拼接 38 { 39 case "建筑年代:": 40 InsertSql += "BuildYear,"; 41 InsertSqlParam += "'" + value + "',"; 42 break; 43 case "建筑类型:": 44 InsertSql += "BuildType,"; 45 InsertSqlParam += "'" + value + "',"; 46 break; 47 case "物业费用:": 48 InsertSql += "PropertyPrice,"; 49 InsertSqlParam += "'" + value + "',"; 50 break; 51 case "物业公司:": 52 InsertSql += "PropertyCompany,"; 53 InsertSqlParam += "'" + value + "',"; 54 break; 55 case "开发商:": 56 InsertSql += "Developers,"; 57 InsertSqlParam += "'" + value + "',"; 58 break; 59 case "楼栋总数:": 60 InsertSql += "FloorNum,"; 61 InsertSqlParam += "'" + value + "',"; 62 //匹配容积率 63 if (mlable.NextMatch().Success) 64 { 65 InsertSql += "Rate,"; 66 InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',"; 67 } 68 break; 69 case "房屋总数:": 70 InsertSql += "HousesNum,"; 71 InsertSqlParam += "'" + value + "',"; 72 //匹配绿化率 73 if (mlable.NextMatch().Success) 74 { 75 InsertSql += "GreenRates,"; 76 InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',"; 77 } 78 break; 79 case "所属学区:": 80 InsertSql += "SchoolAddress,"; 81 InsertSqlParam += "'" + value + "',"; 82 break; 83 case "附近门店:": 84 InsertSql += "NearbyAddress,"; 85 InsertSqlParam += "'" + ClearLable(mch.Value).Replace("附近门店:","").Trim().Replace(" ","") + "',";//获取门店信息 86 break; 87 } 88 } 89 } 90 InsertSql = InsertSql.TrimEnd(',') + ") "; 91 InsertSqlParam = InsertSqlParam.TrimEnd(',') + ")"; 92 sql = InsertSql + "VALUES" + InsertSqlParam; 93 } 94 95 return sql; 96 }
需要注意的就是匹配数据去掉html标签,加入sql语句。重复的匹配再插入
第三步:多线程任务类
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 /// <summary> 2 /// 任务执行入库操作类 3 /// </summary> 4 public class ThreadWorker 5 { 6 private ClumbForm cForm; 7 private List<String> list; 8 private string siteUrl = "@$#@$#@$#@$@#$#@$#@$#@$";//加密处理(^_^) 9 private LianJiaCaiJi caiji=new LianJiaCaiJi(); 10 11 public ThreadWorker(ClumbForm cf, List<String> _list) 12 { 13 cForm = cf; 14 list = _list; 15 } 16 17 /// <summary> 18 /// 线程任务开始 19 /// </summary> 20 /// <param name="objParams"></param> 21 public void StartWorker() 22 { 23 string splitStr = string.Empty; 24 Regex regh2 = new Regex("<h2>(.*?)</h2>", RegexOptions.Singleline); 25 Regex regspan = new Regex("<span class=\"num\">(.*?)</span>", RegexOptions.Singleline); 26 Match m; 27 Match ms; 28 foreach (var item in list) 29 { 30 m = regh2.Match(item); 31 if (m.Success) 32 { 33 lock (this) 34 { 35 ms = regspan.Match(item); 36 cForm.TotalCount += 1; 37 cForm.SBINSERTSQL.AppendFormat(caiji.GetListDetail(siteUrl + GetQuotationContent(m.Value, "href")), GetQuotationContent(m.Value, "title"), ms.Success ? caiji.ClearLable(ms.Value) : "0.00"); 38 cForm.ShowMsg("已完成:" + GetQuotationContent(m.Value, "title") + "小区,价格:"+ (ms.Success ? caiji.ClearLable(ms.Value) : "0.00")+ " 完成时间:" + System.DateTime.Now.ToString()); 39 cForm.ShowLableMsg(cForm.TotalCount+""); 40 } 41 } 42 } 43 cForm.ShowMsg("已完成第:" + cForm.TotalCount + "页数据采集, 完成时间:" + System.DateTime.Now.ToString()); 44 } 45 46 /// <summary> 47 /// 取得双引号中间的数据 48 /// </summary> 49 /// <param name="content"></param> 50 /// <returns></returns> 51 private string GetQuotationContent(string content,string tag) { 52 int s=content.IndexOf(tag)+2; 53 if ( s>= 0) { 54 int tagS = content.Substring(s + tag.Length).IndexOf('"'); 55 return content.Substring(s + tag.Length, tagS); 56 } 57 return ""; 58 } 59 60 }
然后是任务执行
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 private void btnCaiJi_Click(object sender, EventArgs e) 2 { 3 //初始状态 4 listBoxMessage.Items.Clear(); 5 IsComplete = false; 6 7 if (string.IsNullOrWhiteSpace(txtPageStart.Text) || string.IsNullOrWhiteSpace(txtPageEnd.Text)) 8 { 9 MessageBox.Show("请输入采集页数!"); 10 return; 11 } 12 else if (int.Parse(txtPageStart.Text) > 100) { 13 MessageBox.Show("采集页数只能在100以内!"); 14 return; 15 } 16 ShowMsg("开始时间:" + System.DateTime.Now.ToString() + " 处理中请等待...."); 17 _cts = new CancellationTokenSource(); 18 ThreadPool.QueueUserWorkItem(state => CountTo(int.Parse(txtPageStart.Text), _cts.Token)); 19 20 } 21 22 /// <summary> 23 /// 以累计的方式多线程采集数据 24 /// </summary> 25 /// <param name="countTo">累加到的指定值</param> 26 /// <param name="ct">取消凭证</param> 27 private void CountTo(int countTo, CancellationToken ct) 28 { 29 for (; countTo <= int.Parse(txtPageEnd.Text); countTo++) 30 { 31 tw = new ThreadWorker(this, caiji.GetListURl(string.Format(url, countTo)), null); 32 if (ct.IsCancellationRequested) 33 { 34 break; 35 } 36 //Invoke方法用于获得创建控件的线程所在的上下文 37 this.Invoke(new Action(tw.StartWorker)); 38 Thread.Sleep(200); 39 } 40 IsComplete = true; 41 ShowMsg("结束时间:" + System.DateTime.Now.ToString() + " 采集完成,总条数:"+TotalCount); 42 } 43 44 /// <summary> 45 /// 实时信息显示 46 /// </summary> 47 /// <param name="msg">提示信息</param> 48 public void ShowMsg(string msg) 49 { 50 try 51 { 52 if (listBoxMessage.InvokeRequired) 53 { 54 GetMsgDelegate labDele = new GetMsgDelegate(ShowMsg); 55 this.Invoke(labDele, new object[] { msg }); 56 } 57 else 58 { 59 listBoxMessage.Items.Add(msg); 60 listBoxMessage.SelectedItem = listBoxMessage.Items[listBoxMessage.Items.Count - 1];//设定listbox自动滚动 61 if (IsComplete) 62 { 63 btnCaiJi.Enabled = true; 64 btnExceSql.Enabled = true; 65 } 66 else 67 { 68 btnCaiJi.Enabled = false; 69 btnExceSql.Enabled = false; 70 } 71 } 72 } 73 catch { } 74 }
执行时界面