爬虫类与界面的交互

Posted on 2019-05-11 10:44  金色的省略号  阅读(280)  评论(0编辑  收藏  举报

把一个C#控制台显示的爬虫程序,改写成界面的爬虫,爬取的网页地址显示到listBox:

一、爬虫类

  1.爬虫类与界面建立联系,即在爬虫类增加一个Form类对象字段,爬虫类构造函数初始化Form类对象;

  2.爬取方法,使用Form类对象调用Form类显示方法,在控制台的显示改成在listBox中显示;

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5 using System.Threading.Tasks;
  6 using System.Net;
  7 using System.Collections;
  8 using System.Threading;
  9 using System.Text.RegularExpressions;
 10 using System.IO;
 11 
 12 namespace Crawler
 13 {
 14     class Crawler
 15     {
 16         //字段
 17         private WebClient webClient = new WebClient();
 18         public Hashtable urls = new Hashtable();
 19         private int count = 0;
 20         private Form1 f;
 21                
 22         //构造
 23         public Crawler(Form1 f1)
 24         {
 25             f = f1;
 26         }
 27         //1.方法Crawl
 28         public void Crawl()
 29         {
 30             string disp = "开始爬行了....";
 31             f.addUrl(disp);
 32             while (true)
 33             {
 34                 string current = null;
 35                 foreach (string url in urls.Keys) //找到一个还没有下载过的链接
 36                 {
 37                     if ((bool)urls[url]) continue; //已经下载过的,不再下载
 38                     current = url;
 39                     
 40                 }
 41                 if (current == null || count > 10) break;
 42 
 43                 disp = "爬行" + current + "页面!";
 44                 f.addUrl(disp);
 45 
 46                 string html = DownLoad(current); //下载
 47 
 48                 urls[current] = true;
 49                 count++;
 50 
 51                 Parse(html); //解析,并加入新的链接
 52             }
 53             disp = "爬行结束";
 54             f.addUrl(disp);
 55         }
 56 
 57         //2.方法DownLoad
 58         public string DownLoad(string url)
 59         {
 60             try
 61             {
 62                 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
 63                 req.Timeout = 30000;
 64                 HttpWebResponse response = (HttpWebResponse)req.GetResponse();
 65                 byte[] buffer = ReadInstreamIntoMemory(response.GetResponseStream());
 66                 string fileName = count.ToString();
 67                 FileStream fs = new FileStream(fileName, FileMode.OpenOrCreate);
 68                 fs.Write(buffer, 0, buffer.Length);
 69                 fs.Close();
 70                 string html = Encoding.UTF8.GetString(buffer);
 71                 return html;
 72             }
 73             catch
 74             {
 75             }
 76             return "";
 77         }
 78 
 79         //3.方法 Parse
 80         public void Parse(string html)
 81         {
 82             string strRef = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']";
 83             MatchCollection matches = new Regex(strRef).Matches(html);
 84             foreach (Match match in matches)
 85             {
 86                 strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim('"', '\'', '#', ' ', '>');
 87                 if (strRef.Length == 0) continue;
 88 
 89                 if (urls[strRef] == null) urls[strRef] = false;
 90             }
 91         }
 92 
 93         //4.方法 ReadInstreamIntoMemory
 94         private static byte[] ReadInstreamIntoMemory(Stream stream)
 95         {
 96             int bufferSize = 16384;
 97             byte[] buffer = new byte[bufferSize];
 98             MemoryStream ms = new MemoryStream();
 99             while (true)
100             {
101                 int numBytesRead = stream.Read(buffer, 0, bufferSize);
102                 if (numBytesRead <= 0) break;
103                 ms.Write(buffer, 0, numBytesRead);
104             }
105             return ms.ToArray();
106         }
107     }
108 }
爬虫类

二、窗体类

  button控件Click事件代码下创建爬虫类对象,传递窗体对象,调用爬虫方法。

 1 using System;
 2 using System.Collections.Generic;
 3 using System.ComponentModel;
 4 using System.Data;
 5 using System.Drawing;
 6 using System.Linq;
 7 using System.Text;
 8 using System.Threading.Tasks;
 9 using System.Windows.Forms;
10 using System.Net;
11 using System.Collections;
12 using System.Threading;
13 using System.Text.RegularExpressions;
14 using System.IO;
15 
16 namespace Crawler
17 {
18     public partial class Form1 : Form
19     {       
20         public Form1()
21         {
22             InitializeComponent();
23         }
24 
25         private void button1_Click(object sender, EventArgs e)
26         {
27             Crawler myCrawler = new Crawler(this);
28        
29             string startUrl = null;
30             if (textBox1.Text == "") return; 
31                 
32             startUrl = textBox1.Text;   
33 
34             myCrawler.urls.Add(startUrl, false); //加入初始页面
35             myCrawler.Crawl();    
36         }   
37 
38         private void button2_Click(object sender, EventArgs e)
39         {
40             listBox1.Items.Clear();
41             textBox1.Text = "";
42             textBox1.Focus();
43         }
44 
45         //输出
46         public void addUrl(string disp)
47         {
48             listBox1.Items.Add(disp);         
49             //选最后一条,到底部,然后取消
50             listBox1.SelectedIndex = listBox1.Items.Count - 1;
51             listBox1.SelectedIndex = -1;
52         }
53     }
54 }
窗体类