把一个C#控制台显示的爬虫程序,改写成界面的爬虫,爬取的网页地址显示到listBox:
一、爬虫类
1.爬虫类与界面建立联系,即在爬虫类增加一个Form类对象字段,爬虫类构造函数初始化Form类对象;
2.爬取方法,使用Form类对象调用Form类显示方法,在控制台的显示改成在listBox中显示;
1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Text; 5 using System.Threading.Tasks; 6 using System.Net; 7 using System.Collections; 8 using System.Threading; 9 using System.Text.RegularExpressions; 10 using System.IO; 11 12 namespace Crawler 13 { 14 class Crawler 15 { 16 //字段 17 private WebClient webClient = new WebClient(); 18 public Hashtable urls = new Hashtable(); 19 private int count = 0; 20 private Form1 f; 21 22 //构造 23 public Crawler(Form1 f1) 24 { 25 f = f1; 26 } 27 //1.方法Crawl 28 public void Crawl() 29 { 30 string disp = "开始爬行了...."; 31 f.addUrl(disp); 32 while (true) 33 { 34 string current = null; 35 foreach (string url in urls.Keys) //找到一个还没有下载过的链接 36 { 37 if ((bool)urls[url]) continue; //已经下载过的,不再下载 38 current = url; 39 40 } 41 if (current == null || count > 10) break; 42 43 disp = "爬行" + current + "页面!"; 44 f.addUrl(disp); 45 46 string html = DownLoad(current); //下载 47 48 urls[current] = true; 49 count++; 50 51 Parse(html); //解析,并加入新的链接 52 } 53 disp = "爬行结束"; 54 f.addUrl(disp); 55 } 56 57 //2.方法DownLoad 58 public string DownLoad(string url) 59 { 60 try 61 { 62 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); 63 req.Timeout = 30000; 64 HttpWebResponse response = (HttpWebResponse)req.GetResponse(); 65 byte[] buffer = ReadInstreamIntoMemory(response.GetResponseStream()); 66 string fileName = count.ToString(); 67 FileStream fs = new FileStream(fileName, FileMode.OpenOrCreate); 68 fs.Write(buffer, 0, buffer.Length); 69 fs.Close(); 70 string html = Encoding.UTF8.GetString(buffer); 71 return html; 72 } 73 catch 74 { 75 } 76 return ""; 77 } 78 79 //3.方法 Parse 80 public void Parse(string html) 81 { 82 string strRef = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']"; 83 MatchCollection matches = new Regex(strRef).Matches(html); 84 foreach (Match match in matches) 85 { 86 strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim('"', '\'', '#', ' ', '>'); 87 if (strRef.Length == 0) continue; 88 89 if (urls[strRef] == null) urls[strRef] = false; 90 } 91 } 92 93 //4.方法 ReadInstreamIntoMemory 94 private static byte[] ReadInstreamIntoMemory(Stream stream) 95 { 96 int bufferSize = 16384; 97 byte[] buffer = new byte[bufferSize]; 98 MemoryStream ms = new MemoryStream(); 99 while (true) 100 { 101 int numBytesRead = stream.Read(buffer, 0, bufferSize); 102 if (numBytesRead <= 0) break; 103 ms.Write(buffer, 0, numBytesRead); 104 } 105 return ms.ToArray(); 106 } 107 } 108 }
二、窗体类
button控件Click事件代码下创建爬虫类对象,传递窗体对象,调用爬虫方法。
1 using System; 2 using System.Collections.Generic; 3 using System.ComponentModel; 4 using System.Data; 5 using System.Drawing; 6 using System.Linq; 7 using System.Text; 8 using System.Threading.Tasks; 9 using System.Windows.Forms; 10 using System.Net; 11 using System.Collections; 12 using System.Threading; 13 using System.Text.RegularExpressions; 14 using System.IO; 15 16 namespace Crawler 17 { 18 public partial class Form1 : Form 19 { 20 public Form1() 21 { 22 InitializeComponent(); 23 } 24 25 private void button1_Click(object sender, EventArgs e) 26 { 27 Crawler myCrawler = new Crawler(this); 28 29 string startUrl = null; 30 if (textBox1.Text == "") return; 31 32 startUrl = textBox1.Text; 33 34 myCrawler.urls.Add(startUrl, false); //加入初始页面 35 myCrawler.Crawl(); 36 } 37 38 private void button2_Click(object sender, EventArgs e) 39 { 40 listBox1.Items.Clear(); 41 textBox1.Text = ""; 42 textBox1.Focus(); 43 } 44 45 //输出 46 public void addUrl(string disp) 47 { 48 listBox1.Items.Add(disp); 49 //选最后一条,到底部,然后取消 50 listBox1.SelectedIndex = listBox1.Items.Count - 1; 51 listBox1.SelectedIndex = -1; 52 } 53 } 54 }