基于深度优先搜索的蜘蛛程序
这几天发现一个很好的图片网站,美女特多! 就打算下点图片,但是自己下载的话,翻来覆去的太麻烦,所以用找了个蜘蛛来帮忙。
随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
基本就丢了,另外结构上感觉不太满意,所以改改了。
我大致想的采用双线程,一个UI,一个工作线程,抓取方面采用深度优先搜索,基本思路:得到当前网页,提取下载图片,然后正则表达式匹配网址,然后递归处理!在处理过程中,使用一个集合类来收集处理过的网址防止死循环。代码大致如下:
1 public bool Process( WebPageState state )
2 {
3 state.ProcessStarted = true;
4 state.ProcessSuccessfull = false;
5
6 if(level==1)
7 m_baseUri = state.Uri;
8 try
9 {
10 Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11
12 WebRequest req = WebRequest.Create( state.Uri );
13 WebResponse res = null;
14
15 try
16 {
17 res = req.GetResponse( );
18
19 if ( res is HttpWebResponse )
20 {
21 state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22 state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23 }
24 if ( res is FileWebResponse )
25 {
26 state.StatusCode = "OK";
27 state.StatusDescription = "OK";
28 }
29
30 if ( state.StatusCode.Equals( "OK" ) )
31 {
32 StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33
34 state.Content = sr.ReadToEnd( );
35
36
37 MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38 string Address;
39 int k=0;
40 for (k = 0; k < m.Count;k++)
41 {
42
43
44 Address = m[k].Groups[1].ToString();
45 Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46 // statusBar.Text = "Address: " + Address;
47 if (!m_pages.Contains(uri.AbsoluteUri))
48 {
49 m_pages.Add(uri.AbsoluteUri);
50 DownloadImage(state.Uri, Address);
51 if (this.ContentHandler != null)
52 {
53 state.mes.MaxProgress = m.Count;
54
55 state.mes.Progress = k+1;
56 state.mes.Result = state.Uri.AbsoluteUri;
57 state.mes.Status = TaskStatus.Running;
58 state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59 ContentHandler.Invoke(state);
60 }
61 }
62
63
64 }
65
66 int counter = 0;
67 Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68
69 while (mm.Success)
70 {
71 Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72 if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73 {
74 if (level > 10)
75 return true;
76 counter++;
77 level++;
78 WebPageState statec = new WebPageState(uri);
79 m_pages.Add(uri.AbsoluteUri);
80 Process(statec);
81 }
82
83
84 mm = mm.NextMatch();
85 }
86
87 }
88
89 state.ProcessSuccessfull = true;
90 }
91 catch( Exception ex )
92 {
93 HandleException( ex, state );
94 }
95 finally
96 {
97 if ( res != null )
98 {
99 res.Close( );
100 }
101 }
102 }
103 catch (Exception ex)
104 {
105 Console.WriteLine( ex.ToString( ) );
106 }
107 Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108
109 return state.ProcessSuccessfull;
110 }
111 #endregion
112
113
114 private void DownloadImage(Uri m_bb,string imgUri)
115 {
116 Uri imageUri = null;
117 string ext = null;
118 string outFile = null;
119
120
121 try
122 {
123 imageUri = new Uri(m_bb, imgUri);
124
125 ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126 outFile = "temp\\img" + (m_fileId++) + "." + ext;
127
128 if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129 {
130 WebClient web = new WebClient();
131 web.DownloadFile(imageUri.AbsoluteUri,outFile);
132 // byte[] image=web.DownloadData(imageUri);
133
134 if (ext == "swf")
135 {
136 //m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137 //m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138 //m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139 //m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140 //m_graphicViewerWriter.WriteLine("</object>");
141 }
142 else
143 {
144 // m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145 //img" + ( m_fileId++ ) + "." + ext;
146 //m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147 }
148 }
149 }
150 catch (Exception)
151 {
152 // m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153 }
154 }
现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!2 {
3 state.ProcessStarted = true;
4 state.ProcessSuccessfull = false;
5
6 if(level==1)
7 m_baseUri = state.Uri;
8 try
9 {
10 Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11
12 WebRequest req = WebRequest.Create( state.Uri );
13 WebResponse res = null;
14
15 try
16 {
17 res = req.GetResponse( );
18
19 if ( res is HttpWebResponse )
20 {
21 state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22 state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23 }
24 if ( res is FileWebResponse )
25 {
26 state.StatusCode = "OK";
27 state.StatusDescription = "OK";
28 }
29
30 if ( state.StatusCode.Equals( "OK" ) )
31 {
32 StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33
34 state.Content = sr.ReadToEnd( );
35
36
37 MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38 string Address;
39 int k=0;
40 for (k = 0; k < m.Count;k++)
41 {
42
43
44 Address = m[k].Groups[1].ToString();
45 Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46 // statusBar.Text = "Address: " + Address;
47 if (!m_pages.Contains(uri.AbsoluteUri))
48 {
49 m_pages.Add(uri.AbsoluteUri);
50 DownloadImage(state.Uri, Address);
51 if (this.ContentHandler != null)
52 {
53 state.mes.MaxProgress = m.Count;
54
55 state.mes.Progress = k+1;
56 state.mes.Result = state.Uri.AbsoluteUri;
57 state.mes.Status = TaskStatus.Running;
58 state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59 ContentHandler.Invoke(state);
60 }
61 }
62
63
64 }
65
66 int counter = 0;
67 Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68
69 while (mm.Success)
70 {
71 Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72 if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73 {
74 if (level > 10)
75 return true;
76 counter++;
77 level++;
78 WebPageState statec = new WebPageState(uri);
79 m_pages.Add(uri.AbsoluteUri);
80 Process(statec);
81 }
82
83
84 mm = mm.NextMatch();
85 }
86
87 }
88
89 state.ProcessSuccessfull = true;
90 }
91 catch( Exception ex )
92 {
93 HandleException( ex, state );
94 }
95 finally
96 {
97 if ( res != null )
98 {
99 res.Close( );
100 }
101 }
102 }
103 catch (Exception ex)
104 {
105 Console.WriteLine( ex.ToString( ) );
106 }
107 Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108
109 return state.ProcessSuccessfull;
110 }
111 #endregion
112
113
114 private void DownloadImage(Uri m_bb,string imgUri)
115 {
116 Uri imageUri = null;
117 string ext = null;
118 string outFile = null;
119
120
121 try
122 {
123 imageUri = new Uri(m_bb, imgUri);
124
125 ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126 outFile = "temp\\img" + (m_fileId++) + "." + ext;
127
128 if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129 {
130 WebClient web = new WebClient();
131 web.DownloadFile(imageUri.AbsoluteUri,outFile);
132 // byte[] image=web.DownloadData(imageUri);
133
134 if (ext == "swf")
135 {
136 //m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137 //m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138 //m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139 //m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140 //m_graphicViewerWriter.WriteLine("</object>");
141 }
142 else
143 {
144 // m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145 //img" + ( m_fileId++ ) + "." + ext;
146 //m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147 }
148 }
149 }
150 catch (Exception)
151 {
152 // m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153 }
154 }