基于深度优先搜索的蜘蛛程序
这几天发现一个很好的图片网站,美女特多
![](https://www.cnblogs.com/Emoticons/msn/teeth_smile.gif)
随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
基本就丢了,另外结构上感觉不太满意,所以改改了。
我大致想的采用双线程,一个UI,一个工作线程
1
public bool Process( WebPageState state )
2
{
3
state.ProcessStarted = true;
4
state.ProcessSuccessfull = false;
5![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
6
if(level==1)
7
m_baseUri = state.Uri;
8
try
9
{
10
Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
12
WebRequest req = WebRequest.Create( state.Uri );
13
WebResponse res = null;
14![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
15
try
16
{
17
res = req.GetResponse( );
18![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
19
if ( res is HttpWebResponse )
20
{
21
state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22
state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23
}
24
if ( res is FileWebResponse )
25
{
26
state.StatusCode = "OK";
27
state.StatusDescription = "OK";
28
}
29![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
30
if ( state.StatusCode.Equals( "OK" ) )
31
{
32
StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33
34
state.Content = sr.ReadToEnd( );
35![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
36![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
37
MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38
string Address;
39
int k=0;
40
for (k = 0; k < m.Count;k++)
41
{
42![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
43![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
44
Address = m[k].Groups[1].ToString();
45
Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46
// statusBar.Text = "Address: " + Address;
47
if (!m_pages.Contains(uri.AbsoluteUri))
48
{
49
m_pages.Add(uri.AbsoluteUri);
50
DownloadImage(state.Uri, Address);
51
if (this.ContentHandler != null)
52
{
53
state.mes.MaxProgress = m.Count;
54
55
state.mes.Progress = k+1;
56
state.mes.Result = state.Uri.AbsoluteUri;
57
state.mes.Status = TaskStatus.Running;
58
state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59
ContentHandler.Invoke(state);
60
}
61
}
62![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
63
64
}
65
66
int counter = 0;
67
Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
69
while (mm.Success)
70
{
71
Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72
if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73
{
74
if (level > 10)
75
return true;
76
counter++;
77
level++;
78
WebPageState statec = new WebPageState(uri);
79
m_pages.Add(uri.AbsoluteUri);
80
Process(statec);
81
}
82![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
83![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
84
mm = mm.NextMatch();
85
}
86
87
}
88![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
89
state.ProcessSuccessfull = true;
90
}
91
catch( Exception ex )
92
{
93
HandleException( ex, state );
94
}
95
finally
96
{
97
if ( res != null )
98
{
99
res.Close( );
100
}
101
}
102
}
103
catch (Exception ex)
104
{
105
Console.WriteLine( ex.ToString( ) );
106
}
107
Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
109
return state.ProcessSuccessfull;
110
}
111
#endregion
112![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
113![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
114
private void DownloadImage(Uri m_bb,string imgUri)
115
{
116
Uri imageUri = null;
117
string ext = null;
118
string outFile = null;
119![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
120![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
121
try
122
{
123
imageUri = new Uri(m_bb, imgUri);
124![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
125
ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126
outFile = "temp\\img" + (m_fileId++) + "." + ext;
127![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
128
if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129
{
130
WebClient web = new WebClient();
131
web.DownloadFile(imageUri.AbsoluteUri,outFile);
132
// byte[] image=web.DownloadData(imageUri);
133
134
if (ext == "swf")
135
{
136
//m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137
//m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138
//m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139
//m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140
//m_graphicViewerWriter.WriteLine("</object>");
141
}
142
else
143
{
144
// m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145
//img" + ( m_fileId++ ) + "." + ext;
146
//m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147
}
148
}
149
}
150
catch (Exception)
151
{
152
// m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153
}
154
}
现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
2
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
3
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
4
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
5
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
6
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
7
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
8
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
9
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
10
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
11
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
12
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
13
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
14
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
15
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
16
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
17
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
18
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
19
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
20
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
21
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
22
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
23
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
24
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
25
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
26
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
27
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
28
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
29
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
30
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
31
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
32
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
33
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
34
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
35
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
36
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
37
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
38
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
39
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
40
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
41
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
42
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
43
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
44
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
45
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
46
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
47
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
48
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
49
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
50
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
51
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
52
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
53
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
54
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
55
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
56
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
57
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
58
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
59
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
60
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
61
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
62
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
63
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
64
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
65
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
66
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
67
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
68
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
69
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
70
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
71
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
72
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
73
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
74
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
75
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
76
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
77
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
78
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
79
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
80
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
81
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
82
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
83
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
84
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
85
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
86
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
87
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
88
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
89
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
90
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
91
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
92
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
93
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
94
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
95
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
96
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
97
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
98
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
99
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
100
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
101
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
102
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
103
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
104
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
105
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
106
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
107
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
108
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
109
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
110
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockEnd.gif)
111
![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
112
![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
113
![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
114
![](https://www.cnblogs.com/Images/OutliningIndicators/None.gif)
115
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockStart.gif)
116
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
117
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
118
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
119
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
120
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
121
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
122
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
123
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
124
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
125
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
126
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
127
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
128
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
129
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
130
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
131
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
132
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
133
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
134
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
135
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
136
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
137
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
138
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
139
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
140
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
141
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
142
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
143
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
144
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
145
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
146
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
147
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
148
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
149
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
150
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
151
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
152
![](https://www.cnblogs.com/Images/OutliningIndicators/InBlock.gif)
153
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
154
![](https://www.cnblogs.com/Images/OutliningIndicators/ExpandedBlockEnd.gif)