基于深度优先搜索的蜘蛛程序

这几天发现一个很好的图片网站，美女特多

！就打算下点图片，但是自己下载的话，翻来覆去的太麻烦，所以用找了个蜘蛛来帮忙。
随便在网上查了查，就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下，感觉作者也是写着玩来着，意思意思，网页下载下来
基本就丢了，另外结构上感觉不太满意，所以改改了。
我大致想的采用双线程，一个UI，一个工作线程

，抓取方面采用深度优先搜索，基本思路：得到当前网页，提取下载图片，然后正则表达式匹配网址，然后递归处理！在处理过程中，使用一个集合类来收集处理过的网址防止死循环。代码大致如下：

public bool Process( WebPageState state )
2

{
3

state.ProcessStarted = true;
4

state.ProcessSuccessfull = false;
5

if(level==1)
7

m_baseUri = state.Uri;
8

try
9

{
10

Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11

WebRequest req = WebRequest.Create( state.Uri );
13

WebResponse res = null;
14

try
16

{
17

res = req.GetResponse( );
18

if ( res is HttpWebResponse )
20

{
21

state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22

state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23

}
24

if ( res is FileWebResponse )
25

{
26

state.StatusCode = "OK";
27

state.StatusDescription = "OK";
28

}
29

if ( state.StatusCode.Equals( "OK" ) )
31

{
32

StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33

state.Content = sr.ReadToEnd( );
35

MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38

string Address;
39

int k=0;
40

for (k = 0; k < m.Count;k++)
41

{
42

Address = m[k].Groups[1].ToString();
45

Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46

// statusBar.Text = "Address: " + Address;
47

if (!m_pages.Contains(uri.AbsoluteUri))
48

{
49

m_pages.Add(uri.AbsoluteUri);
50

DownloadImage(state.Uri, Address);
51

if (this.ContentHandler != null)
52

{
53

state.mes.MaxProgress = m.Count;
54

state.mes.Progress = k+1;
56

state.mes.Result = state.Uri.AbsoluteUri;
57

state.mes.Status = TaskStatus.Running;
58

state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59

ContentHandler.Invoke(state);
60

}
61

}
62

}
65

int counter = 0;
67

Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68

while (mm.Success)
70

{
71

Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72

if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73

{
74

if (level > 10)
75

return true;
76

counter++;
77

level++;
78

WebPageState statec = new WebPageState(uri);
79

m_pages.Add(uri.AbsoluteUri);
80

Process(statec);
81

}
82

mm = mm.NextMatch();
85

}
86

}
88

state.ProcessSuccessfull = true;
90

}
91

catch( Exception ex )
92

{
93

HandleException( ex, state );
94

}
95

finally
96

{
97

if ( res != null )
98

{
99

res.Close( );
100

}
101

}
102

}
103

catch (Exception ex)
104

{
105

Console.WriteLine( ex.ToString( ) );
106

}
107

Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108

109

return state.ProcessSuccessfull;
110

}
111

#endregion
112

113

114

private void DownloadImage(Uri m_bb,string imgUri)
115

{
116

Uri imageUri = null;
117

string ext = null;
118

string outFile = null;
119

120

121

try
122

{
123

imageUri = new Uri(m_bb, imgUri);
124

125

ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126

outFile = "temp\\img" + (m_fileId++) + "." + ext;
127

128

if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129

{
130

WebClient web = new WebClient();
131

web.DownloadFile(imageUri.AbsoluteUri,outFile);
132

// byte[] image=web.DownloadData(imageUri);
133

134

if (ext == "swf")
135

{
136

//m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137

//m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138

//m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139

//m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140

//m_graphicViewerWriter.WriteLine("</object>");
141

}
142

else
143

{
144

// m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145

//img" + ( m_fileId++ ) + "." + ext;
146

//m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147

}
148

}
149

}
150

catch (Exception)
151

{
152

// m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153

}
154

}

现在基本可以下载图片了，不过感觉要优化的地方较多！递归的层级暂时没有控制，性能也是一般，代码的结构还是比较乱，后续再重构了！

posted on 2009-07-28 16:49 沧海一声笑阅读(590) 评论(2) 收藏举报

刷新页面返回顶部

JACKY

基于深度优先搜索的蜘蛛程序

导航

公告