接触ruby第二天用它写的一个网络爬虫程序
这个程序写的很简单,刚接触ruby第二天写的,写于2009年12月,主要完成的功能是到斯坦福大学的网站上去收集email地址, ,默认是10个线程,策略是广度优先,$debug=true时开启调试信息。附件中包含代码和批处理文件。
运行命令为:
ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
其中参数:2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->URL
运行结果输出为文档文件emails_md[max_depth]_mp[max_pages]_[URL].txt
1.require 'open-uri'
2.require 'thread'
3.# run it like this :
4.# ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5.# regexp
6.$link_regexp = /href\=\"[^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/ #mailto:xx@xxxx"
8.$email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/ #>xx@xx<
9.$before_at = /[a-zA-Z0-9]+[_?a-zA-Z0-9]+/
10.$after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11.$email_regexp=/#{$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13.#ARGV
14.if ARGV==nil||ARGV.length<3
15. puts '-- Command --'
16. puts 'ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People'
17. puts 'help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url'
18. exit(0)
19.end
20.$url=ARGV[2]
21.$max_depth=ARGV[0].to_i
22.$max_pages=ARGV[1].to_i
23.$fname='emails_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
24.$fname_links='links_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
25.$thread_num=10
26.$debug=true
27.
28.$links_stack=[] #fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29.$links_crawled=[] #[url1,url2,url3,...]
30.$emails=[] #[email1,email2,email3,...]
31.
32.class Crawl
33. def initialize url,depth
34. @url=url
35. while @url[-1,1]=='/'
36. @url=@url.slice(0,@url.length-1)
37. end
38. @depth=depth
39. begin
40. @html=open(@url).read
41. rescue
42. @html=''
43. end
44. end
45. def get_links
46. @html.scan($link_regexp) do |match|
47. u=Util.format_url(match,@url)
48. if !(u==nil)&&!$links_crawled.include?(match)&&$links_stack.rassoc(match)==nil
49. $links_stack.push [@depth,u]
50. end
51. end
52. end
53. def get_emails
54. @html.scan($email_regexp_1) do |match|
55. match=Util.format_email(match)
56. if match!=nil&&!$emails.include?(match)
57. $emails.push match
58. msg= match+', '+@url
59. puts msg
60. Util.write($fname,msg+"\r\n")
61. end
62. end
63. @html.scan($email_regexp_2) do |match|
64. match=Util.format_email(match)
65. if match!=nil&&!$emails.include?(match)
66. $emails.push match
67. msg= match+', '+@url
68. puts msg
69. Util.write($fname,msg+"\r\n")
70. end
71. end
72. end
73.end
74.
75.class Util
76. # format url
77. def Util.format_url url,f_url
78. # remove 'www-'
79. f_url=f_url.gsub(/www\-/, '')
80. url=url[6,url.length-7]
81. # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82. if Util.exclude(url)==nil||url.include?('#')
83. return nil
84. end
85. # full path
86. if url[0,4]!='http'
87. while url.index('/')==0
88. url=url.slice(1,url.length-1)
89. end
90. url=f_url+'/'+url
91. end
92. return url
93. end
94. # format email
95. def Util.format_email email
96. email=email.delete('>').delete('<').delete('mailto:').delete('"').strip
97. if String($email_regexp.match(email))== email
98. return email.downcase
99. else
100. return nil
101. end
102. end
103. # write msg to file
104. def Util.write fname,msg
105. file=File.new(fname,'a')
106. file<<msg
107. file.close()
108. end
109. # exclude css & js...
110. def Util.exclude str
111. ex=['css','js','pdf','jpg']
112. ex.each do |e|
113. index=e.length+1
114. if str.length>index && str[-index,index]=='.'+e
115. return nil
116. end
117. end
118. return str
119. end
120.end
121.$count=1
122.0.upto($max_depth) do |i|
123. if $debug
124. puts '~~depth->'+String(i)
125. end
126. if i==0
127. c=Crawl.new($url,i+1)
128. c.get_links
129. c.get_emails
130. $links_crawled.push [i,$url]
131. end
132. #breadth first
133. while $links_stack.length!=0
134. if $debug
135. puts '~~count->'+String($count)+',stack->'+String($links_stack.length)+',crawled->'+String($links_crawled.length)+',total->'+String($links_crawled.length+$links_stack.length)
136. $count=$count+1
137. end
138. #Thread.abort_on_exception = true
139. threads = []
140. if $links_stack.length/$thread_num>=1
141. ts=$thread_num
142. else
143. ts=$links_stack.length%$thread_num
144. end
145. ts.times { |i|
146. threads << Thread.new(i) {
147. Mutex.new.synchronize {
148. if ($links_crawled.length+$links_stack.length)<=$max_pages&&i!=$max_depth
149. link=$links_stack.shift #fifo
150. if link[0]==i+1
151. #read links & emails from pages in stack
152. c=Crawl.new(link[1],i+2)
153. c.get_links
154. c.get_emails
155. $links_crawled.push link[1]
156. else
157. break
158. end
159. else
160. #only read emails from pages in stack
161. link=$links_stack.shift
162. c=Crawl.new(link[1],i+2)
163. c.get_emails
164. $links_crawled.push link[1]
165. end
166. }
167. }
168. }
169. threads.each{|t|t.join}
170. end
171.end
2.require 'thread'
3.# run it like this :
4.# ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5.# regexp
6.$link_regexp = /href\=\"[^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/ #mailto:xx@xxxx"
8.$email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/ #>xx@xx<
9.$before_at = /[a-zA-Z0-9]+[_?a-zA-Z0-9]+/
10.$after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11.$email_regexp=/#{$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13.#ARGV
14.if ARGV==nil||ARGV.length<3
15. puts '-- Command --'
16. puts 'ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People'
17. puts 'help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url'
18. exit(0)
19.end
20.$url=ARGV[2]
21.$max_depth=ARGV[0].to_i
22.$max_pages=ARGV[1].to_i
23.$fname='emails_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
24.$fname_links='links_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
25.$thread_num=10
26.$debug=true
27.
28.$links_stack=[] #fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29.$links_crawled=[] #[url1,url2,url3,...]
30.$emails=[] #[email1,email2,email3,...]
31.
32.class Crawl
33. def initialize url,depth
34. @url=url
35. while @url[-1,1]=='/'
36. @url=@url.slice(0,@url.length-1)
37. end
38. @depth=depth
39. begin
40. @html=open(@url).read
41. rescue
42. @html=''
43. end
44. end
45. def get_links
46. @html.scan($link_regexp) do |match|
47. u=Util.format_url(match,@url)
48. if !(u==nil)&&!$links_crawled.include?(match)&&$links_stack.rassoc(match)==nil
49. $links_stack.push [@depth,u]
50. end
51. end
52. end
53. def get_emails
54. @html.scan($email_regexp_1) do |match|
55. match=Util.format_email(match)
56. if match!=nil&&!$emails.include?(match)
57. $emails.push match
58. msg= match+', '+@url
59. puts msg
60. Util.write($fname,msg+"\r\n")
61. end
62. end
63. @html.scan($email_regexp_2) do |match|
64. match=Util.format_email(match)
65. if match!=nil&&!$emails.include?(match)
66. $emails.push match
67. msg= match+', '+@url
68. puts msg
69. Util.write($fname,msg+"\r\n")
70. end
71. end
72. end
73.end
74.
75.class Util
76. # format url
77. def Util.format_url url,f_url
78. # remove 'www-'
79. f_url=f_url.gsub(/www\-/, '')
80. url=url[6,url.length-7]
81. # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82. if Util.exclude(url)==nil||url.include?('#')
83. return nil
84. end
85. # full path
86. if url[0,4]!='http'
87. while url.index('/')==0
88. url=url.slice(1,url.length-1)
89. end
90. url=f_url+'/'+url
91. end
92. return url
93. end
94. # format email
95. def Util.format_email email
96. email=email.delete('>').delete('<').delete('mailto:').delete('"').strip
97. if String($email_regexp.match(email))== email
98. return email.downcase
99. else
100. return nil
101. end
102. end
103. # write msg to file
104. def Util.write fname,msg
105. file=File.new(fname,'a')
106. file<<msg
107. file.close()
108. end
109. # exclude css & js...
110. def Util.exclude str
111. ex=['css','js','pdf','jpg']
112. ex.each do |e|
113. index=e.length+1
114. if str.length>index && str[-index,index]=='.'+e
115. return nil
116. end
117. end
118. return str
119. end
120.end
121.$count=1
122.0.upto($max_depth) do |i|
123. if $debug
124. puts '~~depth->'+String(i)
125. end
126. if i==0
127. c=Crawl.new($url,i+1)
128. c.get_links
129. c.get_emails
130. $links_crawled.push [i,$url]
131. end
132. #breadth first
133. while $links_stack.length!=0
134. if $debug
135. puts '~~count->'+String($count)+',stack->'+String($links_stack.length)+',crawled->'+String($links_crawled.length)+',total->'+String($links_crawled.length+$links_stack.length)
136. $count=$count+1
137. end
138. #Thread.abort_on_exception = true
139. threads = []
140. if $links_stack.length/$thread_num>=1
141. ts=$thread_num
142. else
143. ts=$links_stack.length%$thread_num
144. end
145. ts.times { |i|
146. threads << Thread.new(i) {
147. Mutex.new.synchronize {
148. if ($links_crawled.length+$links_stack.length)<=$max_pages&&i!=$max_depth
149. link=$links_stack.shift #fifo
150. if link[0]==i+1
151. #read links & emails from pages in stack
152. c=Crawl.new(link[1],i+2)
153. c.get_links
154. c.get_emails
155. $links_crawled.push link[1]
156. else
157. break
158. end
159. else
160. #only read emails from pages in stack
161. link=$links_stack.shift
162. c=Crawl.new(link[1],i+2)
163. c.get_emails
164. $links_crawled.push link[1]
165. end
166. }
167. }
168. }
169. threads.each{|t|t.join}
170. end
171.end