接触ruby第二天用它写的一个网络爬虫程序

这个程序写的很简单，刚接触ruby第二天写的，写于2009年12月，主要完成的功能是到斯坦福大学的网站上去收集email地址，，默认是10个线程，策略是广度优先，$debug=true时开启调试信息。附件中包含代码和批处理文件。

运行命令为：
ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
其中参数：2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->URL
运行结果输出为文档文件emails_md[max_depth]_mp[max_pages]_[URL].txt

1.require 'open-uri'
2.require 'thread'
3.# run it like this :
4.# ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People
5.# regexp
6.$link_regexp = /href\=\"[^\"]*\"/
7.$email_regexp_1 = /mailto\:[^\@]*\@[^\"]*[\"]/  #mailto:xx@xxxx"
8.$email_regexp_2 = /[\>][^\<]*\@[^\>]*[\<]/  #>xx@xx<
9.$before_at = /[a-zA-Z0-9]+[_?a-zA-Z0-9]+/
10.$after_at = /[a-zA-Z]+[-?a-zA-Z]*\.+[a-zA-Z]+/
11.$email_regexp=/#{$before_at}\@#{$after_at}/ #xx@xx.xx
12.
13.#ARGV
14.if ARGV==nil||ARGV.length<3
15.    puts '-- Command --'
16.    puts 'ruby Crawl.rb 2 1000 http://www-cs.stanford.edu/People'
17.    puts 'help: 2->max_depth, 1000->max_pages, http://www-cs.stanford.edu/People->url'
18.    exit(0)
19.end
20.$url=ARGV[2]
21.$max_depth=ARGV[0].to_i
22.$max_pages=ARGV[1].to_i
23.$fname='emails_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
24.$fname_links='links_md'+String($max_depth)+'_mp'+String($max_pages)+'_'+$url.gsub(/[\/\:]/,'_')+'.txt'
25.$thread_num=10
26.$debug=true
27.
28.$links_stack=[]     #fifo #[[depth1,link1],[depth2,link2],[depth3,link3],...]
29.$links_crawled=[]   #[url1,url2,url3,...]
30.$emails=[]      #[email1,email2,email3,...]
31.
32.class Crawl
33.    def initialize url,depth
34.        @url=url
35.        while @url[-1,1]=='/'
36.            @url=@url.slice(0,@url.length-1)
37.        end
38.        @depth=depth
39.        begin
40.        @html=open(@url).read
41.        rescue
42.        @html=''
43.        end
44.    end
45.    def get_links
46.        @html.scan($link_regexp) do |match|
47.            u=Util.format_url(match,@url)
48.            if !(u==nil)&&!$links_crawled.include?(match)&&$links_stack.rassoc(match)==nil
49.                $links_stack.push [@depth,u]
50.            end
51.        end
52.    end
53.    def get_emails
54.        @html.scan($email_regexp_1) do |match|
55.            match=Util.format_email(match)
56.            if match!=nil&&!$emails.include?(match)
57.                $emails.push match
58.                msg= match+', '+@url
59.                puts msg
60.                Util.write($fname,msg+"\r\n")
61.            end
62.        end
63.        @html.scan($email_regexp_2) do |match|
64.            match=Util.format_email(match)
65.            if match!=nil&&!$emails.include?(match)
66.                $emails.push match
67.                msg= match+', '+@url
68.                puts msg
69.                Util.write($fname,msg+"\r\n")
70.            end
71.        end
72.    end
73.end
74.
75.class Util
76.    # format url
77.    def Util.format_url url,f_url
78.        # remove 'www-'
79.        f_url=f_url.gsub(/www\-/, '')
80.        url=url[6,url.length-7]
81.        # exclude css & js & '#'(eg http://www-cs.stanford.edu/People/faculty#Regular%20Faculty)...
82.        if Util.exclude(url)==nil||url.include?('#')
83.            return nil
84.        end
85.        # full path
86.        if url[0,4]!='http'
87.            while url.index('/')==0
88.                url=url.slice(1,url.length-1)
89.            end
90.            url=f_url+'/'+url
91.        end
92.        return url
93.    end
94.    # format email
95.    def Util.format_email email
96.        email=email.delete('>').delete('<').delete('mailto:').delete('"').strip
97.        if String($email_regexp.match(email))== email
98.            return email.downcase
99.        else
100.            return nil
101.        end
102.    end
103.    # write msg to file
104.    def Util.write fname,msg
105.        file=File.new(fname,'a')
106.        file<<msg
107.        file.close()
108.    end
109.    # exclude css & js...
110.    def Util.exclude str
111.        ex=['css','js','pdf','jpg']
112.        ex.each do |e|
113.            index=e.length+1
114.            if str.length>index && str[-index,index]=='.'+e
115.                return nil
116.            end
117.        end
118.        return str
119.    end
120.end
121.$count=1
122.0.upto($max_depth) do |i|
123.    if $debug
124.        puts '~~depth->'+String(i)
125.    end
126.    if i==0
127.        c=Crawl.new($url,i+1)
128.        c.get_links
129.        c.get_emails
130.        $links_crawled.push [i,$url]
131.    end
132.    #breadth first
133.    while $links_stack.length!=0
134.        if $debug
135.            puts '~~count->'+String($count)+',stack->'+String($links_stack.length)+',crawled->'+String($links_crawled.length)+',total->'+String($links_crawled.length+$links_stack.length)
136.            $count=$count+1
137.        end
138.        #Thread.abort_on_exception = true
139.        threads = []
140.        if $links_stack.length/$thread_num>=1
141.            ts=$thread_num
142.        else
143.            ts=$links_stack.length%$thread_num
144.        end
145.        ts.times { |i|
146.            threads << Thread.new(i) {
147.            Mutex.new.synchronize {
148.                if ($links_crawled.length+$links_stack.length)<=$max_pages&&i!=$max_depth
149.                    link=$links_stack.shift     #fifo
150.                    if  link[0]==i+1
151.                        #read links & emails from pages in stack
152.                        c=Crawl.new(link[1],i+2)
153.                        c.get_links
154.                        c.get_emails
155.                        $links_crawled.push link[1]
156.                    else
157.                        break
158.                    end
159.                else
160.                    #only read emails from pages in stack
161.                    link=$links_stack.shift
162.                    c=Crawl.new(link[1],i+2)
163.                    c.get_emails
164.                    $links_crawled.push link[1]
165.                end
166.            }
167.            }
168.        }
169.        threads.each{|t|t.join}
170.    end
171.end

posted @ 2011-10-25 12:52 Bubble 阅读(1136) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

BUBBLE, BUBBLE 欢呼雀跃！

他只是来过，现在还在，不知什么时候走开

接触ruby第二天用它写的一个网络爬虫程序

公告