Perl爬虫研究
这几天忙着做项目和一些W3A的测试,没啥时间研究别的.
今天趁着快放假,也给自己放放假吧.看了下云总写的Perl爬虫,发现有多处不懂.
但是部分地方算是理解了,看来目标还是很遥远的.
给代码加了下注释,不过太累的,准备睡觉了..写了部分,改天补全..
凑合着看吧....
#!/usr/bin/perl use strict; use warnings; use threads; use threads::shared; use Thread::Queue; use Thread::Semaphore; use Bloom::Filter; use URI::URL; use Web::Scraper; # 设置线程数量 my $max_threads = 15; # 设置目标 my $base_url = $ARGV[0] || 'http://www.icylife.net'; # 这个没解释出来(P1) my $host = URI::URL->new($base_url)->host; # 建立一个线程队列 my $queue = Thread::Queue->new( ); # 创建信号量容器并锁定峰值 my $semaphore = Thread::Semaphore->new( $max_threads ); # 每次创建一个信号量 my $mutex = Thread::Semaphore->new( 1 ); # BS算法,用于测试URL是否重复 my $filter = shared_clone( Bloom::Filter->new(capacity => 1000, error_rate => 0.0001) ); # 将目标放入任务队列 $queue->enqueue( $base_url ); # 放入需要对比的第一个成员 $filter->add( $base_url ); while( 1 ) { # join all threads which can be joined #my $joined = 0; foreach ( threads->list(threads::joinable) ) { #$joined ++; $_->join( ); } #print $joined, " joined\n"; # if there are no url need process. my $item = $queue->pending(); if( $item == 0 ) { my $active = threads->list(threads::running); # there are no active thread, we finish the job if( $active == 0 ) { print "All done!\n"; last; } # we will get some more url if there are some active threads, just wait for them else { #print "[MAIN] 0 URL, but $active active thread\n"; sleep 1; next; } } # if there are some url need process #print "[MAIN] $item URLn"; $semaphore->down; #print "[MAIN]Create thread.n"; threads->create( \&ProcessUrl ); } # join all threads which can be joined foreach ( threads->list() ) { $_->join( ); } sub ProcessUrl { my $scraper = scraper { process '//a', 'links[]' => '@href'; }; my $res; my $link; while( my $url = $queue->dequeue_nb() ) { eval { $res = $scraper->scrape( URI->new($url) )->{'links'}; }; if( $@ ) { warn "$@\n"; next; } next if (! defined $res ); #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.\n"; foreach( @{$res} ) { $link = $_->as_string; $link = URI::URL->new($link, $url); # not http and not https? next if( $link->scheme ne 'http' && $link->scheme ne 'https' ); # another domain? next if( $link->host ne $host ); $link = $link->abs->as_string; if( $link =~ /(.*?)#(.*)/ ) { $link = $1; } next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|doc|js|css|docx|xls|xlsx)$/i ); $mutex->down(); if( ! $filter->check($link) ) { print $filter->key_count(), " ", $link, "\n"; $filter->add($link); $queue->enqueue($link); } $mutex->up(); undef $link; } undef $res; } undef $scraper; $semaphore->up( ); }