Perl爬虫代码

目前在做Perl页面爬虫的模块,发现一些代码,做个详细的分析,把好的引用一下给自己用用。

  1 #!/usr/bin/perl -w
  2 
  3 use strict; 
  4  
  5 use HTTP::Request; 
  6 use HTTP::Status; 
  7 use HTML::LinkExtor; 
  8 use URI::URL; 
  9 use LWP::UserAgent; 
 10 #use Digest::MD5  qw(md5_hex); 
 11  
 12  
 13 use Compress::Zlib; 
 14  
 15 #################################################################### 
 16 # Parameters Setting 
 17 our $StartUrl = "http://xxx"; 
 18 our $bRestrict = 1; 
 19 our @restrictSite = ('cxxx','context:'); 
 20 our $bContinueBefore = 1; 
 21  
 22  
 23 #################################################################### 
 24  
 25  
 26 print __FILE__,"\n"; 
 27  
 28 our %img_seen = (); 
 29 our %url_seen = (); 
 30 our @url_queue = (); 
 31 our %url_processed = (); 
 32  
 33 our %RobotDisallow = (); 
 34 our %RobotAllow = (); 
 35 our %site_seen = (); 
 36  
 37  
 38 if($bContinueBefore){ 
 39     &LoadBefore(); 
 40 }else{ 
 41     $url_seen{$StartUrl} = 1; 
 42     push @url_queue, $StartUrl; 
 43 } 
 44  
 45 our $pageNum = 0; 
 46 our $BucketNum = 0; 
 47  
 48 &OpenOutFile(); 
 49  
 50 open(URLHASH,">>urlhash.txt") or die; 
 51 open(URLPROCESSED,">>urlprocessed.txt") or die; 
 52 open(URLREDIRECT,">>urlredirect.txt") or die; 
 53 open(PAGELIST,">>pagelist.txt") or die; 
 54 open(IMGLIST,">>imglist.txt") or die; 
 55  
 56  
 57 $| = 1, select $_ for select URLHASH; 
 58 $| = 1, select $_ for select URLPROCESSED; 
 59 $| = 1, select $_ for select URLREDIRECT; 
 60 $| = 1, select $_ for select PAGELIST; 
 61 $| = 1, select $_ for select IMGLIST; 
 62  
 63 our $urlhash_log = *URLHASH; 
 64 our $urlprocessed_log = *URLPROCESSED; 
 65 our $urlredirect_log = *URLREDIRECT; 
 66 our $pagelist_log = *PAGELIST; 
 67 our $imglist_log = *IMGLIST; 
 68  
 69  
 70 our $UA =  new LWP::UserAgent(keep_alive =>  1, 
 71                               timeout    =>  60, 
 72                               ); 
 73 $UA->agent('Mozilla/5.0'); 
 74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/'); 
 75  
 76 our $linkExtor = new HTML::LinkExtor(\&linkCallback); 
 77 our @tmpLinks = (); 
 78 our @tmpImgs = (); 
 79  
 80 my $url; 
 81 while ( $url = &next_url() ) 
 82 { 
 83     print $urlprocessed_log $url,"\n"; 
 84      
 85     #sleep(1000); 
 86      
 87     my $response = &get_url( $url ); 
 88      
 89     if(!defined $response){ 
 90         next; 
 91     } 
 92      
 93     my $base = $response->base; 
 94     $base = $base->as_string; 
 95     #$base =~ tr/A-Z/a-z/; 
 96      
 97     if ( $base ne $url ) 
 98     { 
 99         if(!&ValidUrl($base)){ 
100             next; 
101         } 
102          
103         print $urlredirect_log $url,"\t",$base,"\n"; 
104          
105         $url_seen{$base} ++; 
106         print $urlhash_log $base,"\n"; 
107          
108         if(exists($url_processed{$base})){ 
109                next; 
110         } 
111     } 
112      
113     my $contents = $response->content; 
114  
115     #my $digest = md5_hex($base);     
116      
117     &SavePage(\$base,\$contents); 
118     print $pagelist_log $base,"\n"; 
119     $url_processed{$base} ++; 
120          
121      
122     @tmpLinks = (); 
123     @tmpImgs = (); 
124     $linkExtor->parse($contents); 
125      
126     foreach (@tmpLinks){ 
127         $_ = URI::URL->new($_,$base)->abs->as_string; 
128         #$_ =~ tr/A-Z/a-z/; 
129     } 
130      
131     foreach (@tmpImgs){ 
132         $_ = URI::URL->new($_,$base)->abs->as_string; 
133         #$_ =~ tr/A-Z/a-z/; 
134     } 
135      
136     #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks; 
137     #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs; 
138      
139     &RecordLinks(); 
140     &RecordImgs(); 
141      
142 } 
143  
144  
145  
146 sub next_url 
147 { 
148  
149     # We return 'undef' to signify no URLs on the list 
150     if (@url_queue == 0 ) 
151     { 
152         return undef; 
153     } 
154      
155     return shift @url_queue; 
156 } 
157  
158 sub get_url 
159 { 
160     my $url   = shift; 
161  
162     my $request = new HTTP::Request( 'HEAD', $url ); 
163     return undef unless $request; 
164  
165     my $response = $UA->request( $request ); 
166     return undef unless defined $response; 
167     return undef unless $response->is_success; 
168  
169     my $content_type = $response->content_type(); 
170     return undef unless defined $content_type; 
171  
172     return undef if 'text/html' ne $content_type; 
173      
174     $request = new HTTP::Request( 'GET', $url ); 
175     return undef unless $request; 
176  
177     $response = $UA->request( $request ); 
178     return undef unless defined $response; 
179     return undef unless $response->is_success; 
180       
181     return $response; 
182 } 
183  
184 sub linkCallback 
185 { 
186     my($tag, %attr) = @_; 
187     if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){ 
188         push(@tmpLinks,values %attr); 
189         return; 
190     } 
191     if($tag eq 'img'){ 
192         push(@tmpImgs,values %attr); 
193         return; 
194     } 
195     return; 
196 } 
197  
198 sub RecordLinks 
199 { 
200     foreach (@tmpLinks){ 
201         if(/\/.+\.(\w{1,4})$/){ 
202             if($1 =~ /(html|htm|asp|php|jsp)/i){ 
203  
204             }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){ 
205                 if(/^http/i){ 
206                      
207                     if(exists($img_seen{$_})){ 
208                         next; 
209                     } 
210                      
211                     $img_seen{$_} = 1; 
212                     print $imglist_log $_,"\n"; 
213                      
214                 } 
215                 next; 
216  
217             }else{ 
218                 next; 
219             } 
220         } 
221              
222         #if(/\.(gif|jpg|jpeg|png|xbm|au|wav|mpg|pdf|ps|mp3|mp2|rm|zip|rar|gz|zip)$/i){ 
223         #    next;             
224         #} 
225  
226         if(/^http/i){ 
227              
228             if(!&ValidUrl($_)){ 
229                 next; 
230             } 
231              
232             s/#.*//;             
233              
234             if(exists($url_seen{$_})){ 
235                 next; 
236             } 
237              
238             $url_seen{$_} = 1; 
239             push @url_queue,$_; 
240             print $urlhash_log $_,"\n"; 
241         } 
242     } 
243 } 
244  
245 sub RecordImgs 
246 { 
247     foreach (@tmpImgs){ 
248         if(/^http/i){ 
249             if(!&ValidImage($_)){ 
250                 next; 
251             } 
252              
253             if(exists($img_seen{$_})){ 
254                 next; 
255             } 
256              
257             $img_seen{$_} = 1; 
258             print $imglist_log $_,"\n"; 
259      
260         } 
261     } 
262 } 
263  
264  
265 sub LoadBefore 
266 { 
267     open(FILE, "urlprocessed.txt") or die; 
268     while(<FILE>){ 
269         chomp; 
270         $url_processed{$_}++; 
271     } 
272      
273     open(FILE, "pagelist.txt") or die; 
274     while(<FILE>){ 
275         if(/(\S+)\s/){ 
276             $url_processed{$1}++; 
277         } 
278     } 
279  
280     open(FILE, "urlhash.txt") or die; 
281     while(<FILE>){ 
282         chomp; 
283         $url_seen{$_}++; 
284         if(!exists($url_processed{$_})){ 
285             push @url_queue,$_; 
286         } 
287     } 
288  
289     open(FILE, "imglist.txt") or die; 
290     while(<FILE>){ 
291         chomp; 
292         $img_seen{$_}++; 
293     } 
294      
295 } 
296  
297  
298 sub ValidUrl 
299 { 
300     my($url) = shift; 
301     if($bRestrict){ 
302         foreach (@restrictSite){ 
303             if($url =~ /$_/){ 
304                 return 1; 
305             } 
306         } 
307         return 0; 
308     }else{ 
309         return 1; 
310     } 
311 } 
312  
313 sub ValidImage 
314 { 
315     my($url) = shift; 
316     if($url =~ /#/){ 
317         return 0; 
318     } 
319          
320     if(/spacer\.gif/){ 
321         return 0; 
322     } 
323  
324     return 1; 
325 } 
326  
327  
328 sub get_robotstxt 
329 { 
330     my $url   = shift; 
331     $url .= "/robots.txt"; 
332      
333     my $request = new HTTP::Request( 'HEAD', $url ); 
334     return undef unless $request; 
335      
336     my $response = $UA->request( $request ); 
337     return undef unless defined $response; 
338     return undef unless $response->is_success; 
339      
340     my $content_type = $response->content_type(); 
341     return undef unless defined $content_type; 
342      
343     return undef if 'text/plain' ne $content_type; 
344      
345     $request = new HTTP::Request( 'GET', $url ); 
346     return undef unless $request; 
347      
348     $response = $UA->request( $request ); 
349     return undef unless defined $response; 
350     return undef unless $response->is_success; 
351       
352     return $response; 
353 } 
354  
355 sub OpenOutFile 
356 { 
357     $BucketNum ++; 
358     my $fname = sprintf("PageBucket.%05d",$BucketNum); 
359     open(PAGEBUCKET,">>$fname") or die; 
360     binmode(PAGEBUCKET); 
361     $| = 1, select $_ for select PAGEBUCKET; 
362 } 
363  
364 sub SavePage 
365 { 
366     my($urlR,$contR) = @_; 
367     my $data = compress($$contR); 
368     my $len = pack('I',length($$urlR)); 
369     print PAGEBUCKET $len; 
370     print PAGEBUCKET $$urlR; 
371     $len = pack('I',length($data)); 
372     print PAGEBUCKET $len; 
373     print PAGEBUCKET $data; 
374  
375     $pageNum++;
376     if($pageNum % 1000 == 0){ 
377         print "$pageNum pages have been crawled!\n"; 
378     } 
379     if($pageNum % 100000 == 0){ 
380         &OpenOutFile; 
381     } 
382 }
点开查看代码

 

posted @ 2013-05-15 16:02  墨迹哥's  阅读(395)  评论(0编辑  收藏  举报