Perl爬虫代码
目前在做Perl页面爬虫的模块,发现一些代码,做个详细的分析,把好的引用一下给自己用用。
1 #!/usr/bin/perl -w 2 3 use strict; 4 5 use HTTP::Request; 6 use HTTP::Status; 7 use HTML::LinkExtor; 8 use URI::URL; 9 use LWP::UserAgent; 10 #use Digest::MD5 qw(md5_hex); 11 12 13 use Compress::Zlib; 14 15 #################################################################### 16 # Parameters Setting 17 our $StartUrl = "http://xxx"; 18 our $bRestrict = 1; 19 our @restrictSite = ('cxxx','context:'); 20 our $bContinueBefore = 1; 21 22 23 #################################################################### 24 25 26 print __FILE__,"\n"; 27 28 our %img_seen = (); 29 our %url_seen = (); 30 our @url_queue = (); 31 our %url_processed = (); 32 33 our %RobotDisallow = (); 34 our %RobotAllow = (); 35 our %site_seen = (); 36 37 38 if($bContinueBefore){ 39 &LoadBefore(); 40 }else{ 41 $url_seen{$StartUrl} = 1; 42 push @url_queue, $StartUrl; 43 } 44 45 our $pageNum = 0; 46 our $BucketNum = 0; 47 48 &OpenOutFile(); 49 50 open(URLHASH,">>urlhash.txt") or die; 51 open(URLPROCESSED,">>urlprocessed.txt") or die; 52 open(URLREDIRECT,">>urlredirect.txt") or die; 53 open(PAGELIST,">>pagelist.txt") or die; 54 open(IMGLIST,">>imglist.txt") or die; 55 56 57 $| = 1, select $_ for select URLHASH; 58 $| = 1, select $_ for select URLPROCESSED; 59 $| = 1, select $_ for select URLREDIRECT; 60 $| = 1, select $_ for select PAGELIST; 61 $| = 1, select $_ for select IMGLIST; 62 63 our $urlhash_log = *URLHASH; 64 our $urlprocessed_log = *URLPROCESSED; 65 our $urlredirect_log = *URLREDIRECT; 66 our $pagelist_log = *PAGELIST; 67 our $imglist_log = *IMGLIST; 68 69 70 our $UA = new LWP::UserAgent(keep_alive => 1, 71 timeout => 60, 72 ); 73 $UA->agent('Mozilla/5.0'); 74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/'); 75 76 our $linkExtor = new HTML::LinkExtor(\&linkCallback); 77 our @tmpLinks = (); 78 our @tmpImgs = (); 79 80 my $url; 81 while ( $url = &next_url() ) 82 { 83 print $urlprocessed_log $url,"\n"; 84 85 #sleep(1000); 86 87 my $response = &get_url( $url ); 88 89 if(!defined $response){ 90 next; 91 } 92 93 my $base = $response->base; 94 $base = $base->as_string; 95 #$base =~ tr/A-Z/a-z/; 96 97 if ( $base ne $url ) 98 { 99 if(!&ValidUrl($base)){ 100 next; 101 } 102 103 print $urlredirect_log $url,"\t",$base,"\n"; 104 105 $url_seen{$base} ++; 106 print $urlhash_log $base,"\n"; 107 108 if(exists($url_processed{$base})){ 109 next; 110 } 111 } 112 113 my $contents = $response->content; 114 115 #my $digest = md5_hex($base); 116 117 &SavePage(\$base,\$contents); 118 print $pagelist_log $base,"\n"; 119 $url_processed{$base} ++; 120 121 122 @tmpLinks = (); 123 @tmpImgs = (); 124 $linkExtor->parse($contents); 125 126 foreach (@tmpLinks){ 127 $_ = URI::URL->new($_,$base)->abs->as_string; 128 #$_ =~ tr/A-Z/a-z/; 129 } 130 131 foreach (@tmpImgs){ 132 $_ = URI::URL->new($_,$base)->abs->as_string; 133 #$_ =~ tr/A-Z/a-z/; 134 } 135 136 #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks; 137 #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs; 138 139 &RecordLinks(); 140 &RecordImgs(); 141 142 } 143 144 145 146 sub next_url 147 { 148 149 # We return 'undef' to signify no URLs on the list 150 if (@url_queue == 0 ) 151 { 152 return undef; 153 } 154 155 return shift @url_queue; 156 } 157 158 sub get_url 159 { 160 my $url = shift; 161 162 my $request = new HTTP::Request( 'HEAD', $url ); 163 return undef unless $request; 164 165 my $response = $UA->request( $request ); 166 return undef unless defined $response; 167 return undef unless $response->is_success; 168 169 my $content_type = $response->content_type(); 170 return undef unless defined $content_type; 171 172 return undef if 'text/html' ne $content_type; 173 174 $request = new HTTP::Request( 'GET', $url ); 175 return undef unless $request; 176 177 $response = $UA->request( $request ); 178 return undef unless defined $response; 179 return undef unless $response->is_success; 180 181 return $response; 182 } 183 184 sub linkCallback 185 { 186 my($tag, %attr) = @_; 187 if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){ 188 push(@tmpLinks,values %attr); 189 return; 190 } 191 if($tag eq 'img'){ 192 push(@tmpImgs,values %attr); 193 return; 194 } 195 return; 196 } 197 198 sub RecordLinks 199 { 200 foreach (@tmpLinks){ 201 if(/\/.+\.(\w{1,4})$/){ 202 if($1 =~ /(html|htm|asp|php|jsp)/i){ 203 204 }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){ 205 if(/^http/i){ 206 207 if(exists($img_seen{$_})){ 208 next; 209 } 210 211 $img_seen{$_} = 1; 212 print $imglist_log $_,"\n"; 213 214 } 215 next; 216 217 }else{ 218 next; 219 } 220 } 221 222 #if(/\.(gif|jpg|jpeg|png|xbm|au|wav|mpg|pdf|ps|mp3|mp2|rm|zip|rar|gz|zip)$/i){ 223 # next; 224 #} 225 226 if(/^http/i){ 227 228 if(!&ValidUrl($_)){ 229 next; 230 } 231 232 s/#.*//; 233 234 if(exists($url_seen{$_})){ 235 next; 236 } 237 238 $url_seen{$_} = 1; 239 push @url_queue,$_; 240 print $urlhash_log $_,"\n"; 241 } 242 } 243 } 244 245 sub RecordImgs 246 { 247 foreach (@tmpImgs){ 248 if(/^http/i){ 249 if(!&ValidImage($_)){ 250 next; 251 } 252 253 if(exists($img_seen{$_})){ 254 next; 255 } 256 257 $img_seen{$_} = 1; 258 print $imglist_log $_,"\n"; 259 260 } 261 } 262 } 263 264 265 sub LoadBefore 266 { 267 open(FILE, "urlprocessed.txt") or die; 268 while(<FILE>){ 269 chomp; 270 $url_processed{$_}++; 271 } 272 273 open(FILE, "pagelist.txt") or die; 274 while(<FILE>){ 275 if(/(\S+)\s/){ 276 $url_processed{$1}++; 277 } 278 } 279 280 open(FILE, "urlhash.txt") or die; 281 while(<FILE>){ 282 chomp; 283 $url_seen{$_}++; 284 if(!exists($url_processed{$_})){ 285 push @url_queue,$_; 286 } 287 } 288 289 open(FILE, "imglist.txt") or die; 290 while(<FILE>){ 291 chomp; 292 $img_seen{$_}++; 293 } 294 295 } 296 297 298 sub ValidUrl 299 { 300 my($url) = shift; 301 if($bRestrict){ 302 foreach (@restrictSite){ 303 if($url =~ /$_/){ 304 return 1; 305 } 306 } 307 return 0; 308 }else{ 309 return 1; 310 } 311 } 312 313 sub ValidImage 314 { 315 my($url) = shift; 316 if($url =~ /#/){ 317 return 0; 318 } 319 320 if(/spacer\.gif/){ 321 return 0; 322 } 323 324 return 1; 325 } 326 327 328 sub get_robotstxt 329 { 330 my $url = shift; 331 $url .= "/robots.txt"; 332 333 my $request = new HTTP::Request( 'HEAD', $url ); 334 return undef unless $request; 335 336 my $response = $UA->request( $request ); 337 return undef unless defined $response; 338 return undef unless $response->is_success; 339 340 my $content_type = $response->content_type(); 341 return undef unless defined $content_type; 342 343 return undef if 'text/plain' ne $content_type; 344 345 $request = new HTTP::Request( 'GET', $url ); 346 return undef unless $request; 347 348 $response = $UA->request( $request ); 349 return undef unless defined $response; 350 return undef unless $response->is_success; 351 352 return $response; 353 } 354 355 sub OpenOutFile 356 { 357 $BucketNum ++; 358 my $fname = sprintf("PageBucket.%05d",$BucketNum); 359 open(PAGEBUCKET,">>$fname") or die; 360 binmode(PAGEBUCKET); 361 $| = 1, select $_ for select PAGEBUCKET; 362 } 363 364 sub SavePage 365 { 366 my($urlR,$contR) = @_; 367 my $data = compress($$contR); 368 my $len = pack('I',length($$urlR)); 369 print PAGEBUCKET $len; 370 print PAGEBUCKET $$urlR; 371 $len = pack('I',length($data)); 372 print PAGEBUCKET $len; 373 print PAGEBUCKET $data; 374 375 $pageNum++; 376 if($pageNum % 1000 == 0){ 377 print "$pageNum pages have been crawled!\n"; 378 } 379 if($pageNum % 100000 == 0){ 380 &OpenOutFile; 381 } 382 }