[Perl]抓取个人的所有闪存+格式化保存为文本
以下代码保存为utf8文本格式
环境:ActivePerl v5.16 built for MSWin32-x86
两个要调整的地方:
for my $i (17..45) { 这里改成自己对应的页码,比如发了30页的闪存,就改成1..30
还有就是这部分改成自己对应的ID和密码,中文ID还未测试过
tbUserName=>"ID",
tbPassword=>"密码",
抓取的结果参考截图:
=info code: vicyang mail: 523066680@163.com date: 2015-02-07 =cut #!/usr/bin/perl use v5.16; use strict; use utf8; use Encode; use IO::Handle; use LWP::UserAgent; binmode(STDOUT, ":encoding(gbk)"); STDOUT->autoflush(1); my $ua = LWP::UserAgent->new; $ua->cookie_jar( {} ); my $randserial=getRandHex(32); my $response = $ua->get( 'http://passport.cnblogs.com/BotDetectCaptcha.ashx?get=image&c=c_login_logincaptcha&t='.$randserial ); print "$randserial\n"; if ($response->is_success) { # 抓取验证码图片 # open my $check_code_image, "> verifycode.BMP" #默认验证图片的位置在当前目录 or die "$!"; binmode($check_code_image); print $check_code_image $response->content; close $check_code_image; #关闭print对句柄的控制 system("start verifycode.BMP"); } else { print "wrong\n"; } print "please input verifycode:"; my $inp=<STDIN>; $inp=~s/\r?\n$//; print "[$inp]\n"; my $res= $ua->post('http://passport.cnblogs.com/login.aspx?ReturnUrl=http://home.cnblogs.com/ing/', [ __EVENTTARGET=>"", __EVENTARGUMENT=>"", __VIEWSTATE=>"/wEPDwUKLTM1MjEzOTU2MGQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFC2Noa1JlbWVtYmVy4b/ZXiH+8FthXlmKpjSEgi7XBNU=", __VIEWSTATEGENERATOR=>"C2EE9ABB", __EVENTVALIDATION=>"/wEdAAYIqCk3Gcmu25zI9fQWqoC7hI6Xi65hwcQ8/QoQCF8JIahXufbhIqPmwKf992GTkd2Mxo6xcg+Ng5CZxsqMUGnVMKtTyqevv9cjRp4Oh+9VMaKeKEbp39eHc9mbdvkCgxCM74oSoIAJofLsQdCCbtmog/0fDw==", tbUserName=>"改成自己的ID", tbPassword=>"改成自己的密码", chkRemember=>"on", LBD_VCID_c_login_logincaptcha=>$randserial, LBD_BackWorkaround_c_login_logincaptcha=>1, CaptchaCodeTextBox=>$inp, btnLogin=>"登++录", #txtReturnUrl=>"http://home.cnblogs.com/u/paktc/" ], ); my $fh; our $main='http://home.cnblogs.com/ing'; our $limit=70; my $all; my @ech; for my $i (17..45) { print STDOUT "Code: $i\n"; open $fh, ">:encoding(gbk)", "ing$i.txt"; select $fh; $fh->autoflush(1); $res = $ua->get("http://home.cnblogs.com/u/paktc/feed/$i.html") or warn "$!"; unless ($res->is_success) { print "wrong\n"; } $all=$res->content; @ech=($all=~/GetIngRecentComments\((\d+)/gi); for (@ech) { &getDetail($_); } close $fh; } print STDERR "over"; <STDIN>; sub getDetail { our $main; our $limit; my $pagecode=shift; $res = $ua->get($main."/".$pagecode) or warn "$!"; my @all=split( /\r?\n/, decode('utf8', $res->content) ); my ( $p_time, $message, $comment_count, $comment_author, $comment_text, $comment_time ); for (my $i=0; $i<=$#all; $i++) { if ($all[$i]=~/\d+-\d+-\d+ \d+:\d+$/) { $p_time=$&; print "\n$p_time\n"; } if ($all[$i]=~/ing_detail_body">(.*)/i) { $message=$1; while (not $message=~s/<\/div>//i) { $message .= $all[++$i]; } #case <div id="ing_detail_body"><a href="/ing/tag/..." class="ing_tag">[组图]</a> # [标签] $message=~s/<a href=".*class="ing_tag">([^<]+)<\/a>/$1/i; #case 幸运闪 其他情况还未考虑 $message=~s/<img src=.*alt="([^"]+)" title=".*"\/>/☆/i; #case 链接 $message=~s/<a href="([^"]+)" .*<\/a>/<$1>/gi; &charResume(\$message); #必须传入一个引用,否则会崩溃 my @msg=splitText2($message, $limit, 2); print " ", "┈"x($limit/2), "\n"; print decode('gbk', join("", @msg)); print " ", "┈"x($limit/2), "\n"; print STDOUT decode('gbk', join("", @msg)), "\n"; } if ($all[$i]=~/ing_comment_count">(.*)(<\/div>)?/i) { $comment_count=$1; } if ($all[$i]=~/comment_author_.*ing\/">(.*)<\/a>/) { $comment_author = $1; ($comment_text = $all[$i+1]) =~ s/^ +//; $comment_text =~s/<a href="([^"]+)" .*<\/a>/<$1>/gi; #替换链接 &charResume(\$comment_text); $all[$i+2] =~/(\d+-\d+-\d+ \d+:\d+)/; $comment_time = $1; printf(" <%s> ", $comment_author); my $name_length = length(encode('gbk', $comment_author))+7; #length " <%s> " my @msg = splitText2($comment_text, ($limit - $name_length), 0); print decode('gbk', shift @msg); for (@msg) { print " "x$name_length . decode('gbk', $_); } print "\n"; $i+=2; } } } sub splitText2 { my ($text, $limit, $indent) = @_; my ($a, $b, @arr); $b = encode('gbk', $text); while (length($b) > $limit) { ($a, $b) = &cut_gbk($limit, $b); push (@arr, " "x$indent . $a ."\n"); } # At last push (@arr, " "x$indent . $b ."\n"); return @arr; } sub cut_gbk { my ($limit, $gstr) = @_; my $i; my $cut = 0; my $gstr_a; my $gstr_b; foreach $i ( split("", decode('gbk', $gstr)) ) { if (ord($i) < 128) { $cut+=1; } else { $cut+=2; } if ($cut >= $limit) { $gstr_a = substr($gstr, 0, $cut); $gstr_b = substr($gstr, $cut); last; } } return ($gstr_a, $gstr_b); } sub charResume { #传入unicode格式的字符串 my $ref = shift; my $char; my $count=0; ${$ref}=~s/\</</gi; ${$ref}=~s/\>/>/gi; ${$ref}=~s/\"/"/gi; ${$ref}=~s/\ / /gi; ${$ref}=~s/\ / /gi; ${$ref}=~s/\&/\&/gi; ${$ref}=~s/\&/\&/gi; while (${$ref}=~/\&#(\d+);/) { $count++; $char=chr($1); ${$ref}=~s/\&#$1;/$char/g; if ($count > 100) { print STDOUT "DEEP LOOP \n"; sleep 1.0; exit; } } } sub getRandHex { my $str=""; for my $i (1 .. $_[0]) { $str .= sprintf("%x", int(rand(16))); } return $str; }