Perl爬取江西失信执行
#! /usr/bin/perl use strict; use Encode qw(encode decode); binmode(STDIN,":encoding(utf8)"); binmode(STDOUT,":encoding(utf8)"); binmode(STDERR,":encoding(utf8)"); use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::TreeBuilder; my @urls; pushurl(2,"http://sxr.jxnews.com.cn/gy_alllist.php?page="); pushurl(10,"http://sxr.jxnews.com.cn/sx_list.php?id=22&page="); pushurl(23,"http://sxr.jxnews.com.cn/sx_list.php?id=23&page="); pushurl(5,"http://sxr.jxnews.com.cn/sx_list.php?id=24&page="); pushurl(2,"http://sxr.jxnews.com.cn/sx_list.php?id=25&page="); pushurl(5,"http://sxr.jxnews.com.cn/sx_list.php?id=26&page="); push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=27"); pushurl(2,"http://sxr.jxnews.com.cn/sx_list.php?id=28&page="); pushurl(274,"http://sxr.jxnews.com.cn/sx_list.php?id=29&page="); push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=30"); pushurl(6,"http://sxr.jxnews.com.cn/sx_list.php?id=31&page="); pushurl(177,"http://sxr.jxnews.com.cn/sx_list.php?id=32&page="); push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=33"); print scalar @urls,"\n"; my @urlset; foreach my $key(@urls){ my $browser=LWP::UserAgent->new(); my $request=HTTP::Request->new("GET"=>$key); my $response=$browser->request($request); my $html=$response->content; my @urls; while($html=~/href=\"list1\.php\?(.*)\"\starget/g){ my $url1="http://sxr.jxnews.com.cn/list1.php?$1"; #这一步主要是获取id,并与绝对路径拼接,得到的url为最终的请求地址 print $url1,"\n"; push(@urlset,$url1); } } open FD ,">>/home/hqh/Desktop/黄启豪/爬虫/江西失信执行/file"; binmode(FD,":encoding(utf8)"); map{getinfo($_)}@urlset; sub getinfo{ my $url=shift; my $browser=LWP::UserAgent->new(); my $request=HTTP::Request->new("GET"=>"$url"); my $response=$browser->request($request); my $html=$response->content; $html=decode("gb2312", $html); my $p=HTML::TreeBuilder->new_from_content($html); my @element1=$p->look_down(_tag=>"table",class=>"imagetable"); foreach(@element1){ my $temp=$_->as_text(); $temp=encode("utf8",$temp); if($temp=~/被执行人详细信息被执行人:(.*)年龄:(.*)性别:(.*)身份证号码\/组织机构代码:(.*)地址:(.*)案件详细信息立案日期:(.*)执行依据文号:(.*)执行法院:(.*)法律文书确定的义务:(.*)被执行人履行情况:(.*)被执行人失信情形:(.*)/){ my $out=join("||",decode("utf8",$1), decode("utf8",$2), decode("utf8",$3), decode("utf8",$4), decode("utf8",$5), decode("utf8",$6), decode("utf8",$7), decode("utf8",$8), decode("utf8",$9), decode("utf8",$10), decode("utf8",$11)); #这里用一个正则表达式就可以把表格中所有的数据抽取出来。 print FD $out,"\n"; } } } sub pushurl{ my($len,$url)=@_; for(my $i=0;$i<$len;$i++){ push(@urls,$url.$len); } }