推推

导航

Perl爬取江西失信执行

#! /usr/bin/perl
use strict;
use Encode qw(encode decode);
binmode(STDIN,":encoding(utf8)");
binmode(STDOUT,":encoding(utf8)");
binmode(STDERR,":encoding(utf8)");
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::TreeBuilder;

my @urls;
pushurl(2,"http://sxr.jxnews.com.cn/gy_alllist.php?page=");
pushurl(10,"http://sxr.jxnews.com.cn/sx_list.php?id=22&page=");
pushurl(23,"http://sxr.jxnews.com.cn/sx_list.php?id=23&page=");
pushurl(5,"http://sxr.jxnews.com.cn/sx_list.php?id=24&page=");
pushurl(2,"http://sxr.jxnews.com.cn/sx_list.php?id=25&page=");
pushurl(5,"http://sxr.jxnews.com.cn/sx_list.php?id=26&page=");
push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=27");
pushurl(2,"http://sxr.jxnews.com.cn/sx_list.php?id=28&page=");
pushurl(274,"http://sxr.jxnews.com.cn/sx_list.php?id=29&page=");
push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=30");
pushurl(6,"http://sxr.jxnews.com.cn/sx_list.php?id=31&page=");
pushurl(177,"http://sxr.jxnews.com.cn/sx_list.php?id=32&page=");
push(@urls,"http://sxr.jxnews.com.cn/sx_list.php?id=33");
print scalar @urls,"\n";


my @urlset;
foreach my $key(@urls){
	my $browser=LWP::UserAgent->new();
	my $request=HTTP::Request->new("GET"=>$key);
	my $response=$browser->request($request);
	my $html=$response->content;
	my @urls;
	while($html=~/href=\"list1\.php\?(.*)\"\starget/g){
		my $url1="http://sxr.jxnews.com.cn/list1.php?$1";    #这一步主要是获取id,并与绝对路径拼接,得到的url为最终的请求地址
		print $url1,"\n";
		push(@urlset,$url1);
		}
}

open FD ,">>/home/hqh/Desktop/黄启豪/爬虫/江西失信执行/file";
binmode(FD,":encoding(utf8)");

map{getinfo($_)}@urlset;
sub getinfo{
	my $url=shift;
	my $browser=LWP::UserAgent->new();
	my $request=HTTP::Request->new("GET"=>"$url");
	my $response=$browser->request($request);
	my $html=$response->content;
	$html=decode("gb2312", $html);
	my $p=HTML::TreeBuilder->new_from_content($html);
	my @element1=$p->look_down(_tag=>"table",class=>"imagetable");
	foreach(@element1){
		my $temp=$_->as_text();
		$temp=encode("utf8",$temp);  
		if($temp=~/被执行人详细信息被执行人:(.*)年龄:(.*)性别:(.*)身份证号码\/组织机构代码:(.*)地址:(.*)案件详细信息立案日期:(.*)执行依据文号:(.*)执行法院:(.*)法律文书确定的义务:(.*)被执行人履行情况:(.*)被执行人失信情形:(.*)/){
			my $out=join("||",decode("utf8",$1),
decode("utf8",$2),
decode("utf8",$3),
decode("utf8",$4),
decode("utf8",$5),
decode("utf8",$6),
decode("utf8",$7),
decode("utf8",$8),
decode("utf8",$9),
decode("utf8",$10),
decode("utf8",$11));   #这里用一个正则表达式就可以把表格中所有的数据抽取出来。
			print FD $out,"\n";
			}
	}
}



sub pushurl{
	my($len,$url)=@_;
	for(my $i=0;$i<$len;$i++){
		push(@urls,$url.$len);
	}
}

  

posted on 2016-03-24 14:55  推推  阅读(431)  评论(0编辑  收藏  举报