Perl6 必应抓取(2):最终版
use HTTP::UserAgent;
use URI::Encode;
my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
my $bing_url = 'http://cn.bing.com/search?q=';
my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
my $filename = ~now.DateTime~'.txt';
$filename = do given $filename {S:g/':'/-/};
my $fp = open $filename, :w;
my $hv;
my $all_data; #进度显示
sub MAIN (Int $page_number) {
say '+';
say '=======================================================================';
say ' By: FireC@t';
say '=======================================================================';
#say '';
my $strings = prompt 'Input String You Want: ';
say 'Search : '~$strings;
$all_data = 10*$page_number;
say 'Data count(10*'~$page_number~') : '~$all_data; #输出数据数目
my $start_time = now.DateTime;
say 'Start Time : '~$start_time;
say '=======================================================================';
$strings = uri_encode($strings);
my $count = 0;
for 1..$page_number {
#每一页的结果调用函数
my $url_end = '&first='~$count;
my $targeturl = $bing_url~$strings~$url_end;
#say $targeturl;
#调用函数查询结果URL
Bing_search($targeturl);
$count += 10;
}
my $end_time = now.DateTime;
say '=======================================================================';
say 'Finish Time : '~$end_time;
say 'Time Use : '~($end_time-$start_time);
say '=======================================================================';
say 'Data save to : '~$filename;
say '=======================================================================';
}
#查询函数
sub Bing_search($url) {
my $html = $ua.get($url).content;#获取结果
loop {
$html ~~ $choose;
last if not $0;
my $swap_ = ~$0;
$html = $/.postmatch;
$swap_ = do given $swap_ {S:g/'<strong>'//};
$swap_ = do given $swap_ {S:g/'</strong>'//};
say '('~$hv~':'~$all_data~')'~$swap_;
$hv++;
$fp.say($swap_);
}
}
说明, 在dos下输入中文, 因为终端编码问题, 程序会报错。
在linux下运行正常, 或dos下设置编为utf8。
用法:
> perl6 bing_s.p6 10
这里的参数 10为页数, 可随意更改。
BUG:
如果bing中的结果只有 100 条, 而我们向他取 1000 条, 这时我们会取到相同的数据。
修复:
在运行前, 用bing的数据库条目与用户输入的对比。 如果用户请求数目超出bing现有数目, 取bing最大值代替用户输入的最大值。
update: 2017/08/25
修复后代码:
use HTTP::UserAgent;
use URI::Encode;
=begin pod
用于国内版bing查询
# by FireC@t
# 2017/08/25
=end pod
my $ua = HTTP::UserAgent.new(:user-agent<Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0>);
my $bing_url = 'http://cn.bing.com/search?q=';
my $choose = rx/'<cite>'(.*?)'</cite>'/;#要查的内容
my $filename = ~now.DateTime~'.txt';
$filename = do given $filename {S:g/':'/-/};
my $fp = open $filename, :w;
my $hv=1;
my $all_data; #进度显示
sub MAIN (Int $page_number) {
say '+';
say '=======================================================================';
say ' By: FireC@t';
say '=======================================================================';
#say '';
my $strings = prompt 'Input String You Want: ';
say 'Search : '~$strings;
say 'User Data Get Page : '~$page_number; #输出数据数目
my $start_time = now.DateTime;
$strings = uri_encode($strings);
#调用用户处理函数, 处理记录数, 防止重复
$all_data = 10*$page_number;#先计算用户实际要求的数目
#如果用户请求数据过多, 提示
#say 'Test all_data ->'~$all_data; # for test;
my $page_number_swap = User_data_chang($strings);
if $page_number_swap < $page_number {
say 'Not enough Data for bing('~$page_number_swap~' pages), page_number change: '~$page_number~' to '~$page_number_swap;
#改写用户要求的实际数目
$all_data = 10*$page_number_swap;
}
my $count = 0;
say 'Start Time : '~$start_time;
say '=======================================================================';
#sleep(100);
for 1..$page_number_swap {
#每一页的结果调用函数
my $url_end = '&first='~$count;
my $targeturl = $bing_url~$strings~$url_end;
say '>> '~$targeturl~'';
#调用函数查询结果URL
Bing_search($targeturl);
$count += 10;
}
my $end_time = now.DateTime;
say '=======================================================================';
say 'Finish Time : '~$end_time;
say 'Time Use : '~($end_time-$start_time);
say '=======================================================================';
say 'Data('~$hv-1~' lines) save to : '~$filename;
say '=======================================================================';
}
#查询函数
sub Bing_search($url) {
my $html = $ua.get($url).content;#获取结果
loop {
$html ~~ $choose;
last if not $0;
my $swap_ = ~$0;
$html = $/.postmatch;
$swap_ = do given $swap_ {S:g/'<strong>'//};
$swap_ = do given $swap_ {S:g/'</strong>'//};
say '('~$hv~':'~$all_data~') '~$swap_;
$hv++; #记录数据数目
$fp.say($swap_);
}
}
#用于处理用户请求记录数
sub User_data_chang($strings){
#获取所有记录数:
my $start_url = $bing_url~$strings;
my $all_result_number = $ua.get($start_url);
$all_result_number ~~ /'sb_count">'(.*?)\s.*?'</span>'/;
if not $0 {
#say 'Not Result';
return 0;
#没有结果, 直接返回0个页面
}
#如果有结果
my $data_number = ~$0;#123,45
#say $data_number; #test
my $bing_all_data = Int($data_number.subst: /','/,'',:g); #获得结果总数
#test
#say $bing_all_data;
#say $all_data;
#$all_data为用户请求总数
if $all_data > $bing_all_data {
#如果用户请求数大于数据已有数目, 那就返回所有请求
#调用分页函数返回一共有多少页
my $user_page = User_page($bing_all_data);
#say 'return page:'~$user_page; sleep(1000);
return $user_page;#返回页数
}else {
#否则返回用户自定义页数
my $user_page = User_page($all_data);
#say 'return page:'~$user_page; sleep(1000);
return $user_page;
}
}
#用于处理页数
sub User_page($data_number) {
my $page_check = ~($data_number/10);
if $page_check.split('.').elems == 2 {
#说明有小数
return $page_check.split('.')[0] + 1;
}else {
#没小数, 取整数
return $page_check.split('.')[0];
}
}