PHP简单爬虫 爬取免费代理ip 一万条

目标站:http://www.xicidaili.com/

代码:

<?php
require 'lib/phpQuery.php';
require 'lib/QueryList.php';
require "db/shared/ez_sql_core.php";  
require "db/mysql/ez_sql_mysql.php";  
require "public/function.php";  
use QL\QueryList;

//抓取猫眼电影TOP100榜单内容 
$db = new ezSQL_mysql('root', 'root', 'spider', 'localhost');  

for($j=1;$j<=100;$j++){
    $gurl="http://www.xicidaili.com/nn/".$j;
    $html=curl_request($gurl);
    for($i=1;$i<=100;$i++){
        getIpInfo($html,$i,$db);
    }
    echo "".$j."页完成".PHP_EOL;
}




function getIpInfo($html,$t,$db){
    $rules = array(
        //采集id为one这个元素里面的纯文本内容
        'ip' => array("#ip_list tr:eq($t) td:eq(1)",'text'),//ip
        'port' => array("#ip_list tr:eq($t) td:eq(2)",'text'),//端口
        'area' => array("#ip_list tr:eq($t) td:eq(3)",'text'),//位置
        'anonymous' => array("#ip_list tr:eq($t) td:eq(4)",'text'),//是否匿名
        'type' => array("#ip_list tr:eq($t) td:eq(5)",'text'),//类型
        'speed' => array("#ip_list tr:eq($t) td:eq(6)",'html','',function($content){
            $num=explode('%', explode(':', $content)[1])[0];
            if($num >= 60 && $num<80){
                return "一般";
            }else if($num >= 80){
                return "很快";
            }else{
                return "较慢";
            }
        }),//速度
        'chtime' => array("#ip_list tr:eq($t) td:eq(8)",'text'),//存活时间.
        'yztime' => array("#ip_list tr:eq($t) td:eq(9)",'text'),//验证时间

    );
    $data = QueryList::Query($html,$rules)->data;
    print_r($data);

    $ip=$data[0]["ip"];
    $port=$data[0]["port"];
    $area=$data[0]["area"];
    $anonymous=$data[0]["anonymous"];
    $type=$data[0]["type"];
    $speed=$data[0]["speed"];
    $chtime=$data[0]["chtime"];
    $yztime=$data[0]["yztime"];


    $db->query("INSERT INTO ip (ip, port,area,anonymous,type,speed,chtime,yztime) 
        VALUES ('$ip','$port','$area','$anonymous','$type','$speed','$chtime','$yztime')");
}

结果:

完整项目下载:https://files.cnblogs.com/files/wordblog/spider2.rar

posted @ 2018-06-18 18:30  波罗斯の程序日记  阅读(4394)  评论(0编辑  收藏  举报