php curl 抓页面 imgs

声明

本分享纯属为了技术分享,禁止商用!!!禁止商用!!!禁止商用!!!
未经本人允许,如有发现,违者必究!!!

调用

// img
$url='http://xxx.cn/';
$re=curl_img($url);
$re=json_decode($re,true);
if ($re==null || count($re)==0) {
    echo '获取错误';
    exit;
}
array_splice($re,2); // 取2图
echo curl_downimg($re[0],dirname(__FILE__).DIRECTORY_SEPARATOR.'imgs'.DIRECTORY_SEPARATOR); // 单文件

// 多文件遍历
//foreach ($re as $v) {
//    echo curl_downimg($v,dirname(__FILE__).DIRECTORY_SEPARATOR.'imgs'.DIRECTORY_SEPARATOR,'GET','1');
//}

// 批量
//curl_downimg_multi($re,dirname(__FILE__).DIRECTORY_SEPARATOR.'imgs'.DIRECTORY_SEPARATOR);

方法

// ============ imgs ============

function curl_img($url='') {
    $ch=curl_init();
    $array=array(
        CURLOPT_URL => $url,
        CURLOPT_ENCODING => 'gzip,deflate',
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_HTTPHEADER => array(
            'pragma: no-cache',
            'cache-control: no-cache',
            'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
            'accept: application/json, text/plain, */*',
            'content-type: application/json',
            'sec-ch-ua-mobile: ?0',
            'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'sec-ch-ua-platform: "Windows"',
            'sec-fetch-site: same-origin',
            'sec-fetch-mode: cors',
            'sec-fetch-dest: empty',
            'accept-language: zh-CN,zh;q=0.9'
        )
    );
    curl_setopt_array($ch,$array);
    $output=curl_exec($ch);
    if (curl_errno($ch)) {
        return curl_error($ch);
    }
    curl_close($ch);
    $r=preg_match_all('/data-src="(.*)"/',$output,$arr); // 规则自己根据实际情况定义
    $items=$arr[1];
    $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
    array_walk($items,function(&$item) use ($url_main){
        $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
        return $item;
    });
    $items=array_filter($items);
    $items=array_unique($items);
    return json_encode($items,320);
}

/**
 * @param $url
 * @param $dir
 * @param $method
 * @param $type 命名规则 1=源文件名 2=随机命名,不会重复
 * @return string
 */
function curl_downimg($url='',$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'jpg';
    $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
    $ch=curl_init();
    $fp=fopen($file_path,'w');
    $arr=array(
        CURLOPT_URL => $url,
        CURLOPT_CUSTOMREQUEST => strtoupper($method),
//        CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//        CURLOPT_NOPROGRESS => 0,
        CURLOPT_HEADER => 0,
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_FILE => $fp,
    );
    curl_setopt_array($ch,$arr);
    $output=curl_exec($ch);
    $size=filesize($file_path);
    $info=curl_getinfo($ch);
    if (curl_errno($ch)) {
        fclose($fp);
        unlink($file_path);
        return curl_error($ch);
    } elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
        fclose($fp);
        unlink($file_path);
        return '数据不完整';
    }
    return 'ok';
}

/**
 * @param $arrs
 * @param $dir
 * @param $method
 * @param $type 命名规则 1=源文件名 2=随机命名,不会重复
 * @return void
 */
function curl_downimg_multi($arrs=array(),$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $conn=array();
    $file_path=array();
    $fp=array();
    $mh=curl_multi_init();
    foreach ($arrs as $k=>$v) {
        $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'jpg';
        $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
        $conn[$k]=curl_init();
        $fp[$k]=fopen($file_path[$k],'w');
        $arr=array(
            CURLOPT_URL => $v,
            CURLOPT_CUSTOMREQUEST => strtoupper($method),
//            CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//            CURLOPT_NOPROGRESS => 0,
            CURLOPT_HEADER => 0,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_SSL_VERIFYHOST => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_CONNECTTIMEOUT => 60,
            CURLOPT_TIMEOUT => 60,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_FILE => $fp[$k],
        );
        curl_setopt_array($conn[$k],$arr);
        curl_multi_add_handle($mh,$conn[$k]);
    }
    $active = null;
    do {
        curl_multi_exec($mh, $active);
        static $i=0;
        static $ok=0;
            while ($done=curl_multi_info_read($mh)) {
                if (curl_errno($done['handle'])) {
                    fclose($fp[$i]);
                    unlink($file_path[$i]);
                    curl_multi_remove_handle($mh,$done['handle']);
                    curl_close($done['handle']);
                    continue;
                }
                $info=curl_getinfo($done['handle']);
                $size=filesize($file_path[$i]);
                if ($info['http_code'] != '200') {
                    fclose($fp[$i]);
                    unlink($file_path[$i]);
                }
                curl_multi_remove_handle($mh,$done['handle']);
                curl_close($done['handle']);
                ++$i;
                ++$ok;
            }
    } while ($active > 0);
    curl_multi_close($mh);
    echo 'ok: '.$ok;
}
posted @ 2022-04-09 22:56  苦逼的后端  阅读(73)  评论(0编辑  收藏  举报