php curl 抓页面 js
声明
本分享纯属为了技术分享,禁止商用!!!禁止商用!!!禁止商用!!!
未经本人允许,如有发现,违者必究!!!
调用
// js
$url='http://xxx.cn/';
$re=curl_js($url);
$re=json_decode($re,true);
if ($re==null || count($re)==0) {
echo '获取错误';
exit;
}
array_splice($re,2);
echo curl_downjs($re[0],dirname(__FILE__).DIRECTORY_SEPARATOR.'js'.DIRECTORY_SEPARATOR,'GET','1'); // 单文件
// 多文件遍历
//foreach ($re as $v) {
// echo curl_downjs($v,dirname(__FILE__).DIRECTORY_SEPARATOR.'js'.DIRECTORY_SEPARATOR,'GET','1');
//}
// 批量
//curl_downjs_multi($re,dirname(__FILE__).DIRECTORY_SEPARATOR.'js'.DIRECTORY_SEPARATOR,'GET','1');
方法
// ============ js ============
function curl_js($url='') {
$ch=curl_init();
$array=array(
CURLOPT_URL => $url,
CURLOPT_ENCODING => 'gzip,deflate',
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_HTTPHEADER => array(
'pragma: no-cache',
'cache-control: no-cache',
'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'accept: application/json, text/plain, */*',
'content-type: application/json',
'sec-ch-ua-mobile: ?0',
'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
'sec-ch-ua-platform: "Windows"',
'sec-fetch-site: same-origin',
'sec-fetch-mode: cors',
'sec-fetch-dest: empty',
'accept-language: zh-CN,zh;q=0.9'
)
);
curl_setopt_array($ch,$array);
$output=curl_exec($ch);
if (curl_errno($ch)) {
return curl_error($ch);
}
curl_close($ch);
preg_match_all('/<script(.*) src="(\S+(.js)?)"/i',$output,$arr); // 规则自己根据实际情况定义
$items=$arr[2];
$url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
array_walk($items,function(&$item) use ($url_main){
$item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
return $item;
});
$items=array_filter($items);
$items=array_unique($items);
return json_encode($items,320);
}
/**
* @param $url
* @param $dir
* @param $method
* @param $type 命名规则 1=源文件名 2=随机命名,不会重复
* @return string
*/
function curl_downjs($url='',$dir='',$method='GET',$type='1') {
if (!is_dir($dir)) {
mkdir($dir,0777,true);
}
$ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'';
$ext=strpos('.'.$ext,'.js')===false?'':'js';
$file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
$ch=curl_init();
$fp=fopen($file_path,'w');
$arr=array(
CURLOPT_URL => $url,
CURLOPT_CUSTOMREQUEST => strtoupper($method),
// CURLOPT_PROGRESSFUNCTION => 'progressCallback',
// CURLOPT_NOPROGRESS => 0,
CURLOPT_HEADER => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_FILE => $fp,
);
curl_setopt_array($ch,$arr);
$output=curl_exec($ch);
// $output=mb_convert_encoding(curl_exec($ch),'utf-8','GB2312'); // 编码转换 utf-8 转 GB2312
$size=filesize($file_path);
$info=curl_getinfo($ch);
if (curl_errno($ch)) {
fclose($fp);
unlink($file_path);
return curl_error($ch);
} elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
fclose($fp);
unlink($file_path);
return '数据不完整';
}
return 'ok';
}
/**
* @param $arrs
* @param $dir
* @param $method
* @param $type 命名规则 1=源文件名 2=随机命名,不会重复
* @return void
*/
function curl_downjs_multi($arrs=array(),$dir='',$method='GET',$type='1') {
if (!is_dir($dir)) {
mkdir($dir,0777,true);
}
$conn=array();
$file_path=array();
$fp=array();
$mh=curl_multi_init();
foreach ($arrs as $k=>$v) {
$ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'';
$ext=strpos('.'.$ext,'.js')===false?'':'js';
$file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
$conn[$k]=curl_init();
$fp[$k]=fopen($file_path[$k],'w');
$arr=array(
CURLOPT_URL => $v,
CURLOPT_CUSTOMREQUEST => strtoupper($method),
// CURLOPT_PROGRESSFUNCTION => 'progressCallback',
// CURLOPT_NOPROGRESS => 0,
CURLOPT_HEADER => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_CONNECTTIMEOUT => 60,
CURLOPT_TIMEOUT => 60,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FILE => $fp[$k],
);
curl_setopt_array($conn[$k],$arr);
curl_multi_add_handle($mh,$conn[$k]);
}
$active = null;
do {
curl_multi_exec($mh, $active);
static $i=0;
static $ok=0;
while ($done=curl_multi_info_read($mh)) {
if (curl_errno($done['handle'])) {
fclose($fp[$i]);
unlink($file_path[$i]);
curl_multi_remove_handle($mh,$done['handle']);
curl_close($done['handle']);
continue;
}
$info=curl_getinfo($done['handle']);
$size=filesize($file_path[$i]);
if ($info['http_code'] != '200') {
fclose($fp[$i]);
unlink($file_path[$i]);
}
curl_multi_remove_handle($mh,$done['handle']);
curl_close($done['handle']);
++$i;
++$ok;
}
} while ($active > 0);
curl_multi_close($mh);
echo 'ok: '.$ok;
}