curl 多线程抓取页面,伪造ip,ip代理,防止被封
"https://www.xxx.com","id"=>1);
$list[] = array("url"=>"https://www.xxx.com","id"=>2);
$list[] = array("url"=>"https://www.xxx.com","id"=>3);
$list[] = array("url"=>"https://www.xxx.com","id"=>4);
$data = array_chunk($list,2);
$userAgentList = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
];
$add = array();
foreach($data as $key=>$arr){
//获取代理ip
$ipArr = get_file_content("http://webapi.http.zhimacangku.com/getip?","num=2");
$ip = $userAgent ="" ;
$ipArr = json_decode($ipArr['data'],true);
if(is_array($ipArr['data']) && !empty($ipArr['data'])){
$ipKey = array_rand($ipArr['data'],1);
$ip = $ipArr['data'][$ipKey]['ip'].":".$ipArr['data'][$ipKey]['port'];
}
$uKey = array_rand($userAgentList,1);
$userAgent = $userAgentList[$uKey];
$msg = http_request_multi($arr,'','',$ip,$userAgent);
/**
* curl 多线程
* @param array $urlArray 并行网址
* @param str $method 请求方式(get,post)
* @param str $header 请求头
* @param str $proxy 代理IP
* @return array
*/
public static function http_request_multi($urlArray,$method='',$header='',$proxy='',$useragent='')
{
unset($mh);
unset($curl);
$mh = curl_multi_init();
for($i=count($urlArray)-1;$i>=0;$i--)
{
$curl[$i] = curl_init();
//构造随机ip
$random_ip= rand(1,254).'.'.rand(1,254).'.'.rand(1,254).'.'.rand(1,254);
curl_setopt($curl[$i], CURLOPT_URL, $urlArray[$i]['url']);
curl_setopt($curl[$i], CURLOPT_HEADER, 0);
curl_setopt($curl[$i], CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl[$i], CURLOPT_TIMEOUT, 30);
curl_setopt($curl[$i], CURLOPT_HTTPHEADER, array("X-FORWARDED-FOR:{$random_ip}","CLIENT-IP:{$random_ip}")); //构造IP
if($useragent){
curl_setopt($curl[$i], CURLOPT_USERAGENT, $useragent);
}else{
curl_setopt($curl[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0');
}
curl_setopt($curl[$i], CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl[$i], CURLOPT_SSL_VERIFYHOST, false);
if($proxy){
curl_setopt($curl[$i], CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($curl[$i], CURLOPT_PROXY, $proxy);
}
if($method){
curl_setopt($curl[$i], CURLOPT_POST, 1);
curl_setopt($curl[$i], CURLOPT_POSTFIELDS, $method[$i]);
}
if($header){
curl_setopt($curl[$i], CURLOPT_HTTPHEADER, $header);
}
curl_multi_add_handle($mh,$curl[$i]);
}
$action = null;
do{
$mrc = curl_multi_exec($mh,$action);
}while ($mrc == CURLM_CALL_MULTI_PERFORM);
unset($num);
$num = 0;
while($action && $mrc == CURLM_OK) {
if(curl_multi_select($mh) == 0)
{
$num++; //如果有一个0 那么num 就加1
}
if($num == 30) //假如已经有30个0等待不到活动链接了 那就退出本次循环吧。
{
break;
}
if(curl_multi_select($mh) == -1){
usleep(100);
}
do{
$mrc = curl_multi_exec($mh,$action);
}while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
unset($newcomment);
$newcomment = array();
for($i=count($urlArray)-1;$i>=0;$i--)
{
//$newcomment[] = curl_multi_getcontent($curl[$i]);
$newcomment[$urlArray[$i]['id']] = curl_getinfo($curl[$i],CURLINFO_HTTP_CODE);
curl_multi_remove_handle($mh, $curl[$i]);
curl_close($curl[$i]);
}
curl_multi_close($mh);
if($newcomment){
//array_shift($newcomment);
//$newcomment = array_reverse($newcomment);
return $newcomment;
}else{
return false;
}
}