curl 多线程抓取页面,伪造ip,ip代理,防止被封

"https://www.xxx.com","id"=>1); $list[] = array("url"=>"https://www.xxx.com","id"=>2); $list[] = array("url"=>"https://www.xxx.com","id"=>3); $list[] = array("url"=>"https://www.xxx.com","id"=>4); $data = array_chunk($list,2); $userAgentList = [ # Opera "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", # Firefox "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", # Safari "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", # chrome "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", # 360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", # 淘宝浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", # 猎豹浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", # QQ浏览器 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", # sogou浏览器 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", # maxthon浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", # UC浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", ]; $add = array(); foreach($data as $key=>$arr){ //获取代理ip $ipArr = get_file_content("http://webapi.http.zhimacangku.com/getip?","num=2"); $ip = $userAgent ="" ; $ipArr = json_decode($ipArr['data'],true); if(is_array($ipArr['data']) && !empty($ipArr['data'])){ $ipKey = array_rand($ipArr['data'],1); $ip = $ipArr['data'][$ipKey]['ip'].":".$ipArr['data'][$ipKey]['port']; } $uKey = array_rand($userAgentList,1); $userAgent = $userAgentList[$uKey]; $msg = http_request_multi($arr,'','',$ip,$userAgent); /** * curl 多线程 * @param array $urlArray 并行网址 * @param str $method 请求方式(get,post) * @param str $header 请求头 * @param str $proxy 代理IP * @return array */ public static function http_request_multi($urlArray,$method='',$header='',$proxy='',$useragent='') { unset($mh); unset($curl); $mh = curl_multi_init(); for($i=count($urlArray)-1;$i>=0;$i--) { $curl[$i] = curl_init(); //构造随机ip $random_ip= rand(1,254).'.'.rand(1,254).'.'.rand(1,254).'.'.rand(1,254); curl_setopt($curl[$i], CURLOPT_URL, $urlArray[$i]['url']); curl_setopt($curl[$i], CURLOPT_HEADER, 0); curl_setopt($curl[$i], CURLOPT_RETURNTRANSFER, true); curl_setopt($curl[$i], CURLOPT_TIMEOUT, 30); curl_setopt($curl[$i], CURLOPT_HTTPHEADER, array("X-FORWARDED-FOR:{$random_ip}","CLIENT-IP:{$random_ip}")); //构造IP if($useragent){ curl_setopt($curl[$i], CURLOPT_USERAGENT, $useragent); }else{ curl_setopt($curl[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'); } curl_setopt($curl[$i], CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl[$i], CURLOPT_SSL_VERIFYHOST, false); if($proxy){ curl_setopt($curl[$i], CURLOPT_PROXYTYPE, CURLPROXY_HTTP); curl_setopt($curl[$i], CURLOPT_PROXY, $proxy); } if($method){ curl_setopt($curl[$i], CURLOPT_POST, 1); curl_setopt($curl[$i], CURLOPT_POSTFIELDS, $method[$i]); } if($header){ curl_setopt($curl[$i], CURLOPT_HTTPHEADER, $header); } curl_multi_add_handle($mh,$curl[$i]); } $action = null; do{ $mrc = curl_multi_exec($mh,$action); }while ($mrc == CURLM_CALL_MULTI_PERFORM); unset($num); $num = 0; while($action && $mrc == CURLM_OK) { if(curl_multi_select($mh) == 0) { $num++; //如果有一个0 那么num 就加1 } if($num == 30) //假如已经有30个0等待不到活动链接了 那就退出本次循环吧。 { break; } if(curl_multi_select($mh) == -1){ usleep(100); } do{ $mrc = curl_multi_exec($mh,$action); }while ($mrc == CURLM_CALL_MULTI_PERFORM); } unset($newcomment); $newcomment = array(); for($i=count($urlArray)-1;$i>=0;$i--) { //$newcomment[] = curl_multi_getcontent($curl[$i]); $newcomment[$urlArray[$i]['id']] = curl_getinfo($curl[$i],CURLINFO_HTTP_CODE); curl_multi_remove_handle($mh, $curl[$i]); curl_close($curl[$i]); } curl_multi_close($mh); if($newcomment){ //array_shift($newcomment); //$newcomment = array_reverse($newcomment); return $newcomment; }else{ return false; } }
posted @ 2019-05-10 18:36  经验源于积累  阅读(312)  评论(0编辑  收藏  举报