使用CURL和火车头软件采集搜狐文章
直接上代码:
//参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url,$post='',$cookie='', $returnCookie=0){ $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($curl, CURLOPT_AUTOREFERER, 1); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_REFERER, "http://www.baidu.com/"); if($post) { curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post)); } if($cookie) { curl_setopt($curl, CURLOPT_COOKIE, $cookie); } curl_setopt($curl, CURLOPT_HEADER, $returnCookie); curl_setopt($curl, CURLOPT_TIMEOUT, 10); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $data = curl_exec($curl); if (curl_errno($curl)) { return curl_error($curl); } curl_close($curl); if($returnCookie){ list($header, $body) = explode("\r\n\r\n", $data, 2); preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches); $info['cookie'] = substr($matches[1][0], 1); $info['content'] = $body; return $info; }else{ return $data; } } $caiji_set = []; $caiji_url = ''; for($i=0;$i<85;$i++){ // page-100 //$url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=NTYzOTU5NjY1OUBzaW5hLnNvaHUuY29t&pageNumber=".$i."&pageSize=10&categoryId=&_=1541053659128"; // page-85 $url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=cHBhZzU5MTM5NjA2NmVlM0Bzb2h1LmNvbQ==&pageNumber=".$i."&pageSize=10&categoryId=&_=1541122188390"; $detail_url = curl_request($url,'GET', ''); $detail_data = json_decode($detail_url); $result = stripslashes(html_entity_decode($detail_data)); //传递过来的json字符串 $result = json_decode($result, TRUE); if($result['msg'] == 'succes'){ foreach($result['data'] as $k=>$item){ $caiji_set[$k][] = array( "brief" => urldecode($item['brief']), "thumbnail" => $item["thumbnail"], "title"=>urldecode(title), "url"=>"http:".$item["url"] ); $caiji_url .= "http:".$item["url"].'<br/>'; } } } if(!empty($caiji_url)){ //file_put_contents('./gougou.txt', $caiji_url, FILE_APPEND); } var_export($caiji_url);exit;