使用CURL和火车头软件采集搜狐文章

直接上代码:

//参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies
function curl_request($url,$post='',$cookie='', $returnCookie=0){
	$curl = curl_init();
	curl_setopt($curl, CURLOPT_URL, $url);
	curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
	curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
	curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
	curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($curl, CURLOPT_REFERER, "http://www.baidu.com/");
	if($post) {
		curl_setopt($curl, CURLOPT_POST, 1);
		curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
	}
	if($cookie) {
		curl_setopt($curl, CURLOPT_COOKIE, $cookie);
	}
	curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
	curl_setopt($curl, CURLOPT_TIMEOUT, 10);
	curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
	$data = curl_exec($curl);
	if (curl_errno($curl)) {
		return curl_error($curl);
	}
	curl_close($curl);
	if($returnCookie){
		list($header, $body) = explode("\r\n\r\n", $data, 2);
		preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
		$info['cookie']  = substr($matches[1][0], 1);
		$info['content'] = $body;
		return $info;
	}else{
		return $data;
	}
}

$caiji_set = [];
$caiji_url = '';
for($i=0;$i<85;$i++){
	// page-100 
	//$url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=NTYzOTU5NjY1OUBzaW5hLnNvaHUuY29t&pageNumber=".$i."&pageSize=10&categoryId=&_=1541053659128";
	// page-85
	$url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=cHBhZzU5MTM5NjA2NmVlM0Bzb2h1LmNvbQ==&pageNumber=".$i."&pageSize=10&categoryId=&_=1541122188390";
	$detail_url = curl_request($url,'GET', '');
	$detail_data = json_decode($detail_url);
	
	$result = stripslashes(html_entity_decode($detail_data)); //传递过来的json字符串
	$result = json_decode($result, TRUE);
	
	if($result['msg'] == 'succes'){
		foreach($result['data'] as $k=>$item){
			$caiji_set[$k][] = array(
				"brief" => urldecode($item['brief']),
				"thumbnail" => $item["thumbnail"],
				"title"=>urldecode(title),
				"url"=>"http:".$item["url"]
			);
			$caiji_url .= "http:".$item["url"].'<br/>';
		}
	}
}
if(!empty($caiji_url)){
	//file_put_contents('./gougou.txt', $caiji_url, FILE_APPEND);
}
var_export($caiji_url);exit;

 

posted @ 2018-11-02 09:51  心无引擎,眼无流派  阅读(988)  评论(0编辑  收藏  举报