php利用curl获取网页title内容

/**$html = curl_get_file_contents($url);
$title = get_title_contents($html);
var_dump($title);*/
function curl_get_file_contents($url,$referer='') {
	static $curl_loops = 0;//避免死了循环必备
	static $curl_max_loops = 3;
	$useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";

	$ch = curl_init();
	curl_setopt($ch,CURLOPT_URL,$url);
	curl_setopt($ch,CURLOPT_HEADER,true);
	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //不验证证书
	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //不验证证书
	curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
	curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
	curl_setopt($ch,CURLOPT_REFERER,$referer);
	$data = curl_exec($ch);
	$ret = $data;
	list($header,$data) = explode("\r\n\r\n",$data,2);
	$http_code = curl_getinfo($ch,CURLINFO_HTTP_CODE);
	$last_url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
	curl_close($ch);
	if ($http_code == 301 || $http_code == 302) {
		$matches = array();
		preg_match('/Location:(.*?)\n/',$header,$matches);
		$url = @parse_url(trim(array_pop($matches)));
		if (!$url) {
			return $data;
		}
		$new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '');
		if ($curl_loops++ >= $curl_max_loops) {
			return false;
		}else {
			$new_url = stripslashes($new_url);
			return curl_get_file_contents($new_url);
		}
	} else {
		list($header,$data) = explode("\r\n\r\n",$ret,2);
		return $data;
	}
}
function get_title_contents($html){
	// 解析 HTML 的 <head> 区段
//	<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
//	<meta content="text/html; charset=gb2312" http-equiv="Content-Type">
	preg_match("/<head.*>(.*)<\/head>/smUi",$html, $htmlHeaders);
	//var_dump($output);die();
	if(!count($htmlHeaders)){
		$title = "无法解析数据中的 <head> 区段";
	}

// 取得 <head> 中 meta 设置的编码格式<meta charset="gb2312">
	if(preg_match('/<meta.*charset=(("){0,1}[a-zA-Z0-9-]*("){0,1})/',$htmlHeaders[1], $results)){
		$charset =  $results[1];
	}else{
		$charset = "None";
	}
	$charset = str_replace('"','',$charset);

// 取得 <title> 中的文字
	if(preg_match("/<title>(.*)<\/title>/Ui",$htmlHeaders[1], $htmlTitles)){
		if(!count($htmlTitles)){
			$title = "无法解析 <title> 的内容";
			exit;
		}

		// 将  <title> 的文字编码格式转成 UTF-8
		if($charset == "None"){
			$title=$htmlTitles[1];
		}else{
			$title=iconv($charset, "UTF-8", $htmlTitles[1]);
		}
	}
	return html_entity_decode($title);
}

  支持https,302跳转

posted on 2017-05-09 09:57  防空洞123  阅读(1642)  评论(0编辑  收藏  举报

导航