php抓取网页获取特定信息

1、需求

比如，抓取博客网首页文章标题和作者
2、编码实现

<?php 

/**
 * 抓取网页
 */
function catch_html($url) {
	$urlR = parse_url($url);
	$domain = $urlR['scheme'].'://'.$urlR['host'].'/';
	$headers=array(
		"Accept: application/json, text/javascript, */*; q=0.01",
		"Content-Type: application/x-www-form-urlencoded; charset=UTF-8",
		"Origin: {$domain}",
		"Referer: {$url}",
		"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
		
	);
	$curl = curl_init();
	curl_setopt($curl, CURLOPT_URL, $url);//设置抓取的url
	curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);//指定头部参数
	curl_setopt($curl, CURLOPT_HEADER, 0);//设置头文件的信息作为数据流输出
	//设置获取的信息以文件流的形式返回，而不是直接输出。
	curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($curl, CURLOPT_ACCEPT_ENCODING, "gzip,deflate");
	//重要！
	curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); // https请求 不验证证书和hosts
	curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
	curl_setopt($curl,CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); //模拟浏览器代理
	$data = curl_exec($curl);//执行命令
	curl_close($curl);//关闭URL请求
	$data = mb_convert_encoding($data, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5');//使用该函数对结果进行转码
	
	return $data;
}

/**
 * 正则匹配，获取标签内容
 */
function get_tag_data($html,$tag,$attr,$value=''){ 
	$regex = $value ? "/<$tag.*?$attr=\"$value\".*?>(.*?)<\/$tag>/is" :  "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
    preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER); 
    return $matches[1];//返回值为数组 ,查找到的标签内的内容
}


$url = 'https://www.cnblogs.com/';
$html = catch_html($url);
//echo $html;die;


//匹配标题：<a class="post-item-title" href="https://www.cnblogs.com/lyl-star/p/15410719.html" target="_blank">数据库系统之实体完整性约束</a>
$titles = get_tag_data($html,'a','class','post-item-title');


//匹配作者：<a href="https://www.cnblogs.com/lyl-star/" class="post-item-author"><span>lyl-star</span></a>
$authors = get_tag_data($html,'a','class','post-item-author');
//['<span>lyl-star</span>']，
foreach($authors as &$author){
	$author = str_replace('<span>','',$author);
	$author = str_replace('</span>','',$author);
}

$list = [
	'titles' => $titles,
	'authors' => $authors,
];

echo '<pre>'; print_r($list); die;
输出结果：
posted @ 2021-10-15 15:43 pine007 阅读(451) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部