php 采集常用代码
function curl_get($url, $gzip=false){ $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10); curl_setopt($curl,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); // curl_setopt($curl, CURLOPT_ENCODING, "gzip"); // gzip $content = curl_exec($curl); curl_close($curl); return $content; } function get_middle($before,$after,$str)/*{{{*/ { if(strpos($str, $before) === false){ return ''; } $data = explode($before, $str); $data = $data[1]; if(strpos($data, $after) === false){ return ''; } $data = explode($after, $data); $data = $data[0]; return trim($data); }/*}}}*/
用法
$html=curl_get($url);
$title=get_middle('<title>','</title>',$html); //截取以<title>开头至</title>之间的字符
常用的一些过滤代码
$content= preg_replace( "@<svg(.*?)</svg>@is", "", $content ); //将svg标签内容替换为空
提取某个变量
<a href="/cat/2546">我是大侠</a>
preg_match_all("|<a href=\"/cat/(.*)\">(.*)</a>|isU",$html,$daijiejue); $c1=$daijiejue[1][0]; // /cat/2546 $c2=$daijiejue[2][0]; //我是大侠