PHP爬虫(3)PHP DOM开源代码里的大坑和字符编码
一、开源代码的问题
在PHP爬虫(2)中介绍了开源工程Sunra.PhpSimple.HtmlDomParser。在实际工作中发现一个问题,例如http://www.163.com的网页数据怎么也抓取不下来。
$url = "http://www.163.com"; $content = Http::request($url); $dom = str_get_html($content);//dom返回值为false
检查simple_html_dom.php代码发现,
if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; }
要判断加载字符串的长度。此处可以将MAX_FILE_SIZE修改更大一些,或者去除这个判断。
二、字符编码
网页抓取必然要处理网页内容,网页内容的编码有很多种,常见的UTF-8,GBK,GB2312等。通常处理的过程,首先判断字符编码,再转化成统一编码。
判断编码的代码,
function ws_mb_detect_encoding ($string, $enc=null, $ret=null) { static $enclist = array( 'UTF-8', 'GBK', 'GB2312', 'GB18030' ); $result = false; foreach ($enclist as $item) { //$sample = iconv($item, $item, $string); $sample = mb_convert_encoding($string,$item, $item); if (md5($sample) == md5($string)) { if ($ret === NULL) { $result = $item; } else { $result = true; } break; } } return $result; }
转化成UTF-8编码
$html = mb_convert_encoding($html,"UTF-8",$enc); //enc是ws_mb_detect_encoding返回值
下面的代码,是从一个导航页面,抓取全部链接,找到链接文档的title信息
<?php Vendor('Sunra.PhpSimple.HtmlDomParser'); $url = "http://hao.360.cn/"; $html = file_get_html($url); $links = $html->find('a'); $num = 0; $array = array(); foreach ($links as $l) { if(strpos($l->href,"http")===0) { $url = $l->href; $pattern = "/(http|https):\/\/\S+?\//";//查找http,https开头 $ret = preg_match($pattern, $url,$m); $url =$ret?$m[0]:$url; if(!array_search($url, $array)) { $array[] = $url; } if(count($array)>30) { break; } } } foreach ($array as $url) { $html = false; $num = 0; while($html==false && $num<3) { $num++; $html = \Home\Wsn\Http::request($url); } if($html == false) { echo "无法获取网页数据<br>";continue; } $enc = ws_mb_detect_encoding($html); echo $enc."<br>"; if($enc==false) { echo "编码错误<br>";continue; } elseif($enc!='UTF-8') { $html = mb_convert_encoding($html,"UTF-8",$enc); } $dom = str_get_html($html); $title = $dom->find('title',0); if($title){ echo "标题".$title->innertext."<br>"; } else{ echo "没找到标题<br>"; } echo "<hr>"; } ?>
附录
封装好的HTTP类如下,喜欢的同学可以拿去直接使用。
<?php public static function request($url, $params = array(), $method = 'GET', $multi = false, $extheaders = array()) { if (!function_exists('curl_init')) exit('Need to open the curl extension'); $method = strtoupper($method); $ci = curl_init(); curl_setopt($ci, CURLOPT_USERAGENT, 'PHP-SDK OAuth2.0'); curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, 3); curl_setopt($ci, CURLOPT_TIMEOUT, 3); curl_setopt($ci, CURLOPT_RETURNTRANSFER, true); curl_setopt($ci, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ci, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ci, CURLOPT_HEADER, false); $headers = (array)$extheaders; switch ($method) { case 'POST': curl_setopt($ci, CURLOPT_POST, TRUE); if (!empty($params)) { if ($multi) { foreach ($multi as $key => $file) { $params[$key] = '@' . $file; } curl_setopt($ci, CURLOPT_POSTFIELDS, $params); $headers[] = 'Expect: '; } else { curl_setopt($ci, CURLOPT_POSTFIELDS, http_build_query($params)); } } break; case 'DELETE': $method == 'DELETE' && curl_setopt($ci, CURLOPT_CUSTOMREQUEST, 'DELETE'); break; case 'GET': if (!empty($params)) { $url = $url . (strpos($url, '?') ? '&' : '?') . (is_array($params) ? http_build_query($params) : $params); } break; } curl_setopt($ci, CURLINFO_HEADER_OUT, TRUE); curl_setopt($ci, CURLOPT_URL, $url); if ($headers) { curl_setopt($ci, CURLOPT_HTTPHEADER, $headers); } $response = curl_exec($ci); curl_close($ci); return $response; } ?>
PC端和手机端的技术研发