PHP利用正则匹配 完成数据抓取
//抓取该页面的数据 $url = "http://xxxxxxx"; $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10); curl_setopt($curl, CURLOPT_ENCODING, "gzip"); $contents = curl_exec($curl); curl_close($curl); $contents = mb_convert_encoding($contents, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5' ); $contents = \Helper\CFunctionHelper::DeleteHtml($contents); if (!preg_match_all("@<table[^>]+>(.*?)</table>@", $contents, $tables)) { die("匹配表格失败"); } //dd($tables[1][1]); if (!preg_match_all("@<tr[^>]+>(.*?)</tr>@", $tables[1][1], $tr)) { die("匹配tr失败"); } // var_dump($tr[1][0]); // die(); $data = []; foreach($tr[1] as $value){ $bonusData = strip_tags(str_replace('</td>', '|', str_replace(" ", '', $value))); $issue = 0; $recommend = ""; $kaijiang = ""; $result = ""; try { list($issue, $recommend, $kaijiang,$result) = explode('|', $bonusData); } catch (\ErrorException $e) { die("发生异常:".$e->getMessage()); } array_push($data,['issue'=>$issue,'recommend'=>$recommend,'kaijiang'=>$kaijiang,'result'=>$result]); } dd($data);
/** * 去掉html中的换行和空格 * @param type $str * @return type */ public static function DeleteHtml($str) { $str = trim($str); //清除字符串两边的空格 $str = preg_replace("/\t/","",$str); //使用正则表达式替换内容,如:空格,换行,并将替换为空。 $str = preg_replace("/\r\n/","",$str); $str = preg_replace("/\r/","",$str); $str = preg_replace("/\n/","",$str); $str = preg_replace("\n[^\<]", "", $str); $str = preg_replace("/ /","",$str); $str = preg_replace("/ /","",$str); //匹配html中的空格 return trim($str); //返回字符串 }
不积跬步,无以至千里;