2个爬虫
<?php namespace Util\data; use Util\data\DbUtil; class Index_m { /*1,获取新闻首页 2,获取新闻链接数组 3,循环,将每个链接截取 作者、标题、内容,写入数组 4,将数组写进数据库1 */ public function update_m(){ $url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'; $curl = $this->curl($url); //获取子新闻们的url $urls = $this->geturls($curl); $news = []; $num = 0; //获取子链接每个新闻的标题、正文、url foreach($urls as $value){ $new_curl = $this->curl($value); //如果是图集的话 if(substr_count($new_curl, 'picBoxPrev')>0){ echo '图集被删除'; continue; } //获取标题 $title = $this->getKeyWord($new_curl,'<title>','</title>')[0]; if($x = strpos($title,'_凤凰')){ $title = substr($title,0,$x); } //获取内容 if(substr_count($new_curl,'<!--mainContent begin-->')>0){ $body = $this->getKeyWord($new_curl,'<!--mainContent begin-->','<span class="ifengLogo"><a')[0]; }elseif(substr_count($new_curl,'<!-- 正文begin -->')>0){ $body = $this->getKeyWord($new_curl,' <!-- 正文begin -->','<span class="ifengLogo"><a')[0]; } //获取url $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg'; if(substr_count($body,'<img src="')>0){ $img = $this->getKeyWord($body,'<img src="','">')[0]; if(substr_count($img,'gif')>0){ $index = strpos($img,'gif'); $img = substr($img,0,$index+3); } elseif(substr_count($img,'jpeg')>0){ $index = strpos($img,'jpeg'); $img = substr($img,0,$index+4); }elseif(substr_count($img,'jpg')>0){ $index = strpos($img,'jpg'); $img = substr($img,0,$index+3); } } if(preg_match('/[\x{4e00}-\x{9fa5}]/u', $img)>0){ $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg'; } /* $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';*/ /* $rule = '/^<img src="(*)"$/' preg_match_all($body, $rule,$img);*/ if(strlen($body)<100||strlen($title)<20){ continue; } $news[$num]['news_title'] = trim($title); $news[$num]['news_body'] = trim($body); $news[$num]['news_pic'] = trim($img); $news[$num]['news_autuor'] = '新闻网'; $num++; } $arr = $this->do_sql($news); return $arr; } //将新闻们写进数据库 public function do_sql($news){ $b = array_rand($news,6); foreach($b as $k =>$v){ $arr[] = $news[$v]; } //总数 $sum = 0; //写进的数量 $succ = 0; //重复的数量 $ready = 0; foreach($arr as $value=>$key){ $sum++; //去重 if(DbUtil::getdb()->table('news')->where(array('news_title'=>$key['news_title']))->count()>0){ $ready++; continue; } $title = $key['news_title']; if(DbUtil::getdb()->table('news')->insert($key)){ $succ++; } print_r($key['news_title']); } //$sum:总数 $succ:成功个数 $ready:重复的个数 return array($sum,$succ,$ready,); } //获取子链接 public function geturls($curl){ $urls = []; $url = $this->getKeyWord($curl,'<h2>即时新闻</h2>','<div class="clear"></div>')[0]; $index = 0; for($x = 0;$x<20;$x++){ $arr = $this->getKeyWord($url,'<a href="','" target="_blank">',$index); $urls [] = trim($arr[0]); $index = $arr[1]; } return $urls; } /*截取有用的子串(爬虫相关) $info=网页 $first_key=开始的字符串 $last_key=结束的字符串 return 中间的字符串; $index:结束字符串的索引(选填)*/ function getKeyWord($info,$first_key,$last_key,$index = 0){ $len = strlen($first_key); $first_key_start = strpos($info,$first_key,$index); $last_key_start = strpos($info,$last_key,$first_key_start); $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len)); //return array(关键词,最后的索引,方便循环) return array($keyword,$last_key_start); } /*$url :html链接 return :解析后的html文档(字符串) 获取CURL请求的输出信息,这个可以爬取https,非常好*/ function curl($url,$coding='utf-8') { //初始化 $ch = curl_init(); //设置选项,包括url curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。 /* curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向*/ //不验证证书和host /* curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);*/ $result = curl_exec($ch); //释放curl句柄 curl_close($ch); //如果网站不是utf-8编码的话要转码 if($coding!='utf-8'){ $result= iconv($coding,"utf-8//IGNORE",$result); } return $result; } } ?>