thinkphp 新浪新闻采集代码演示
<?php namespace Home\Controller; use Think\Controller; class CollectController extends Controller { //采集新闻列表包括标题,链接,简介并保存 public function index() { $contents = file_get_contents('http://news.sohu.com/guoneixinwen.shtml'); //搜狐国内新闻首页 $contents = mb_convert_encoding($contents, "UTF-8", "gb2312"); $preg = '/maxPage = (\d+);/is'; preg_match_all($preg, $contents, $arr); $num = $arr[1][0]; //新闻列表url最新顺序码 $preg1 = '/(上一页|下一页|尾页|末页)/';//根据页面情况写正则 preg_match_all($preg1, $contents, $arr1); if (isset($_GET['id'])) { if ($arr1[1] !== null && $_GET['id'] <= C('COLLECT_PAGE')) { //配置中数组采集页数 $url = 'http://news.sohu.com/guoneixinwen_' . ($num - $_GET['id']) . '.shtml'; //搜狐国内新闻分页 $id = ++$_GET['id']; } else { echo '列表采集结束,将继续采集正文内容'; $this->index1(); } } else { //实时新闻,避免重复,数据,图片要清空,也可以保存上次的页码,留下次继续,但比较复杂,这里是清空 $this->deleteTable('think_news'); //清空表 delFile(ROOT . '/z2/Public/Uploads/sohu/'); //清空图片 $url = 'http://news.sohu.com/guoneixinwen.shtml'; //搜狐国内新闻首页 $id = 1; } echo '第' . $id . '页列表采集中......'; $contents = mb_convert_encoding(file_get_contents($url), "UTF-8", "gb2312"); //转码 $preg = '/<h3><span class="com-num"><a target="_blank" href="#">comment num<\/a><\/span><a target="_blank" href="(http:\/\/.*)(?#链接)">(.*)(?#标题)<\/a><\/h3>\s*<p>(.*)(?#简介)<a target="_blank" href="http:\/\/.*">.*<\/a><\/p>/Uims'; preg_match_all($preg, $contents, $arr); $news = M('News'); foreach ($arr[1] as $key => $value) { $data['title'] = $arr[2][$key]; $data['url'] = $value; $data['info'] = $arr[3][$key]; $news->create($data); $news->add(); } echo '<script>location.href="' . U('collect/index', array('id' => $id)) . '"</script>'; } //根据采采集的url采集正文内容及图片并保存 public function index1() { $news = M('News'); if (isset($_GET['cid'])) $cid = $_GET['cid']; else $cid = 0; $map['id'] = array('gt', $cid); $result = $news->field('id,url')->where($map)->find(); if (null != $result) { $contents = mb_convert_encoding(file_get_contents($result['url']), "UTF-8", "gb2312"); $preg = '/(<div itemprop="articleBody">.*)(?#正文)<!-- seo标签描述 -->/is'; preg_match_all($preg, $contents, $arr); $contents1 = $arr[1][0]; //正文内容 if (!!$path = $this->getPath($contents1)) { //正文内的图片远程路径数组 $savePath = $this->saveImage($path); //保存图片并获取本地保存绝对路径 $contents1 = str_ireplace($path, $savePath, $contents1); //远程图片路径替换为本地绝对路径 } $news->where('id=' . $result['id'])->setField('contents', htmlspecialchars($contents1)); echo 'id为' . $result['id'] . '的正文内容采集中......'; $cid = ++$cid; echo '<script>location.href="' . U('collect/index1', array('cid' => $cid)) . '"</script>'; } else { echo '正文内容采集结束,以下是采集内容显示'; echo '<script>location.href="' . U('collect/index2') . '"</script>'; } } //内容显示简页 public function index2() { $news = M('News'); $result = $news->where($map)->select(); $this->assign('aa', $result); //$a = $this->buildHtml('1', HTML_PATH . '/collect/', APP_PATH . 'Admin/View/Login/index.html'); //echo $a; $this->display(); } //清空表 private function deleteTable($table) { $sql = "TRUNCATE TABLE $table"; M()->execute($sql); } //获取正文内容中的远程图片路径并返回,参数采集的正文内容 private function getPath($contents) { $path = array(); if ($contents == null) return false; $preg = '/<img src="(http:\/\/.*)" alt=.*\/>/Uis'; if (preg_match_all($preg, $contents, $arr)) { foreach ($arr[1] as $key => $value) { $path[] = $value; //获取远程图片路径 } return $path; //返回远程图片路径 } return false; } //保存图片并返回本地绝对路径,参数远程图片路径数组 private function saveImage($path) { if ($path == '') return false; $pathArr = array(); foreach ($path as $key => $value) { $url = $value; //远程图片路径 $filename = substr($value, strripos($value, '/')); //图片名.后缀 $savePath = './Public/Uploads/sohu' . $filename; //保存路径 ob_start(); //开启缓冲 readfile($url); //读取图片 $img = ob_get_contents(); //保存到缓冲区 ob_end_clean(); //关闭缓冲 $fp2 = @fopen($savePath, "a"); //打开本地保存图片文件 fwrite($fp2, $img); //写入图片 fclose($fp2); $pathArr[] = 'http://localhost:9096/z2/' . str_ireplace('./', '', $savePath); //保存图片绝对路径 } return $pathArr; //返回本地保存绝对路径 } } /* z2是我的app名 相关代码 //删除文件夹里的文件,放公共函数文件里 function delFile($dir) { $dh = opendir($dir); while ($file = readdir($dh)) { if ($file != "." && $file != "..") { $fullpath = $dir . "/" . $file; if (!is_dir($fullpath)) { unlink($fullpath); } else { deldir($fullpath); } } } closedir($dh); } //入口文件 define('ROOT',$_SERVER['DOCUMENT_ROOT']); //配置文件 'COLLECT_PAGE'=> '10',//采集新闻列表页数 //图片本地保存目录'./Public/Uploads/sohu' //index2模板代码 <volist name='aa' id='vo'> <div style="width:800px;margin:0 auto;"> <h2>{$vo.id}.{$vo.title}</h2> {$vo.contents|htmlString_decode} <p>原文链接:<a href="{$vo.url}" target="blank">{$vo.url}</a></p> </div><br><br><br><br><br><br><br> </volist> //表结构 -- -- `think_news` -- CREATE TABLE IF NOT EXISTS `think_news` ( `id` int(5) unsigned NOT NULL AUTO_INCREMENT, `title` varchar(255) DEFAULT NULL, `url` varchar(255) DEFAULT NULL, `info` text, `contents` text, PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC ; */