采集文章

 1 <?php
 2     //1.告诉采集页面的地址
 3     $url = 'http://www.cmstop.com/news/1.shtml';
 4     //2.读取采集页面地址
 5     $str = file_get_contents($url);
 6     //echo $str;
 7     //3.定义采集文章链接区域的正则
 8     $pattern_qu = '/<ul\s+class=\"txt-list-a\s+lh-56\s+cor-333\s+fz-18\">(.*?)<div\s+class=\"page-box\s+clear\s+ov\s+page\"/Ss';
 9     //4.进行正则匹配 将文章区域的链接匹配到
10     preg_match($pattern_qu,$str,$match_url);
11     //var_dump($match_url);
12     //5.定义匹配文章链接的正则
13     $pattern_url = '/<a\s+href=\"(.*?)\"\s+title/S';
14     //6.匹配文章的链接地址
15     preg_match_all($pattern_url,$match_url[1],$match);
16     //var_dump($match);
17     $num = 1;
18     //7.遍历匹配到的所有文章内容地址
19     foreach($match[1] as $k=>$v){
20         //echo $v.'<br/>';
21         //7.1循环打开文章内容地址
22         $content = file_get_contents($v);
23         //7.2定义匹配文章内容的正则
24         $con_pattern = '/<div\s+class=\"ad\"><\/div>(.*?)<span\s+id=\"supports\"\s+class=\"praise\"/Ss';
25         //7.3定义匹配文章标题的正则
26         $title_pattern = '/<title>(.*?)<\/title>/Ss';
27         //7.4进行文章内容的匹配
28          preg_match($con_pattern,$content,$newCon);
29         //var_dump($newCon);exit;
30         //7.5进行文章标题的匹配
31         preg_match($title_pattern,$content,$newTitle);
32         //var_dump($newTitle);
33         //7.6组成字符串
34         $newStr = $newTitle[0].'<meta charset="utf-8" />'.$newCon[1];
35         //7.7写入到指定文件中保存
36         file_put_contents('./collect/'.$num.'.html',$newStr);
37         $num ++;
38     }

 

posted on 2018-05-11 14:27  xuxxnb  阅读(431)  评论(0编辑  收藏  举报

导航