采集文章
1 <?php 2 //1.告诉采集页面的地址 3 $url = 'http://www.cmstop.com/news/1.shtml'; 4 //2.读取采集页面地址 5 $str = file_get_contents($url); 6 //echo $str; 7 //3.定义采集文章链接区域的正则 8 $pattern_qu = '/<ul\s+class=\"txt-list-a\s+lh-56\s+cor-333\s+fz-18\">(.*?)<div\s+class=\"page-box\s+clear\s+ov\s+page\"/Ss'; 9 //4.进行正则匹配 将文章区域的链接匹配到 10 preg_match($pattern_qu,$str,$match_url); 11 //var_dump($match_url); 12 //5.定义匹配文章链接的正则 13 $pattern_url = '/<a\s+href=\"(.*?)\"\s+title/S'; 14 //6.匹配文章的链接地址 15 preg_match_all($pattern_url,$match_url[1],$match); 16 //var_dump($match); 17 $num = 1; 18 //7.遍历匹配到的所有文章内容地址 19 foreach($match[1] as $k=>$v){ 20 //echo $v.'<br/>'; 21 //7.1循环打开文章内容地址 22 $content = file_get_contents($v); 23 //7.2定义匹配文章内容的正则 24 $con_pattern = '/<div\s+class=\"ad\"><\/div>(.*?)<span\s+id=\"supports\"\s+class=\"praise\"/Ss'; 25 //7.3定义匹配文章标题的正则 26 $title_pattern = '/<title>(.*?)<\/title>/Ss'; 27 //7.4进行文章内容的匹配 28 preg_match($con_pattern,$content,$newCon); 29 //var_dump($newCon);exit; 30 //7.5进行文章标题的匹配 31 preg_match($title_pattern,$content,$newTitle); 32 //var_dump($newTitle); 33 //7.6组成字符串 34 $newStr = $newTitle[0].'<meta charset="utf-8" />'.$newCon[1]; 35 //7.7写入到指定文件中保存 36 file_put_contents('./collect/'.$num.'.html',$newStr); 37 $num ++; 38 }