文章采集代码
<?php //1.告诉采集页面的地址 $url = 'http://www.zgjiemeng.com/dongwu/'; //2.读取采集页面地址 $str = file_get_contents($url); // echo $str; echo '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; //3.定义采集文章链接区域的正则 $pattern_qu = '/<ul class=\"list2 clearfix\">([\S\s]*?)<\/ul>/'; //4.进行正则匹配 将文章区域的链接匹配到 preg_match($pattern_qu,$str,$match_url); // var_dump($match_url[1]); preg_match_all ("/<li>(.*)<\/li>/U", $match_url[1], $pat_array); print_r( $pat_array[0][1]); preg_match_all ("/<li><a target=\"_blank\" title=\"(.*)\" href=\"(.*)\">(.*)<\/a><\/li>/U", $pat_array[0][1], $pat_array); print_r($pat_array); //preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $match_url[1], $matches, PREG_SET_ORDER); //print_r($matches ); die; preg_match($pattern_qu,$match_url[1],$match_url); var_dump($match_url); die; //5.定义匹配文章链接的正则 $pattern_url = '/<a\s+href=\"(.*?)\"\s+title/S'; //6.匹配文章的链接地址 preg_match_all($pattern_url,$match_url[1],$match); var_dump($match); die; $num = 1; //7.遍历匹配到的所有文章内容地址 foreach($match[1] as $k=>$v){ //echo $v.'<br/>'; //7.1循环打开文章内容地址 $content = file_get_contents($v); //7.2定义匹配文章内容的正则 $con_pattern = '/<div\s+class=\"ad\"><\/div>(.*?)<span\s+id=\"supports\"\s+class=\"praise\"/Ss'; //7.3定义匹配文章标题的正则 $title_pattern = '/<title>(.*?)<\/title>/Ss'; //7.4进行文章内容的匹配 preg_match($con_pattern,$content,$newCon); //var_dump($newCon);exit; //7.5进行文章标题的匹配 preg_match($title_pattern,$content,$newTitle); //var_dump($newTitle); //7.6组成字符串 $newStr = $newTitle[0].'<meta charset="utf-8" />'.$newCon[1]; //7.7写入到指定文件中保存 file_put_contents('./collect/'.$num.'.html',$newStr); $num ++; }