PHP代码-数据爬取(a标签和a标签所对应的内容)
public function export(){ set_time_limit(1000); // header("Content-type: text/html; charset=utf-8"); $a = file_get_contents('http://chuangye.yjbys.com/zhengce/'); $reg = '/<\/span><a href="(.*)" (.*)>(.*)</isU'; $result = preg_match_all($reg,$a,$match_result); $arr = array(); foreach($match_result[1] as $k=>$v){ $tnum = strlen($match_result[3][$k]); if(substr($v,0,1) == 'h' && $tnum>21){ $arr[$k]['art_url'] = $v; $arr[$k]['art_title'] = mb_convert_encoding($match_result[3][$k], "UTF-8",'gbk'); // $match_result[3][$k]; mb_convert_encoding($match_result[3][$k], "UTF-8",'gbk'); $b = file_get_contents($v); preg_match('/<div class=\"content\">(.*)<\/div>/s',$b,$match); $match[0] = iconv("gbk", "utf-8", $match[0]); $num = strpos($match[0],'<script type="text/javascript">a("content_body");</script>'); $cont = substr($match[0],0,$num)."</div>"; $cony = str_replace('<div class="ad_top_left"><script type="text/javascript">a("content_1");</script></div>',"",$cont); $cont = str_replace('<div class="ad_top_left2"><script type="text/javascript">a("content_2");</script></div>',"",$cony); // $cont = str_replace('“','“',$cont); // $cont = str_replace('”','”',$cont); // $cont = str_replace('…','~',$cont); // $cont = str_replace('—','-',$cont); // $cont = str_replace('"','“',$cont); // $cont = str_replace('•','•',$cont); $arr[$k]['art_content'] = html_entity_decode($cont); $arr[$k]['state'] = 0; $arr[$k]['type'] = 4; $arr[$k]['userid'] = 4; } } $arr = array_values($arr); // print_r($arr);die; // $arr2=array_iconv("gbk","utf-8",$arr); // print_r($arr);die; $article = M('cxpt_user_article'); var_dump($article->addAll($arr));echo $article->getLastSql();die; // foreach($arr as &$v){ // $b = file_get_contents($v['url']); // preg_match('/<div class=\"content\">(.*)<\/div>/s',$b,$match); // $num = strpos($match[0],'<script type="text/javascript">a("content_body");</script>'); // $v['content'] = substr($match[0],0,$num); // } // foreach($arr as $v){ // $info['art_title'] = $v['title']; // $info['art_content'] = $v['content']; // } // print_r($arr);die; }