PHP采集入库教程
<?php define('IN_PHPBB', true); set_time_limit(0); //设置程序的运行上限时间为不限制 ignore_user_abort(true); //die('ok'); $phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './'; $phpEx = substr(strrchr(__FILE__, '.'), 1); include($phpbb_root_path . 'common.' . $phpEx); include($phpbb_root_path . 'includes/functions_user.' . $phpEx); include($phpbb_root_path . 'includes/functions_module.' . $phpEx); include($phpbb_root_path . 'includes/functions_display.' . $phpEx); include($phpbb_root_path . 'includes/functions_privmsgs.' . $phpEx); //设置采集参数 /** * title_url 采集地址 * title_url 采集网站域名 * file_name 帖子后缀名 * pattern_title 采集标题列表正则 * pattern_content 采集内容页正则 * charset 采集网站编码 * forum_id 加入的版块的ID * post_approved 是否已经审核,0没有,1已经审核 **/ $data = array( 'kds' =>array( 'title_url' =>'http://club.pchome.net/forum_1_15.html', 'web_url' =>'http://club.pchome.net', 'file_name' =>'', 'pattern_title' =>'/\<li class=\"i3\"\>[\s]*\<a[\s\S]*?href\=\"(.*?)\"[\s\S]*?\>(.*?)\<\/a\>[\s\S]*?\<\/li\>/', 'pattern_content' =>'/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/', 'charset' =>'gbk', 'forum_id' =>26, 'post_approved' =>1 ), 'cttj' =>array( 'title_url' =>'http://www.dianping.com/group/meishi123/', 'web_url' =>'http://www.dianping.com', 'file_name' =>'', 'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/', 'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/', 'charset' =>'UTF-8', 'forum_id' =>22, 'post_approved' =>1 ), 'crwq' =>array( 'title_url' =>'http://www.dianping.com/group/kitchen', 'web_url' =>'http://www.dianping.com', 'file_name' =>'', 'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/', 'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/', 'charset' =>'UTF-8', 'forum_id' =>160, 'post_approved' =>1 ), 'pcwh' =>array( 'title_url' =>'http://www.dianping.com/group/sh5757', 'web_url' =>'http://www.dianping.com', 'file_name' =>'', 'pattern_title' =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/', 'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/', 'charset' =>'UTF-8', 'forum_id' =>21, 'post_approved' =>1 ) ); //循环采集数据 foreach($data as $key=>$value){ //print_r($value); caiji($value['title_url'],$value['web_url'],$value['file_name'],$value['pattern_title'],$value['pattern_content'],$value['charset'],$value['forum_id'],$value['post_approved']); } unset($data); echo'<br/>meishi'; function caiji($title_url,$web_url,$file_name,$pattern_title,$pattern_content,$charset,$forum_id,$post_approved=0) { global $db; $topic_poster = 2; $topic_first_poster_name = 'abc'; $poster_id = 53; // 获取页面代码 $r = file_get_contents($title_url); $r = str_replace("\n","",$r); $r = str_replace("\n\r","",$r); // 进行正则搜索 preg_match_all($pattern_title, $r, $title); // 计算标题数量 $count = count($title[1]); //echo $count;die('==='); //print_r($title[1]);die('===='); //加载一个多进程CURL实例 $mh = curl_multi_init(); $handles = array(); // 通过标题数量进行内容采集 for($i=5;$i<$count;$i++) { // 设置内容页地址 $content_url = $web_url.$title[1][$i].$file_name; //die($content_url.'=='); // 创建一个单线程CURL实例 $ch = curl_init(); // 设置CURL相关参数 curl_setopt($ch, CURLOPT_URL, $content_url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 30); // 将该进程加载到实例中 curl_multi_add_handle($mh,$ch); // 加入循环数组中 $handles[] = $ch; } // 执行CURL多线程实例 $running=null; do { curl_multi_exec($mh,$running); // 间隔0.25S usleep (250000); } while ($running > 0); // 获取采集内容 for($i=0;$i<count($handles);$i++) { // 获取内容页代码 $c = ''; $c = curl_multi_getcontent($handles[$i]); $c = str_replace("\n","",$c); $c = str_replace("\n\r","",$c); $c = str_replace("\t","",$c); //print_r($c);die(); //echo $i.'--';continue; // 设置内容页匹配正则 //$p = '/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/'; // 进行正则匹配搜索 preg_match($pattern_content,$c,$content); $content[0] = strip_tags($content[0],'<a><img><br><p>');//去除HTML标识 $content[0] = addslashes($content[0]); //$m = count($content); //echo $m.'==='; //echo $content[1].'==';die(); //var_dump($content);die('=='); $title[2][$i] = addslashes($title[2][$i]); $time = time(); $sql_check = "select `topic_title` from " . TOPICS_TABLE . " where `topic_title`='".iconv($charset, 'UTF-8',$title[2][$i])."'"; //echo ($sql_check); $result = $db->sql_query($sql_check); $r = $db->sql_fetchrow($result); //var_dump($r);die(); $db->sql_freeresult($result); if($r['topic_title'] != '') { //die('yes'); continue; } //echo $title[2][$i];continue; $sql_topic = "INSERT INTO " . TOPICS_TABLE . " (`topic_poster`, `topic_time`, `topic_last_view_time`, `forum_id`, `icon_id`, `topic_approved`, `topic_title`, `topic_first_poster_name`, `topic_first_poster_colour`, `topic_type`, `topic_time_limit`, `topic_attachment`) VALUES ('$topic_poster', '$time', '$time', '$forum_id', 0, 1, '".iconv($charset, 'UTF-8',$title[2][$i])."', '$topic_first_poster_name', 'AA0000', 0, 0, 0)"; //die($sql_topic); $result = $db->sql_query($sql_topic); $topic_id = $db->sql_nextid(); $sql_post = "INSERT INTO " . POSTS_TABLE . " (`forum_id`, `poster_id`, `icon_id`, `poster_ip`, `post_time`, `post_approved`, `enable_bbcode`, `enable_smilies`, `enable_magic_url`, `enable_sig`, `post_username`, `post_subject`, `post_text`, `post_checksum`, `post_attachment`, `bbcode_bitfield`, `bbcode_uid`, `post_postcount`, `post_edit_locked`, `topic_id`) VALUES ('$forum_id', '$poster_id', 0, '192.168.10.22', '$time', '$post_approved', 1, 1, 1, 1, '', '".iconv($charset, 'UTF-8',$title[2][$i])."', '".iconv($charset, 'UTF-8',$content[0])."', '921d0e37730d722e4475373dcc96bb0d', 0, '', '2vnw1nj7', 1, 0, '$topic_id')"; //die($sql_post); $result = $db->sql_query($sql_post); //echo $i.'--'; //sleep(1); //延时1秒继续循环 //$output.= curl_multi_getcontent($handles[$i]); curl_multi_remove_handle($mh,$handles[$i]); echo $i.'==='; } // 输出采集结果 //echo $output; //关闭实例 curl_multi_close($mh); } echo "结束"; // Close our DB connection. if (!empty($db)) { $db->sql_close(); } ?>