PHP采集入库教程

<?php
define('IN_PHPBB', true);
set_time_limit(0);    //设置程序的运行上限时间为不限制
ignore_user_abort(true);
//die('ok');
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/functions_user.' . $phpEx);
include($phpbb_root_path . 'includes/functions_module.' . $phpEx);
include($phpbb_root_path . 'includes/functions_display.' . $phpEx);
include($phpbb_root_path . 'includes/functions_privmsgs.' . $phpEx);


//设置采集参数
/**
* title_url 采集地址
* title_url 采集网站域名
* file_name 帖子后缀名
* pattern_title 采集标题列表正则
* pattern_content 采集内容页正则
* charset 采集网站编码
* forum_id 加入的版块的ID
* post_approved 是否已经审核,0没有,1已经审核
**/

$data = array(
 'kds'      =>array(
  'title_url'       =>'http://club.pchome.net/forum_1_15.html',
  'web_url'         =>'http://club.pchome.net',
  'file_name'       =>'',
  'pattern_title'   =>'/\<li class=\"i3\"\>[\s]*\<a[\s\S]*?href\=\"(.*?)\"[\s\S]*?\>(.*?)\<\/a\>[\s\S]*?\<\/li\>/',
  'pattern_content' =>'/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/',
  'charset'         =>'gbk',
  'forum_id'        =>26,
  'post_approved'   =>1
  ),

 'cttj'      =>array(
  'title_url'       =>'http://www.dianping.com/group/meishi123/',
  'web_url'         =>'http://www.dianping.com',
  'file_name'       =>'',
  'pattern_title'   =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
  'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
  'charset'         =>'UTF-8',
  'forum_id'        =>22,
  'post_approved'   =>1
  ),
 'crwq'      =>array(
  'title_url'       =>'http://www.dianping.com/group/kitchen',
  'web_url'         =>'http://www.dianping.com',
  'file_name'       =>'',
  'pattern_title'   =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
  'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
  'charset'         =>'UTF-8',
  'forum_id'        =>160,
  'post_approved'   =>1
  ),
 'pcwh'      =>array(
  'title_url'       =>'http://www.dianping.com/group/sh5757',
  'web_url'         =>'http://www.dianping.com',
  'file_name'       =>'',
  'pattern_title'   =>'/\<a class=\"B con\".*?href=\"(.*?)\".*?\>(.*?)\<\/a\>/',
  'pattern_content' =>'/\<div id=\"mainNoteInfo\" class=\"noteInfo\">(.*?)\<\/div\>/',
  'charset'         =>'UTF-8',
  'forum_id'        =>21,
  'post_approved'   =>1
  )  
);

//循环采集数据
foreach($data as $key=>$value){
 //print_r($value);
 caiji($value['title_url'],$value['web_url'],$value['file_name'],$value['pattern_title'],$value['pattern_content'],$value['charset'],$value['forum_id'],$value['post_approved']);
}
unset($data);
echo'<br/>meishi';

function caiji($title_url,$web_url,$file_name,$pattern_title,$pattern_content,$charset,$forum_id,$post_approved=0)
{
 global $db;
 $topic_poster = 2;
 $topic_first_poster_name = 'abc';
 $poster_id = 53;
 
 // 获取页面代码
 $r = file_get_contents($title_url);
 $r = str_replace("\n","",$r);
 $r = str_replace("\n\r","",$r);
 // 进行正则搜索
 preg_match_all($pattern_title, $r, $title);
 
 // 计算标题数量
 $count = count($title[1]);
 //echo $count;die('===');
 //print_r($title[1]);die('====');
 //加载一个多进程CURL实例
 $mh = curl_multi_init();
 $handles = array();

 // 通过标题数量进行内容采集
 for($i=5;$i<$count;$i++) {
  // 设置内容页地址
  $content_url = $web_url.$title[1][$i].$file_name;
  //die($content_url.'==');  
  // 创建一个单线程CURL实例
  $ch = curl_init();

  // 设置CURL相关参数
  curl_setopt($ch, CURLOPT_URL, $content_url);
  curl_setopt($ch, CURLOPT_HEADER, 0);
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($ch, CURLOPT_TIMEOUT, 30);

  // 将该进程加载到实例中
  curl_multi_add_handle($mh,$ch);

  // 加入循环数组中
  $handles[] = $ch; 
 }

 // 执行CURL多线程实例
 $running=null;
 do {

  curl_multi_exec($mh,$running);

  // 间隔0.25S
  usleep (250000);
 } while ($running > 0);


 // 获取采集内容
 for($i=0;$i<count($handles);$i++)

 {
  // 获取内容页代码
  $c = '';
  $c = curl_multi_getcontent($handles[$i]);
  $c = str_replace("\n","",$c);
  $c = str_replace("\n\r","",$c);
  $c = str_replace("\t","",$c);
  //print_r($c);die();
  //echo $i.'--';continue;
  // 设置内容页匹配正则  
  //$p = '/\<div id=\"__Message_\d*\"\>(.*?)\<\/div>/';
  // 进行正则匹配搜索
  preg_match($pattern_content,$c,$content);
  $content[0] = strip_tags($content[0],'<a><img><br><p>');//去除HTML标识
  $content[0] = addslashes($content[0]);
  //$m = count($content);
  //echo $m.'===';
  //echo $content[1].'==';die();
  //var_dump($content);die('==');
  $title[2][$i] = addslashes($title[2][$i]);
  $time = time();
  $sql_check = "select `topic_title` from " . TOPICS_TABLE . " where `topic_title`='".iconv($charset, 'UTF-8',$title[2][$i])."'";
  //echo ($sql_check);
  $result = $db->sql_query($sql_check);
  $r = $db->sql_fetchrow($result);
  //var_dump($r);die();
  $db->sql_freeresult($result);
  
  if($r['topic_title'] != '') {
   //die('yes');
   continue;
  }
  
  //echo $title[2][$i];continue;
  $sql_topic = "INSERT INTO " . TOPICS_TABLE . " (`topic_poster`, `topic_time`, `topic_last_view_time`, `forum_id`, `icon_id`, `topic_approved`, `topic_title`, `topic_first_poster_name`, `topic_first_poster_colour`, `topic_type`, `topic_time_limit`, `topic_attachment`) VALUES ('$topic_poster', '$time', '$time', '$forum_id', 0, 1, '".iconv($charset, 'UTF-8',$title[2][$i])."', '$topic_first_poster_name', 'AA0000', 0, 0, 0)";
  //die($sql_topic);
  $result = $db->sql_query($sql_topic);
  $topic_id = $db->sql_nextid();

  $sql_post = "INSERT INTO " . POSTS_TABLE . " (`forum_id`, `poster_id`, `icon_id`, `poster_ip`, `post_time`, `post_approved`, `enable_bbcode`, `enable_smilies`, `enable_magic_url`, `enable_sig`, `post_username`, `post_subject`, `post_text`, `post_checksum`, `post_attachment`, `bbcode_bitfield`, `bbcode_uid`, `post_postcount`, `post_edit_locked`, `topic_id`) VALUES ('$forum_id', '$poster_id', 0, '192.168.10.22', '$time', '$post_approved', 1, 1, 1, 1, '', '".iconv($charset, 'UTF-8',$title[2][$i])."', '".iconv($charset, 'UTF-8',$content[0])."', '921d0e37730d722e4475373dcc96bb0d', 0, '', '2vnw1nj7', 1, 0, '$topic_id')";
  //die($sql_post);
  $result = $db->sql_query($sql_post);
  //echo $i.'--';
  //sleep(1);    //延时1秒继续循环
  //$output.= curl_multi_getcontent($handles[$i]);
  curl_multi_remove_handle($mh,$handles[$i]);
  echo $i.'===';
 }

 // 输出采集结果

 //echo $output;

 //关闭实例

 curl_multi_close($mh);

   
}

echo "结束";
// Close our DB connection.
if (!empty($db))
{
 $db->sql_close();
}
 
?>

 

posted @ 2014-03-06 02:51  某人2013  阅读(725)  评论(0编辑  收藏  举报