新闻文章信息采集功能的实现

一共涉及以下几个文件:

data_article_gather.php//显示出要获取哪个网页的文章列表

View Code
<?php
require_once '../include/adminfunction.php';
date_default_timezone_set('PRC');
checkadmin();//sysfunction.php里面的方法,验证是否已经登录
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>data_article_manage</title>
<link rel="stylesheet" href="css/admin_center.css" type="text/css" />
<script>
var  highlightcolor='#eafcd5';
//此处clickcolor只能用win系统颜色代码才能成功
var  clickcolor='#51b2f6';
function  changeto(){
source=event.srcElement;
if  (source.tagName=="TR"||source.tagName=="TABLE")
return;
while(source.tagName!="TD")
source=source.parentElement;
source=source.parentElement;
cs  =  source.children;
//alert(cs.length);
if  (cs[1].style.backgroundColor!=highlightcolor&&source.id!="nc"&&cs[1].style.backgroundColor!=clickcolor)
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor=highlightcolor;
}
}

function  changeback(){
if  (event.fromElement.contains(event.toElement)||source.contains(event.toElement)||source.id=="nc")
return
if  (event.toElement!=source&&cs[1].style.backgroundColor!=clickcolor)
//source.style.backgroundColor=originalcolor
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor="";
}
}

function  clickto(){
source=event.srcElement;
if  (source.tagName=="TR"||source.tagName=="TABLE")
return;
while(source.tagName!="TD")
source=source.parentElement;
source=source.parentElement;
cs  =  source.children;
//alert(cs.length);
if  (cs[1].style.backgroundColor!=clickcolor&&source.id!="nc")
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor=clickcolor;
}
else
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor="";
}
}
</script>
</head>
<body>
<table width="100%" border="0" align="center" cellpadding="0" cellspacing="0">
  <tr>
    <td height="30">
    <table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="15" height="30"><img src="images/main_01.gif" width="15" height="30" /></td>
        <td width="1101" background="images/main_02.gif"><img src="images/center_ico01.gif" width="16" height="16" /> <span class="STYLE1">文章管理</span></td>
        <td width="281" background="images/main_02.gif">
        <table border="0" align="right" cellpadding="0" cellspacing="0">
        </table>
        </td>
        <td width="14"><img src="images/main_03.gif" width="14" height="30" /></td>
     </tr>
    </table></td>
  </tr>
        </table>
    </td>
  </tr>
  <tr>
    <td>
    <table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="9" background="images/main_04.gif">&nbsp;</td>
        <td bgcolor="#f3ffe3"><table width="99%" border="0" align="center" cellpadding="0" cellspacing="1" bgcolor="#c0de98" onmouseover="changeto()"  onmouseout="changeback()">
          <tr>
            <td width="50" height="26" background="images/main_05.gif" class="STYLE2">编号</td>
            <td width="450" height="26" background="images/main_05.gif" class="STYLE2">取文章链接的地址</td>
            <td width="200" background="images/main_05.gif" class="STYLE2">网站说明</td>
            <td height="26" background="images/main_05.gif" class="STYLE2">操作</td>
          </tr>
          <tr>
                <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2">
                1
                </td>
                <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <a href="http://news.ef360.com/lady/">http://news.ef360.com/lady/</a> 
                </td>
                <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2">
                华衣网 女装资讯
                </td>
                <td height="30" bgcolor="#FFFFFF" class="STYLE5">
                <a href="data_article_gather_num.php?id=1" target="centerFrame">[开始获取内容]</a>
                </td>
         </tr>
          <tr>
                <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2">
                2
                </td>
                <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <a href="http://www.chaoliu1.net/fushi/nvshi/">http://www.chaoliu1.net/fushi/nvshi/</a> 
                </td>
                <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2">
                第一潮流网 潮流服饰 女式服装
                </td>
                <td height="30" bgcolor="#FFFFFF" class="STYLE5">
                <a href="data_article_gather_num.php?id=2" target="centerFrame">[开始获取内容]</a>
                </td>
         </tr>
          <tr>
                <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2">
                3
                </td>
                <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <a href="http://www.nz86.com/popular/">http://www.nz86.com/popular/</a> 
                </td>
                <td width="200" height="30" bgcolor="#FFFFFF" class="STYLE2">
                中国女装网 时尚快递 潮流搭配
                </td>
                <td height="30" bgcolor="#FFFFFF" class="STYLE5">
                <a href="data_article_gather_num.php?id=3" target="centerFrame">[开始获取内容]</a>
                </td>
         </tr>
        </table></td>
        <td width="9" background="images/main_06.gif"></td>
      </tr>
    </table></td>
  </tr>
  <tr>
    <td height="29"><table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="15" height="29"><img src="images/main_07.gif" width="15" height="29" /></td>
        <td width="100%" background="images/main_08.gif" style="padding-left:150px;">
        </td>
      </tr>
    </table></td>
  </tr>
</table>
</body>
</html>

data_article_gather_num.php//将某个网页链接上的全部文章列表获取到并显示

View Code
<?php
require_once '../include/adminfunction.php';
date_default_timezone_set('PRC');
checkadmin();//sysfunction.php里面的方法,验证是否已经登录
set_time_limit(0);
function canshujiequ($yuanma,$canshustr,$mubiao){
            if($yuanma=='')return array();
            
            if(strpos($canshustr,'[参数]')==false||strpos($mubiao,'[参数1]')==false)
            {
                echo '参数或组合字符串格式不对';
                return array();
            }
            $chaxunwz=0;
            $canshuarr=array();
            $canshuarr=explode('[参数]',$canshustr);
            $len1=count($canshuarr);
            $pipeiarr=array();
            $tpfarr=array();
            $qianks=0;
            $qianjs=0;
            $nowks=0;
            $nowjs=0;
            $end=0;
            $num=0;
            while(($end==0)&&($chaxunwz<strlen($yuanma))){
                    $mubiaofuben=$mubiao;
                    $feikong=0;
                    for($i=0;($end==0)&&($i<$len1);$i++){
                            if($canshuarr[$i]=='')continue;
                            $feikong++;
                            $tpfarr=explode('(*)',$canshuarr[$i]);
                            $len2=count($tpfarr);
                            $feikongnum=0;
                            for($j=0;($j<$len2)&&($end==0);$j++){
                                    if($tpfarr[$j]=='')continue;
                                    $feikongnum++;
                                    if($chaxunwz>=strlen($yuanma)){$end=1;break;}
                                    if(($pipeiwz=strpos($yuanma,$tpfarr[$j],$chaxunwz))!==false){
                                    $chaxunwz=$pipeiwz+strlen($tpfarr[$j]);
                                    if($feikongnum==1)$nowks=$pipeiwz;
                                    $nowjs=$chaxunwz;


                                    }
                                    else{$end=1;break;}
                            }
                            if($end==0){
                                    if($feikong>1){
                                        $str=substr($yuanma,$qianjs,$nowks-$qianjs);
                                        $mubiaofuben=str_replace('[参数'.($feikong-1).']',$str,$mubiaofuben);
                                    }
                                    $qianks=$nowks;
                                    $qianjs=$nowjs;
                            }else{
                                break;
                            }
                    }
                    if($end==0){
                        $pipeiarr[]=$mubiaofuben;
                        $num++;
                    }
            }
            return $pipeiarr;
}
$jieguo1="";
$jieguo2="";
$list_href=array();
$list_name=array();
$sel=$_GET["id"];
if($_GET["id"]==1)
{
$source=file_get_contents("http://news.ef360.com/lady/");//获取数据源(【url】)
$a='<ul class="ul_text_1 f14 arr1" style="padding:15px 0;">[参数]</ul>';
$b="&nbsp;[参数1]&nbsp";
$jieguo1=canshujiequ($source,$a,$b); 
  for($i=0;$i<count($jieguo1);$i++)
  {
   $source=iconv("GB2312","UTF-8//IGNORE",$jieguo1[$i]) ;
   $a='http://news.ef360.com/Articles/[参数].html';
   $b="http://news.ef360.com/Articles/[参数1].html";
   $list=canshujiequ($source,$a,$b);
   for($m=0;$m<count($list);$m++)
   {
      $list_href[]=$list[$m];
   }
   $source=iconv("GB2312","UTF-8//IGNORE",$jieguo1[$i]) ;
   $c='_blank">[参数]</a></li>';
   $d="&nbsp;[参数1]&nbsp";
    $list=canshujiequ($source,$c,$d);
   for($m=0;$m<count($list);$m++)
   {
      $list_name[]=$list[$m];
   }
  }
}
else
if($_GET["id"]==2)
{
   $source=file_get_contents("http://www.chaoliu1.net/fushi/nvshi/");//获取数据源(【url】)
   $source=iconv("GB2312","UTF-8//IGNORE",$source) ;
   $a='href="http://www.chaoliu1.net/fushi/nvshi/[参数].html';
   $b="http://www.chaoliu1.net/fushi/nvshi/[参数1].html";
   $list_href=canshujiequ($source,$a,$b); 
   $a='" class="title">[参数]</A>';
   $b="&nbsp;[参数1]&nbsp;";
   $list_name=canshujiequ($source,$a,$b);
}
else
if($_GET["id"]==3)
{
   $source=file_get_contents("http://eladies.sina.com.cn/fa/zhuangban/");//获取数据源(【url】)
   $a='<span class="l">
                                                    <a href="http://www.nz86.com/article/[参数]/" target="_blank" title="小外套度暖春 轻松显瘦">小外套度暖春 轻松显瘦</a>
                                                    </span>';
   $b="http://eladies.sina.com.cn/fa/2013/[参数1].shtml";
   $c='.shtml" target="_blank" title="[参数]"><img src="';
   $d="&nbsp;[参数1]&nbsp;";
   $jieguo1=canshujiequ($source,$a,$b);
   $jieguo2=canshujiequ($source,$c,$d);
}
else
{
   echo "未发现有此链接";
}
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>data_article_manage</title>
<link rel="stylesheet" href="css/admin_center.css" type="text/css" />
<script>
var  highlightcolor='#eafcd5';
//此处clickcolor只能用win系统颜色代码才能成功
var  clickcolor='#51b2f6';
function  changeto(){
source=event.srcElement;
if  (source.tagName=="TR"||source.tagName=="TABLE")
return;
while(source.tagName!="TD")
source=source.parentElement;
source=source.parentElement;
cs  =  source.children;
//alert(cs.length);
if  (cs[1].style.backgroundColor!=highlightcolor&&source.id!="nc"&&cs[1].style.backgroundColor!=clickcolor)
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor=highlightcolor;
}
}

function  changeback(){
if  (event.fromElement.contains(event.toElement)||source.contains(event.toElement)||source.id=="nc")
return
if  (event.toElement!=source&&cs[1].style.backgroundColor!=clickcolor)
//source.style.backgroundColor=originalcolor
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor="";
}
}

function  clickto(){
source=event.srcElement;
if  (source.tagName=="TR"||source.tagName=="TABLE")
return;
while(source.tagName!="TD")
source=source.parentElement;
source=source.parentElement;
cs  =  source.children;
//alert(cs.length);
if  (cs[1].style.backgroundColor!=clickcolor&&source.id!="nc")
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor=clickcolor;
}
else
for(i=0;i<cs.length;i++){
    cs[i].style.backgroundColor="";
}
}
</script>
<script>
function SelectAll() {
 var checkboxs=document.getElementsByName("checkboxid[]");
  var checkboxs_l=document.getElementsByName("checkboxid_a");
  var a=checkboxs_l[0];
 for (var i=0;i<checkboxs.length;i++) {
  var e=checkboxs[i];
  if(a.checked)
  {
    e.checked=true;}else{e.checked=false;}
 }
}
</script>
</head>
<body>
<table width="100%" border="0" align="center" cellpadding="0" cellspacing="0">
  <tr>
    <td height="30">
    <table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="15" height="30"><img src="images/main_01.gif" width="15" height="30" /></td>
        <td width="1101" background="images/main_02.gif"><img src="images/center_ico01.gif" width="16" height="16" /> <span class="STYLE1">文章管理</span></td>
        <td width="281" background="images/main_02.gif">
        <table border="0" align="right" cellpadding="0" cellspacing="0">
        </table>
        </td>
        <td width="14"><img src="images/main_03.gif" width="14" height="30" /></td>
     </tr>
    </table></td>
  </tr>
        </table>
    </td>
  </tr>
  <tr>
    <td>
    <table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="9" background="images/main_04.gif">&nbsp;</td>
        <td bgcolor="#f3ffe3"><table width="99%" border="0" align="center" cellpadding="0" cellspacing="1" bgcolor="#c0de98" onmouseover="changeto()"  onmouseout="changeback()">
          <tr>
            <td width="50" height="26" background="images/main_05.gif" class="STYLE2">
            <input name="checkboxid_a" type="checkbox" value="" onclick="SelectAll()" />
            </td>
            <td width="450" height="26" background="images/main_05.gif" class="STYLE2">取文章链接的地址</td>
            <td width="463" background="images/main_05.gif" class="STYLE2">网站说明</td>
            <td width="159" height="26" background="images/main_05.gif" class="STYLE2">操作</td>
          </tr>
          <form action="data_article_gather_born.php?id=<?php echo $_GET["id"];?>" name="form1" method="post">
          <?php for($n=0;$n<count($list_href);$n++){?>
          <tr>
                <td width="50" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <input type="checkbox" name="checkboxid[]" id="<?php echo $n;?>"value="<?php echo $list_href[$n];?>" />
                </td>
                <td width="450" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <a href="<?php echo $list_href[$n];?>"><?php echo $list_href[$n];?></a> 
                </td>
                <td width="463" height="30" bgcolor="#FFFFFF" class="STYLE2">
                <?php echo $list_name[$n];?>                </td>
                <td height="30" bgcolor="#FFFFFF" class="STYLE5">
                <a href="#">[生成网站数据]</a>
                </td>
         </tr>
         <?php }?>
         </form>
        </table></td>
        <td width="9" background="images/main_06.gif"></td>
      </tr>
    </table></td>
  </tr>
  <tr>
    <td height="29"><table width="100%" border="0" cellspacing="0" cellpadding="0">
      <tr>
        <td width="15" height="29"><img src="images/main_07.gif" width="15" height="29" /></td>
        <td width="100%" background="images/main_08.gif" style="padding-left:150px;">
        <input name="" type="button" value="开始生成" onclick="form1.submit()"/>
        </td>
      </tr>
    </table></td>
  </tr>
</table>

</body>
</html>

data_article_gather_born.php//将获取到的内容填到数据库中

View Code
<?php
require_once '../include/adminfunction.php';
date_default_timezone_set('PRC');
checkadmin();//sysfunction.php里面的方法,验证是否已经登录
function canshujiequ($yuanma,$canshustr,$mubiao){
            if($yuanma=='')return array();
            
            if(strpos($canshustr,'[参数]')==false||strpos($mubiao,'[参数1]')==false)
            {
                echo '参数或组合字符串格式不对';
                return array();
            }
            $chaxunwz=0;
            $canshuarr=array();
            $canshuarr=explode('[参数]',$canshustr);
            $len1=count($canshuarr);
            $pipeiarr=array();
            $tpfarr=array();
            $qianks=0;
            $qianjs=0;
            $nowks=0;
            $nowjs=0;
            $end=0;
            $num=0;
            while(($end==0)&&($chaxunwz<strlen($yuanma))){
                    $mubiaofuben=$mubiao;
                    $feikong=0;
                    for($i=0;($end==0)&&($i<$len1);$i++){
                            if($canshuarr[$i]=='')continue;
                            $feikong++;
                            $tpfarr=explode('(*)',$canshuarr[$i]);
                            $len2=count($tpfarr);
                            $feikongnum=0;
                            for($j=0;($j<$len2)&&($end==0);$j++){
                                    if($tpfarr[$j]=='')continue;
                                    $feikongnum++;
                                    if($chaxunwz>=strlen($yuanma)){$end=1;break;}
                                    if(($pipeiwz=strpos($yuanma,$tpfarr[$j],$chaxunwz))!==false){
                                    $chaxunwz=$pipeiwz+strlen($tpfarr[$j]);
                                    if($feikongnum==1)$nowks=$pipeiwz;
                                    $nowjs=$chaxunwz;


                                    }
                                    else{$end=1;break;}
                            }
                            if($end==0){
                                    if($feikong>1){
                                        $str=substr($yuanma,$qianjs,$nowks-$qianjs);
                                        $mubiaofuben=str_replace('[参数'.($feikong-1).']',$str,$mubiaofuben);
                                    }
                                    $qianks=$nowks;
                                    $qianjs=$nowjs;
                            }else{
                                break;
                            }
                    }
                    if($end==0){
                        $pipeiarr[]=$mubiaofuben;
                        $num++;
                    }
            }
            return $pipeiarr;
}

function GrabImage($url, $filename=""){ 
//$url 为空则返回 false; 
if($url == ""){return false;} 
$ext = strrchr($url, ".");//得到图片的扩展名 
if($ext != ".gif" && $ext != ".jpg" && $ext != ".bmp"){echo "格式不支持!";return false;} 
if($filename == ""){$filename = time()."$ext";}//以时间戳另起名 
//开始捕捉 
ob_start(); 
readfile($url); 
$img = ob_get_contents(); 
ob_end_clean(); 
$size = strlen($img); 
$fp2 = fopen($filename , "a"); 
fwrite($fp2, $img); 
fclose($fp2); 
return $filename; 
} 
if($_GET["id"]==1)
{
if(!empty($_POST["checkboxid"]))
{
   $url_list=$_POST["checkboxid"];
   for($i=0;$i<count($_POST["checkboxid"]);$i++)
   {
      $url = $url_list[$i]; 
      $contents=file_get_contents($url); 
      $contents=str_replace("/EditManager/File/News/","http://news.ef360.com/EditManager/File/News/",iconv("GBK", "UTF-8//IGNORE", $contents));
      /*`Article_id`  `Article_name``Last_time` `Article_source``Article_desc``Article_content``image`
  `Image_url` `Category_id` */
//1.标题2.时间3.来源4.备注5.文章6.分类
    $c='<h1 class="news_title">[参数]</h1>';
    $d="&nbsp;[参数1]&nbsp";
    $title=canshujiequ($contents,$c,$d);//print_r($title);
    $a='<span id="btn_message" class="btn_message"></span>[参数]</div>';
    $b="&nbsp;[参数1]&nbsp";
    $source=canshujiequ($contents,$a,$b);//print_r($source);
    $a='<div class="content">[参数]</div>
      
      <div class="tagbar">
';
    $b="&nbsp;[参数1]&nbsp";
    $content=canshujiequ($contents,$a,$b);
    $shijian=time();
    $sql="INSERT INTO  `women`.`article` (`Article_id` ,`Article_name` ,`Last_time` ,`Article_source` ,`Article_desc` ,`Article_content` ,`image` ,`Image_url` ,`Category_id`)
VALUES (NULL ,  '".$title[0]."',  '".$shijian."',  '".$source[0]."',  '0',  '".$content[0]."',  '',  '',  '3')";
print_r($content);
echo "ok";
if(mysql_query($sql))
{
  echo "<script>alert('数据生成成功,您可以在文章资讯列表中查看');</script>";
}
else{echo "<script>alert('数据生成失败,可能是此篇文章已经存在');</script>";}
   }
}
}
else if($_GET["id"]==2)
{
   if(!empty($_POST["checkboxid"]))
{
   $url_list=$_POST["checkboxid"];
   for($i=0;$i<count($_POST["checkboxid"]);$i++)
   {
      $url = $url_list[$i]; 
      print_r($url);
      $contents=file_get_contents($url); 
      $contents=iconv("GB2312","UTF-8//IGNORE",$contents);
      /*`Article_id`  `Article_name``Last_time` `Article_source``Article_desc``Article_content``image`
  `Image_url` `Category_id` */
//1.标题2.时间3.来源4.备注5.文章6.分类
    $c='<H2><STRONG>[参数]</STRONG></H2>';
    $d="&nbsp;[参数1]&nbsp";
    $title=canshujiequ($contents,$c,$d);//print_r($title);
    $a='<P class=title-bt>[参数] <SPAN>-</SPAN>点击';
    $b="&nbsp;[参数1]&nbsp";
    $source=canshujiequ($contents,$a,$b);//print_r($source);
    $a='<script src="http://cpro.baidustatic.com/cpro/ui/c.js" type="text/javascript"></script>
[参数]<DIV class=pg>';
    $b="&nbsp;[参数1]&nbsp";
    $content=canshujiequ($contents,$a,$b);
    $shijian=time();
    print_r($title[0]);echo "<br/>";
    print_r($source[0]);echo "<br/>";
    print_r(htmlspecialchars($content[0]));echo "<br/>";
    $sql="INSERT INTO  `women`.`article` (`Article_id` ,`Article_name` ,`Last_time` ,`Article_source` ,`Article_desc` ,`Article_content` ,`image` ,`Image_url` ,`Category_id`)
VALUES (NULL ,  '".$title[0]."',  '".$shijian."',  '".$source[0]."',  '0',  '".str_replace("'",'"',$content[0])."',  '',  '',  '3')";
print_r($content);
echo "ok";
if(mysql_query($sql))
{
  echo "<script>alert('数据生成成功,您可以在文章资讯列表中查看');</script>";
}
else{echo "<script>alert('数据生成失败,可能是此篇文章已经存在');</script>";}
   }
}
}
?>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>无标题文档</title>
</head>

<body>
</body>
</html>

文章表数据结构:


 

 

posted @ 2013-04-24 16:37  芭菲雨  阅读(343)  评论(0编辑  收藏  举报