(phpQuery)对网站产品信息采集代码的优化

a.要采集的源链接:

http://www.prospecbio.com/Recombinant_Proteins/

b.具体要求:

接下来就是采集代码的编写。

对于:b-(1)中,代码如下:

<?php
header('Content-Type:text/html;charset=UTF-8');
include './phpQuery/phpQuery.php';
set_time_limit(10000);

$url = "http://www.prospecbio.com/Hormones/";
echo "当前的URL:";
echo $url."<br/>";
phpQuery::newDocumentFile($url);
$artList = pq(".Body");
// var_dump($artList);
$li = '';
foreach($artList as $li){
    $path = '';
    $head = '';
    $head = 'http://www.prospecbio.com';
    $tr = '';
    $tr = pq($li)->eq(0)->find("table")->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->find("a")->eq(0)->attr('href');
    $tr = trim($tr);
    
    
    if($tr != ''){
    $path = $head.$tr;
    // var_dump($path);
    $path .= "\r\n";
    file_put_contents('Url.txt',$path,FILE_APPEND);
    }
     
    // exit;

}


?>

对于:b-(2)中代码如下:

<?php
header('Content-Type:text/html;charset=UTF-8');
include './phpQuery/phpQuery.php';
set_time_limit(10000);
$id = isset($_GET['id']) ? intval($_GET['id']) : 1;

if($id > 14){
   echo "finish!";
   exit;
}
echo "当前 id=".$id;
echo "<br/>";

$conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" );
$db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" );
// var_dump($conn,$db);
$url = '';
$sql = '';
$sql = 'select url from url_a where id ='.$id;
echo "当前sql :".$sql;
echo '<br/>';
$query = mysql_query($sql);
$res = mysql_fetch_assoc($query);
$url = trim($res['url']);
echo "当前的url:".$url;
echo '<br/>';

phpQuery::newDocumentFile($url);

$artList = pq(".Body");
// var_dump($artList);
$li = '';
foreach($artList as $k => $li){

$tr = '';
$tr = pq($li)->eq(0)->find('table')->eq(0)->find('tr')->eq(1)->find('td')->eq(0)->find('a')->attr('href');
$tr = trim($tr);


if($tr !== '' and $k > 0){

    $head = '';
    $head = 'http://www.prospecbio.com';

    $path = '';
    $tr = ltrim($tr,".");
    $path = $head.$tr."\r\n";

    var_dump($path);
    echo '<br/>';

    file_put_contents('Url_a.txt',$path,FILE_APPEND);
    

}


}

mysql_close($conn);
unset($artList);


?>
<script>
function JumpUrl(){
    location.href='?id=<?php echo ($id+1);?>';
}
setTimeout('JumpUrl()',0);
</script>

对于:b-(3)中,代码如下:

<?php
header('Content-Type:text/html;charset=gb2312');
include './phpQuery/phpQuery.php';
set_time_limit(100000);
// $id = isset($_GET['id']) ? intval($_GET['id']) : 1290;
// $id = isset($_GET['id']) ? intval($_GET['id']) : 2;
$id = isset($_GET['id']) ? intval($_GET['id']) : 1;

if($id > 63){
   echo "finish!";
   exit;
}
echo "当前 id=".$id;
echo "<br/>";

$conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" );
$db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" );
// var_dump($conn,$db);
$url = '';
$sql = '';
$sql = 'select url from url_b where id ='.$id;
echo "当前sql :".$sql;
echo '<br/>';
$query = mysql_query($sql);

$res = mysql_fetch_assoc($query);
$url = trim($res['url']);
echo "当前的url:".$url;
echo '<br/>';
//$url_wh = "http://www.prospecbio.com/CAPN2_10_455/";
phpQuery::newDocumentFile($url);
//phpQuery::newDocumentFile($url_wh);
$arr = array();  ###用于装载产品信息
$arr['product_url'] = '';
$arr['product_url'] = $url;

### Product Name
$product_name = '';
$product_name = pq('#PageHeader')->eq(0)->find('span')->eq(0)->html();
// var_dump($a); 
$product_name = trim($product_name);
echo "产品名称:";
var_dump($product_name);

$arr['product_name'] = '';
$arr['product_name'] = $product_name;
echo '<br/>';

#### price
$price = '';
$price_1 = '';
$price_2 = '';
$price_3 = '';

$price_1 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(0)->find('td')->eq(1)->find('label')->eq(0)->html();
$price_1 = trim($price_1);
iconv('utf-8','gbk',$price_1);

$price_2 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(1)->find('td')->eq(1)->find('label')->eq(0)->html();
$price_2 = trim($price_2);
iconv('utf-8','gbk',$price_2);

$price_3 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(2)->find('td')->eq(1)->find('label')->eq(0)->html();
$price_3 = trim($price_3);
iconv('utf-8','gbk',$price_3);


$price = $price_1."/".$price_2."/".$price_3;

iconv('utf-8','gbk',$price);

echo "产品价格:";
var_dump($price);
$arr['price'] = '';
$arr['price'] = $price;
echo '<br/>';

########## Catalogue Number
$cata_num = '';
$cata_num = pq('.ItemRowLastCellStyle')->eq(0)->html();
$cata_num = trim($cata_num);
echo "产品 Catalogue Number:";
var_dump($cata_num);

$arr['cata_num'] = '';
$arr['cata_num'] = $cata_num;
echo '<br/>';

############ Source 
$source = '';
$appearance = '';
$formulation = '';
$stability = '';
$purity = '';
$amino_acid = '';

$solubility = '';
$bio_activity = '';

$artlist = '';
$artlist = pq('.ItemRowFirstCellStyle');

$arr['source'] = '';
$arr['appearance'] = '';
$arr['formulation'] = '';
$arr['stability'] = '';
$arr['solubility'] = '';
$arr['purity'] = '';
$arr['amino_acid'] = '';
$arr['bio_activity'] = '';

foreach($artlist as $k => $li){
    $tr_1 = '';

    $tr_1 = pq($li)->eq(0)->find('span')->eq(0)->html();
    $tr_1 = trim($tr_1);

    if($tr_1 == "Source"){

     $source = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $source = trim($source);
     $source = strip_tags($source);
     $source = iconv('utf-8','gbk',$source);
     echo "产品 Source:";
     var_dump($source);
    
     $arr['source'] = $source;
     echo '<br/>';
     // var_dump($k,$tr_1,$source);
    }
    
    if($tr_1 == "Physical Appearance"){
     $appearance = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $appearance = trim($appearance);
     $appearance = strip_tags($appearance);
     $appearance = iconv('utf-8','gbk',$appearance);
     echo "产品 Physical Appearance:";
     var_dump($appearance);
    
     $arr['appearance'] = $appearance;
     echo '<br/>';
    }

    if($tr_1 == "Formulation"){
     $formulation = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $formulation = trim($formulation);
     $formulation = strip_tags($formulation);
     $formulation = iconv('utf-8','gbk',$formulation);
     echo "产品 Formulation:";
     var_dump($formulation);
     
     $arr['formulation'] = $formulation;
     echo '<br/>';
    }

    if($tr_1 == "Stability"){
     $stability = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $stability = trim($stability);
     $stability = strip_tags($stability);
     $stability = iconv('utf-8','gbk',$stability);
     echo "产品 Stability:";
     var_dump($stability);
     
     $arr['stability'] = $stability;
     echo '<br/>';
    }

    if($tr_1 == "Purity"){
     $purity = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $purity = trim($purity);
     $purity = strip_tags($purity);
     $purity = iconv('utf-8','gbk',$purity);
     echo "产品 Purity:";
     var_dump($purity);
     
     $arr['purity'] = $purity;
     echo '<br/>';
    }

    if($tr_1 == "Amino acid sequence" || $tr_1 == "Amino Acid Sequence"){
     $amino_acid = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $amino_acid = trim($amino_acid);
     $amino_acid = strip_tags($amino_acid);
     $amino_acid = iconv('utf-8','gbk',$amino_acid);
     echo "产品 Amino acid sequence:";
     var_dump($amino_acid);
     
     $arr['amino_acid'] = $amino_acid;
     echo '<br/>';
    }

    if($tr_1 == "Solubility"){
     $solubility = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $solubility = trim($solubility);
     $solubility = strip_tags($solubility);
     $solubility = iconv('utf-8','gbk',$solubility);
     echo "产品 Solubility :";
     var_dump($solubility);
     
     $arr['solubility'] = $solubility;
     echo '<br/>';
    }


    if($tr_1 == "Biological Activity"){
     $bio_activity = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html();
     $bio_activity = trim($bio_activity);
     $bio_activity = strip_tags($bio_activity);
     $bio_activity = iconv('utf-8','gbk',$bio_activity);
     echo "产品 Biological Activity:";
     var_dump($bio_activity);
     
     $arr['bio_activity'] = $bio_activity;
     echo '<br/>';
    }


   // var_dump($tr_1);
    
    echo '<br/>';
}

 # 写入文件 
$handle = fopen('Neurotrophins.csv','a');
fputcsv($handle,$arr);
fclose($handle);

mysql_close($conn);
unset($artlist);
unset($arr);
?>

<script>
function JumpUrl(){
   location.href='?id=<?php echo ($id+1);?>';
}
setTimeout(JumpUrl,0);
</script>

说明,此次采集对phpQuery方法采集数据做了局部的优化,使我对此方法有了更搞的认识。好方法是成功的一半。

同时也有部分不足,毕竟该采集方法是针对源码的代码处理,采集代码根据页面的排版决定的,所以并不是通用型,

这在以后的学习工程中,还要继续优化和完善。学无止境,加油!

 

posted @ 2016-04-05 10:12  侠岚之弋痕夕  阅读(279)  评论(0编辑  收藏  举报
Where is the starting point, we don't have a choice, but the destination where we can pursue!