(phpQuery)对网站产品信息采集代码的优化
a.要采集的源链接:
http://www.prospecbio.com/Recombinant_Proteins/
b.具体要求:
接下来就是采集代码的编写。
对于:b-(1)中,代码如下:
<?php header('Content-Type:text/html;charset=UTF-8'); include './phpQuery/phpQuery.php'; set_time_limit(10000); $url = "http://www.prospecbio.com/Hormones/"; echo "当前的URL:"; echo $url."<br/>"; phpQuery::newDocumentFile($url); $artList = pq(".Body"); // var_dump($artList); $li = ''; foreach($artList as $li){ $path = ''; $head = ''; $head = 'http://www.prospecbio.com'; $tr = ''; $tr = pq($li)->eq(0)->find("table")->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->find("a")->eq(0)->attr('href'); $tr = trim($tr); if($tr != ''){ $path = $head.$tr; // var_dump($path); $path .= "\r\n"; file_put_contents('Url.txt',$path,FILE_APPEND); } // exit; } ?>
对于:b-(2)中代码如下:
<?php header('Content-Type:text/html;charset=UTF-8'); include './phpQuery/phpQuery.php'; set_time_limit(10000); $id = isset($_GET['id']) ? intval($_GET['id']) : 1; if($id > 14){ echo "finish!"; exit; } echo "当前 id=".$id; echo "<br/>"; $conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" ); $db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" ); // var_dump($conn,$db); $url = ''; $sql = ''; $sql = 'select url from url_a where id ='.$id; echo "当前sql :".$sql; echo '<br/>'; $query = mysql_query($sql); $res = mysql_fetch_assoc($query); $url = trim($res['url']); echo "当前的url:".$url; echo '<br/>'; phpQuery::newDocumentFile($url); $artList = pq(".Body"); // var_dump($artList); $li = ''; foreach($artList as $k => $li){ $tr = ''; $tr = pq($li)->eq(0)->find('table')->eq(0)->find('tr')->eq(1)->find('td')->eq(0)->find('a')->attr('href'); $tr = trim($tr); if($tr !== '' and $k > 0){ $head = ''; $head = 'http://www.prospecbio.com'; $path = ''; $tr = ltrim($tr,"."); $path = $head.$tr."\r\n"; var_dump($path); echo '<br/>'; file_put_contents('Url_a.txt',$path,FILE_APPEND); } } mysql_close($conn); unset($artList); ?> <script> function JumpUrl(){ location.href='?id=<?php echo ($id+1);?>'; } setTimeout('JumpUrl()',0); </script>
对于:b-(3)中,代码如下:
<?php header('Content-Type:text/html;charset=gb2312'); include './phpQuery/phpQuery.php'; set_time_limit(100000); // $id = isset($_GET['id']) ? intval($_GET['id']) : 1290; // $id = isset($_GET['id']) ? intval($_GET['id']) : 2; $id = isset($_GET['id']) ? intval($_GET['id']) : 1; if($id > 63){ echo "finish!"; exit; } echo "当前 id=".$id; echo "<br/>"; $conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" ); $db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" ); // var_dump($conn,$db); $url = ''; $sql = ''; $sql = 'select url from url_b where id ='.$id; echo "当前sql :".$sql; echo '<br/>'; $query = mysql_query($sql); $res = mysql_fetch_assoc($query); $url = trim($res['url']); echo "当前的url:".$url; echo '<br/>'; //$url_wh = "http://www.prospecbio.com/CAPN2_10_455/"; phpQuery::newDocumentFile($url); //phpQuery::newDocumentFile($url_wh); $arr = array(); ###用于装载产品信息 $arr['product_url'] = ''; $arr['product_url'] = $url; ### Product Name $product_name = ''; $product_name = pq('#PageHeader')->eq(0)->find('span')->eq(0)->html(); // var_dump($a); $product_name = trim($product_name); echo "产品名称:"; var_dump($product_name); $arr['product_name'] = ''; $arr['product_name'] = $product_name; echo '<br/>'; #### price $price = ''; $price_1 = ''; $price_2 = ''; $price_3 = ''; $price_1 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(0)->find('td')->eq(1)->find('label')->eq(0)->html(); $price_1 = trim($price_1); iconv('utf-8','gbk',$price_1); $price_2 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(1)->find('td')->eq(1)->find('label')->eq(0)->html(); $price_2 = trim($price_2); iconv('utf-8','gbk',$price_2); $price_3 = pq('.ProductsColumnPrice')->find('table')->eq(0)->find('tr')->eq(2)->find('td')->eq(1)->find('label')->eq(0)->html(); $price_3 = trim($price_3); iconv('utf-8','gbk',$price_3); $price = $price_1."/".$price_2."/".$price_3; iconv('utf-8','gbk',$price); echo "产品价格:"; var_dump($price); $arr['price'] = ''; $arr['price'] = $price; echo '<br/>'; ########## Catalogue Number $cata_num = ''; $cata_num = pq('.ItemRowLastCellStyle')->eq(0)->html(); $cata_num = trim($cata_num); echo "产品 Catalogue Number:"; var_dump($cata_num); $arr['cata_num'] = ''; $arr['cata_num'] = $cata_num; echo '<br/>'; ############ Source $source = ''; $appearance = ''; $formulation = ''; $stability = ''; $purity = ''; $amino_acid = ''; $solubility = ''; $bio_activity = ''; $artlist = ''; $artlist = pq('.ItemRowFirstCellStyle'); $arr['source'] = ''; $arr['appearance'] = ''; $arr['formulation'] = ''; $arr['stability'] = ''; $arr['solubility'] = ''; $arr['purity'] = ''; $arr['amino_acid'] = ''; $arr['bio_activity'] = ''; foreach($artlist as $k => $li){ $tr_1 = ''; $tr_1 = pq($li)->eq(0)->find('span')->eq(0)->html(); $tr_1 = trim($tr_1); if($tr_1 == "Source"){ $source = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $source = trim($source); $source = strip_tags($source); $source = iconv('utf-8','gbk',$source); echo "产品 Source:"; var_dump($source); $arr['source'] = $source; echo '<br/>'; // var_dump($k,$tr_1,$source); } if($tr_1 == "Physical Appearance"){ $appearance = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $appearance = trim($appearance); $appearance = strip_tags($appearance); $appearance = iconv('utf-8','gbk',$appearance); echo "产品 Physical Appearance:"; var_dump($appearance); $arr['appearance'] = $appearance; echo '<br/>'; } if($tr_1 == "Formulation"){ $formulation = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $formulation = trim($formulation); $formulation = strip_tags($formulation); $formulation = iconv('utf-8','gbk',$formulation); echo "产品 Formulation:"; var_dump($formulation); $arr['formulation'] = $formulation; echo '<br/>'; } if($tr_1 == "Stability"){ $stability = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $stability = trim($stability); $stability = strip_tags($stability); $stability = iconv('utf-8','gbk',$stability); echo "产品 Stability:"; var_dump($stability); $arr['stability'] = $stability; echo '<br/>'; } if($tr_1 == "Purity"){ $purity = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $purity = trim($purity); $purity = strip_tags($purity); $purity = iconv('utf-8','gbk',$purity); echo "产品 Purity:"; var_dump($purity); $arr['purity'] = $purity; echo '<br/>'; } if($tr_1 == "Amino acid sequence" || $tr_1 == "Amino Acid Sequence"){ $amino_acid = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $amino_acid = trim($amino_acid); $amino_acid = strip_tags($amino_acid); $amino_acid = iconv('utf-8','gbk',$amino_acid); echo "产品 Amino acid sequence:"; var_dump($amino_acid); $arr['amino_acid'] = $amino_acid; echo '<br/>'; } if($tr_1 == "Solubility"){ $solubility = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $solubility = trim($solubility); $solubility = strip_tags($solubility); $solubility = iconv('utf-8','gbk',$solubility); echo "产品 Solubility :"; var_dump($solubility); $arr['solubility'] = $solubility; echo '<br/>'; } if($tr_1 == "Biological Activity"){ $bio_activity = pq('.ItemRowLastCellStyle')->eq($k)->find('span')->eq(0)->html(); $bio_activity = trim($bio_activity); $bio_activity = strip_tags($bio_activity); $bio_activity = iconv('utf-8','gbk',$bio_activity); echo "产品 Biological Activity:"; var_dump($bio_activity); $arr['bio_activity'] = $bio_activity; echo '<br/>'; } // var_dump($tr_1); echo '<br/>'; } # 写入文件 $handle = fopen('Neurotrophins.csv','a'); fputcsv($handle,$arr); fclose($handle); mysql_close($conn); unset($artlist); unset($arr); ?> <script> function JumpUrl(){ location.href='?id=<?php echo ($id+1);?>'; } setTimeout(JumpUrl,0); </script>
说明,此次采集对phpQuery方法采集数据做了局部的优化,使我对此方法有了更搞的认识。好方法是成功的一半。
同时也有部分不足,毕竟该采集方法是针对源码的代码处理,采集代码根据页面的排版决定的,所以并不是通用型,
这在以后的学习工程中,还要继续优化和完善。学无止境,加油!
起点在哪,或许选择不了。重要的是,你追求的终点在哪!