php采集DEMO一个
<!DOCTYPE html PUBLIC “-//W3C//DTD XHTML 1.0 Transitional//EN” “http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd”> <html xmlns=”http://www.w3.org/1999/xhtml”> <head> <meta http-equiv=”Content-Type” content=”text/html; charset=utf-8″ /> <title>alibaba采集</title> <?php set_time_limit(0); function _rand() { $length=26; $chars = “0123456789abcdefghijklmnopqrstuvwxyz”; $max = strlen($chars) – 1; mt_srand((double)microtime() * 1000000); $string = ”; for($i = 0; $i < $length; $i++) { $string .= $chars[mt_rand(0, $max)]; } return $string; } error_reporting(0); ini_set(‘html_errors’,false); ini_set(‘display_errors’,false); mysql_pconnect(“localhost”, “root”, “password”) or die (“Could not connect” . mysql_error()); mysql_select_db(“company”); mysql_query(“SET NAMES ‘UTF8′”); if($_GET['page']){ $page = $_GET['page']; }else{ $page = ’1′; } $HTTP_SESSION=_rand(); $HTTP_SESSION; $HTTP_URL=”http://www.alibaba.com/corporations/jiangmen/CN——————————–/”.$page.”.html”; $ch = curl_init(); curl_setopt ($ch,CURLOPT_URL,$HTTP_URL); curl_setopt($ch,CURLOPT_RETURNTRANSFER,true); curl_setopt($ch,CURLOPT_USERAGENT,”Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)”); $res = curl_exec($ch); curl_close ($ch); preg_match_all(‘/href\s*=\s*["|\']?([^\s"\'>]*).en.alibaba.com\”/i’,$res,$arr); foreach($arr[1] as $a=>$web){ $HTTP_SESSION=_rand(); $HTTP_SESSION; $HTTP_Server=$web; $HTTP_URL=”.en.alibaba.com/contactinfo.html”; $ch1 = curl_init(); curl_setopt ($ch1,CURLOPT_URL,$HTTP_Server.$HTTP_URL); curl_setopt($ch1,CURLOPT_RETURNTRANSFER,true); curl_setopt($ch1,CURLOPT_USERAGENT,”Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)”); $res1 = curl_exec($ch1); curl_close ($ch1); preg_match(“/contactName(.*?)<\/a>/s”,$res1,$name); $name = strip_tags($name[1]); //$name = str_replace(“\”",”",$name); //$name = str_replace(“>”,”",$name); $name = trim($name); preg_match(“/Company Name:(.*?)<\/td>/s”,$res1,$Cname); $Cname = trim(strip_tags($Cname[1])); preg_match(“/Street Address:(.*?)<\/td>/s”,$res1,$Add); $Add = trim(strip_tags($Add[1])); preg_match(“/City:(.*?)<\/td>/s”,$res1,$City); $City = trim(strip_tags($City[1])); preg_match(“/Province\/State:(.*?)<\/td>/s”,$res1,$Pronvice); $Pronvice = trim(strip_tags($Pronvice[1])); preg_match(“/Country\/Region:(.*?)<\/td>/s”,$res1,$Region); $Region = trim(strip_tags($Region[1])); preg_match(“/Zip:(.*?)<\/td>/s”,$res1,$Zip); $Zip = trim(strip_tags($Zip[1])); preg_match(“/Telephone:(.*?)<\/td>/s”,$res1,$Tel); $Tel = trim(strip_tags($Tel[1])); preg_match(“/Mobile Phone:(.*?)<\/td>/s”,$res1,$Phone); $Phone = trim(strip_tags($Phone[1])); preg_match(“/Fax:(.*?)<\/td>/s”,$res1,$Fax); $Fax = trim(strip_tags($Fax[1])); preg_match(“/Website:(.*?)<\/td>/s”,$res1,$Web); $Web = trim(strip_tags($Web[1])); $result = mysql_query(“ INSERT INTO alibaba ( Name, Company, Address, City, Province, Region, Zip, Tel, Phone, Fax, Web ) VALUES( ‘”.htmlspecialchars($name).”‘, ‘”.htmlspecialchars($Cname).”‘, ‘”.htmlspecialchars($Add).”‘, ‘”.htmlspecialchars($City).”‘, ‘”.htmlspecialchars($Pronvice).”‘, ‘”.htmlspecialchars($Region).”‘, ‘”.htmlspecialchars($Zip).”‘, ‘”.htmlspecialchars($Tel).”‘, ‘”.htmlspecialchars($Phone).”‘, ‘”.htmlspecialchars($Fax).”‘, ‘”.htmlspecialchars($Web).”‘ )”); } if($page >= 29){ echo “OVER!”;exit(); }else{ echo “<meta http-equiv=refresh content=’0; url=alibaba.php?page=”.++$page.”‘>”; } ?> </head> <body> </body></html>