php爬虫 curl 拼多多 京东评论采集
PDD评论:需要登录,需要添加头信息
AccessToken $header[] = 'AccessToken:';
http://apiv4.yangkeduo.com/reviews/'.$goods_id.'/list?size=10&page='.$page
JD评论:
https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId='.$goods_id.'&score='.$score.'&sortType='.$sortType.'&page='.$i.'&pageSize=10&isShadowSku=0&rid=0&fold=1
$sortType = 5;//排序
$score = 3;//0全部,1 差评,2中评,3好评,4带图评论,5追评
AccessToken $header[] = 'AccessToken:';
http://apiv4.yangkeduo.com/reviews/'.$goods_id.'/list?size=10&page='.$page
JD评论:
https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId='.$goods_id.'&score='.$score.'&sortType='.$sortType.'&page='.$i.'&pageSize=10&isShadowSku=0&rid=0&fold=1
$sortType = 5;//排序
$score = 3;//0全部,1 差评,2中评,3好评,4带图评论,5追评
public function spider(){ ini_set("display_errors", "On");//打开错误提示 ini_set("error_reporting",E_ALL);//显示所有错误 header("Content-Type: text/html; charset=utf-8"); $header = $this->header(); $header[] = 'Referer: https://item.jd.com/4995961.html'; //设置浏览器信息 $header[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'; $url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=4995961&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1'; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); //为防止爬取多次禁用Ip,可用代理ip // curl_setopt($ch, CURLOPT_PROXY,'88.198.50.103'); //代理服务器地址 // curl_setopt($ch, CURLOPT_PROXYPORT, '8080'); //代理服务器端口 $output = curl_exec($ch); curl_close($ch); $encode = mb_detect_encoding($output, array("ASCII",'UTF-8',"GB2312","GBK",'BIG5')); if($encode == 'UTF-8'){ echo $encode; }else{ $output = mb_convert_encoding($output, 'UTF-8', $encode); } $result = json_decode($output, true); } //此函数提供了国内的IP地址 public static function header(){ $ip_long = array( array('607649792', '608174079'), //36.56.0.0-36.63.255.255 array('1038614528', '1039007743'), //61.232.0.0-61.237.255.255 array('1783627776', '1784676351'), //106.80.0.0-106.95.255.255 array('2035023872', '2035154943'), //121.76.0.0-121.77.255.255 array('2078801920', '2079064063'), //123.232.0.0-123.235.255.255 array('-1950089216', '-1948778497'), //139.196.0.0-139.215.255.255 array('-1425539072', '-1425014785'), //171.8.0.0-171.15.255.255 array('-1236271104', '-1235419137'), //182.80.0.0-182.92.255.255 array('-770113536', '-768606209'), //210.25.0.0-210.47.255.255 array('-569376768', '-564133889'), //222.16.0.0-222.95.255.255 ); $rand_key = mt_rand(0, 9); $ip= long2ip(mt_rand($ip_long[$rand_key][0], $ip_long[$rand_key][1])); $headers['CLIENT-IP'] =$ip; $headers['X-FORWARDED-FOR'] =$ip; $headers["VIA"] = $ip; $headers["REMOTE_ADDR"] = $ip; // $header[] = 'Referer: https://item.jd.com/'.$goods_id.'.html'; $headerArr = array(); foreach($headers as $n => $v ) { $headerArr[] = $n .': ' . $v; } return $headerArr; }