尝试抓取大众点评网站数据

  使用PHP单线程抓取,速度比较慢,可以抓取所有的团购信息;店铺信息也可以抓取;

  公司测试产品需要使用一些数据,所有试着抓取的,感觉就是写正则,不指定别人是怎么样的实现思路,感觉使用php多线程应该会速度更好吧。

  我主要是抓评论跟一些图片,但是其他思路基本一样。按理来说,只要能显示出到网页上的,都可以抓下来。

  我抓取的思路是第一步获取所有的city信息即test_get_city_info;

         第二部通过city的url抓取每个city的每个类别的商品团购信息test_get_web_info

         第三部通过团购信息页码,抓取商品商户的信息。test_get_detail

  说明下,代码仅供学习研究。不要用于其他用途!

  未解决ip被封问题,尝试了代理,但效果极其不理想。

  

<?php

    class IndexAction extends Action {


        /*
            封装curl方法
        */
        private function curl_self($url) {
            $cookie_jar     = tempnam(APP_PATH . 'Public/cookie', 'cookie');
       //iP可以做出随机的,基本没有什么意义,大众获取的是外网ip,抓取商铺地址使用此方法会出现问题,需要重新封装头信息,
$header = array( 'CLIENT-IP:58.68.44.61', 'X-FORWARDED-FOR:58.68.44.61', ); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_COOKIEJAR,$cookie_jar); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); $result_web = curl_exec($ch); curl_close($ch); return $result_web; } private function echo_info($str) { $cache = str_repeat(' ', 64000); return $str . $cache . "<br />\n\t"; } /* 从大众点评上获取店铺信息 CREATE TABLE `ty_dazh_shopinfo` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `url` char(200) NOT NULL DEFAULT '', `city_id` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '关联city表的城市id', `is_do` tinyint unsigned not null default 0 comment '是否处理', 'type' char(10) not null default '' comment '类型', 'city_name' char(50) not null default '' comment '城市名称', PRIMARY KEY (`id`), key (is_do) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; */ public function test_get_web_info() { set_time_limit(0); ob_end_clean(); header('Content-type:text/html;Charset=UTF-8'); //绝对刷出 ob_implicit_flush(); //拼接分类url $dazh_city = M('dazh_city'); //每次取一条,用js刷新页面 $dazh_city_info = $dazh_city->field('id,city_spell,city_name')->where(array('is_do' => 0))->limit(1)->select(); if($dazh_city_info === null) die('所有城市已经处理完毕'); echo $this->echo_info('开始循环读取城市列表'); foreach ($dazh_city_info as $key => $value) { echo $this->echo_info('当前城市' . $value['city_spell']); //拼接分类url $classify = array(); $classify = array( 0 => 'http://t.dianping.com/list/'.$value['city_spell'].'-category_1',//美食 1 => 'http://t.dianping.com/list/'.$value['city_spell'].'-category_3',//休闲娱乐 2 => 'http://t.dianping.com/movie/' .$value['city_spell'],//电影 3 => 'http://t.dianping.com/hotel/' . $value['city_spell'],//酒店 4 => 'http://t.dianping.com/travel/' . $value['city_spell'],//旅游 5 => 'http://t.dianping.com/list/'.$value['city_spell'].'-category_5',//丽人 6 => 'http://t.dianping.com/list/'.$value['city_spell'].'-category_10',//生活服务 7 => 'http://t.dianping.com/goods/' . $value['city_spell'],//商品 8 => 'http://t.dianping.com/wedding/'.$value['city_spell'].'-category_8',//结婚 9 => 'http://t.dianping.com/wedding/'.$value['city_spell'].'-category_9',//亲子 ); $arr_info = array('美食','休闲娱乐','电影','酒店','旅游','丽人','生活服务','商品','结婚','亲子'); echo $this->echo_info('开始循环读取分类信息'); //读取分类信息 foreach ($classify as $k => $v) { echo $this->echo_info('<span style="color:blue;font-weight:bold;border:2px solid #ccc;">当前分类为:'.$arr_info[$k].'</span>'); //读取首页,获取分页数量 $classify_info = $this->curl_self($v); //开始匹配分页数量 $pattern_page = '/<div\s*class="tg-paginator\s*Fix"\s*id="paginator"\s*>([\s\S]*)<\/div>/iU'; $matches_pages = array(); $pattern_nums = preg_match_all($pattern_page, $classify_info, $matches_pages); if($pattern_nums === false) die('匹配分页数量出错,当前key为' . $k ); else if($pattern_nums == 0) { $max_pages = 1; }else { //匹配出最大的数字,即最大的页面值 $pattern_page_max = '/title="(.*?)"/iU'; $matches_pages_max = array(); $pattern_page_max_nums = preg_match_all($pattern_page_max, $matches_pages[1][0], $matches_pages_max); if($pattern_page_max_nums === false) die('匹配出最大的数字,即最大的页面值出错,当前key为' . $k); else if($pattern_page_max_nums == 0) { $max_pages = 1; }else { rsort($matches_pages_max[1]); $max_pages = $matches_pages_max[1][0]; } } echo $this->echo_info($arr_info[$k] . '分类下最大页码为' . $max_pages); echo $this->echo_info('开始循环读取分类下的店铺信息'); //开始读取每个分类下所有的信息 for($i = 0;$i<$max_pages;$i++) { echo $this->echo_info('当前分页为' . $i); //每次读取一页,将店铺的url存储到表中 // $file_dir = APP_PATH . 'Public/temp'; if($k == 2 || $k == 4 || $k == 7) { $i = $i + 1; $url = $v . '?pageno=' . $i; }else { $url = $v . '?pageIndex=' . $i; } echo $this->echo_info($url); $result_web_r = $this->curl_self($url); //大众店铺页码编码为utf8,不用转码 //匹配出内容区域 $matches_r = array(); if($k == 0 || $k == 1){ $pattern = '/<li\s*class="tg-floor-item"\s*data-type="floor-item"\s*data-eval-config="\{\'dealId\':\'([\d]*)\'\}">/iU'; $nums_o = preg_match_all($pattern, $result_web_r, $matches_r); if($nums_o === false) { die('匹配内容出错'); } if($nums_o == 0) //如果没有匹配到,放弃 continue; }else if($k == 2) { $pattern = '/<li\s*class="J_floor_box\s*floor-box"[\s\S*]+>([\s\S]*)<\/li>/iU'; $matches_r_on_ok= array(); $nums_o = preg_match_all($pattern, $result_web_r, $matches_r_on_ok); if($nums_o === false) { die('匹配内容出错 电影'); } if($nums_o == 0) //如果没有匹配到,放弃 continue; $matches_r[1] = array(); foreach ($matches_r_on_ok[1] as $k_no_ok => $v_no_ok) { //匹配唯一id $pattern = '/<a\s*href="\/deal\/(.*?)"\s*target="_blank"/iU'; $matches_r_ok = array(); $nums_t = preg_match_all($pattern, $v_no_ok, $matches_r_ok); if($nums_t === false) die('匹配内容出错 电影'); foreach ($matches_r_ok[1] as $k_ok => $v_ok) { $matches_r[1][] = $v_ok; } } }else if($k == 3 || $k == 5 || $k == 6 || $k == 8 || $k ==9) { $pattern = '/<ul\s*class="tg-floor-list\s*Fix\s*tg-floor-list-freak"\s*>([\s\S]*)<\/ul>\s*<\/div>\s*<\/div>\s*<div\s*class="tg-paginator\s*Fix"\s*id="paginator"\s*>/iU'; $matches_no_ok = array(); $nums_o = preg_match_all($pattern, $result_web_r, $matches_no_ok); if($nums_o === false) die('匹配内容出错 333'); if($nums_o == 0) //如果没有匹配到,放弃 continue; $matches_r[1] = array(); //匹配唯一id $pattern = '/href="\/deal\/([\d]+)"/Ui'; $matches_ok = array(); $nums_t = preg_match_all($pattern, $matches_no_ok[1][0], $matches_ok); if($nums_t === false) die('匹配内容出错 333333'); //数组去重 $matches_r[1] = array(); $matches_r[1] = array_unique($matches_ok[1]); }else if($k == 4){ $pattern = '/<div\s*class="floor-item">\s*<ul\s*class="Fix"\s*>([\s\S]*)<\/ul>\s*<\/div>/iU'; $matches_no_ok = array(); $nums_o = preg_match_all($pattern, $result_web_r, $matches_no_ok); if($nums_o === false) die('匹配内容出错 4444'); if($nums_o == 0) //如果没有匹配到,放弃 continue; $matches_r[1] = array(); //匹配唯一id $pattern = '/href="\/deal\/([\d]+)"/iU'; $matches_ok = array(); $nums_t = preg_match_all($pattern, $matches_no_ok[1][0], $matches_ok); $matches_r[1] = $matches_ok[1]; }else if($k == 7) { $pattern = '/<li\s*class="floor-box\s*J_floor_box">([\s\S]*)<\/li>/iU'; $matches_r_on_ok = array(); $nums_o = preg_match_all($pattern, $result_web_r, $matches_r_on_ok); if($nums_o === false) { die('匹配内容出错 77'); } if($nums_o == 0) //如果没有匹配到,放弃 continue; $matches_r[1] = array(); foreach ($matches_r_on_ok[1] as $k_no_ok => $v_no_ok) { //匹配唯一id $pattern = '/<a\s*href="\/deal\/([\d]+)"\s*target="_blank"/iU'; $matches_r_ok = array(); $nums_t = preg_match_all($pattern, $v_no_ok, $matches_r_ok); if($nums_t === false) die('匹配内容出错 7777'); foreach ($matches_r_ok[1] as $k_ok => $v_ok) { $matches_r[1][] = $v_ok; } } } //拼接url $dazh_shopinfo = M('dazh_shopinfo'); if(!empty($matches_r[1])) { foreach ($matches_r[1] as $key => $value_id) { $shop_url = ''; $shop_url = 'http://t.dianping.com/deal/' . $value_id; $city_id = $value['id']; $insert_res = $dazh_shopinfo->add(array('url' => $shop_url,'city_id' => $city_id,'type' => $arr_info[$k],'city_name' => $value['city_name'])); if(!$insert_res) die('添加数据错误'); } } echo $this->echo_info('分页' . $i .'内容读取完毕'); } } echo $this->echo_info('<span style="color:red;font-weight:bold;">城市' . $value['city_spell'] . '读取完毕</span>'); $dazh_city->where(array('id' => $value['id']))->save(array('is_do' => 1)); } echo '<script type="text/javascript">location.reload(true);</script>'; } /* 从大众店铺获取城市列表,以及url,拼音 Create Table: CREATE TABLE `ty_dazh_city` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `city_name` char(50) NOT NULL DEFAULT '' COMMENT '城市名称', `city_spell` char(50) NOT NULL DEFAULT '' COMMENT '城市拼音', `city_url` char(200) NOT NULL DEFAULT '' COMMENT '城市url', `is_do` tinyint not null default 0 comment '是否使用', PRIMARY KEY (`id`), key (`is_do`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; */ public function test_get_city_info() { $dazh_city = M('dazh_city'); $url = 'http://t.dianping.com/citylist'; $result_web = $this->curl_self($url); //开始匹配内容 $pattern = '/<div\s*class="cityes"\s*>([\S\s]*)<\/div>/Ui'; $nums_o = preg_match_all($pattern, $result_web, $matches); if($nums_o == 0) die('没有匹配到内容,失败'); //循环数据入库 foreach ($matches[1] as $key => $value) { //匹配出a标签,获取内容,拼装入库 if(empty($value)) continue; $pattern_a = '/<a\s*href="([\s\S]*)"\s*title="([\s\S]*)"\s*>[\s\S]*<\/a>/iU'; $nums_t = preg_match_all($pattern_a, $value, $matches_a); if($nums_t == 0)//如果没有匹配到,那么放弃 continue; foreach ($matches_a[1] as $k => $v) { $city_name = $matches_a[2][$k]; $city_spell = trim(str_replace('/', '', $v)); $city_url = 'http://t.dianping.com' . $v; $insert_res = $dazh_city->add(array( 'city_name' => $city_name, 'city_spell' => $city_spell, 'city_url' => $city_url )); if(!$insert_res) die('insert 数据错误,' . $dazh_city->getDbError()); } } } /* 获取大众点评商品详情 create table ty_dazh_shop_detail( id int unsigned not null auto_increment, adress char(255) not null default '' comment '店铺地址', lat decimal(12,8) not null default 0 comment '维度', lng decimal(12,8) not null default 0 comment '经度', title char(100) not null default '' comment '标题', taste tinyint not null default 0 comment '口味', setting tinyint not null default 0 comment '环境', service tinyint not null default 0 comment '服务', content text comment '评论内容', img_info varchar(500) not null default 0 comment '图片路径', primary key (id)) partition by range(id) ( partition p0 values less than (100000), partition p1 values less than (300000), partition p2 values less than (500000), partition p3 values less than (700000), partition p4 values less than (800000), partition p5 values less than maxvalue ); alter table ty_dazh_shop_detail charset = utf8; ALTER TABLE `tuyou`.`ty_dazh_shop_detail` CHANGE COLUMN `adress` `adress` char(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '店铺地址', CHANGE COLUMN `title` `title` char(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '标题', CHANGE COLUMN `content` `content` text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '评论内容', CHANGE COLUMN `img_info` `img_info` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '0' COMMENT '图片路径'; */ public function test_get_detail() { header('Content-type:text/html;Charset=UTF-8'); $dazh_shopinfo = M('dazh_shopinfo'); $dazh_shop_detail = M('dazh_shop_detail'); $dazh_shopinfo_info = $dazh_shopinfo->field('id,url')->where(array('is_do' => 0))->find(); //获取商品唯一id标识符 $url_info = explode('/', $dazh_shopinfo_info['url']); $url_id = $url_info[count($url_info) - 1]; //读取内容 $web_info = $this->curl_self($dazh_shopinfo_info['url']); //匹配出title $pattern_h1 = '/<h1\s*class="title"\s*>([\s\S]*)<\/h1>/'; $nums_h1 = preg_match_all($pattern_h1, $web_info, $matches_h1); if($nums_h1 === false) die('匹配出title出错'); //过滤特殊字符 $matches_h1[1][0] = preg_replace("/\n/", '', $matches_h1[1][0]); $matches_h1[1][0] = preg_replace("/\t/", '', $matches_h1[1][0]); $title = preg_replace("/\s/", '', $matches_h1[1][0]); //判断是否有评论 $pattern_comment = '/<span>暂无评价<\/span>/iU'; $nums_com = preg_match_all($pattern_comment, $web_info, $matches_com); if($nums_com === false) die('匹配评论数量错误'); //如果有评论 if($nums_com == 0) { //提取评论(拿到店铺信息,拼接店铺url,进入店铺网址,爬取信息) //1,获取对应参数,url:http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=2022338&action=region $url_ajax = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId='.$url_id.'&action=region'; $ajax_need_info = $this->curl_self($url_ajax); $ajax_need_info = json_decode($ajax_need_info); if($ajax_need_info->code == 200) { //观察结构可以发现,subs多个表示多个地区的店铺,只拿一个即可 $regionId = $ajax_need_info->msg->region[0]->subs[0]->id; $cityId = $ajax_need_info->msg->region[0]->id; $action = 'shops'; $page = 1; //获取店铺id,经纬度 $url_shopinfo = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId='.$url_id.'&cityId='.$cityId.'&action='.$action.'&regionId='.$regionId.'&page='.$page; $shopinfo = $this->curl_self($url_shopinfo); $shopinfo = json_decode($shopinfo); if($shopinfo->code == 200) { foreach ($shopinfo->msg->shops as $key => $value) { $flag_is_get_img = 0;//是否2个评论都没有图片 $insert_data = array(); $insert_data['adress'] = $value->address . $value->shopName; $insert_data['lat'] = $value->glat; $insert_data['lng'] = $value->glng; $insert_data['title'] = $title; $shop_url = 'http://www.dianping.com/shop/' . $value->shopId; //读取店铺内信息 $shop_info_need = $this->curl_self($shop_url); //开始匹配评论区块 $pattern = '/<li\s*class="comment-item"[\s\S]*>([\s\S]*)<\/li>/iU'; $matches_no_ok = array(); $nums_o = preg_match_all($pattern,$shop_info_need,$matches_no_ok); //开始取详情 for($i =0;$i <3;$i++) { //口味 $pattern_taste = '/<span\s*class="item">口味:([\d]+)<\/span>/iU'; $nums_taste = preg_match_all($pattern_taste,$matches_no_ok[1][$i], $matches_taste); if($nums_taste) //如果有值拿值,没有就算了 $insert_data['taste'] = $matches_taste[1][0]; //环境 $pattern_setting = '/<span\s*class="item">环境:([\d]+)<\/span>/iU'; $nums_setting = preg_match_all($pattern_setting,$matches_no_ok[1][$i], $matches_setting); if($nums_setting) //如果有值拿值,没有就算了 $insert_data['setting'] = $matches_setting[1][0]; //服务 $pattern_service = '/<span\s*class="item">服务:([\d]+)<\/span>/iU'; $nums_service = preg_match_all($pattern_service,$matches_no_ok[1][$i], $matches_service); if($nums_service) //如果有值拿值,没有就算了 $insert_data['service'] = $matches_service[1][0]; //评论内容 $pattern_content = '/<p\s*class="desc">([\s\S]*)<\/p>/iU'; $nums_content = preg_match_all($pattern_content,$matches_no_ok[1][$i], $matches_content); if($nums_content){ $insert_data['content'] = $matches_content[1][0]; if(mb_strlen($insert_data['content']) > 90) continue; } //评论图片 $pattern_pics = '/<img\s*src=".*?"\s*data-lazyload="(.*?)"\s*alt=".*?"\s*>\s*<\/a>/iU'; $nums_pics = preg_match_all($pattern_pics,$matches_no_ok[1][$i], $matches_pics); //存储图片 if($nums_pics) { $flag_is_get_img = 1; foreach ($matches_pics[1] as $k_pic => $v_pic) { $file = file_get_contents($v_pic); //创建目录,000/000/000/wenjian.jpg $dir_total = str_pad($value->shopId, 9, '0', STR_PAD_LEFT); $dir_arr = str_split($dir_total,3); $file_path = C('FILE_UPLOAD_PATH') . '/dzh/' . $dir_arr[0] .'/'. $dir_arr[1] .'/'. $dir_arr[2] . '/'; if(!is_dir($file_path)) @mkdir($file_path,0777,true); $img_url = $file_path.uniqid().'.jpg'; $fort = file_put_contents($img_url, $file); $img_url_i = str_replace(APP_PATH, '', $img_url); if($fort === false) die('存储图片失败'); else $img_url_info[] = $img_url_i; } $insert_data['img_info'] = serialize($img_url_info); } //如果没有图片,即$flag_is_get_img为0,那么提取图片 if($flag_is_get_img == 0 && $i == 2) { //提取图片 $patter_photos = '/<img\s*itemprop="photo"\s*src="([\s\S]*)"[\s\S]*>/iU'; $nums_photos = preg_match_all($patter_photos,$shop_info_need, $matches_photos); $file = file_get_contents($matches_photos[1][0]); //创建目录,000/000/000/wenjian.jpg $dir_total = str_pad($value->shopId, 9, '0', STR_PAD_LEFT); $dir_arr = str_split($dir_total,3); $file_path = C('FILE_UPLOAD_PATH') . '/dzh/' . $dir_arr[0] .'/'. $dir_arr[1] .'/'. $dir_arr[2] . '/'; if(!is_dir($file_path)) @mkdir($file_path,0777,true); $img_url = $file_path.uniqid().'.jpg'; $fort = file_put_contents($img_url, $file); $img_url_i = str_replace(APP_PATH, '', $img_url); if($fort === false) die('存储图片失败'); else $img_url_info[] = $img_url_i; $insert_data['img_info'] = serialize($img_url_info); } //数据入库 $insert_res = $dazh_shop_detail->add($insert_data); if(!$insert_res) die('insert error ' . $dazh_shop_detail->getDbError()); } } } } //更改状态,刷新页面 $dazh_shopinfo->where(array('id' => $dazh_shopinfo_info['id']))->save(array('is_do' => 1)); echo '<script type="text/javascript">location.reload(true);</script>'; die('ok 1'); }else { //更改状态,刷新页面 $dazh_shopinfo->where(array('id' => $dazh_shopinfo_info['id']))->save(array('is_do' => 1)); echo '<script type="text/javascript">location.reload(true);</script>'; } } /* 地址转换为经纬度 ak EkmwkVyCUSGRADzjE5P7aXA4,自己申请 */ public function test_address_to_nums($address,$city) { //拼接url $url = 'http://api.map.baidu.com/geocoder/v2/?ak=EkmwkVyCUSGRADzjE5P7aXA4&callback=renderOption&output=json&address='.$address.'&city=' . $city; $info = file_get_contents($url); //匹配出json $pattern = '/\(([\s\S]*)\)/iU'; $nums_o = preg_match_all($pattern, $info, $matches); $result = (array)json_decode($matches[1][0]); //拼接数据 $data = array(); if($result['status'] != 0) $data = array('lng' => 0,'lat' => 0); else { $data['lng'] = $result['result']->location->lng; $data['lat'] = $result['result']->location->lat; } return $data; } }

 

posted @ 2015-01-10 00:03  栋的博客  阅读(1884)  评论(0编辑  收藏  举报
深入理解php php扩展开发 docker mongodb