pyspider爬取TripAdvisor

 1 #!/usr/bin/env python
 2 # -*- encoding: utf-8 -*-
 3 # Created on 2017-06-11 10:10:53
 4 # Project: london
 5 
 6 from pyspider.libs.base_handler import *
 7 import pymongo
 8 
 9 
10 class Handler(BaseHandler):
11     crawl_config = {
12     }
13     client = pymongo.MongoClient('localhost')
14     db = client['trip']
15 
16     @every(minutes=24 * 60)
17     def on_start(self):
18         self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-London_England.html', callback=self.index_page)
19 
20     @config(age=10 * 24 * 60 * 60)
21     def index_page(self, response):
22         for each in response.doc('.listing_title > a').items():
23             self.crawl(each.attr.href, callback=self.detail_page)
24         next_page = response.doc('.pagination .nav.next').attr.href
25         self.crawl(next_page,callback = self.index_page)
26 
27     @config(priority=2)
28     def detail_page(self, response):
29         return {
30             "name":response.doc('h1').text(),
31             "url": response.url,
32             'comment':response.doc('.heading_ratings .taLnk').text(),
33             'address':response.doc('.addressReset > span.format_address').text(),
34             'phone':response.doc('.phoneNumber').text(),
35             'duration':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(5) > div > div:nth-child(1)').text(),
36             'instruction':response.doc('#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(6) > div > b').text()
37         }
38     def on_result(self,result):
39         if result:
40             self.save_to_mongo(result)
41             
42     def save_to_mongo(self,result):
43         if self.db['london'].insert(result):
44             print('saved to mongo',result)
45     

 

posted @ 2017-06-11 15:15  道高一尺  阅读(717)  评论(0编辑  收藏  举报