通过Selenium爬取去哪网酒店数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'Fade Zhao'
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pymongo import MongoClient

from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
class QunaSpider(object):

    def get_hotel(self,driver,toCity,stop_Page=3,starData=None,endData=None):
        # 切换到酒店搜索框
        driver.find_element(By.XPATH,'//*[@id="js_nva_cgy"]/li[2]').click()
        time.sleep(1)
        # City输入框
        city_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toCity']")
        # 酒店名 地标输入框
        query_ele =  driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='q']")
        # 起始日期
        startDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='fromDate']")
        # 结束日期
        endDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toDate']")

        city_ele.clear()
        city_ele.send_keys(toCity)
        startDate_ele.send_keys(starData)
        endDate_ele.send_keys(endData)
        # 提交按钮
        sub_ele = driver.find_element(By.XPATH,"//*[@id='js_hotel_searchbox']//button")
        sub_ele.click()
        page_num = 0
        while page_num < stop_Page:
            try:
                WebDriverWait(driver,5).until(
                    EC.title_contains(toCity)
                )
            except Exception as e:
                print(e)
                break
            # # 通过JS代码来实现将页面下拉
            driver.execute_script('''
                        window.scrollTo(0,document.body.scrollHeight);
                                     ''')
            # 等待2秒,Ajax加载
            time.sleep(5)
            # 获取页面数据
            html_content = driver.page_source
            soup = BeautifulSoup(html_content,'html.parser')
            info_list = soup.find_all(class_='item_hotel_info')
            data_list= []
            for item in info_list:
                hotel_data = {}
                title = item.find('span',class_='hotel_item')
                name = title.a.get_text()
                hotel_url = title.a['href']
                hotel_type = title.em.get_text()
                hotel_address = item.find('span',class_='area_contair').get_text().replace('\n','').replace('\t','').strip()
                common_score = item.find('td',class_='hotel_facilities').find('p',class_='score').get_text()
                if common_score is None:
                    common_score = '暂无评分'
                else:
                    common_score = common_score.split('/')[0]
                print(common_score)
                hotel_price = item.find('div',class_='hotel_price').b.get_text()
                hotel_data['name']=name
                hotel_data['url']=hotel_url
                hotel_data['score']=common_score
                hotel_data['type']=hotel_type
                hotel_data['price']=hotel_price
                hotel_data['address']=hotel_address

                print('hotel_price=',hotel_price)
                print('name=',name)
                print('url=',hotel_url)
                print('hotel_type=',hotel_type)
                data_list.append(hotel_data)
            # 保存数据到MongoDB
            self.save_data(data_list)

            try:
                next_page = WebDriverWait(driver,10).until(
                    EC.visibility_of(driver.find_element(By.CSS_SELECTOR,'.item.next'))
                )
                next_page.click()
                page_num +=1

            except Exception as e:
                print('错误:',e)
                break
            time.sleep(5)

        driver.quit()

    def save_data(self,data):
        '''保存数据到MongoDB,传入类型为列表'''
        conn = MongoClient('localhost',27017)
        db = conn.mydb  # 连接mydb数据库,没有则自动创建
        hotel = db.hotel
        # 批量插入
        hotel.insert(data)

    def crawl(self,root_url,to_city):
        today = datetime.date.today().strftime('%Y-%m-%d')
        tomorrow = datetime.date.today()+ datetime.timedelta(days=1)
        tomorrow = tomorrow.strftime('%Y-%m-%d')

        driver = webdriver.Chrome()
        driver.set_page_load_timeout(20)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)
        self.get_hotel(driver,to_city,4,today,tomorrow)

if __name__ =='__main__':
    url = 'https://www.qunar.com/'
    spider = QunaSpider()
    spider.crawl(url,'杭州')
    # 有待完善

 

posted @ 2017-12-11 01:08  LeeeetMe  阅读(250)  评论(0编辑  收藏  举报