通过Selenium爬取去哪网酒店数据
#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'Fade Zhao' import time from selenium import webdriver from selenium.webdriver.common.by import By from bs4 import BeautifulSoup from pymongo import MongoClient from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import datetime class QunaSpider(object): def get_hotel(self,driver,toCity,stop_Page=3,starData=None,endData=None): # 切换到酒店搜索框 driver.find_element(By.XPATH,'//*[@id="js_nva_cgy"]/li[2]').click() time.sleep(1) # City输入框 city_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toCity']") # 酒店名 地标输入框 query_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='q']") # 起始日期 startDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='fromDate']") # 结束日期 endDate_ele = driver.find_element(By.XPATH,"//div[@id='hotelsearch_panel']//input[@name='toDate']") city_ele.clear() city_ele.send_keys(toCity) startDate_ele.send_keys(starData) endDate_ele.send_keys(endData) # 提交按钮 sub_ele = driver.find_element(By.XPATH,"//*[@id='js_hotel_searchbox']//button") sub_ele.click() page_num = 0 while page_num < stop_Page: try: WebDriverWait(driver,5).until( EC.title_contains(toCity) ) except Exception as e: print(e) break # # 通过JS代码来实现将页面下拉 driver.execute_script(''' window.scrollTo(0,document.body.scrollHeight); ''') # 等待2秒,Ajax加载 time.sleep(5) # 获取页面数据 html_content = driver.page_source soup = BeautifulSoup(html_content,'html.parser') info_list = soup.find_all(class_='item_hotel_info') data_list= [] for item in info_list: hotel_data = {} title = item.find('span',class_='hotel_item') name = title.a.get_text() hotel_url = title.a['href'] hotel_type = title.em.get_text() hotel_address = item.find('span',class_='area_contair').get_text().replace('\n','').replace('\t','').strip() common_score = item.find('td',class_='hotel_facilities').find('p',class_='score').get_text() if common_score is None: common_score = '暂无评分' else: common_score = common_score.split('/')[0] print(common_score) hotel_price = item.find('div',class_='hotel_price').b.get_text() hotel_data['name']=name hotel_data['url']=hotel_url hotel_data['score']=common_score hotel_data['type']=hotel_type hotel_data['price']=hotel_price hotel_data['address']=hotel_address print('hotel_price=',hotel_price) print('name=',name) print('url=',hotel_url) print('hotel_type=',hotel_type) data_list.append(hotel_data) # 保存数据到MongoDB self.save_data(data_list) try: next_page = WebDriverWait(driver,10).until( EC.visibility_of(driver.find_element(By.CSS_SELECTOR,'.item.next')) ) next_page.click() page_num +=1 except Exception as e: print('错误:',e) break time.sleep(5) driver.quit() def save_data(self,data): '''保存数据到MongoDB,传入类型为列表''' conn = MongoClient('localhost',27017) db = conn.mydb # 连接mydb数据库,没有则自动创建 hotel = db.hotel # 批量插入 hotel.insert(data) def crawl(self,root_url,to_city): today = datetime.date.today().strftime('%Y-%m-%d') tomorrow = datetime.date.today()+ datetime.timedelta(days=1) tomorrow = tomorrow.strftime('%Y-%m-%d') driver = webdriver.Chrome() driver.set_page_load_timeout(20) driver.get(root_url) driver.maximize_window() driver.implicitly_wait(10) self.get_hotel(driver,to_city,4,today,tomorrow) if __name__ =='__main__': url = 'https://www.qunar.com/' spider = QunaSpider() spider.crawl(url,'杭州') # 有待完善