携程——多线程。
# _*_coding: utf-8 _*_ from fake_useragent import UserAgent import requests from requests.exceptions import RequestException import time from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import csv from lxml import etree import threading from queue import Queue ua = UserAgent() class XieCheng(): def __init__(self): self.start_url = "http://hotels.ctrip.com/hotel/beijing1/p{}" self.details_url = "http://hotels.ctrip.com" # 详情页url self.headers = {"User-Agent":ua.random} self.data_list = [] self.details_list = [] self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() # self.details_url_queue= Queue() self.details_html_queue = Queue() self.details_content_queue = Queue() # 构造列表页url列表 def list_url(self): # url_list = [self.start_url.format(i) for i in range(1,500)] # return url_list for i in range(1,567): self.url_queue.put(self.start_url.format(i)) # s = self.url_queue.get() # print(s) # 请求列表页 def get_url(self): try: while True: url = self.url_queue.get() print(url) response = requests.get(url=url,headers=self.headers) if response.status_code ==200: # return response.content.decode() self.html_queue.put(response.content.decode()) self.url_queue.task_done() return None except RequestException: return None #提取详情页url def extract_data(self): while True: html_str = self.html_queue.get() print('----'*20) html_list = etree.HTML(html_str) div_list = html_list.xpath("//div[@id='hotel_list']/div/ul/li[2]/h2/a/@href") for div_url in div_list: url_str = self.details_url + div_url print(url_str) self.content_queue.put(url_str) self.html_queue.task_done() # 使用selenium请求详情页 def driver_get(self): while True: details_url_s = self.content_queue.get() dcap = dict(DesiredCapabilities.PHANTOMJS) # 添加请求头 dcap["phantomjs.page.settings.userAgent"] = (ua.random) # 取消图片加载 dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get(details_url_s) print(details_url_s) print("数据请求中。。。。。") time.sleep(3) details_html_str = driver.page_source driver.quit() self.details_html_queue.put(details_html_str) self.content_queue.task_done() # 提取详情页数据 def driver_data(self,): while True: details_html_str= self.details_html_queue.get() html_str = etree.HTML(details_html_str) details_data_list= [] item_dict = {} html_str_list = html_str.xpath(".//tr[@data-disable='0']") item_dict['name'] = html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()") if len( html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()")) > 0 else None item_dict['id'] = html_str.xpath("//a[@id='linkViewMap']/@data-hotelid") if len( html_str.xpath("//a[@id='linkViewMap']/@data-hotelid")) > 0 else None details_data_list.append(item_dict) for html in html_str_list: item = {} item['price'] = html.xpath(".//span[@class='base_price']/text()") if len( html.xpath(".//span[@class='base_price']/text()")) > 0 else None item['bed'] = html.xpath(".//td[@class='col3']/text()") if len( html.xpath(".//td[@class='col3']/text()")) >0 else None details_data_list.append(item) print(details_data_list) self.details_content_queue.put(details_data_list) self.details_html_queue.task_done() #保存 def save_data(self): while True: details_data=self.details_content_queue.get() title = ['id', 'name', 'price', 'bed'] with open('xiechen.csv', 'a+', encoding='utf-8') as f: f_csv = csv.DictWriter(f,title) f_csv.writeheader() f_csv.writerows(details_data) print("数据保存完成。。。。。。") self.details_content_queue.task_done() # 主函数 def run(self): thread_list = [] # 构造url列表 t_url=threading.Thread(target=self.list_url) thread_list.append(t_url) # 请求列表页 for i in range(567): t_g_url=threading.Thread(target= self.get_url) thread_list.append(t_g_url) # 提取详情页url for i in range(10): t_extract=threading.Thread(target=self.extract_data) thread_list.append(t_extract) # 请求详情页 for i in range(7): t_details=threading.Thread(target=self.driver_get) thread_list.append(t_details) #提取详情页数据 for i in range(5): t_details_data=threading.Thread(target=self.driver_data) thread_list.append(t_details_data) #保存 for i in range(3): t_save=threading.Thread(target=self.save_data) thread_list.append(t_save) for t in thread_list: t.setDaemon(True) # 设置子线程守护 t.start() for q in [self.url_queue, self.html_queue, self.content_queue, # self.details_url_queue, self.details_html_queue, self.details_content_queue,]: q.join() # 阻塞 print("主线程结束") if __name__ == '__main__': xc = XieCheng() xc.run()
感觉多线程和phantomjs 不够完善·,想多写一些判断,和反爬措施。