爬虫 - 斗鱼房间信息


'''爬取斗鱼直播所有的房间信息,
https://www.douyu.com/gapi/rkc/directory/0_0/1 还可直接获取json数据
'''
import time

from selenium import webdriver
from fake_useragent import UserAgent
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

class DouYuRoom(object):
    def __init__(self):
        ua = UserAgent().random
        self.url = 'https://www.douyu.com/directory/all'
        # self.driver = webdriver.PhantomJS(r'F:/tools/phantomjs-2.1.1-windows/bin/phantomjs.exe')
        self.driver = webdriver.Chrome()
        self.content_list = []
    def get_content_list(self):

        li_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li')
        content_list = []
        for li in li_list:
            content_dic = {}
            game = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
            room_url = li.find_element_by_xpath('.//a').get_attribute('href')
		   #.....
            content_dic['game'] = game
            content_dic['room_url'] = room_url
            content_list.append(content_dic)
        next_url = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
        next_url =next_url[0] if len(next_url)>0 else None
        print('next_url',next_url)
        return next_url,content_list

    def save_content(self,content):
        pass

    def run(self):
        # 获取数据
        self.driver.get(self.url)
        # 等待加载完毕,可以使用显示等待,隐式等待
        time.sleep(10)
        # 提取数据
        next_url,content_dic = self.get_content_list()
        print(content_dic)
        # 每次的数据都先保存下来了
        self.save_content(content_dic)
        # 开始下一页的提取
        while next_url:
            next_url.click()
            time.sleep(5)
            next_url, content_dic = self.get_content_list()
            print(content_dic)
            self.save_content(content_dic)
        # 关闭浏览器
        self.driver.close()


if __name__ == '__main__':
    dy = DouYuRoom()
    dy.run()


posted on 2019-10-24 13:08  Afrafre  阅读(251)  评论(0编辑  收藏  举报