selenium,webdriver爬取斗鱼主播信息 实操

from selenium import webdriver
import time
from bs4 import BeautifulSoup


class douyuSelenium():
    #初始化,启动斗鱼浏览器
    def setUp(self):
        self.driver = webdriver.PhantomJS()

    #获取斗鱼房间信息
    def testDouyu(self):#'https://www.douyu.com/directory/all'
        self.driver.get('https://www.douyu.com/directory/all')
        while True:

            time.sleep(2)
            #指定解析器,生成一个soup对象
            soup = BeautifulSoup(self.driver.page_source,'lxml')

            #获取当前页面所有的房间标题,观众人数
            titles = soup.find_all('h3',{'class':'ellipsis'})
            nums = soup.find_all('span',{'class':'dy-num fr'})
            for title,num in zip(titles,nums):
                info = "房间标题:" + title.get_text().strip() + '\t' + "人气:" + num.get_text().strip()
                print(info)

            #下一页
            #查找下一页  在最后一页会有shark-pager-disable-next元素 表示没有下一页
            if self.driver.page_source.find('shark-pager-disable-next') != -1:
                break

            #点击 页面到下一页
            next_page = self.driver.find_element_by_class_name('shark-pager-next')
            next_page.click()

    def shutdown(self):
        print('加载完成...')
        #加载完成 退出浏览器
        self.driver.quit()



if __name__ == '__main__':
    douyu = douyuSelenium()
    douyu.setUp()
    douyu.testDouyu()
    douyu.shutdown()

 

posted @ 2018-03-14 19:54  Bob__Zhang  阅读(293)  评论(0编辑  收藏  举报