selenium,webdriver爬取斗鱼主播信息 实操
from selenium import webdriver import time from bs4 import BeautifulSoup class douyuSelenium(): #初始化,启动斗鱼浏览器 def setUp(self): self.driver = webdriver.PhantomJS() #获取斗鱼房间信息 def testDouyu(self):#'https://www.douyu.com/directory/all' self.driver.get('https://www.douyu.com/directory/all') while True: time.sleep(2) #指定解析器,生成一个soup对象 soup = BeautifulSoup(self.driver.page_source,'lxml') #获取当前页面所有的房间标题,观众人数 titles = soup.find_all('h3',{'class':'ellipsis'}) nums = soup.find_all('span',{'class':'dy-num fr'}) for title,num in zip(titles,nums): info = "房间标题:" + title.get_text().strip() + '\t' + "人气:" + num.get_text().strip() print(info) #下一页 #查找下一页 在最后一页会有shark-pager-disable-next元素 表示没有下一页 if self.driver.page_source.find('shark-pager-disable-next') != -1: break #点击 页面到下一页 next_page = self.driver.find_element_by_class_name('shark-pager-next') next_page.click() def shutdown(self): print('加载完成...') #加载完成 退出浏览器 self.driver.quit() if __name__ == '__main__': douyu = douyuSelenium() douyu.setUp() douyu.testDouyu() douyu.shutdown()