Python 之selenium+phantomJS斗鱼抓取案例
from selenium import webdriver from bs4 import BeautifulSoup import time if __name__ == '__main__': driver = webdriver.PhantomJS() driver.get("https://www.douyu.com/directory/all") count = 0 # 总页数 total = 0 # 房间数 time.sleep(5) while True: # 判断是否到达尾页,如果是true,则推出循环 attr = driver.find_element_by_css_selector(".dy-Pagination-next").get_attribute("aria-disabled") if "true" in attr: break try: page_count = 0 # 记录每页多少条数据 # 解析html soup = BeautifulSoup(driver.page_source, "lxml") names = soup.find_all("h2", attrs={"class": "DyListCover-user"}) looks = soup.find_all("span", attrs={"class": "DyListCover-hot"}) for name, look in zip(names, looks): print("房间:" + name.get_text() + "\t人数:" + look.get_text()) total += 1 page_count += 1 count += 1 print("==================第%s页,每页%s条数据==================" % (str(count), str(page_count))) # 保存每一页的图片 driver.save_screenshot("./douyu/douyu_%s.png" % str(count)) # 点击下一页 driver.find_element_by_css_selector(".dy-Pagination-next").click() # 等待数据加载完毕 time.sleep(3) except RuntimeError as e: print(e) print("总页数:" + str(count)) print("总房间数:" + str(total))
结果如图: