爬虫案例-使用selenium模拟点击动态页面
爬取斗鱼上正在直播的主播名、直播分区、直播标题以及直播热度等信息,以jsonlines的形式写入到本地json文件中,代码如下:
# coding:utf-8 import unittest import json from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver import Chrome,ChromeOptions from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait class DouyuSpider(unittest.TestCase): def setUp(self): options = ChromeOptions() options.add_argument('--headless') options.add_argument( 'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_argument("disable-blink-features=AutomationControlled") prefs = {'profile.default_content_setting_values': {'notifications': 2}} options.add_experimental_option('prefs', prefs) self.driver = Chrome(options=options) self.url = 'https://www.douyu.com/directory/all' self.f = open('douyu.json','a') def testDouyu(self): self.driver.get(self.url) while True: next = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'dy-Pagination-item-custom'))) soup = BeautifulSoup(self.driver.page_source,'lxml') elements = soup.select('div[class="layout-Module-container layout-Cover ListContent"] li[class="layout-Cover-item"]') for element in elements: user = element.find_all('div',{"class":"DyListCover-userName"})[0].text #主播 zone = element.find_all('span',{"class":"DyListCover-zone"})[0].text #直播分区 title = element.find_all('h3',{"class":"DyListCover-intro"})[0].text #直播标题 hot = element.find_all('span',{"class":"DyListCover-hot"})[0].text #直播热度 douyu = { 'username':user, 'zone':zone, 'title':title, 'hot':hot } json_data = json.dumps(douyu,ensure_ascii=False) self.f.write(json_data+'\n') if self.driver.page_source.find('dy-Pagination-disabled dy-Pagination-next') != -1: break next.click() def tearDown(self): self.driver.quit() self.f.close() print('执行结束') if __name__ == '__main__': unittest.main()