学习selenium

import requests
import bs4
from selenium.webdriver.common.by import By
import pandas as pd
driverPath = "C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"
url = 'https://so.gushiwen.cn/'
url2 = 'gushi/tangshi.aspx'

msg = requests.get(url=url + url2)
soup = bs4.BeautifulSoup(msg.text, "html.parser")

titles = soup.find_all(class_='typecont')

result = []
from selenium import webdriver
driver = webdriver.Edge(driverPath)

for title in titles:
    list_name = title.find_all('a')
    for item in list_name:
        driver.get(url + item['href'])

        a = driver.find_element(By.CLASS_NAME,'contson').text

        result.append((item.text,a))
driver.quit()

result = pd.DataFrame(result,columns=['诗名','诗词'])
print(result)

有个唐诗三百首的爬取任务,而人家的是script动态加载,普通的request.get无法支持,因此引入selenium库,模拟浏览器访问,但是由于需要浏览器访问,大大增加了运行时间

posted @ 2024-03-20 19:48  子过杨梅  阅读(8)  评论(0编辑  收藏  举报