Selenium-[实例]猫眼电影爬取
import random
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
DIRVER_PATH = r'C:\Users\Administrator\Desktop\chromedriver.exe'
# 跳过selenium检测
STEALTH_JS = r'C:\Users\Administrator\Desktop\stealth.min.js'
def main():
service = ChromeService(executable_path=DIRVER_PATH)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# 不退出浏览器
# options.add_experimental_option('detach', True)
# 防止检测到selenium
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=service, options=options)
with open(STEALTH_JS) as f:
js = f.read()
driver.execute_cdp_cmd(
cmd="Page.addScriptToEvaluateOnNewDocument",
cmd_args={
"source": js
}
)
# 打开一个标签页
# driver.get("https://bot.sannysoft.com/")
driver.get("https://www.maoyan.com/films?showType=3")
# 最大化
driver.maximize_window()
# 隐式等待最长时间:5秒
driver.implicitly_wait(5)
while next_page_ele := driver.find_element(By.LINK_TEXT, "下一页"):
elements = driver.find_elements(By.XPATH,'//div[@class="movies-list"]/dl/dd/div[@class="movie-item film-channel"]')
for index, element in enumerate(elements, 1):
print("点击电影item元素")
ActionChains(driver).move_to_element(element).click().perform()
print("切换到最后的标签页")
driver.switch_to.window(driver.window_handles[-1])
print("查找评分的元素")
try:
# 评分是字体加密的,这时候就可以利用针对元素进行截图,然后丢给OCR识别
rate_ele = driver.find_element(By.XPATH, "//span[@class='index-left info-num ']/span")
# 丢给OCR识别
except:
print("暂无评分")
else:
print("可以获取到评分元素")
# 关闭标签页
driver.close()
# 回到原来的页面
driver.switch_to.window(driver.window_handles[0])
# 休息一下
time.sleep(random.randint(1, 3))
print("点击下一页")
#ActionChains(driver,).move_to_element(next_page_ele).click().perform()
next_page_ele.click()
time.sleep(10)
driver.quit()
if __name__ == '__main__':
main()
本文来自博客园,作者:蕝戀,转载请注明原文链接:https://www.cnblogs.com/juelian/p/17559584.html