爬虫--selenium模块

selenium模块与爬虫的关系:

--方便便捷的获取网站中动态加载的数据

--便捷实现模拟登录

什么是selenium模块

--基于浏览器自动化的一个模块.python代码调用浏览器,浏览器根据代码完成自动化操作.

selenium使用流程:

--环境安装:pip install selenium

--下载浏览器的驱动程序(每个浏览器不一样)

　　--chrome版本:https://chromedriver.storage.googleapis.com/index.html

　　　　　　　　 https://registry.npmmirror.com/binary.html?path=chromedriver/

--实例化一个浏览器对象

--编写基于浏览器自动化的操作代码

　　--发起请求:get(url)

　　--标签定位:find系列方法

　　--标签交互:send_keys('xxx')

　　--执行js:excute_script('js代码')

　　--前进,后退:back(),forward()

　　--关闭浏览器:quit()

--selenuim处理iframe

　　--目标标签如果在iframe标签之中,则必须使用switch_to.frame(id)

　　--动作链(拖动):fromselenium.webdriver import ActionChains

　　　　--实例化一个动作链对象:action=ActionChains(bro)

　　　　--click_and_hold(div):点击且长按

　　　　--move_by_offset(x,y)

　　　　--perform()让动作链执行

　　　　--action.release():释放动作链

selenium可以获取到经过二次请求的渲染后的数据,这是不同于其他抓取模块的

以前述案例(抓取梨视频)为例selenium基础代码:

from selenium import webdriver
from lxml import etree
from time import sleep
# 实例化一个浏览器对象(传入浏览器的驱动程序)
browser = webdriver.Chrome(executable_path='chromedriver.exe')
# 让浏览器发起一个指定URL请求
browser.get('https://www.bilibili.com/video/BV1VS4y1q7GM?spm_id_from=333.851.b_7265636f6d6d656e64.1')
# 获取浏览器当前页面的源码数据
page_text = browser.page_source
# 解析数据
tree = etree.HTML(page_text)
video_url = tree.xpath('//video/@src')[0]
sleep(5)
browser.quit()

selenium基础代码2:

from selenium import webdriver
# selenium 4.0版本以后重新封装了Service用来放驱动路径
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep

service = Service("chromedriver.exe")
# bro = webdriver.Chrome(executable_path='chromedriver.exe') # 4.0版本以前
bro = webdriver.Chrome(service=service) # 4.0版本以后
# 发起请求
bro.get('https://www.taobao.com')

# 标签定位(find_系列函数)
# By.ID ,By.NAME 使用find_element,结果是单一标签对象,非列表
# tg = bro.find_element(By.ID,'id名称')
# tg = bro.find_element(By.NAME,'name名称')

# By.CLASS_NAME,By.CSS_SELECTOR,By.XPATH,By.TAG_NAME,By.LINK_TEXT,By.PARTIAL_LINK_TEXT
# tg = bro.find_elements(By.LINK_TEXT, 'Java')[0] # 全部内容匹配
# tg = bro.find_elements(By.PARTIAL_LINK_TEXT, '人工智')[0] # 部分内容匹配
# tg = bro.find_elements(By.TAG_NAME, '标签名称')[0]
# tg = bro.find_elements(By.CSS_SELECTOR,'.search-button-text')[0]
btn = bro.find_elements(By.CLASS_NAME,'search-button-text')[0]
search_input = bro.find_elements(By.XPATH,'//input')[0]

# 标签交互
search_input.send_keys('打印')

# 执行一组js程序
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)

# 点击搜索按钮
btn.click()

# 当前浏览器回退上个页面
bro.back()
# 前进
bro.forward()

# 关闭浏览器
bro.quit()

selenium与iframe标签

如果定位的标签是包含在iframe标签之中的,selenium无法直接获取该标签

需要使用switch_to方法

bro.switch_to.frame('iframe的id名称')

bro.switch_to.frame('iframe名称') # 切换浏览器标签定位的作用域
div = bro.find_element(By.ID,'aaa')

selenium与动作链:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep

# 需求:拖动目标div到指定位置
service = Service("chromedriver.exe")
bro = webdriver.Chrome(service=service)
bro.get('https://www.taobao.com')

# 目标定位标签位于iframe内,则需要使用switch_to方法切换浏览器作用域
bro.switch_to.frame('iframe001')
div = bro.find_element(By.ID,'mydiv')

# 动作链
# 实例化动作链对象
action = ActionChains(bro)

# 点击并长按指定的标签
action.click_and_hold(div)

# 模拟人的操作,使用for循环来移动
for i in range(5):
    action.move_by_offset(16,0).perform() # 每次移动16像素
    sleep(0.3)

# 释放动作链
action.release()
bro.quit()

selenium无可视化界面的操作,以及selenium规避被检测到的风险:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
# 实现无可视化界面模块
from selenium.webdriver.chrome.options import Options
# 实现规避检测以及无可视化浏览器
from selenium.webdriver import ChromeOptions
from time import sleep

# 实现无可视化界面模块的操作(老方法)
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

# 实现规避检测的操作,同时实现无可视化浏览器(新方法)
option = ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_experimental_option('excludeSwitches',['enable-automation'])


service = Service('./chromedriver.exe')

bro = webdriver.Chrome(service=service,options=option)

# 无可视化界面(无头浏览器):隐藏浏览器  phantomJs也是一种无头浏览器
bro.get('https://www.baidu.com')
print(bro.page_source)
bro.quit()

需求:验证码截图及保存

# 需求:将验证码截图下载
service = Service("chromedriver.exe")
bro = webdriver.Chrome(service=service)
bro.get('https://www.taobao.com')

# 整个页面截图
bro.save_screenshot('selenium的淘宝页面截图.png')

# 定位到需要截取的标签位置
div = bro.find_elements(By.CLASS_NAME,'grid-content')[0]

# 取得标签所处的左上角坐标和标签长宽
location = div.location
size = div.size
rangle = (
   int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])
   )

# 引入裁剪模块及参数:Pillow模块(from PIL import Image)
i = Image.open('./selenium的淘宝页面截图.png')
code_img_name = '聚划算div截图.png'
# 裁剪
frame = i.crop(rangle)
frame.save(code_img_name)