python模块之selenium

配置

*建议安装chrome浏览器
*安装chrome浏览器对应版本的驱动[http://chromedriver.storage.googleapis.com/index.html]或[https://googlechromelabs.github.io/chrome-for-testing/#stable]
思路：
- selenium直接解析tag的数据
- selenium解析到动态加载的page_source；使用xpath(etree)解析数据

python代码

# pip install selenium
# 导包
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.select import Select 
from selenium.webdriver import ActionChains
import time

# 创建对象
opt = Options()
#opt.add_argument(UA)
## 无头浏览器
#opt.add_argument("--headless")
#opt.add_argument('--disable-gpu')
# 设置窗口大小，防止出现堆叠问题
opt.add_argument("--window-size=4000,1600") 
# 实例化浏览器对象
bro= Chrome(options=opt)
# 全局设置监测elements，如果元素加载出来了 就继续. 如果没加载出来. 会最多等待10s；限find操作，增加点击操作未操作的话 需sleep
bro.implicitly_wait(10)

url = '我是要访问的网址'
bro.get(url)

# 解析
page_text = bro.page_source	  # 轻松获得动态加载数据
tag = bro.find_element(By.xpath,'xxx ')
href = tag.get_property("href")
 # 更多的标签解析是elementS
tag_list = bro.find_elements(By.xpath,'xxx ')
for tag in tag_list:
	content = tag.text
	href = tag.get_property("href")

# 解析cookie
cookies = bro.get_cookies()
dic = {}
for cookie in cookies:
    key = cookie['name']
    value = cookie['value']
    dic[key] = value
print(dic) #在爬虫中可以使用的cookie

# 保存截图
code_tag = bro.find_element()
code_tag.screenshot('./code.png')    # 将验证码对话框截图保存，输入保存路径
code_tag.screenshot_as_png  # 拿到PNG的字节

# 退出
bro.quit()

滑动页面/滑动翻页,划动至末页

#滑动页面/滑动翻页,划动至末页
# 方式一
for i in range(4):    
    bro.find_element(By.TAG_NAME,'body').send_keys(Keys.END)
    sleep(3)
# 方式二：执行JS代码，滑动9000像素，不一定好使
bro.execute_script('document.documentElement.scrollTo(0,9000)')             
# sleep(1)

节点交互

# 节点交互
# 输入文字
input_tag.send_keys('24寸自行车')
# 输入文字并敲回车
input_tag.send_keys('24寸自行车',Keys.ENTER)

#按钮操作
search_tag.click()
#若无法直接点击，如：有遮挡，需执行JS代码
bro.execute_script("arguments[0].click();", element_tag)

# 执行JS代码
bro.execute_script('JS代码' )

窗口操作

# 窗口操作
#切换窗口
bro.switch_to.window(bro.window_handles[-1])   #进入新窗口
bro.close()  #解析后关闭新窗口
bro.switch_to.window(bro.window_handles[0])   #返回旧窗口！！

# 切换iframe
iframe_tag = web.find_element(By.XPATH, '//div[@class="QUI_POP_CONT"]/iframe')
bro.switch_to.frame(iframe_tag)
bro.switch_to.parent_frame()  #操作后切回

# 切换select,定位<select>的标签
sel = Select(bro.find_element(By.XPATH,'//*[@id="OptionDate"]'))
for i in range(len(sel.options)):	# 按所有选项的长度，循环
    sel.select_by_index(i)  # 按照索引位置切换
    time.sleep(1)
    table = bro.find_element(By.XPATH,'//*[@id="TableList"]/table')

动作链 ActionChains

# 动作链 ActionChains
from selenium.webdriver import ActionChains 
# 实例化动作链对象
action = ActionChains(bro)
div_tag = bro.find_element(By.ID,'draggable')
action.click_and_hold(div_tag)
action.release

# 3.滑动（x，y）像素
for i in range(5):
    action.move_by_offset(10,10).perform() #perform让动作链立即执行
    sleep(0.5)
    
action.move_by_offset(10,10)   #原点为屏幕左下角
action.perform()

action.move_to_element_with_offset(code_tag,x,y)  #tag的左下角为原点
action.perform()

规避操作

通过在F12 console中输入window.navigator.webdriver 确认是否被检测，selenium执行时true即被检测；需要规避，加options

# 方式一：（优先）
from selenium.webdriver.chrome.options import Options
opt = Options()
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
opt.add_argument('--disable-blink-features=AutomationControlled')
bro= Chrome(options=option)

#方式二：
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
bro = Chrome(options=chrome_options)
#Selenium在打开任何页面之前，先运行这个Js文件。
with open('./stealth.min.js') as f:        #需要stealth.min.js的文件！
    js = f.read()
#进行js注入，绕过检测
#execute_cdp_cmd执行cdp命令（在浏览器开发者工具中执行相关指令，完成相关操作）
#Page.addScriptToEvaluateOnNewDocument执行脚本
bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": js
})
url = 'https://www.taobao.com/'
bro.get(url)