使用selenium模拟登陆12306以及滑块验证
selenium是一个自动化测试工具,利用它可以驱动浏览器执行特定的动作,如点击,下拉等操作,同时还可以获取浏览器当前呈现的页面源码,做到可见即可爬。常常被运用于爬取javascript动态渲染的页面。
下面是其简单用法:
通过浏览器发起请求获取响应页面源码数据然后利用xpath进行数据提取
from selenium import webdriver from lxml import etree import time #实例化一个浏览器对象(传入浏览器驱动) bro = webdriver.Chrome(executable_path='./chromedriver') #让浏览器发起一个指定的url对应请求 bro.get('http://www.lvse.cn/xiaohua/') #page_source获取当前页面的页面源码数据 page_text = bro.page_source #数据解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id="slisting"]')
动作链以及iframe处理:
通过动作链可以完成滑块滑动的操作
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='./chromedriver') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') #如果定位的标签是存在于iframe标签中的则必须通过如下操作进行标签定位 bro.switch_to.frame('iframeResult') #切换到浏览器定位的作用域 div = bro.find_element_by_id('draggable') #实例化一个动作链 action = ActionChains(bro) #点击长按指定的标签 action.click_and_hold(div) for i in range(5): #peiform()立即执行动作链操作 #move_by_offset(x,y):x水平方向,y垂直方向 action.move_by_offset(17,0).perform() sleep(0.2) #释放动作链 action.release()
实现无可视化界面以及实现规避检测:
#实现无可视化界面 from selenium.webdriver.chrome.options import Options #实现规避检测 from selenium.webdriver import ChromeOptions from time import sleep #实现无可视化界面 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #实现规避检测 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation'])
12306自动登录操作:
实现原理:利用selenium进行点击登录,PIL进行截图然后提交给超级鹰处理返回坐标,再利用selenium进行点击图片验证码登录。
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from chaojiying import Chaojiying_Client from selenium.webdriver import ChromeOptions #反监测 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='./chromedriver') all_list = [] def get_url(): bro.get('https://kyfw.12306.cn/otn/resources/login.html') sleep(1) bro.find_element_by_xpath('/html/body/div[2]/div[2]/ul/li[2]').click() get_url() def save(): code_img_ele = bro.find_element_by_id('J-loginImgArea') bro.save_screenshot('./aaa.png') location = code_img_ele.location size = code_img_ele.size rangel = ( int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) return rangel save() def crop(): rangel = save() i = Image.open('./aaa.png') code_img_name = 'code.png' frame = i.crop(rangel) frame.save(code_img_name) return code_img_name crop() def get_track(): code_img_name = crop() chaojiying = Chaojiying_Client('username', 'passwd', 'id')#超级鹰账号密码以及软件id im = open(code_img_name, 'rb').read() result = chaojiying.PostPic(im, 9004)['pic_str'] return result get_track() def ensure_list(): code_img_ele = bro.find_element_by_id('J-loginImgArea') result = get_track() if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) for l in all_list: x = l[0] y = l[1] print(x, y) ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform() sleep(0.5) ActionChains(bro).release() ensure_list() def login(): bro.find_element_by_id('J-userName').send_keys('username')#12306账号 sleep(1) bro.find_element_by_id('J-password').send_keys('passwd')#12306密码 sleep(1) bro.find_element_by_id('J-login').click() # 实现规避selenium检测 script = 'Object.defineProperty(navigator,"webdriver",{get:() => false,});' bro.execute_script(script) sleep(3) hold_div = bro.find_element_by_xpath('//*[@id="nc_1_n1z"]') action = ActionChains(bro) action.click_and_hold(hold_div) action.move_by_offset(400, 0).perform() action.release() sleep(5) bro.quit() login()
当让可用并不一定适用splash也可以进行动态渲染并且程序不会阻塞但是进行与浏览器也就略显麻烦,selenium通常也是用于通过验证码。使用js代码也可以与浏览器进行交互我也是比较推荐这种方式的。