4-爬虫-selenium
selenium介绍
基于浏览器自动化的一个模块
selenium和爬虫之间的关联
- 1.便捷去捕获动态加载的数据
- 页面的可见即可得
- 2.便捷的实现模拟登录
selenium的缺点:
- 效率太慢
- 优点:
- 可见即可得
selenium的基本使用
下载:pip install selenium
事先准备好一个浏览器的驱动程序
下载谷歌驱动:http://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver from time import sleep #1.实例化一个浏览器对象 bro = webdriver.Chrome(executable_path='./chromedriver') #2.发起一个请求 bro.get('https://www.jd.com') #3.进行标签定位 search_input = bro.find_element_by_xpath('//*[@id="key"]') search_input.send_keys('macPro') btn = bro.find_element_by_xpath('/html/body/div[1]/div[4]/div/div[2]/div/div[2]/button') btn.click() sleep(2) #4.执行js代码 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) #5.可以获取当前页面的源码数据 page_text = bro.page_source print(page_text) sleep(3) bro.quit()
获取动态加载数据
爬取前5页的企业名称
from selenium import webdriver from time import sleep from lxml import etree url = 'http://125.35.6.84:81/xk/' bro = webdriver.Chrome(executable_path='./chromedriver') bro.get(url) sleep(2) page_text_list = [bro.page_source] #点击下一页 for i in range(5): nextPage_a = bro.find_element_by_xpath('//*[@id="pageIto_next"]') nextPage_a.click() sleep(2) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) sleep(1) bro.quit()
动作链操作
完成一系列连续的行为动作
from selenium.webdriver import ActionChains # 导入动作链类 bro = webdriver.Chrome(executable_path='./chromedriver') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') sleep(1) #注意:如果定位的标签是存在于iframe表示的子页面中,则常规的标签定位报错 #处理:使用如下指定操作 bro.switch_to.frame('iframeResult') # 括号里放的iframe标签的id div_tag = bro.find_element_by_id('draggable') #实例化一个动作链对象且将该对象绑定到指定的浏览器中 action = ActionChains(bro) action.click_and_hold(div_tag) #对指定标签实现点击且长按操作 for i in range(5): action.move_by_offset(40,30).perform() #perform让动作链立即执行 sleep(0.5) sleep(3) bro.quit() ''' 注: action.move_by_offset(10,20) # 以浏览器的左下角为坐标原点 action.move_to_element_with_offset(img_tag,x,y) # 以标签的左下角为坐标远点 '''
获取cookie
browser.get_cookies()
无头浏览器
没有可视化界面的浏览器
谷歌无头浏览器
from selenium import webdriver from selenium.webdriver.chrome.options import Options import time # 创建一个参数对象,用来控制chrome以无界面模式打开 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') # 创建浏览器对象 browser = webdriver.Chrome(executable_path='./chromedriver',chrome_options=chrome_options) # 上网 url = 'http://www.baidu.com/' browser.get(url) time.sleep(3) print(browser.page_source) browser.save_screenshot('baidu.png') #截屏 browser.quit()
12306模拟登录
超级鹰
#!/usr/bin/env python # coding:utf-8 import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
def transdform_code_img(img_path,img_type): chaojiying = Chaojiying_Client('超级鹰账号', '密码', 'ID') #用户中心>>软件ID 生成一个替换 96001 im = open(img_path, 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要// return chaojiying.PostPic(im, img_type)['pic_str']
模拟登陆
from PIL import Image # 导入图片处理模块 #pip install PIL/Pillow bro = webdriver.Chrome(executable_path='./chromedriver') url = 'https://kyfw.12306.cn/otn/login/init' bro.get(url) sleep(2) # 识别处理验证码:不可以对验证码图片地址单独发请求,因为会请求到另一张非本次登录对应的验证码图片 bro.save_screenshot('main.png') # 获取裁剪的区域 # 验证码图片标签 img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img') location = img_tag.location # 验证码图片左下角(起始)位置坐标 size = img_tag.size # 验证码图片的尺寸 # 左下角+右上角两点坐标获取 rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) # 根据rangle表示的裁剪区域进行图片的裁剪 i = Image.open('./main.png') frame = i.crop(rangle) frame.save('./code.png') # 识别验证码 result = transdform_code_img('./code.png',9004) print(result) # 需要在验证码中点击的位置坐标 # x1,y1|x2,y2|x3,y3 == [[x1,y1],[x2,y2],[x3,y3]] # 将返回的验证码处理成对应坐标点 all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) # 动作链操作根据坐标点击图片验证码 for loc in all_list:# 循环的次数就是点击的次数 x = loc[0] y = loc[1] ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform() sleep(1) # 输入账号密码 sleep(2) bro.find_element_by_id('username').send_keys('123456') sleep(1) bro.find_element_by_id('password').send_keys('123456') sleep(1) # 点击登陆 bro.find_element_by_id('loginSub').click() sleep(3) bro.quit()
js反爬
PyExecJS工具类如何使用:
- 大前提:需要在本机安装好nodejs的开发环境
- 环境安装:pip install PyExecJS
- 将想要执行的js相关代码,保存到一个js原文件中
- 1.将反混淆网站中的代码粘贴到jsCode.js文件中
- 2.在该js文件中添加一个自定义函数getPostParamCode,该函数是为了获取且返回post请求的动态加密参数:
function getPostParamCode(method, city, type, startTime, endTime){ var param = {}; param.city = city; param.type = type; param.startTime = startTime; param.endTime = endTime; return getParam(method, param); }
import requests headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' }
- 3.在py源文件中可以基于PyExecJS模拟执行步骤2中定义好的自定义函数,用 requests 库来模拟 POST 请求获取动态加密的参数:
#调用getParam将动态变化的请求参数 import execjs #1.实例化对象 node = execjs.get() # Params method = 'GETCITYWEATHER' city = '北京' type = 'HOUR' start_time = '2018-01-25 00:00:00' end_time = '2018-01-25 23:00:00' #2.将js源文件进行编译 file = 'test.js' ctx = node.compile(open(file,encoding='utf-8').read()) # Get params js = 'getPostParamCode("{0}", "{1}", "{2}", "{3}", "{4}")'.format(method, city, type, start_time, end_time) #3.eval执行相关编译后的js函数 params = ctx.eval(js) url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php' param = { 'd':params } #获取了加密后的响应数据 code_response_text = requests.get(url=url,params=param,headers=headers).text code_response_text
- 4.接下来我们再调用一下 JavaScript 中的 decodeData() 方法即可实现解密:
#将加密后的响应数据进行解密 import execjs import requests node = execjs.get() # Params method = 'GETCITYWEATHER' city = '北京' type = 'HOUR' start_time = '2018-01-25 00:00:00' end_time = '2018-01-25 23:00:00' # Compile javascript file = 'test.js' ctx = node.compile(open(file,encoding='utf-8').read()) # Get params js = 'getPostParamCode("{0}", "{1}", "{2}", "{3}", "{4}")'.format(method, city, type, start_time, end_time) params = ctx.eval(js) #发起post请求 url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php' response_text = requests.post(url, data={'d': params}).text #对加密的响应数据进行解密 js = 'decodeData("{0}")'.format(response_text) decrypted_data = ctx.eval(js) print(decrypted_data)