selenium模块在爬虫中的应用
1. 相关概念
1. selenium模块
是一个基于浏览器自动化的模块
2. 与爬虫之间的关联
便捷的捕获到动态加载到的数据(可见即可得)
实现模拟登陆
3.环境安装
pip3 install selenium
简单演示
from selenium import webdriver from time import sleep # 后面是你的浏览器驱动位置,记得前面加r'','r'是防止字符转义的 driver = webdriver.Chrome(r'chromedriver.exe') # 用get打开百度页面 driver.get("http://www.baidu.com") # 查找页面的“设置”选项,并进行点击 driver.find_elements_by_link_text('设置')[0].click() sleep(2) # 打开设置后找到“搜索设置”选项,设置为每页显示50条 driver.find_elements_by_link_text('搜索设置')[0].click() sleep(2) # 选中每页显示50条 m = driver.find_element_by_id('nr') sleep(2) m.find_element_by_xpath('//*[@id="nr"]/option[3]').click() m.find_element_by_xpath('.//option[3]').click() sleep(2) # 点击保存设置 driver.find_elements_by_class_name("prefpanelgo")[0].click() sleep(2) # 处理弹出的警告页面 确定accept() 和 取消dismiss() driver.switch_to_alert().accept() sleep(2) # 找到百度的输入框,并输入 美女 driver.find_element_by_id('kw').send_keys('美女') sleep(2) # 点击搜索按钮 driver.find_element_by_id('su').click() sleep(2) # 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面 driver.find_elements_by_link_text('美女_百度图片')[0].click() sleep(3) # 关闭浏览器 driver.quit()
2.基本使用
from time import sleep from selenium import webdriver bro = webdriver.Chrome(executable_path="chromedriver.exe") # 录入路由地址 bro.get("https://www:jd.com/") sleep(2) # 进行标签定位 search_input = bro.find_element_by_id("key") # 向搜索框中录入关键词 search_input.send_keys("苹果") # 定位搜索按钮 btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button') # 点击搜索按钮 btn.click() sleep(2) #执行js(滑动滚轮) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) # 获取页面的源码数据 page_text = bro.page_source print(page_text) # 退出访问 bro.quit()
from time import sleep from selenium import webdriver from lxml import etree bro = webdriver.Chrome(executable_path="chromedriver.exe") bro.get("http://125.35.6.84:81/xk/") sleep(2) page_text = bro.page_source page_text_list = [page_text] for i in range(3): bro.find_element_by_id("pageIto_next").click() # 点击下一页 sleep(2) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) tree_list = tree.xpath('//ul[@id="gzlist"]/li') for lis in tree_list: title = lis.xpath('./dl/@title')[0] num = lis.xpath('./ol/@title')[0] sleep(2) bro.quit()
3.动作链
一系列连续的动作
在实现标签定位时,如果发现定位的标签是存在于iframe标签中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame('id')
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') # 拖动=点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()
4.模拟12306登录
# Cjy.py import requests from hashlib import md5 class Chaojiying_Client(object): def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
模拟登陆
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from Cjy import Chaojiying_Client from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://kyfw.12306.cn/otn/login/init') sleep(5) bro.save_screenshot('main.png') code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img') location = code_img_tag.location size = code_img_tag.size # 裁剪的区域范围 rangle = ( int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) i = Image.open('./main.png') frame = i.crop(rangle) frame.save('code.png') def get_text(imgPath, imgType): chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') im = open(imgPath, 'rb').read() return chaojiying.PostPic(im, imgType)['pic_str'] # 55,70|267,133 ==[[55,70],[33,66]] result = get_text('./code.png', 9004) all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) # action = ActionChains(bro) for a in all_list: x = a[0] y = a[1] ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform() sleep(1) bro.find_element_by_id('username').send_keys('123456') sleep(1) bro.find_element_by_id('password').send_keys('67890000000') sleep(1) bro.find_element_by_id('loginSub').click() sleep(5) bro.quit()
import requests from lxml import etree import re headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } url = 'https://www.pearvideo.com/category_1' page_text = requests.get(url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="listvideoListUl"]/li') for li in li_list: detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4' detail_page_text = requests.get(detail_url,headers=headers).text ex = 'srcUrl="(.*?)",vdoUrl' video_url = re.findall(ex,detail_page_text,re.S)[0] video_data = requests.get(video_url,headers=headers).content with open(title,'wb') as fp: fp.write(video_data)
5. 移动端数据的爬取
1. fiddler是一款抓包工具,代理服务器
- 青花瓷
- miteproxy
- 配置:让其可以抓取https协议的请求
- tools -> options -> https -> 安装证书
-
-
将手机和fiddler所在的电脑处在同一个网段下(pc开启wifi,手机连接)
-
在手机中访问fiddler的ip+port:192.168.14.110:50816,在当前页面中点击对应的连接下载证书
-
在手机中安装且信任证书
-
设置手机网络的代理:开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号