selenium的
1.简介
selenium可以认为是反反爬虫的最佳利器,它基本可以等同于真实的浏览器访问,用它可以加载到动态数据,也省去了cookie的操作,但是用这个有一个重大的效率问题。所以selenium可以用来爬取一些对爬虫限制很大的网站。
2.基本使用
# -*-coding:utf8 -*- #selenium+chromedriver获取动态数据 #selenium相当于是一个机器人。可以模拟人类在浏览器上的一些行为,比如点击,填充数据,删除cookie等。 #chromedirver是一个驱动chrome浏览器的驱动程序,使用他才可以驱动浏览器。当然针对不同的浏览器有不同的driver #1.Chrome:https://sites.google.com/a/chromium.org/chromedriver/downloads #2.Firefox:https://github.com/mozilla/geckodriver/releases #3.Edge:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver #4.Safari:https://webkit.org/blog/6900/webdriver-support-in-safari-10/ #安装selenium #pip3 install selenium #安装chromedriver:下载完成后,放到不需要权限的纯英文目录下即可 from selenium import webdriver driver_path=r'D:\chromedriver\chromedriver.exe' driver=webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') # print(driver.page_source) import time time.sleep(5) #关闭页面 # driver.close() #关闭页面 # driver.quit() #退出整个浏览器
3.查找元素
# -*-coding:utf8 -*- from selenium import webdriver driver_path=r'D:\chromedriver\chromedriver.exe' driver=webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') from selenium.webdriver.common.by import By #定位元素 ''' 1.find_element_by_id:根据id来查找某个元素 inputTag=driver.find_element_by_id('kw') inputTag=driver.find_element(By.ID,'kw') 2.find_element_by_class_name:根据类名查找元素 submitTag=driver.find_element_by_class_name('su') submitTag=driver.find_element(By.CLASS_NAME,'su') 3.find_element_by_name:根据name属性的值来查找元素 submitTag=driver.find_element(By.NAME,'su') submitTag=driver.find_element_by_name('su') 4.find_element_by_tag_name:根据标签名来查找元素 submitTag=driver.find_element_by_tag_name('div') submitTag=driver.find_element(By.TAG_NAME,'div') 5.find_element_by_xpath:根据xpath语法来获取元素 submitTag=driver.find_element_by_xpath('//div') submitTag=driver.find_element(By.XPATH,'//div') 6.find_element_by_css_selector:根据css选择器选择元素 submitTag=driver.find_element(By.CSS_SELECTOR,'//div') submitTag=driver.find_element_by_css_selector('//div') 每个方法都有find_element_by和find_elements_by:查找一个和查找多个的区别 ''' # inputTag=driver.find_element_by_id('kw') # inputTag=driver.find_element_by_name('wd') # inputTag=driver.find_element_by_class_name('s_ipt') # inputTag=driver.find_element_by_xpath('//input[@id="kw"]') # inputTag=driver.find_element_by_css_selector('.quickdelete-wrap > input') inputTag=driver.find_elements_by_css_selector('.quickdelete-wrap > input')[0] inputTag.send_keys('python') #1.如果只是想要解析网页中的数据,那么推荐将网页源代码扔给lxml来解析。因为lxml底层 #使用的是C语言,所以解析效率会更高 #2.如果是想要对元素进行一些操作,比如给一个文本输入值,或者是点击某个按钮,那么就必须使用selenuim给 #我们提供的查找元素的方法 from selenium import webdriver from lxml import etree from selenium.webdriver.common.by import By #使用By查找元素 driver_path='D:\chromedriver\chromedriver' driver=webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') inputTag=driver.find_element(By.ID,'kw') inputTag.send_keys('python') # get_attribute('innerHTML')的方式可以获取某个元素的html # get_attribute('outerHTML')
4.操作表单元素
# -*-coding:utf8 -*- # 常见的表单元素 # button # checkbox # select,下拉列表 # input from selenium import webdriver from selenium.webdriver.common.by import By import time driver_path = r'D:\chromedriver\chromedriver' driver = webdriver.Chrome(executable_path=driver_path) # driver.get('https://www.baidu.com') # # inputTag = driver.find_element(By.ID, 'kw') # #输入内容python # inputTag.send_keys('python') # time.sleep(5) # #清除输入的内容 # inputTag.clear() #checkbox 选中 # driver.get('https://www.douban.com') # rememberBth=driver.find_element_by_name('remember') # rememberBth.click() #选择select # driver.get('http://www.dobai.cn/') from selenium.webdriver.support.ui import Select #使用选择,要用Select先进行实例化 # selectBtn=Select(driver.find_element_by_name('jumpMenu')) #通过索引进行选择 # selectBtn.select_by_index(1) #通过value进行选择 # selectBtn.select_by_value('http://m.95xiu.com/') #通过可见文本选择 # selectBtn.select_by_visible_text('95秀客户端') #取消所有选中 # selectBtn.deselect_all() #按钮的点击事件 driver.get('https://www.baidu.com') inputTag=driver.find_element_by_id('kw') inputTag.send_keys('python') submitTag=driver.find_element_by_id('su') time.sleep(5) submitTag.click()
5.行为链
# -*-coding:utf8 -*- #行为链 #有时候在页面中的操作可能要有很多步,那么这时候可以使用鼠标行为链类ActionChains来完成。 #行为链在爬虫中使用不是很多 from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import time driver_path = r'D:\chromedriver\chromedriver' driver = webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') inputTag=driver.find_element_by_id('kw') submitBtn=driver.find_element_by_id('su') actions=ActionChains(driver) actions.move_to_element(inputTag) actions.send_keys_to_element(inputTag,'python') actions.move_to_element(submitBtn) actions.click() actions.perform() #还有更多的鼠标相关的操作。 #click_and_hold(element):点击但不松开鼠标 #context_click(element):右键点击 #double_click(element):双击
6.操作cookie
# -*-coding:utf8 -*- #cookie操作 #1.获取所有的cookie # driver.get_cookies() #2.根据cookie的key获取value # value=driver.get_cookies(key) #3.删除所有的cookie # driver.delete_all_cookies() #4.删除某个cookie # driver.delete_cookie(key) import time from selenium import webdriver driver_path=r'D:\chromedriver\chromedriver' driver=webdriver.Chrome(executable_path=driver_path) #只能获得当前网页https://www.baidu.com的所有cookie信息,并不能获取其他页面的cookie信息 driver.get('https://www.baidu.com') # for cookie in driver.get_cookies(): # print(cookie) print(driver.get_cookie('PSTM')) # driver.delete_cookie('PSTM') # print(driver.get_cookie('PSTM')) #删除所有cookie # driver.delete_all_cookies()
7.隐式等待和显式等待
# -*-coding:utf8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import time driver_path = r'D:\chromedriver\chromedriver' driver = webdriver.Chrome(executable_path=driver_path) # 页面等待: # 现在的网页越来越多采用了Ajax技术,这样程序便不能确定何时某个元素完全加载出来了。 # 如果实际页面等待时间过长导致某个dom元素还没出来,但是代码直接使用了这个webElement, # 那么就会抛出NullPointer的异常。为了解决这个问题,所以Selenium提供了 # 两种等待方式:一种是隐式等待、一种是显式等待 # 1.隐式等待:调用driver.implicitly_wait。那么在获取不可用的元素之前,会先等待10秒 driver.get('https://www.douban.com/') #不设置等待时间,马上报错 # driver.find_element_by_id('shdiasjdsdas') #设置等待时间 # driver.implicitly_wait(20) # driver.find_element_by_id('shdiasjdsdas')#等待20s才报错 # 2.显式等待:显式等待是表名某个条件成立后才执行获取元素的操作。也可以在等待的时候指定一个 #显示等待更加智能一点 # 最大的时间,如果超过这个时间那么就抛出一个异常 from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By #后面可以接一个条件 # WebDriverWait(driver,10).until( # #某个元素加载完成,只能传一个参数.所以以一个元组传进去 # EC.presence_of_element_located((By.ID,'asdasdasdasda')) # ) #如果能获取的话,则不会等待10s element=WebDriverWait(driver,10).until( EC.presence_of_element_located((By.ID,'anony-book')) ) print(element)
8.打开多窗口和切换窗口
# -*-coding:utf8 -*- #切换页面: #有时候窗口中有很多子tab页面,这时候肯定是需要进行切换的。selenium提供了一个叫做switch_to_window #来进行切换,具体切换到哪个页面,可以从driver.window_handles中找到 from selenium import webdriver import time driver_path = r'D:\chromedriver\chromedriver' driver = webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') #又打开一个豆瓣页面 driver.execute_script("window.open('https://www.douban.com')") #但是打印当前所在网页的话,显示还是百度,要去豆瓣页面的话,需要进行切换 print(driver.current_url) #switch_to_window来进行切换页面 #要切换页面首先要拿到窗口句柄,driver每打开一个页面,会有一个句柄 #放在wind_handles里 # print(driver.window_handles) driver.switch_to.window(driver.window_handles[1]) print(driver.current_url)
9.使用代理
# -*-coding:utf8 -*- #设置代理ip: #有时候频繁爬取一些网页,服务器发现你是爬虫后会封掉你的ip地址。这时我们可以更换代理ip。 from selenium import webdriver options=webdriver.ChromeOptions() options.add_argument('--proxy-server=http://113.124.87.163:9999') driver_path = r'D:\chromedriver\chromedriver' driver=webdriver.Chrome(executable_path=driver_path,chrome_options=options) driver.get('http://httpbin.org/ip')
10.补充
# -*-coding:utf8 -*- from selenium import webdriver from selenium.webdriver.remote.webdriver import WebElement driver_path = r'D:\chromedriver\chromedriver' driver=webdriver.Chrome(executable_path=driver_path) driver.get('https://www.baidu.com') submitBtn=driver.find_element_by_id('su') # print(type(submitBtn)) print(submitBtn.get_attribute('value')) driver.save_screenshot('baidu.png')#保存屏幕截图
11.实战:爬取拉勾网
# -*-coding:utf8 -*- import re import requests import time from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Cookie':'JSESSIONID=ABAAABAAAGFABEFFA5F21EB50BF5A6DCE619C8EEA6CB14A; SEARCH_ID=1146364cc73d498abea7c5b4dde4c1e3; user_trace_token=20190417144437-71ba273c-c709-43be-ae40-d1c531c2a4d7; X_HTTP_TOKEN=42daf4b72327b2817743845551bf5e71415983ed09' } #拉勾设置了反爬机制,每次cookie都会变,该怎么做?后面再学 def request_list_page(): url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' data = { 'first': False, 'pn': 1, 'kd': 'python' } for x in range(1, 14): data['pn'] = x response = requests.post(url, headers=headers, data=data) result = response.json() positions = result['content']['positionResult']['result'] for position in positions: positionId = position['positionId'] position_url = 'http://www.lagou.com/jobs/%s.html' % positionId parse_position_detail(position_url) break break # print(response.json()) # json方法,如果返回来的是json数据,这个方法会自动load成字典 def parse_position_detail(url): response = requests.get(url, headers=headers) text = response.text html = etree.HTML(text) position_name = html.xpath('//span[@class="name"]/text()')[0] job_request_spans = html.xpath('//dd[@class="job_request"]//span') salary_span = job_request_spans[0] salary = salary_span.xpath('.//text()')[0].strip() city = job_request_spans[1].xpath('.//text()')[0].strip() city = re.sub(r'[\s/]', '', city, re.S) work_years = job_request_spans[2].xpath('.//text()')[0].strip() work_years = re.sub(r'[\s/]', '', work_years, re.S) education = job_request_spans[3].xpath('.//text()')[0].strip() education = re.sub(r'[\s/]', '', education, re.S) desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() position={ 'position_name':position_name, 'salary':salary, 'city':city, 'work_years':work_years, 'education':education, 'desc':desc, } print(position) def main(): request_list_page() if __name__ == '__main__': main()
12.实战:爬取拉勾网
# -*-coding:utf8 -*- from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree from selenium.webdriver.common.by import By import re import time class LagouSpider(object): driver_path = r'D:\chromedriver\chromedriver' def __init__(self): self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' self.positions = [] def run(self): self.driver.get(self.url) while True: # self.driver.page_source拿到的是整个网页源代码信息,ajax请求的数据也能看到 source = self.driver.page_source WebDriverWait(driver=self.driver, timeout=10).until( #这个等待加载的XPATH不要加text(),否则会异常 EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]')) ) self.parse_list_page(source) try: # 一页爬完后,点击下一页,继续爬取 next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]') # 判断一下,下一页是否可以点击,因为最后一页的时候,下一页是无法点击的 if "pager_next_disabled" in next_btn.get_attribute('class'): pass else: next_btn.click() time.sleep(7) except Exception: print(source) def parse_list_page(self, source): html = etree.HTML(source) links = html.xpath('//a[@class="position_link"]/@href') for link in links: # 当前的页面一个一个的请求 # 设置请求频率 self.request_detail_page(link) time.sleep(10) def request_detail_page(self, url): # 切换窗口,打开新的页面 self.driver.execute_script("window.open('%s')" % url) # 切换到新的窗口上去 self.driver.switch_to.window(self.driver.window_handles[1]) source = self.driver.page_source WebDriverWait(self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, '//span[@class="name"]')) ) self.parse_detail_page(source) # close()是关闭当前页面 self.driver.close() # 切换回职位列表页 self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, source): html = etree.HTML(source) position_name = html.xpath('//span[@class="name"]/text()')[0] job_request_spans = html.xpath('//dd[@class="job_request"]//span') salary_span = job_request_spans[0] salary = salary_span.xpath('.//text()')[0].strip() city = job_request_spans[1].xpath('.//text()')[0].strip() city = re.sub(r'[\s/]', '', city, re.S) work_years = job_request_spans[2].xpath('.//text()')[0].strip() work_years = re.sub(r'[\s/]', '', work_years, re.S) education = job_request_spans[3].xpath('.//text()')[0].strip() education = re.sub(r'[\s/]', '', education, re.S) desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() company_name=html.xpath("//h2[@class='fl']/text()")[0].strip() position = { 'name': position_name, 'company_name':company_name, 'salary': salary, 'city': city, 'work_years': work_years, 'education': education, 'desc': desc } self.positions.append(position) print(position) print('=' * 40) if __name__ == '__main__': spider = LagouSpider() spider.run() print(spider.positions)
世界最优美的情书