selenium的

1.简介

selenium可以认为是反反爬虫的最佳利器,它基本可以等同于真实的浏览器访问,用它可以加载到动态数据,也省去了cookie的操作,但是用这个有一个重大的效率问题。所以selenium可以用来爬取一些对爬虫限制很大的网站。

2.基本使用

#  -*-coding:utf8 -*-

#selenium+chromedriver获取动态数据
#selenium相当于是一个机器人。可以模拟人类在浏览器上的一些行为,比如点击,填充数据,删除cookie等。
#chromedirver是一个驱动chrome浏览器的驱动程序,使用他才可以驱动浏览器。当然针对不同的浏览器有不同的driver
#1.Chrome:https://sites.google.com/a/chromium.org/chromedriver/downloads
#2.Firefox:https://github.com/mozilla/geckodriver/releases
#3.Edge:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver
#4.Safari:https://webkit.org/blog/6900/webdriver-support-in-safari-10/

#安装selenium
#pip3 install selenium

#安装chromedriver:下载完成后,放到不需要权限的纯英文目录下即可


from selenium import webdriver

driver_path=r'D:\chromedriver\chromedriver.exe'
driver=webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com')
# print(driver.page_source)

import time
time.sleep(5)
#关闭页面
# driver.close()   #关闭页面
# driver.quit()   #退出整个浏览器
View Code

3.查找元素

#  -*-coding:utf8 -*-

from selenium import webdriver

driver_path=r'D:\chromedriver\chromedriver.exe'
driver=webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com')


from selenium.webdriver.common.by import By
#定位元素
'''
1.find_element_by_id:根据id来查找某个元素
inputTag=driver.find_element_by_id('kw')
inputTag=driver.find_element(By.ID,'kw')
2.find_element_by_class_name:根据类名查找元素
submitTag=driver.find_element_by_class_name('su')
submitTag=driver.find_element(By.CLASS_NAME,'su')
3.find_element_by_name:根据name属性的值来查找元素
submitTag=driver.find_element(By.NAME,'su')
submitTag=driver.find_element_by_name('su')
4.find_element_by_tag_name:根据标签名来查找元素
submitTag=driver.find_element_by_tag_name('div')
submitTag=driver.find_element(By.TAG_NAME,'div')
5.find_element_by_xpath:根据xpath语法来获取元素
submitTag=driver.find_element_by_xpath('//div')
submitTag=driver.find_element(By.XPATH,'//div')
6.find_element_by_css_selector:根据css选择器选择元素
submitTag=driver.find_element(By.CSS_SELECTOR,'//div')
submitTag=driver.find_element_by_css_selector('//div')

每个方法都有find_element_by和find_elements_by:查找一个和查找多个的区别
'''
# inputTag=driver.find_element_by_id('kw')
# inputTag=driver.find_element_by_name('wd')
# inputTag=driver.find_element_by_class_name('s_ipt')
# inputTag=driver.find_element_by_xpath('//input[@id="kw"]')
# inputTag=driver.find_element_by_css_selector('.quickdelete-wrap > input')
inputTag=driver.find_elements_by_css_selector('.quickdelete-wrap > input')[0]
inputTag.send_keys('python')

#1.如果只是想要解析网页中的数据,那么推荐将网页源代码扔给lxml来解析。因为lxml底层
#使用的是C语言,所以解析效率会更高
#2.如果是想要对元素进行一些操作,比如给一个文本输入值,或者是点击某个按钮,那么就必须使用selenuim给
#我们提供的查找元素的方法

from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By

#使用By查找元素
driver_path='D:\chromedriver\chromedriver'
driver=webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com')

inputTag=driver.find_element(By.ID,'kw')
inputTag.send_keys('python')

# get_attribute('innerHTML')的方式可以获取某个元素的html
# get_attribute('outerHTML')
View Code

4.操作表单元素

#  -*-coding:utf8 -*-

# 常见的表单元素
# button
# checkbox
# select,下拉列表
# input

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

driver_path = r'D:\chromedriver\chromedriver'
driver = webdriver.Chrome(executable_path=driver_path)
# driver.get('https://www.baidu.com')
#
# inputTag = driver.find_element(By.ID, 'kw')
# #输入内容python
# inputTag.send_keys('python')
# time.sleep(5)
# #清除输入的内容
# inputTag.clear()

#checkbox   选中
# driver.get('https://www.douban.com')
# rememberBth=driver.find_element_by_name('remember')
# rememberBth.click()


#选择select
# driver.get('http://www.dobai.cn/')
from selenium.webdriver.support.ui import Select
#使用选择,要用Select先进行实例化
# selectBtn=Select(driver.find_element_by_name('jumpMenu'))
#通过索引进行选择
# selectBtn.select_by_index(1)
#通过value进行选择
# selectBtn.select_by_value('http://m.95xiu.com/')
#通过可见文本选择
# selectBtn.select_by_visible_text('95秀客户端')
#取消所有选中
# selectBtn.deselect_all()



#按钮的点击事件
driver.get('https://www.baidu.com')

inputTag=driver.find_element_by_id('kw')
inputTag.send_keys('python')
submitTag=driver.find_element_by_id('su')
time.sleep(5)
submitTag.click()
View Code

5.行为链

#  -*-coding:utf8 -*-

#行为链
#有时候在页面中的操作可能要有很多步,那么这时候可以使用鼠标行为链类ActionChains来完成。
#行为链在爬虫中使用不是很多
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time

driver_path = r'D:\chromedriver\chromedriver'
driver = webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com')

inputTag=driver.find_element_by_id('kw')
submitBtn=driver.find_element_by_id('su')

actions=ActionChains(driver)
actions.move_to_element(inputTag)
actions.send_keys_to_element(inputTag,'python')
actions.move_to_element(submitBtn)
actions.click()
actions.perform()

#还有更多的鼠标相关的操作。
#click_and_hold(element):点击但不松开鼠标
#context_click(element):右键点击
#double_click(element):双击
View Code

6.操作cookie

#  -*-coding:utf8 -*-

#cookie操作
#1.获取所有的cookie
# driver.get_cookies()
#2.根据cookie的key获取value
# value=driver.get_cookies(key)
#3.删除所有的cookie
# driver.delete_all_cookies()
#4.删除某个cookie
# driver.delete_cookie(key)

import time
from selenium import webdriver
driver_path=r'D:\chromedriver\chromedriver'
driver=webdriver.Chrome(executable_path=driver_path)
#只能获得当前网页https://www.baidu.com的所有cookie信息,并不能获取其他页面的cookie信息
driver.get('https://www.baidu.com')

# for cookie in driver.get_cookies():
#     print(cookie)

print(driver.get_cookie('PSTM'))

# driver.delete_cookie('PSTM')
# print(driver.get_cookie('PSTM'))

#删除所有cookie
# driver.delete_all_cookies()
View Code

7.隐式等待和显式等待

#  -*-coding:utf8 -*-
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time

driver_path = r'D:\chromedriver\chromedriver'
driver = webdriver.Chrome(executable_path=driver_path)

# 页面等待:
# 现在的网页越来越多采用了Ajax技术,这样程序便不能确定何时某个元素完全加载出来了。
# 如果实际页面等待时间过长导致某个dom元素还没出来,但是代码直接使用了这个webElement,
# 那么就会抛出NullPointer的异常。为了解决这个问题,所以Selenium提供了
# 两种等待方式:一种是隐式等待、一种是显式等待

# 1.隐式等待:调用driver.implicitly_wait。那么在获取不可用的元素之前,会先等待10秒
driver.get('https://www.douban.com/')
#不设置等待时间,马上报错
# driver.find_element_by_id('shdiasjdsdas')
#设置等待时间
# driver.implicitly_wait(20)
# driver.find_element_by_id('shdiasjdsdas')#等待20s才报错


# 2.显式等待:显式等待是表名某个条件成立后才执行获取元素的操作。也可以在等待的时候指定一个
#显示等待更加智能一点
# 最大的时间,如果超过这个时间那么就抛出一个异常
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#后面可以接一个条件
# WebDriverWait(driver,10).until(
#     #某个元素加载完成,只能传一个参数.所以以一个元组传进去
#     EC.presence_of_element_located((By.ID,'asdasdasdasda'))
# )

#如果能获取的话,则不会等待10s
element=WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.ID,'anony-book'))
)
print(element)
View Code

8.打开多窗口和切换窗口

#  -*-coding:utf8 -*-

#切换页面:
#有时候窗口中有很多子tab页面,这时候肯定是需要进行切换的。selenium提供了一个叫做switch_to_window
#来进行切换,具体切换到哪个页面,可以从driver.window_handles中找到

from selenium import webdriver
import time

driver_path = r'D:\chromedriver\chromedriver'
driver = webdriver.Chrome(executable_path=driver_path)

driver.get('https://www.baidu.com')
#又打开一个豆瓣页面
driver.execute_script("window.open('https://www.douban.com')")
#但是打印当前所在网页的话,显示还是百度,要去豆瓣页面的话,需要进行切换
print(driver.current_url)

#switch_to_window来进行切换页面
#要切换页面首先要拿到窗口句柄,driver每打开一个页面,会有一个句柄
#放在wind_handles里
# print(driver.window_handles)
driver.switch_to.window(driver.window_handles[1])
print(driver.current_url)
View Code

9.使用代理

#  -*-coding:utf8 -*-

#设置代理ip:
#有时候频繁爬取一些网页,服务器发现你是爬虫后会封掉你的ip地址。这时我们可以更换代理ip。
from selenium import webdriver

options=webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://113.124.87.163:9999')
driver_path = r'D:\chromedriver\chromedriver'
driver=webdriver.Chrome(executable_path=driver_path,chrome_options=options)
driver.get('http://httpbin.org/ip')
View Code

10.补充

#  -*-coding:utf8 -*-

from selenium import webdriver
from selenium.webdriver.remote.webdriver import WebElement
driver_path = r'D:\chromedriver\chromedriver'
driver=webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com')

submitBtn=driver.find_element_by_id('su')
# print(type(submitBtn))
print(submitBtn.get_attribute('value'))
driver.save_screenshot('baidu.png')#保存屏幕截图
View Code

11.实战:爬取拉勾网

#  -*-coding:utf8 -*-
import re
import requests
import time
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Cookie':'JSESSIONID=ABAAABAAAGFABEFFA5F21EB50BF5A6DCE619C8EEA6CB14A; SEARCH_ID=1146364cc73d498abea7c5b4dde4c1e3; user_trace_token=20190417144437-71ba273c-c709-43be-ae40-d1c531c2a4d7; X_HTTP_TOKEN=42daf4b72327b2817743845551bf5e71415983ed09'
}

#拉勾设置了反爬机制,每次cookie都会变,该怎么做?后面再学
def request_list_page():
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    data = {
        'first': False,
        'pn': 1,
        'kd': 'python'
    }
    for x in range(1, 14):
        data['pn'] = x
        response = requests.post(url, headers=headers, data=data)
        result = response.json()
        positions = result['content']['positionResult']['result']
        for position in positions:
            positionId = position['positionId']
            position_url = 'http://www.lagou.com/jobs/%s.html' % positionId
            parse_position_detail(position_url)
            break
        break
        # print(response.json())
        # json方法,如果返回来的是json数据,这个方法会自动load成字典


def parse_position_detail(url):
    response = requests.get(url, headers=headers)
    text = response.text
    html = etree.HTML(text)
    position_name = html.xpath('//span[@class="name"]/text()')[0]
    job_request_spans = html.xpath('//dd[@class="job_request"]//span')
    salary_span = job_request_spans[0]
    salary = salary_span.xpath('.//text()')[0].strip()
    city = job_request_spans[1].xpath('.//text()')[0].strip()
    city = re.sub(r'[\s/]', '', city, re.S)
    work_years = job_request_spans[2].xpath('.//text()')[0].strip()
    work_years = re.sub(r'[\s/]', '', work_years, re.S)
    education = job_request_spans[3].xpath('.//text()')[0].strip()
    education = re.sub(r'[\s/]', '', education, re.S)
    desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
    position={
        'position_name':position_name,
        'salary':salary,
        'city':city,
        'work_years':work_years,
        'education':education,
        'desc':desc,
    }
    print(position)

def main():
    request_list_page()


if __name__ == '__main__':
    main()
View Code

12.实战:爬取拉勾网

#  -*-coding:utf8 -*-
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
from selenium.webdriver.common.by import By
import re
import time


class LagouSpider(object):
    driver_path = r'D:\chromedriver\chromedriver'

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
        self.positions = []

    def run(self):
        self.driver.get(self.url)
        while True:
            # self.driver.page_source拿到的是整个网页源代码信息,ajax请求的数据也能看到
            source = self.driver.page_source
            WebDriverWait(driver=self.driver, timeout=10).until(
                #这个等待加载的XPATH不要加text(),否则会异常
                EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]'))
            )
            self.parse_list_page(source)
            try:
                # 一页爬完后,点击下一页,继续爬取
                next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
                # 判断一下,下一页是否可以点击,因为最后一页的时候,下一页是无法点击的
                if "pager_next_disabled" in next_btn.get_attribute('class'):
                    pass
                else:
                    next_btn.click()
                    time.sleep(7)
            except Exception:
                print(source)

    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath('//a[@class="position_link"]/@href')
        for link in links:
            # 当前的页面一个一个的请求
            # 设置请求频率
            self.request_detail_page(link)
            time.sleep(10)

    def request_detail_page(self, url):
        # 切换窗口,打开新的页面
        self.driver.execute_script("window.open('%s')" % url)
        # 切换到新的窗口上去
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        WebDriverWait(self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, '//span[@class="name"]'))
        )
        self.parse_detail_page(source)
        # close()是关闭当前页面
        self.driver.close()
        # 切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        position_name = html.xpath('//span[@class="name"]/text()')[0]
        job_request_spans = html.xpath('//dd[@class="job_request"]//span')
        salary_span = job_request_spans[0]
        salary = salary_span.xpath('.//text()')[0].strip()
        city = job_request_spans[1].xpath('.//text()')[0].strip()
        city = re.sub(r'[\s/]', '', city, re.S)
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r'[\s/]', '', work_years, re.S)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r'[\s/]', '', education, re.S)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        company_name=html.xpath("//h2[@class='fl']/text()")[0].strip()
        position = {
            'name': position_name,
            'company_name':company_name,
            'salary': salary,
            'city': city,
            'work_years': work_years,
            'education': education,
            'desc': desc
        }
        self.positions.append(position)
        print(position)
        print('=' * 40)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()
    print(spider.positions)
View Code

 

posted @ 2019-04-29 03:33  徐大  阅读(303)  评论(0编辑  收藏  举报