selenium知识点
1. 导包
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.common.by import By
2. 创建 webdriver
driver = webdriver.Chrome(executable_path=r'C:\python35\chromedriver.exe')
# executable_path:chromedriver.exe的存放路径
3. 操作
1. 常用操作
driver.page_source # 响应页面,element标签 driver.title # 请求页面的title driver.current_url # 请求的url driver.close() # 关闭浏览器窗口 driver.quit() # 退出浏览器 driver.current_window_handle # 获取当前窗口的句柄 driver.window_handles # 获取所有窗口的句柄 driver.switch_to_window() # 切换浏览器窗口 driver.execute_script() # 执行js脚本(在打开新窗口时用) driver.get_cookie() # 获取cookie driver.find_element_by_class_name() # 根据class查找标签 driver.find_element_by_xpath() # 根据xpath查找标签
4 例子 -- 爬去拉钩网招聘信息
过程: 用 selenium 访问首页,然后用lxml解析首页上的每一个职位,获取职位的链接地址,然后再用selenium 访问该地址,用lxml提取职位详情的信息。
- get_attribute('class') # 获取标签的class属性值
-
self.driver.execute_script('window.open("https://www.baidu.com/")') # 打开一个新窗口 (请求详情页) 通过执行js脚本打开新的窗口
-
self.driver.switch_to.window(self.driver.window_handles[1]) # 将driver切换到新窗口(详情页窗口)
-
WebDriverWait(driver=self.driver, timeout=10). \ until(ec.presence_of_element_located((By.XPATH, '//span[@class="name"]'))) # 显示等待(职位详情页,出现职位标题为条件),在等待过程中如果出现要寻找的标签,则结束等待,如果没有出现则一直等待,直到10s,抛出异常。注意:这里只能定位某个标签,不能取标签里面的值,如这里的span标签中的值。
- 注意:在切换到新窗口后,如果想继续在原始窗口执行操作,需要再重新切回原始窗口,如这里的由详情页窗口切换到列表页窗口。
import re import time import csv from lxml import etree from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.ui import WebDriverWait class LaGouSpider: def __init__(self): self.driver = webdriver.Chrome(executable_path=r'C:\python35\chromedriver.exe') self.init_url = 'https://www.lagou.com/zhaopin/Python/' self.next_page = True self.position = None self.csv_header = ['职位名称', '职位要求', '薪水', '职位标签', '职位诱惑', '职位详情', '发布时间'] self.is_writer_header = False def request_list_page(self, url=None): if url: self.driver.get(url) html = etree.HTML(self.driver.page_source) # 解析html 获取职位列表 links = html.xpath('//a[@class="position_link"]/@href') for link in links: self.request_detail_page(link) time.sleep(1) # 下一页 next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/a[last()]') if 'page_no pager_next_disabled' not in next_btn.get_attribute('class'): next_btn.click() else: self.next_page = False def request_detail_page(self, url): # 打开一个新窗口 (请求详情页) self.driver.execute_script('window.open("' + url + '")') # 将driver切换到新窗口(详情页窗口) self.driver.switch_to.window(self.driver.window_handles[1]) # 显示等待(职位详情页,出现职位标题为条件) WebDriverWait(driver=self.driver, timeout=10). \ until(ec.presence_of_element_located((By.XPATH, '//span[@class="name"]'))) # 解析详情页 self.parse_detail_page(self.driver.page_source) # 关闭新窗口 self.driver.close() # 重新将窗口切回到列表页窗口 self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, source): html = etree.HTML(source) position_name = html.xpath('//span[@class="name"]/text()') job_request = html.xpath('//dd[@class="job_request"]/p//text()') position_label = html.xpath('//ul[@class="position-label clearfix"]//text()') publish_time = html.xpath('//p[@class="publish_time"]//text()') job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()') job_detail = html.xpath('//div[@class="job-detail"]/p/text()') # 清洗数据 position_name = position_name[0] if position_name else None job_request = [re.sub('\n|/|\\xa0', '', i).strip() for i in job_request] if job_request else None job_request = [i for i in job_request if i != ''] # 提取salary salary = job_request[0] if len(job_request) > 2 else None # 提取职位需求 job_request = ','.join(job_request[2:5]) # 职位标签 position_label = [re.sub('\n|/|\\xa0', '', i).strip() for i in position_label] if position_label else None position_label = ','.join([i for i in position_label if i != '']) # 职位诱惑 job_advantage = job_advantage[0] if job_advantage else None # 发布时间 print(publish_time) print(salary) publish_time = re.match(r'\d+天\w|\d+:\d+', publish_time[0]).group() if publish_time else None # 职位详情 job_detail = ','.join([a.strip() for a in job_detail] if job_detail else None) position = { '职位名称': position_name, '职位要求': job_request, '薪水': salary, '职位标签': position_label, '职位诱惑': job_advantage, '职位详情': job_detail, '发布时间': publish_time, } self.write_csv(position) def write_csv(self, position): with open('position.csv', 'a+', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, self.csv_header) if not self.is_writer_header: writer.writeheader() self.is_writer_header = True writer.writerow(position) def run(self): self.request_list_page(self.init_url) while self.next_page: self.request_list_page() if __name__ == '__main__': lagou = LaGouSpider() lagou.run()
5. Selenium 的 WebDriverWait
https://blog.csdn.net/duzilonglove/article/details/78455051