爬虫请求库之selenium
一:简介#
1:介绍
(1)selenium最初是测试工具
(2)爬虫使用该模块的原因是request无法操作js代码 而selenium可以操作js代码
(3)selenium本质是操作浏览器内核 完全模拟浏览器行为 例如 输入内容 点击等
(4)因为直接操作浏览器 我们无需考虑请求头等
2:支持的浏览器
from selenium import webdriver browser=webdriver.Chrome() browser=webdriver.Firefox() browser=webdriver.PhantomJS() browser=webdriver.Safari() browser=webdriver.Edge()
二:安装使用#
1:安装
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可 国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/ 最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
#注意: selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver 下载链接:https://github.com/mozilla/geckodriver/releases
2:基本使用
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
1、find_element_by_id 根据id找 2、find_element_by_link_text 根据链接名字找到控件(a标签的文字) 3、find_element_by_partial_link_text 根据链接名字找到控件(a标签的文字)模糊查询 4、find_element_by_tag_name 根据标签名 5、find_element_by_class_name 根据类名 6、find_element_by_name 根据属性名 例如 name = 'xxx' 7、find_element_by_css_selector 根据css选择器 8、find_element_by_xpath 根据xpath选择
3:显示等待与隐式等待的作用
(1)大部分网页可能都是由ajax + js开发的 加载需要一定的时间 当我们通过代码进行操作的时候 可能有的标签还没渲染出来
(2)通过设置等待时间让标签能够被加载出来
4:显示等待
(1)设置最大的等待时间
(2)如果指定查询的元素在规定时间内查找出来 便会执行下一行代码
(3)如果在规定时间内没有查询出指定的元素便会抛出异常TimeoutException
wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'content_left')))
5:隐式等待
(1)设置最大的等待时间
(2)如果在规定时间内完成页面加载 会执行下一步
(3)否则会一直等到时间结束才会执行下一步
from selenium import webdriver bro=webdriver.Chrome() bro.get("http://www.baidu.com") bro.implicitly_wait(10) # 表示等待所有 等待时间10s中
4:模拟百度进行登录案例

from selenium import webdriver import time def login(url,browser): login_button = browser.find_element_by_link_text('登录') # 查找登录标签 login_button.click() # 点击登录 time.sleep(1) login_type = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') # 选择用户名密码登录的方式 login_type.click() input_username = browser.find_element_by_id('TANGRAM__PSP_10__userName') # 获取输入用户名框 input_username.send_keys("123") # 输入用户名 input_password = browser.find_element_by_id('TANGRAM__PSP_10__password') # 获取输入用户密码框 input_password.send_keys("123") # 输入用户密码 login_submit = browser.find_element_by_id('TANGRAM__PSP_10__submit') # 用户名密码输入成功 进行登录 login_submit.click() # 点击登录 cookie = browser.get_cookies() # 获取所有的cookies 如果获取单个cookie需要知道某个cookie的名称 browser.close() # 进行关闭当前浏览器 if __name__ == '__main__': browser = webdriver.Chrome() browser.implicitly_wait(10) # 等待时间10s url = browser.get('https://www.baidu.com') # 进行百度请求 login(url,browser)
三:xpath#

doc=''' <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a> </div> </body> </html> ''' from lxml import etree html=etree.HTML(doc) # html=etree.parse('search.html',etree.HTMLParser()) # 1 所有节点 a=html.xpath('//*') #匹配所有标签 # 2 指定节点(结果为列表) # a=html.xpath('//head') # 3 子节点,子孙节点 a=html.xpath('//div/a') a=html.xpath('//body/a') #无数据 a=html.xpath('//body//a') # 4 父节点 # a=html.xpath('//body//a[@href="image1.html"]/..') a=html.xpath('//body//a[1]/..') #从1开始 # 也可以这样 a=html.xpath('//body//a[1]/parent::*') # 5 属性匹配 a=html.xpath('//body//a[@href="image1.html"]') # 6 文本获取 a=html.xpath('//body//a[@href="image1.html"]/text()') a=html.xpath('//body//a/text()') # 7 属性获取 # a=html.xpath('//body//a/@href') # # 注意从1 开始取(不是从0) a=html.xpath('//body//a[2]/@href') # 8 属性多值匹配 # a 标签有多个class类,直接匹配就不可以了,需要用contains # a=html.xpath('//body//a[@class="li"]') a=html.xpath('//body//a[contains(@class,"li")]/text()') # a=html.xpath('//body//a[contains(@class,"li")]/text()') # 9 多属性匹配 a=html.xpath('//body//a[contains(@class,"li") or @name="items"]') a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()') a=html.xpath('//body//a[contains(@class,"li")]/text()') # 10 按序选择 a=html.xpath('//a[2]/text()') a=html.xpath('//a[2]/@href') # 取最后一个 a=html.xpath('//a[last()]/@href') # 位置小于3的 a=html.xpath('//a[position()<3]/@href') # 倒数第二个 a=html.xpath('//a[last()-2]/@href') # 11 节点轴选择 # ancestor:祖先节点 # 使用了* 获取所有祖先节点 a=html.xpath('//a/ancestor::*') # # 获取祖先节点中的div a=html.xpath('//a/ancestor::div') # attribute:属性值 a=html.xpath('//a[1]/attribute::*') # child:直接子节点 a=html.xpath('//a[1]/child::*') # descendant:所有子孙节点 a=html.xpath('//a[6]/descendant::*') # following:当前节点之后所有节点 a=html.xpath('//a[1]/following::*') a=html.xpath('//a[1]/following::*[1]/@href') # following-sibling:当前节点之后同级节点 a=html.xpath('//a[1]/following-sibling::*') a=html.xpath('//a[1]/following-sibling::a') a=html.xpath('//a[1]/following-sibling::*[2]/text()') a=html.xpath('//a[1]/following-sibling::*[2]/@href') print(a)
四:获取元素属性#

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() browser.get('https://www.amazon.cn/') wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer'))) tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img') #获取标签属性, print(tag.get_attribute('src')) #获取标签ID,位置,名称,大小(了解) print(tag.id) print(tag.location) print(tag.tag_name) print(tag.size) browser.close()
五:元素交互操作#

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() browser.get('https://www.amazon.cn/') wait=WebDriverWait(browser,10) input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox'))) input_tag.send_keys('iphone 8') button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') button.click() import time time.sleep(3) input_tag=browser.find_element_by_id('twotabsearchtextbox') input_tag.clear() #清空输入框 input_tag.send_keys('iphone7plus') button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') button.click()

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time driver = webdriver.Chrome() driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') wait=WebDriverWait(driver,3) # driver.implicitly_wait(3) # 使用隐式等待 try: driver.switch_to.frame('iframeResult') ##切换到iframeResult sourse=driver.find_element_by_id('draggable') target=driver.find_element_by_id('droppable') ``` #方式一:基于同一个动作链串行执行 # actions=ActionChains(driver) #拿到动作链对象 # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行 # actions.perform() #方式二:不同的动作链,每次移动的位移都不同 ``` ActionChains(driver).click_and_hold(sourse).perform() distance=target.location['x']-sourse.location['x'] ``` track=0 while track < distance: ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform() track+=2 ActionChains(driver).release().perform() time.sleep(10) ``` finally: driver.close() Action Chains

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 try: browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('alert("hello world")') #打印警告 finally: browser.close() 在交互动作比较难实现的时候可以自己写JS(万能方法)
六:其他操作#

#cookies from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.zhihu.com/explore') print(browser.get_cookies()) browser.add_cookie({'k1':'xxx','k2':'yyy'}) print(browser.get_cookies()) # browser.delete_all_cookies()

from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.zhihu.com/explore') print(browser.get_cookies()) browser.add_cookie({'k1':'xxx','k2':'yyy'}) print(browser.get_cookies())

import time from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('window.open()') print(browser.window_handles) #获取所有的选项卡 browser.switch_to_window(browser.window_handles[1]) browser.get('https://www.taobao.com') time.sleep(10) browser.switch_to_window(browser.window_handles[0]) browser.get('https://www.sina.com.cn') browser.close()

from selenium import webdriver from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException try: browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframssseResult') except TimeoutException as e: print(e) except NoSuchFrameException as e: print(e) finally: browser.close()
七:爬取示例#

from selenium import webdriver from selenium.webdriver.common.keys import Keys # 键盘按键操作 import time import os, requests, hashlib path = str(time.time()) def get_good(bro): goods_list = bro.find_elements_by_class_name('gl-item') for good in goods_list: good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href') # 获取商品详情 good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src') # 获取商品图片 if not good_img_url: # 此只有当鼠标滑动到商品的时候才会进行加载出来 good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img') good_price = good.find_element_by_css_selector('.p-price i').text good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title') good_comment = good.find_element_by_css_selector('.p-commit strong a').text response = requests.get(good_img_url) good_name = good_brief.split(' ')[0][:5] md5 = hashlib.md5() md5.update(good_img_url.encode('utf-8')) file_name = '%s%s.jpg' % (good_name, md5.hexdigest()) photo_path = 'photo' if not os.path.exists(photo_path): os.mkdir(photo_path) file_path = os.path.join(photo_path, file_name) with open(file_path, 'wb') as f: print("%s下载之中:" % good_brief) for line in response.iter_content(): f.write(line) next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em") # 点击下一页 继续获取商品 time.sleep(1) next_page.click() time.sleep(1) get_good(bro) # 循环调用函数 if __name__ == '__main__': name = input('商品名>>:') bro = webdriver.Chrome() bro.get("https://www.jd.com") bro.implicitly_wait(10) search_input = bro.find_element_by_id('key') search_input.send_keys(name) search_input.send_keys(Keys.ENTER) try: print('商品获取中') get_good(bro) except Exception as e: print("结束") finally: bro.close()

from selenium import webdriver from selenium.webdriver.common.keys import Keys # 键盘按键操作 import time import pymysql def get_goods(bro): goods_lists = bro.find_elements_by_class_name('gl-item') return goods_lists def get_goods_info(goods_lists): for good in goods_lists: good_detail_url = good.find_element_by_css_selector('.p-img a').get_attribute('href') # 获取商品详情 good_img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src') # 获取商品图片 if not good_img_url: # 此只有当鼠标滑动到商品的时候才会进行加载出来 good_img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img') good_price = good.find_element_by_css_selector('.p-price i').text good_brief = good.find_element_by_css_selector('.p-name a ').get_attribute('title') good_comment = good.find_element_by_css_selector('.p-commit strong a').text return good_detail_url, good_img_url, good_price, good_comment, good_brief def write_database(good_detail_url, good_img_url, good_price, good_comment, good_brief): db = pymysql.connect(host="localhost", user="root", password="123", db="syl", port=3306) cur = db.cursor() sql_insert = """insert into goods(good_detail,good_image,good_price,good_comment,good_brief) values({},{},{},{},{})""".format( repr(good_detail_url), repr(good_img_url), good_price, repr(good_comment), repr(good_brief)) try: cur.execute(sql_insert) # 提交 db.commit() except Exception as e: # 错误回滚 print(e) db.rollback() finally: db.close() def next_get_good(): next_page = bro.find_element_by_css_selector(".page .p-num .pn-next em") # 点击下一页 继续获取商品 time.sleep(1) next_page.click() time.sleep(1) bro.implicitly_wait(10) main(bro) def main(bro): goods_lists = get_goods(bro) response_good_info = get_goods_info(goods_lists) good_detail_url, good_img_url, good_price, good_comment, good_brief = response_good_info write_database( good_detail_url, good_img_url, good_price, good_comment, good_brief) next_get_good() if __name__ == '__main__': name = input('商品名>>:') bro = webdriver.Chrome() bro.implicitly_wait(10) bro.get("https://www.jd.com") search_input = bro.find_element_by_id('key') search_input.send_keys(name) search_input.send_keys(Keys.ENTER) try: main(bro) except Exception as e: print(e) print("结束") finally: bro.close()

from selenium import webdriver import requests import os import hashlib path = 'photo' def get_url(base_url): browser.get(base_url) browser.implicitly_wait(10) def get_image_url(): images_list = browser.find_elements_by_css_selector('.goods-item .figure-img img ') yield images_list def get_image(images): image = images.get_attribute('src') image_title = images.get_attribute('alt') yield image, image_title def download_image(image, image_title): if not os.path.exists(path): # 判断存储路径是否存在 os.mkdir(path) md5 = hashlib.md5() md5.update(image_title.encode('utf-8')) file_name = '%s%s.jpg' % (image_title, md5.hexdigest()) # 防止文件名重复 file_path = os.path.join(path, file_name) # 拼接文件路径 response = requests.get(image) # 请求图片数据流 with open(file_path, 'wb') as f: print("%s下载之中:" % image_title) for line in response.iter_content(): f.write(line) def main(): for i in range(page_num): base_url = 'https://www.plmm.com.cn/tags-199-%s.html' % i get_url(base_url) images_list = list(get_image_url())[0] for images in images_list: images_detail = list(get_image(images))[0] image_detail, image_title = images_detail download_image(image_detail, image_title) if __name__ == '__main__': request_url = 'https://www.plmm.com.cn/tags-199-0.html' browser = webdriver.Chrome() browser.implicitly_wait(10) browser.get(request_url) page = browser.find_elements_by_class_name('page-num') # 获取所有的页面 page_num = len(page) + 1 # 通过len判断有多少页面 因为首页没有page-num +1补上首页 try: main() except Exception as e: print(e) finally: print('爬取结束') browser.close()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!